A PyTorch implementation of the paper "CSI: a hybrid deep neural network for fake news detection"
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

data preprocessing.ipynb 24KB


  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "from __future__ import division\n",
  10. "import re\n",
  11. "from collections import Counter\n",
  12. "import pickle\n",
  13. "import numpy as np\n",
  14. "import os\n",
  15. "from tqdm.notebook import tqdm\n",
  16. "from matplotlib import pyplot as plt\n",
  17. "import jieba, re\n",
  18. "import time\n",
  19. "from sklearn.utils.extmath import randomized_svd\n",
  20. "\n",
  21. "\n",
  22. "from gensim import utils\n",
  23. "from gensim.models.doc2vec import TaggedDocument\n",
  24. "from gensim.models import Doc2Vec\n",
  25. "\n",
  26. "import json"
  27. ]
  28. },
  29. {
  30. "cell_type": "markdown",
  31. "metadata": {},
  32. "source": [
  33. "## data dictionary"
  34. ]
  35. },
  36. {
  37. "cell_type": "code",
  38. "execution_count": 23,
  39. "metadata": {},
  40. "outputs": [
  41. {
  42. "data": {
  43. "application/vnd.jupyter.widget-view+json": {
  44. "model_id": "45d59671b76b4f8ab7fac5fe8c9228c8",
  45. "version_major": 2,
  46. "version_minor": 0
  47. },
  48. "text/plain": [
  49. "HBox(children=(FloatProgress(value=0.0, max=78.0), HTML(value='')))"
  50. ]
  51. },
  52. "metadata": {},
  53. "output_type": "display_data"
  54. },
  55. {
  56. "name": "stdout",
  57. "output_type": "stream",
  58. "text": [
  59. "\n"
  60. ]
  61. },
  62. {
  63. "data": {
  64. "application/vnd.jupyter.widget-view+json": {
  65. "model_id": "ffa76ec995ac4712a7518f649aaca4b3",
  66. "version_major": 2,
  67. "version_minor": 0
  68. },
  69. "text/plain": [
  70. "HBox(children=(FloatProgress(value=0.0, max=78.0), HTML(value='')))"
  71. ]
  72. },
  73. "metadata": {},
  74. "output_type": "display_data"
  75. },
  76. {
  77. "name": "stdout",
  78. "output_type": "stream",
  79. "text": [
  80. "\n"
  81. ]
  82. }
  83. ],
  84. "source": [
  85. "path = '/media/external_3TB/3TB/rafie/master/model-inputs'\n",
  86. "dataset = 'twitter'\n",
  87. "split = 'validation'\n",
  88. "\n",
  89. "f = open(f'{path}/{dataset}/{split}.txt', \"r\")\n",
  90. "lines = f.readlines()\n",
  91. "f.close()\n",
  92. "\n",
  93. "events = {}\n",
  94. "for line in tqdm(lines):\n",
  95. " line = json.loads(line.strip())\n",
  96. " events[str(line[0]['eid'])] = {'label' : line[1]}\n",
  97. " \n",
  98. "def process_tweet(tweet):\n",
  99. "# return tweet['t'], tweet['uid'], tweet['text']\n",
  100. " t = int(time.mktime(time.strptime(tweet['created_at'],\"%a %b %d %H:%M:%S +0000 %Y\")))\n",
  101. " uid = tweet['user']['id']\n",
  102. " text = tweet['text']\n",
  103. " return t, uid, text\n",
  104. " \n",
  105. "\n",
  106. "path = f'/media/external_3TB/3TB/rafie/master/{dataset}-raw-data/{dataset.capitalize()}'\n",
  107. "for event in tqdm(events):\n",
  108. " timestamps = []\n",
  109. " uids = []\n",
  110. " texts = []\n",
  111. " \n",
  112. " for file in os.listdir(f\"{path}/{event}-{events[event]['label']}\"):\n",
  113. " file = json.load(open(f'{path}/{event}-{events[event][\"label\"]}/{file}'))\n",
  114. "# tweets = json.load(open(f'{path}/{event}.json'))\n",
  115. " for tweet in [file['tweet']] + file['retweets']:\n",
  116. " t, uid, text = process_tweet(tweet)\n",
  117. " timestamps.append(t) #\n",
  118. " uids.append(uid) # tweet['user_id']\n",
  119. " texts.append(text)\n",
  120. " \n",
  121. " events[event]['timestamps'] = timestamps\n",
  122. " events[event]['uid'] = uids\n",
  123. " events[event]['text'] = texts\n",
  124. " "
  125. ]
  126. },
  127. {
  128. "cell_type": "markdown",
  129. "metadata": {},
  130. "source": [
  131. "## user features"
  132. ]
  133. },
  134. {
  135. "cell_type": "code",
  136. "execution_count": 24,
  137. "metadata": {},
  138. "outputs": [],
  139. "source": [
  140. "cnt = Counter()\n",
  141. "for event in events:\n",
  142. " cnt.update(events[event]['uid'])\n",
  143. "\n",
  144. "topk = 20000\n",
  145. "top_users = list(map(lambda x: x[0], cnt.most_common(topk)))\n",
  146. "all_users = list(map(lambda x: x[0], cnt.most_common()))"
  147. ]
  148. },
  149. {
  150. "cell_type": "code",
  151. "execution_count": 25,
  152. "metadata": {},
  153. "outputs": [],
  154. "source": [
  155. "top_users_index = {}\n",
  156. "for ii, uid in enumerate(top_users):\n",
  157. " top_users_index[uid] = ii\n",
  158. "\n",
  159. "all_users_index = {}\n",
  160. "for ii, uid in enumerate(all_users):\n",
  161. " all_users_index[uid] = ii\n",
  162. " \n",
  163. "events_index = {}\n",
  164. "for ii, eid in enumerate(events):\n",
  165. " events_index[eid] = ii"
  166. ]
  167. },
  168. {
  169. "cell_type": "code",
  170. "execution_count": 26,
  171. "metadata": {},
  172. "outputs": [
  173. {
  174. "data": {
  175. "application/vnd.jupyter.widget-view+json": {
  176. "model_id": "36ad18cb3f634d92a890dbd9b20f53b5",
  177. "version_major": 2,
  178. "version_minor": 0
  179. },
  180. "text/plain": [
  181. "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))"
  182. ]
  183. },
  184. "metadata": {},
  185. "output_type": "display_data"
  186. },
  187. {
  188. "name": "stdout",
  189. "output_type": "stream",
  190. "text": [
  191. "\n"
  192. ]
  193. },
  194. {
  195. "data": {
  196. "application/vnd.jupyter.widget-view+json": {
  197. "model_id": "5964cfe1f2b44f44bbec0c061e2ed9fa",
  198. "version_major": 2,
  199. "version_minor": 0
  200. },
  201. "text/plain": [
  202. "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))"
  203. ]
  204. },
  205. "metadata": {},
  206. "output_type": "display_data"
  207. },
  208. {
  209. "name": "stdout",
  210. "output_type": "stream",
  211. "text": [
  212. "\n",
  213. "top_users_events_matrix shape : (20000, 78)\n",
  214. "Sparsity : 1.616 %\n",
  215. "matrix_main shape: (39948, 78)\n",
  216. "Sparsity : 1.449 % \n"
  217. ]
  218. }
  219. ],
  220. "source": [
  221. "from scipy.sparse import csr_matrix\n",
  222. "\n",
  223. "def get_user_in_event(eid, users):\n",
  224. " event_users = set(events[eid]['uid'])\n",
  225. " return list(set(users).intersection(event_users))\n",
  226. "\n",
  227. "def get_user_event_matrix(users, users_index):\n",
  228. " row = []\n",
  229. " col = []\n",
  230. " data = []\n",
  231. " for ii, (eid, value) in tqdm(enumerate(events.items())):\n",
  232. " user_in_event = get_user_in_event(eid, users)\n",
  233. " for uid in user_in_event:\n",
  234. " uind = users_index[uid]\n",
  235. " col.append(ii)\n",
  236. " row.append(uind)\n",
  237. " data.append(1)\n",
  238. " return csr_matrix((data, (row, col)), shape=(len(users), len(events)))\n",
  239. "\n",
  240. "sub_matrix = get_user_event_matrix(top_users, top_users_index)\n",
  241. "main_matrix = get_user_event_matrix(all_users, all_users_index)\n",
  242. "\n",
  243. "print(\"top_users_events_matrix shape : {}\".format(sub_matrix.shape))\n",
  244. "print(\"Sparsity : {:.3f} %\".format(sub_matrix.count_nonzero()/np.prod(sub_matrix.shape) * 100))\n",
  245. "print(\"matrix_main shape: {}\".format(main_matrix.shape))\n",
  246. "print(\"Sparsity : {:.3f} % \".format(main_matrix.count_nonzero()/np.prod(main_matrix.shape) * 100))"
  247. ]
  248. },
  249. {
  250. "cell_type": "code",
  251. "execution_count": 27,
  252. "metadata": {},
  253. "outputs": [],
  254. "source": [
  255. "RELOAD = False\n",
  256. "save_path = f'assets/{dataset}/{split}'\n",
  257. "os.makedirs(save_path, exist_ok=True)\n",
  258. "\n",
  259. "if RELOAD:\n",
  260. " \n",
  261. " u_main = np.load(open(f'{save_path}/u_main.npy','rb'))\n",
  262. " sigma_main = np.load(open(f'{save_path}/sigma_main.npy','rb'))\n",
  263. " vt_main = np.load(open(f'{save_path}/vt_main.npy','rb'))\n",
  264. " all_users_features = [email protected](sigma_main)\n",
  265. " \n",
  266. " u_sub = np.load(open(f'{save_path}/u_sub.npy','rb'))\n",
  267. " sigma_sub = np.load(open(f'{save_path}/sigma_sub.npy','rb'))\n",
  268. " vt_sib = np.load(open(f'{save_path}/vt_sub.npy','rb'))\n",
  269. " top_users_features = [email protected](sigma_sub)\n",
  270. "else:\n",
  271. " num_main_features = 20 # 10 for weibo, 20 for tweet\n",
  272. " n_iter = 7 # 15 for weibo, 7 for tweet\n",
  273. " \n",
  274. " u_main, sigma_main, vt_main = randomized_svd(main_matrix, n_components=num_main_features, n_iter=n_iter, random_state=42)\n",
  275. " all_users_features = [email protected](sigma_main)\n",
  276. " \n",
  277. " \n",
  278. " num_sub_features = 50\n",
  279. " \n",
  280. " u_sub, sigma_sub, vt_sub = randomized_svd(sub_matrix@sub_matrix.T, n_components=num_sub_features, n_iter=n_iter, random_state=42) # random_state=42\n",
  281. " top_users_features = [email protected](sigma_sub)\n",
  282. " \n",
  283. " np.save(f'{save_path}/u_main.npy',u_main)\n",
  284. " np.save(f'{save_path}/sigma_main.npy',sigma_main)\n",
  285. " np.save(f'{save_path}/vt_main.npy',vt_main)\n",
  286. " \n",
  287. " np.save(f'{save_path}/u_sub.npy',u_sub)\n",
  288. " np.save(f'{save_path}/sigma_sub.npy',sigma_sub)\n",
  289. " np.save(f'{save_path}/vt_sub.npy',vt_sub)"
  290. ]
  291. },
  292. {
  293. "cell_type": "code",
  294. "execution_count": 28,
  295. "metadata": {},
  296. "outputs": [
  297. {
  298. "data": {
  299. "text/plain": [
  300. "((39948, 20), (20000, 50))"
  301. ]
  302. },
  303. "execution_count": 28,
  304. "metadata": {},
  305. "output_type": "execute_result"
  306. }
  307. ],
  308. "source": [
  309. "all_users_features.shape, top_users_features.shape"
  310. ]
  311. },
  312. {
  313. "cell_type": "markdown",
  314. "metadata": {},
  315. "source": [
  316. "## text features"
  317. ]
  318. },
  319. {
  320. "cell_type": "code",
  321. "execution_count": 8,
  322. "metadata": {},
  323. "outputs": [
  324. {
  325. "data": {
  326. "application/vnd.jupyter.widget-view+json": {
  327. "model_id": "e3a6d76d2fee4a4188f90cf440e44c63",
  328. "version_major": 2,
  329. "version_minor": 0
  330. },
  331. "text/plain": [
  332. "HBox(children=(FloatProgress(value=0.0, max=806.0), HTML(value='')))"
  333. ]
  334. },
  335. "metadata": {},
  336. "output_type": "display_data"
  337. },
  338. {
  339. "name": "stdout",
  340. "output_type": "stream",
  341. "text": [
  342. "\n",
  343. "train # sentences : 66081\n"
  344. ]
  345. }
  346. ],
  347. "source": [
  348. "binsize = 3600\n",
  349. "threshold = 120*24\n",
  350. "chinese_stopwords = '、 。 〃 〄 々 〆 〇 〈〉 《 》 「 」 『 』 【】 〒 〓 〔 〕 〖 〗 〘〙 〚 〛 〛 〜 〝 〞 〟,'\n",
  351. "rx = '[' + re.escape(''.join(chinese_stopwords.split())) + ']'\n",
  352. "\n",
  353. "\n",
  354. "def get_sentences():\n",
  355. " sentences = []\n",
  356. " for eid in tqdm(events):\n",
  357. " ts = sorted(events[eid]['timestamps'])\n",
  358. " cnt, bins = np.histogram(ts, bins=range(ts[0],ts[0]+threshold*binsize,binsize))\n",
  359. "\n",
  360. " nonzero_bins_ind = np.nonzero(cnt)[0]\n",
  361. " nonzero_bins = bins[nonzero_bins_ind]\n",
  362. " hist = cnt[nonzero_bins_ind]\n",
  363. " inv = nonzero_bins_ind[1:]-nonzero_bins_ind[:-1]\n",
  364. " intervals = np.insert(inv,0,0)\n",
  365. "\n",
  366. " for bid, bin_left in enumerate(nonzero_bins):\n",
  367. " bin_right = bin_left + binsize\n",
  368. " doc = ''\n",
  369. " for tid, t in enumerate(ts):\n",
  370. " if t<bin_left:\n",
  371. " continue\n",
  372. " elif t>=bin_right:\n",
  373. " break\n",
  374. " string = events[eid]['text'][tid]\n",
  375. " string = re.sub(r\"http\\S+\", \"\", string)\n",
  376. " string = re.sub(\"[?!.,:;()'@#$%^&*-=+/\\[\\[\\]\\]]\", ' ', string) # !.,:;()'@#$%^&*-_{}=+/\\\"\n",
  377. " doc += string\n",
  378. " sentences.append(TaggedDocument(\n",
  379. " words=doc, #jieba.lcut(doc), \n",
  380. " tags=[eid+'_%s' % bid]\n",
  381. " ))\n",
  382. " \n",
  383. " return sentences\n",
  384. " \n",
  385. "train_sentences = get_sentences()\n",
  386. "\n",
  387. "print(\"train # sentences : {}\".format(len(train_sentences)))"
  388. ]
  389. },
  390. {
  391. "cell_type": "code",
  392. "execution_count": 9,
  393. "metadata": {},
  394. "outputs": [
  395. {
  396. "data": {
  397. "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY8AAAD8CAYAAACPWyg8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEyJJREFUeJzt3XuMXOV5x/HvExsIzc0GtpZlO13SWIpM1RKYglGiKCUK2CSK+SOKjKpiUSuWCkhJL0pMI5XcKoVULQQ1IaGFYtIkxrkJi0Ad1yC1UsVlHYjBEMfLTdgC7MRcGkUihTz9Y94Nx8vs7rzrGc9s/P1IoznnOe8555kzu/PzzDmzjsxEkqQarxt0A5KkucfwkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUbf6gG5itU045JUdHRwfdhiTNGTt37vxZZo70YltzNjxGR0cZGxsbdBuSNGdExJO92pYfW0mSqhkekqRqXYVHRDwREQ9GxAMRMVZqJ0XE9ojYW+4XlnpExLURMR4RuyLijMZ21pXxeyNiXaN+Ztn+eFk3ev1AJUm9U/PO408y8/TMbJX5jcCOzFwO7CjzAKuB5eW2AbgO2mEDXAmcDZwFXDkROGXMRxvrrZr1I5Ik9d2RfGy1BthUpjcBFzbqN2fb3cCCiFgMnA9sz8xDmfkcsB1YVZa9OTPvzvZ/LnJzY1uSpCHUbXgk8MOI2BkRG0ptUWY+XaafARaV6SXAU41195XadPV9HeqSpCHV7aW6787M/RHxu8D2iPhJc2FmZkT0/b8kLMG1AeCtb31rv3cnSZpCV+88MnN/uT8AfJ/2OYtny0dOlPsDZfh+YFlj9aWlNl19aYd6pz6uz8xWZrZGRnryPRdJ0izMGB4R8YaIeNPENHAe8BCwFZi4YmodcGuZ3gpcXK66Wgm8UD7e2gacFxELy4ny84BtZdmLEbGyXGV1cWNbkqQh1M3HVouA75erZ+cD38zM/4iI+4AtEbEeeBL4SBl/O3ABMA78ErgEIDMPRcTngPvKuM9m5qEyfSlwE3AicEe5SZKGVLQvcJp7Wq1W+udJJKl7EbGz8XWLI+I3zCVJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVug6PiJgXEfdHxG1l/tSIuCcixiPilog4vtRPKPPjZfloYxtXlPqeiDi/UV9VauMRsbF3D0+S1A817zw+BjzSmL8KuDoz3w48B6wv9fXAc6V+dRlHRKwA1gKnAauAr5RAmgd8GVgNrAAuKmMlSUOqq/CIiKXAB4B/LfMBnAt8pwzZBFxYpteUecry95Xxa4DNmflSZj4OjANnldt4Zj6Wmb8CNpexkqQh1e07j2uATwC/LvMnA89n5stlfh+wpEwvAZ4CKMtfKON/U5+0zlR1SdKQmjE8IuKDwIHM3HkU+pmplw0RMRYRYwcPHhx0O5J0zOrmnce7gA9FxBO0P1I6F/gSsCAi5pcxS4H9ZXo/sAygLH8L8PNmfdI6U9VfIzOvz8xWZrZGRka6aF2S1A8zhkdmXpGZSzNzlPYJ7zsz80+Bu4APl2HrgFvL9NYyT1l+Z2Zmqa8tV2OdCiwH7gXuA5aXq7eOL/vY2pNHJ0nqi/kzD5nSJ4HNEfF54H7ghlK/Afh6RIwDh2iHAZm5OyK2AA8DLwOXZeYrABFxObANmAfcmJm7j6AvSVKfRftNwdzTarVybGxs0G1I0pwRETszs9WLbfkNc0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVK1GcMjIl4fEfdGxI8jYndEfKbUT42IeyJiPCJuiYjjS/2EMj9elo82tnVFqe+JiPMb9VWlNh4RG3v/MCVJvdTNO4+XgHMz84+A04FVEbESuAq4OjPfDjwHrC/j1wPPlfrVZRwRsQJYC5wGrAK+EhHzImIe8GVgNbACuKiMlSQNqRnDI9t+UWaPK7cEzgW+U+qbgAvL9JoyT1n+voiIUt+cmS9l5uPAOHBWuY1n5mOZ+StgcxkrSRpSXZ3zKO8QHgAOANuBR4HnM/PlMmQfsKRMLwGeAijLXwBObtYnrTNVXZI0pLoKj8x8JTNPB5bSfqfwjr52NYWI2BARYxExdvDgwUG0IEmi8mqrzHweuAs4B1gQEfPLoqXA/jK9H1gGUJa/Bfh5sz5pnanqnfZ/fWa2MrM1MjJS07okqYe6udpqJCIWlOkTgfcDj9AOkQ+XYeuAW8v01jJPWX5nZmapry1XY50KLAfuBe4Dlpert46nfVJ9ay8enCSpP+bPPITFwKZyVdTrgC2ZeVtEPAxsjojPA/cDN5TxNwBfj4hx4BDtMCAzd0fEFuBh4GXgssx8BSAiLge2AfOAGzNzd88eoSSp56L9pmDuabVaOTY2Nug2JGnOiIidmdnqxbb8hrkkqZrhIUmqZnhIkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySp2ozhERHLIuKuiHg4InZHxMdK/aSI2B4Re8v9wlKPiLg2IsYjYldEnNHY1royfm9ErGvUz4yIB8s610ZE9OPBSpJ6o5t3Hi8Df52ZK4CVwGURsQLYCOzIzOXAjjIPsBpYXm4bgOugHTbAlcDZwFnAlROBU8Z8tLHeqiN/aJKkfpkxPDLz6cz8UZn+X+ARYAmwBthUhm0CLizTa4Cbs+1uYEFELAbOB7Zn5qHMfA7YDqwqy96cmXdnZgI3N7YlSRpCVec8ImIUeCdwD7AoM58ui54BFpXpJcBTjdX2ldp09X0d6pKkIdV1eETEG4HvAh/PzBeby8o7huxxb5162BARYxExdvDgwX7vTpI0ha7CIyKOox0c38jM75Xys+UjJ8r9gVLfDyxrrL601KarL+1Qf43MvD4zW5nZGhkZ6aZ1SVIfdHO1VQA3AI9k5j81Fm0FJq6YWgfc2qhfXK66Wgm8UD7e2gacFxELy4ny84BtZdmLEbGy7OvixrYkSUNofhdj3gX8GfBgRDxQan8LfAHYEhHrgSeBj5RltwMXAOPAL4FLADLzUER8DrivjPtsZh4q05cCNwEnAneUmyRpSEX7dMXc02q1cmxsbNBtSNKcERE7M7PVi235DXNJUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lStRnDIyJujIgDEfFQo3ZSRGyPiL3lfmGpR0RcGxHjEbErIs5orLOujN8bEesa9TMj4sGyzrUREb1+kJKk3urmncdNwKpJtY3AjsxcDuwo8wCrgeXltgG4DtphA1wJnA2cBVw5EThlzEcb603elyRpyMwYHpn5X8ChSeU1wKYyvQm4sFG/OdvuBhZExGLgfGB7Zh7KzOeA7cCqsuzNmXl3ZiZwc2NbkqQhNdtzHosy8+ky/QywqEwvAZ5qjNtXatPV93WoS5KG2BGfMC/vGLIHvcwoIjZExFhEjB08ePBo7FKS1MFsw+PZ8pET5f5Aqe8HljXGLS216epLO9Q7yszrM7OVma2RkZFZti5JOlKzDY+twMQVU+uAWxv1i8tVVyuBF8rHW9uA8yJiYTlRfh6wrSx7MSJWlqusLm5sS5I0pObPNCAivgW8FzglIvbRvmrqC8CWiFgPPAl8pAy/HbgAGAd+CVwCkJmHIuJzwH1l3Gczc+Ik/KW0r+g6Ebij3CRJQyzapyzmnlarlWNjY4NuQ5LmjIjYmZmtXmzLb5hLkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqodk+ExuvEHg25Bkua0YzI8JElHxvCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdWGJjwiYlVE7ImI8YjYOOh+JElTG4rwiIh5wJeB1cAK4KKIWNHPffpFQUmavaEID+AsYDwzH8vMXwGbgTX93qkBIkmzMyzhsQR4qjG/r9T6bnTjDwwRSao0f9AN1IiIDcCGMvuLiNgzy02dAvzssG1fdSSd9dRrehsi9jY79jY7w9wbDHd/U/X2e73awbCEx35gWWN+aakdJjOvB64/0p1FxFhmto50O/1gb7Njb7Njb7M3zP0djd6G5WOr+4DlEXFqRBwPrAW2DrgnSdIUhuKdR2a+HBGXA9uAecCNmbl7wG1JkqYwFOEBkJm3A7cfpd0d8UdffWRvs2Nvs2NvszfM/fW9t8jMfu9DkvRbZljOeUiS5pLMPGZuwCpgDzAObOzzvp4AHgQeAMZK7SRgO7C33C8s9QCuLX3tAs5obGddGb8XWNeon1m2P17WjWl6uRE4ADzUqPW9l6n20UVvn6Z9td0D5XZBY9kVZT97gPNnem6BU4F7Sv0W4PhSP6HMj5flox16WwbcBTwM7AY+NizHbpreBn7sgNcD9wI/Lr19Zrbb61XPXfR2E/B447idPojfhzJuHnA/cNuwHLeOffbzBXSYbuUJeRR4G3B8+eFZ0cf9PQGcMqn2xYknDNgIXFWmLwDuKD+oK4F7Gj9sj5X7hWV64oXq3jI2yrqrp+nlPcAZHP4C3fdeptpHF719GvibDmNXlOfthPLD/mh5Xqd8boEtwNoy/VXgL8r0pcBXy/Ra4JYO+1tMebEA3gT8tPQw8GM3TW8DP3blsbyxTB9H+0VpZe32etlzF73dBHy4w3E7qr8PZdlfAd/k1fAY+HHr2Ge/XjyH7QacA2xrzF8BXNHH/T3Ba8NjD7C4TC8G9pTprwEXTR4HXAR8rVH/WqktBn7SqB82bop+Rjn8BbrvvUy1jy56+zSdXwAPe85oX513zlTPbfnl/Rkwf/LPwMS6ZXp+GTflu7cy7lbg/cN07Dr0NlTHDvgd4EfA2bXb62XPXfR2E53D46g+p7S/47YDOBe4bTbPQ7+P28TtWDrncbT/BEoCP4yIneWb8QCLMvPpMv0MsGiG3qar7+tQr3E0eplqH924PCJ2RcSNEbFwlr2dDDyfmS936O0365TlL5TxHUXEKPBO2v9SHapjN6k3GIJjFxHzIuIB2h9Jbqf9L97a7fWy5yl7y8yJ4/b35bhdHREnTO6tyx6O9Dm9BvgE8OsyP5vnoS/HbbJjKTyOtndn5hm0/1LwZRHxnubCbEd8DqSzSY5GL5X7uA74feB04GngH/vVVzci4o3Ad4GPZ+aLzWWDPnYdehuKY5eZr2Tm6bT/JX0W8I5B9NHJ5N4i4g9o/wv8HcAf0/4o6pN97uE1z2lEfBA4kJk7+7nvXjmWwqOrP4HSK5m5v9wfAL5P+xfo2YhYDFDuD8zQ23T1pR3qNY5GL1PtY1qZ+Wz5Bf818C+0j91sevs5sCAi5k+qH7atsvwtZfxhIuI42i/O38jM783wuI7qsevU2zAdu9LP87RP7J8zi+31sufpeluVmU9n20vAvzH743Ykz+m7gA9FxBO0/7L4ucCXpnlMAzluvzHT51q/LTfanwk+RvsE0sTJotP6tK83AG9qTP8P7asc/oHDT5h9sUx/gMNPyt1b6ifRvgJkYbk9DpxUlk0+KXfBDD2Ncvh5hb73MtU+uuhtcWP6L4HNZfo0Dj8R+Bjtk4BTPrfAtzn8ROClZfoyDj/ZuKVDXwHcDFwzqT7wYzdNbwM/dsAIsKBMnwj8N/DB2u31sucuelvcOK7XAF8Y1O9DWf5eXj1hPvDj1rHHfrx4DuuN9pUTP6X9+eun+rift5UnZuJywE+V+sm0T4btBf6z8cMWtP8zrEdpX+LXamzrz2lfPjcOXNKot4CHyjr/zPSX6n6L9kcY/0f788z1R6OXqfbRRW9fL/veRftvnDVfED9V9rOHxhVmUz235bm4t/T8beCEUn99mR8vy9/Wobd30/5oYReNS1+H4dhN09vAjx3wh7QvNd1VHtvfzXZ7veq5i97uLMftIeDfefWKrKP6+9DYxnt5NTwGftw63fyGuSSp2rF0zkOS1COGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqr9P21J550AQwc7AAAAAElFTkSuQmCC\n",
  398. "text/plain": [
  399. "<Figure size 432x288 with 1 Axes>"
  400. ]
  401. },
  402. "metadata": {
  403. "needs_background": "light"
  404. },
  405. "output_type": "display_data"
  406. }
  407. ],
  408. "source": [
  409. "lens = [len(s.words) for s in train_sentences]\n",
  410. "plt.hist(lens, 1000)\n",
  411. "plt.show()"
  412. ]
  413. },
  414. {
  415. "cell_type": "code",
  416. "execution_count": 10,
  417. "metadata": {},
  418. "outputs": [
  419. {
  420. "name": "stdout",
  421. "output_type": "stream",
  422. "text": [
  423. "build_vocab is done.\n",
  424. "doc2vec training is done.\n"
  425. ]
  426. }
  427. ],
  428. "source": [
  429. "reload = False\n",
  430. "if reload:\n",
  431. " doc_vectorizer = Doc2Vec.load(f'assets/{dataset}/doc2vec.model')\n",
  432. " print(\"doc_vectorizer is loaded.\")\n",
  433. "else:\n",
  434. " doc_vectorizer = Doc2Vec(min_count=1, window=10, vector_size=100, sample=1e-4, negative=5, workers=8)\n",
  435. " doc_vectorizer.build_vocab(train_sentences)\n",
  436. " print(\"build_vocab is done.\")\n",
  437. " doc_vectorizer.train(train_sentences,total_examples=doc_vectorizer.corpus_count,epochs=10)\n",
  438. " print(\"doc2vec training is done.\")\n",
  439. " doc_vectorizer.save(f'assets/{dataset}/doc2vec.model')"
  440. ]
  441. },
  442. {
  443. "cell_type": "markdown",
  444. "metadata": {},
  445. "source": [
  446. "## gather all features"
  447. ]
  448. },
  449. {
  450. "cell_type": "code",
  451. "execution_count": 29,
  452. "metadata": {},
  453. "outputs": [],
  454. "source": [
  455. "threshold = 120*24\n",
  456. "binsize = 3600\n",
  457. "\n",
  458. "def get_event_features(eid):\n",
  459. " \n",
  460. " ## capture\n",
  461. " ts = sorted(events[eid]['timestamps'])\n",
  462. " cnt, bins = np.histogram(ts, bins=range(ts[0],ts[0]+threshold*binsize,binsize))\n",
  463. "\n",
  464. " nonzero_bins_ind = np.nonzero(cnt)[0]\n",
  465. " nonzero_bins = bins[nonzero_bins_ind]\n",
  466. "\n",
  467. " # num_engagements and time intervals\n",
  468. " hist = cnt[nonzero_bins_ind].reshape(-1,1)\n",
  469. " deltas = nonzero_bins_ind[1:]-nonzero_bins_ind[:-1]\n",
  470. " deltas = np.insert(deltas,0,0).reshape(-1, 1)\n",
  471. " \n",
  472. " # user features\n",
  473. " X_users = []\n",
  474. " for bid, bin_left in enumerate(nonzero_bins):\n",
  475. " bin_userlist = []\n",
  476. " bin_right = bin_left + binsize\n",
  477. " bin_user_feature = []\n",
  478. " for tid, t in enumerate(ts):\n",
  479. " if t<bin_left:\n",
  480. " continue\n",
  481. " elif t>=bin_right:\n",
  482. " break\n",
  483. " uind = all_users_index[events[eid]['uid'][tid]]\n",
  484. " bin_user_feature += [all_users_features[uind].reshape(1,-1)] # (1,n_components)\n",
  485. "\n",
  486. " X_users += [np.concatenate(bin_user_feature, axis=0).mean(axis=0).reshape(1,-1)]\n",
  487. " X_users = np.concatenate(X_users, axis=0)\n",
  488. " \n",
  489. " # text features\n",
  490. " X_text = []\n",
  491. " for bid, bin_left in enumerate(nonzero_bins):\n",
  492. " bin_right = bin_left + binsize\n",
  493. " doc = ''\n",
  494. " for tid, t in enumerate(ts):\n",
  495. " if t<bin_left:\n",
  496. " continue\n",
  497. " elif t>=bin_right:\n",
  498. " break\n",
  499. " string = events[eid]['text'][tid]\n",
  500. " string = re.sub(r\"http\\S+\", \"\", string)\n",
  501. " string = re.sub(\"[?!.,:;()'@#$%^&*-=+/\\[\\[\\]\\]]\", ' ', string) # !.,:;()'@#$%^&*-_{}=+/\\\"\n",
  502. " doc += string\n",
  503. " X_text += [doc_vectorizer.infer_vector(\n",
  504. "# jieba.lcut(doc)\n",
  505. " doc\n",
  506. " ).reshape(1,-1)]\n",
  507. " X_text = np.concatenate(X_text, axis=0)\n",
  508. " \n",
  509. " # stack all\n",
  510. " X_capture = np.hstack([hist, deltas, X_users, X_text])\n",
  511. " \n",
  512. " ## score\n",
  513. " eind = events_index[eid]\n",
  514. " top_active_users = np.where(sub_matrix[:, eind].toarray() > 0)[0]\n",
  515. " X_score = top_users_features[top_active_users]\n",
  516. " \n",
  517. " ## label\n",
  518. " y = events[eid]['label']\n",
  519. " \n",
  520. " return X_capture, X_score, y"
  521. ]
  522. },
  523. {
  524. "cell_type": "code",
  525. "execution_count": 30,
  526. "metadata": {},
  527. "outputs": [],
  528. "source": [
  529. "# xc, xs, y = get_event_features('9833726676')\n",
  530. "# xc.shape, xs.shape, y"
  531. ]
  532. },
  533. {
  534. "cell_type": "code",
  535. "execution_count": 31,
  536. "metadata": {},
  537. "outputs": [
  538. {
  539. "data": {
  540. "application/vnd.jupyter.widget-view+json": {
  541. "model_id": "f4d178f5971648f589d3b1637893930a",
  542. "version_major": 2,
  543. "version_minor": 0
  544. },
  545. "text/plain": [
  546. "HBox(children=(FloatProgress(value=0.0, max=78.0), HTML(value='')))"
  547. ]
  548. },
  549. "metadata": {},
  550. "output_type": "display_data"
  551. },
  552. {
  553. "name": "stdout",
  554. "output_type": "stream",
  555. "text": [
  556. "\n"
  557. ]
  558. }
  559. ],
  560. "source": [
  561. "save_path = f'assets/{dataset}/{split}/pkls/'\n",
  562. "os.makedirs(save_path, exist_ok=True)\n",
  563. "\n",
  564. "for eid in tqdm(events):\n",
  565. " xc, xs, y = get_event_features(eid)\n",
  566. " pickle.dump(\n",
  567. " {'x_capture': xc, 'x_score':xs, 'label': y}, \n",
  568. " open(save_path + eid + '.pkl', 'wb')\n",
  569. " )"
  570. ]
  571. },
  572. {
  573. "cell_type": "code",
  574. "execution_count": null,
  575. "metadata": {},
  576. "outputs": [],
  577. "source": [
  578. " "
  579. ]
  580. }
  581. ],
  582. "metadata": {
  583. "kernelspec": {
  584. "display_name": "Python 3",
  585. "language": "python",
  586. "name": "python3"
  587. },
  588. "language_info": {
  589. "codemirror_mode": {
  590. "name": "ipython",
  591. "version": 3
  592. },
  593. "file_extension": ".py",
  594. "mimetype": "text/x-python",
  595. "name": "python",
  596. "nbconvert_exporter": "python",
  597. "pygments_lexer": "ipython3",
  598. "version": "3.6.8"
  599. }
  600. },
  601. "nbformat": 4,
  602. "nbformat_minor": 2
  603. }