{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from __future__ import division\n", "import re\n", "from collections import Counter\n", "import pickle\n", "import numpy as np\n", "import os\n", "from tqdm.notebook import tqdm\n", "from matplotlib import pyplot as plt\n", "import jieba, re\n", "import time\n", "from sklearn.utils.extmath import randomized_svd\n", "\n", "\n", "from gensim import utils\n", "from gensim.models.doc2vec import TaggedDocument\n", "from gensim.models import Doc2Vec\n", "\n", "import json" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## data dictionary" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "45d59671b76b4f8ab7fac5fe8c9228c8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=78.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ffa76ec995ac4712a7518f649aaca4b3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=78.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "path = '/media/external_3TB/3TB/rafie/master/model-inputs'\n", "dataset = 'twitter'\n", "split = 'validation'\n", "\n", "f = open(f'{path}/{dataset}/{split}.txt', \"r\")\n", "lines = f.readlines()\n", "f.close()\n", "\n", "events = {}\n", "for line in tqdm(lines):\n", " line = json.loads(line.strip())\n", " events[str(line[0]['eid'])] = {'label' : line[1]}\n", " \n", "def process_tweet(tweet):\n", "# return tweet['t'], tweet['uid'], tweet['text']\n", " t = int(time.mktime(time.strptime(tweet['created_at'],\"%a %b %d %H:%M:%S +0000 %Y\")))\n", " uid = tweet['user']['id']\n", " text = tweet['text']\n", " return t, uid, text\n", " \n", "\n", "path = f'/media/external_3TB/3TB/rafie/master/{dataset}-raw-data/{dataset.capitalize()}'\n", "for event in tqdm(events):\n", " timestamps = []\n", " uids = []\n", " texts = []\n", " \n", " for file in os.listdir(f\"{path}/{event}-{events[event]['label']}\"):\n", " file = json.load(open(f'{path}/{event}-{events[event][\"label\"]}/{file}'))\n", "# tweets = json.load(open(f'{path}/{event}.json'))\n", " for tweet in [file['tweet']] + file['retweets']:\n", " t, uid, text = process_tweet(tweet)\n", " timestamps.append(t) #\n", " uids.append(uid) # tweet['user_id']\n", " texts.append(text)\n", " \n", " events[event]['timestamps'] = timestamps\n", " events[event]['uid'] = uids\n", " events[event]['text'] = texts\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## user features" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "cnt = Counter()\n", "for event in events:\n", " cnt.update(events[event]['uid'])\n", "\n", "topk = 20000\n", "top_users = list(map(lambda x: x[0], cnt.most_common(topk)))\n", "all_users = list(map(lambda x: x[0], cnt.most_common()))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "top_users_index = {}\n", "for ii, uid in enumerate(top_users):\n", " top_users_index[uid] = ii\n", "\n", "all_users_index = {}\n", "for ii, uid in enumerate(all_users):\n", " all_users_index[uid] = ii\n", " \n", "events_index = {}\n", "for ii, eid in enumerate(events):\n", " events_index[eid] = ii" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "36ad18cb3f634d92a890dbd9b20f53b5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5964cfe1f2b44f44bbec0c061e2ed9fa", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "top_users_events_matrix shape : (20000, 78)\n", "Sparsity : 1.616 %\n", "matrix_main shape: (39948, 78)\n", "Sparsity : 1.449 % \n" ] } ], "source": [ "from scipy.sparse import csr_matrix\n", "\n", "def get_user_in_event(eid, users):\n", " event_users = set(events[eid]['uid'])\n", " return list(set(users).intersection(event_users))\n", "\n", "def get_user_event_matrix(users, users_index):\n", " row = []\n", " col = []\n", " data = []\n", " for ii, (eid, value) in tqdm(enumerate(events.items())):\n", " user_in_event = get_user_in_event(eid, users)\n", " for uid in user_in_event:\n", " uind = users_index[uid]\n", " col.append(ii)\n", " row.append(uind)\n", " data.append(1)\n", " return csr_matrix((data, (row, col)), shape=(len(users), len(events)))\n", "\n", "sub_matrix = get_user_event_matrix(top_users, top_users_index)\n", "main_matrix = get_user_event_matrix(all_users, all_users_index)\n", "\n", "print(\"top_users_events_matrix shape : {}\".format(sub_matrix.shape))\n", "print(\"Sparsity : {:.3f} %\".format(sub_matrix.count_nonzero()/np.prod(sub_matrix.shape) * 100))\n", "print(\"matrix_main shape: {}\".format(main_matrix.shape))\n", "print(\"Sparsity : {:.3f} % \".format(main_matrix.count_nonzero()/np.prod(main_matrix.shape) * 100))" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "RELOAD = False\n", "save_path = f'assets/{dataset}/{split}'\n", "os.makedirs(save_path, exist_ok=True)\n", "\n", "if RELOAD:\n", " \n", " u_main = np.load(open(f'{save_path}/u_main.npy','rb'))\n", " sigma_main = np.load(open(f'{save_path}/sigma_main.npy','rb'))\n", " vt_main = np.load(open(f'{save_path}/vt_main.npy','rb'))\n", " all_users_features = u_main@np.diag(sigma_main)\n", " \n", " u_sub = np.load(open(f'{save_path}/u_sub.npy','rb'))\n", " sigma_sub = np.load(open(f'{save_path}/sigma_sub.npy','rb'))\n", " vt_sib = np.load(open(f'{save_path}/vt_sub.npy','rb'))\n", " top_users_features = u_sub@np.diag(sigma_sub)\n", "else:\n", " num_main_features = 20 # 10 for weibo, 20 for tweet\n", " n_iter = 7 # 15 for weibo, 7 for tweet\n", " \n", " u_main, sigma_main, vt_main = randomized_svd(main_matrix, n_components=num_main_features, n_iter=n_iter, random_state=42)\n", " all_users_features = u_main@np.diag(sigma_main)\n", " \n", " \n", " num_sub_features = 50\n", " \n", " u_sub, sigma_sub, vt_sub = randomized_svd(sub_matrix@sub_matrix.T, n_components=num_sub_features, n_iter=n_iter, random_state=42) # random_state=42\n", " top_users_features = u_sub@np.diag(sigma_sub)\n", " \n", " np.save(f'{save_path}/u_main.npy',u_main)\n", " np.save(f'{save_path}/sigma_main.npy',sigma_main)\n", " np.save(f'{save_path}/vt_main.npy',vt_main)\n", " \n", " np.save(f'{save_path}/u_sub.npy',u_sub)\n", " np.save(f'{save_path}/sigma_sub.npy',sigma_sub)\n", " np.save(f'{save_path}/vt_sub.npy',vt_sub)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((39948, 20), (20000, 50))" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_users_features.shape, top_users_features.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## text features" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e3a6d76d2fee4a4188f90cf440e44c63", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=806.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "train # sentences : 66081\n" ] } ], "source": [ "binsize = 3600\n", "threshold = 120*24\n", "chinese_stopwords = '、 。 〃 〄 々 〆 〇 〈〉 《 》 「 」 『 』 【】 〒 〓 〔 〕 〖 〗 〘〙 〚 〛 〛 〜 〝 〞 〟,'\n", "rx = '[' + re.escape(''.join(chinese_stopwords.split())) + ']'\n", "\n", "\n", "def get_sentences():\n", " sentences = []\n", " for eid in tqdm(events):\n", " ts = sorted(events[eid]['timestamps'])\n", " cnt, bins = np.histogram(ts, bins=range(ts[0],ts[0]+threshold*binsize,binsize))\n", "\n", " nonzero_bins_ind = np.nonzero(cnt)[0]\n", " nonzero_bins = bins[nonzero_bins_ind]\n", " hist = cnt[nonzero_bins_ind]\n", " inv = nonzero_bins_ind[1:]-nonzero_bins_ind[:-1]\n", " intervals = np.insert(inv,0,0)\n", "\n", " for bid, bin_left in enumerate(nonzero_bins):\n", " bin_right = bin_left + binsize\n", " doc = ''\n", " for tid, t in enumerate(ts):\n", " if t=bin_right:\n", " break\n", " string = events[eid]['text'][tid]\n", " string = re.sub(r\"http\\S+\", \"\", string)\n", " string = re.sub(\"[?!.,:;()'@#$%^&*-=+/\\[\\[\\]\\]]\", ' ', string) # !.,:;()'@#$%^&*-_{}=+/\\\"\n", " doc += string\n", " sentences.append(TaggedDocument(\n", " words=doc, #jieba.lcut(doc), \n", " tags=[eid+'_%s' % bid]\n", " ))\n", " \n", " return sentences\n", " \n", "train_sentences = get_sentences()\n", "\n", "print(\"train # sentences : {}\".format(len(train_sentences)))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY8AAAD8CAYAAACPWyg8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEyJJREFUeJzt3XuMXOV5x/HvExsIzc0GtpZlO13SWIpM1RKYglGiKCUK2CSK+SOKjKpiUSuWCkhJL0pMI5XcKoVULQQ1IaGFYtIkxrkJi0Ad1yC1UsVlHYjBEMfLTdgC7MRcGkUihTz9Y94Nx8vs7rzrGc9s/P1IoznnOe8555kzu/PzzDmzjsxEkqQarxt0A5KkucfwkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUbf6gG5itU045JUdHRwfdhiTNGTt37vxZZo70YltzNjxGR0cZGxsbdBuSNGdExJO92pYfW0mSqhkekqRqXYVHRDwREQ9GxAMRMVZqJ0XE9ojYW+4XlnpExLURMR4RuyLijMZ21pXxeyNiXaN+Ztn+eFk3ev1AJUm9U/PO408y8/TMbJX5jcCOzFwO7CjzAKuB5eW2AbgO2mEDXAmcDZwFXDkROGXMRxvrrZr1I5Ik9d2RfGy1BthUpjcBFzbqN2fb3cCCiFgMnA9sz8xDmfkcsB1YVZa9OTPvzvZ/LnJzY1uSpCHUbXgk8MOI2BkRG0ptUWY+XaafARaV6SXAU41195XadPV9HeqSpCHV7aW6787M/RHxu8D2iPhJc2FmZkT0/b8kLMG1AeCtb31rv3cnSZpCV+88MnN/uT8AfJ/2OYtny0dOlPsDZfh+YFlj9aWlNl19aYd6pz6uz8xWZrZGRnryPRdJ0izMGB4R8YaIeNPENHAe8BCwFZi4YmodcGuZ3gpcXK66Wgm8UD7e2gacFxELy4ny84BtZdmLEbGyXGV1cWNbkqQh1M3HVouA75erZ+cD38zM/4iI+4AtEbEeeBL4SBl/O3ABMA78ErgEIDMPRcTngPvKuM9m5qEyfSlwE3AicEe5SZKGVLQvcJp7Wq1W+udJJKl7EbGz8XWLI+I3zCVJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVug6PiJgXEfdHxG1l/tSIuCcixiPilog4vtRPKPPjZfloYxtXlPqeiDi/UV9VauMRsbF3D0+S1A817zw+BjzSmL8KuDoz3w48B6wv9fXAc6V+dRlHRKwA1gKnAauAr5RAmgd8GVgNrAAuKmMlSUOqq/CIiKXAB4B/LfMBnAt8pwzZBFxYpteUecry95Xxa4DNmflSZj4OjANnldt4Zj6Wmb8CNpexkqQh1e07j2uATwC/LvMnA89n5stlfh+wpEwvAZ4CKMtfKON/U5+0zlR1SdKQmjE8IuKDwIHM3HkU+pmplw0RMRYRYwcPHhx0O5J0zOrmnce7gA9FxBO0P1I6F/gSsCAi5pcxS4H9ZXo/sAygLH8L8PNmfdI6U9VfIzOvz8xWZrZGRka6aF2S1A8zhkdmXpGZSzNzlPYJ7zsz80+Bu4APl2HrgFvL9NYyT1l+Z2Zmqa8tV2OdCiwH7gXuA5aXq7eOL/vY2pNHJ0nqi/kzD5nSJ4HNEfF54H7ghlK/Afh6RIwDh2iHAZm5OyK2AA8DLwOXZeYrABFxObANmAfcmJm7j6AvSVKfRftNwdzTarVybGxs0G1I0pwRETszs9WLbfkNc0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVK1GcMjIl4fEfdGxI8jYndEfKbUT42IeyJiPCJuiYjjS/2EMj9elo82tnVFqe+JiPMb9VWlNh4RG3v/MCVJvdTNO4+XgHMz84+A04FVEbESuAq4OjPfDjwHrC/j1wPPlfrVZRwRsQJYC5wGrAK+EhHzImIe8GVgNbACuKiMlSQNqRnDI9t+UWaPK7cEzgW+U+qbgAvL9JoyT1n+voiIUt+cmS9l5uPAOHBWuY1n5mOZ+StgcxkrSRpSXZ3zKO8QHgAOANuBR4HnM/PlMmQfsKRMLwGeAijLXwBObtYnrTNVXZI0pLoKj8x8JTNPB5bSfqfwjr52NYWI2BARYxExdvDgwUG0IEmi8mqrzHweuAs4B1gQEfPLoqXA/jK9H1gGUJa/Bfh5sz5pnanqnfZ/fWa2MrM1MjJS07okqYe6udpqJCIWlOkTgfcDj9AOkQ+XYeuAW8v01jJPWX5nZmapry1XY50KLAfuBe4Dlpert46nfVJ9ay8enCSpP+bPPITFwKZyVdTrgC2ZeVtEPAxsjojPA/cDN5TxNwBfj4hx4BDtMCAzd0fEFuBh4GXgssx8BSAiLge2AfOAGzNzd88eoSSp56L9pmDuabVaOTY2Nug2JGnOiIidmdnqxbb8hrkkqZrhIUmqZnhIkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySp2ozhERHLIuKuiHg4InZHxMdK/aSI2B4Re8v9wlKPiLg2IsYjYldEnNHY1royfm9ErGvUz4yIB8s610ZE9OPBSpJ6o5t3Hi8Df52ZK4CVwGURsQLYCOzIzOXAjjIPsBpYXm4bgOugHTbAlcDZwFnAlROBU8Z8tLHeqiN/aJKkfpkxPDLz6cz8UZn+X+ARYAmwBthUhm0CLizTa4Cbs+1uYEFELAbOB7Zn5qHMfA7YDqwqy96cmXdnZgI3N7YlSRpCVec8ImIUeCdwD7AoM58ui54BFpXpJcBTjdX2ldp09X0d6pKkIdV1eETEG4HvAh/PzBeby8o7huxxb5162BARYxExdvDgwX7vTpI0ha7CIyKOox0c38jM75Xys+UjJ8r9gVLfDyxrrL601KarL+1Qf43MvD4zW5nZGhkZ6aZ1SVIfdHO1VQA3AI9k5j81Fm0FJq6YWgfc2qhfXK66Wgm8UD7e2gacFxELy4ny84BtZdmLEbGy7OvixrYkSUNofhdj3gX8GfBgRDxQan8LfAHYEhHrgSeBj5RltwMXAOPAL4FLADLzUER8DrivjPtsZh4q05cCNwEnAneUmyRpSEX7dMXc02q1cmxsbNBtSNKcERE7M7PVi235DXNJUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lStRnDIyJujIgDEfFQo3ZSRGyPiL3lfmGpR0RcGxHjEbErIs5orLOujN8bEesa9TMj4sGyzrUREb1+kJKk3urmncdNwKpJtY3AjsxcDuwo8wCrgeXltgG4DtphA1wJnA2cBVw5EThlzEcb603elyRpyMwYHpn5X8ChSeU1wKYyvQm4sFG/OdvuBhZExGLgfGB7Zh7KzOeA7cCqsuzNmXl3ZiZwc2NbkqQhNdtzHosy8+ky/QywqEwvAZ5qjNtXatPV93WoS5KG2BGfMC/vGLIHvcwoIjZExFhEjB08ePBo7FKS1MFsw+PZ8pET5f5Aqe8HljXGLS216epLO9Q7yszrM7OVma2RkZFZti5JOlKzDY+twMQVU+uAWxv1i8tVVyuBF8rHW9uA8yJiYTlRfh6wrSx7MSJWlqusLm5sS5I0pObPNCAivgW8FzglIvbRvmrqC8CWiFgPPAl8pAy/HbgAGAd+CVwCkJmHIuJzwH1l3Gczc+Ik/KW0r+g6Ebij3CRJQyzapyzmnlarlWNjY4NuQ5LmjIjYmZmtXmzLb5hLkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqodk+ExuvEHg25Bkua0YzI8JElHxvCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdWGJjwiYlVE7ImI8YjYOOh+JElTG4rwiIh5wJeB1cAK4KKIWNHPffpFQUmavaEID+AsYDwzH8vMXwGbgTX93qkBIkmzMyzhsQR4qjG/r9T6bnTjDwwRSao0f9AN1IiIDcCGMvuLiNgzy02dAvzssG1fdSSd9dRrehsi9jY79jY7w9wbDHd/U/X2e73awbCEx35gWWN+aakdJjOvB64/0p1FxFhmto50O/1gb7Njb7Njb7M3zP0djd6G5WOr+4DlEXFqRBwPrAW2DrgnSdIUhuKdR2a+HBGXA9uAecCNmbl7wG1JkqYwFOEBkJm3A7cfpd0d8UdffWRvs2Nvs2NvszfM/fW9t8jMfu9DkvRbZljOeUiS5pLMPGZuwCpgDzAObOzzvp4AHgQeAMZK7SRgO7C33C8s9QCuLX3tAs5obGddGb8XWNeon1m2P17WjWl6uRE4ADzUqPW9l6n20UVvn6Z9td0D5XZBY9kVZT97gPNnem6BU4F7Sv0W4PhSP6HMj5flox16WwbcBTwM7AY+NizHbpreBn7sgNcD9wI/Lr19Zrbb61XPXfR2E/B447idPojfhzJuHnA/cNuwHLeOffbzBXSYbuUJeRR4G3B8+eFZ0cf9PQGcMqn2xYknDNgIXFWmLwDuKD+oK4F7Gj9sj5X7hWV64oXq3jI2yrqrp+nlPcAZHP4C3fdeptpHF719GvibDmNXlOfthPLD/mh5Xqd8boEtwNoy/VXgL8r0pcBXy/Ra4JYO+1tMebEA3gT8tPQw8GM3TW8DP3blsbyxTB9H+0VpZe32etlzF73dBHy4w3E7qr8PZdlfAd/k1fAY+HHr2Ge/XjyH7QacA2xrzF8BXNHH/T3Ba8NjD7C4TC8G9pTprwEXTR4HXAR8rVH/WqktBn7SqB82bop+Rjn8BbrvvUy1jy56+zSdXwAPe85oX513zlTPbfnl/Rkwf/LPwMS6ZXp+GTflu7cy7lbg/cN07Dr0NlTHDvgd4EfA2bXb62XPXfR2E53D46g+p7S/47YDOBe4bTbPQ7+P28TtWDrncbT/BEoCP4yIneWb8QCLMvPpMv0MsGiG3qar7+tQr3E0eplqH924PCJ2RcSNEbFwlr2dDDyfmS936O0365TlL5TxHUXEKPBO2v9SHapjN6k3GIJjFxHzIuIB2h9Jbqf9L97a7fWy5yl7y8yJ4/b35bhdHREnTO6tyx6O9Dm9BvgE8OsyP5vnoS/HbbJjKTyOtndn5hm0/1LwZRHxnubCbEd8DqSzSY5GL5X7uA74feB04GngH/vVVzci4o3Ad4GPZ+aLzWWDPnYdehuKY5eZr2Tm6bT/JX0W8I5B9NHJ5N4i4g9o/wv8HcAf0/4o6pN97uE1z2lEfBA4kJk7+7nvXjmWwqOrP4HSK5m5v9wfAL5P+xfo2YhYDFDuD8zQ23T1pR3qNY5GL1PtY1qZ+Wz5Bf818C+0j91sevs5sCAi5k+qH7atsvwtZfxhIuI42i/O38jM783wuI7qsevU2zAdu9LP87RP7J8zi+31sufpeluVmU9n20vAvzH743Ykz+m7gA9FxBO0/7L4ucCXpnlMAzluvzHT51q/LTfanwk+RvsE0sTJotP6tK83AG9qTP8P7asc/oHDT5h9sUx/gMNPyt1b6ifRvgJkYbk9DpxUlk0+KXfBDD2Ncvh5hb73MtU+uuhtcWP6L4HNZfo0Dj8R+Bjtk4BTPrfAtzn8ROClZfoyDj/ZuKVDXwHcDFwzqT7wYzdNbwM/dsAIsKBMnwj8N/DB2u31sucuelvcOK7XAF8Y1O9DWf5eXj1hPvDj1rHHfrx4DuuN9pUTP6X9+eun+rift5UnZuJywE+V+sm0T4btBf6z8cMWtP8zrEdpX+LXamzrz2lfPjcOXNKot4CHyjr/zPSX6n6L9kcY/0f788z1R6OXqfbRRW9fL/veRftvnDVfED9V9rOHxhVmUz235bm4t/T8beCEUn99mR8vy9/Wobd30/5oYReNS1+H4dhN09vAjx3wh7QvNd1VHtvfzXZ7veq5i97uLMftIeDfefWKrKP6+9DYxnt5NTwGftw63fyGuSSp2rF0zkOS1COGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqr9P21J550AQwc7AAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "lens = [len(s.words) for s in train_sentences]\n", "plt.hist(lens, 1000)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "build_vocab is done.\n", "doc2vec training is done.\n" ] } ], "source": [ "reload = False\n", "if reload:\n", " doc_vectorizer = Doc2Vec.load(f'assets/{dataset}/doc2vec.model')\n", " print(\"doc_vectorizer is loaded.\")\n", "else:\n", " doc_vectorizer = Doc2Vec(min_count=1, window=10, vector_size=100, sample=1e-4, negative=5, workers=8)\n", " doc_vectorizer.build_vocab(train_sentences)\n", " print(\"build_vocab is done.\")\n", " doc_vectorizer.train(train_sentences,total_examples=doc_vectorizer.corpus_count,epochs=10)\n", " print(\"doc2vec training is done.\")\n", " doc_vectorizer.save(f'assets/{dataset}/doc2vec.model')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## gather all features" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "threshold = 120*24\n", "binsize = 3600\n", "\n", "def get_event_features(eid):\n", " \n", " ## capture\n", " ts = sorted(events[eid]['timestamps'])\n", " cnt, bins = np.histogram(ts, bins=range(ts[0],ts[0]+threshold*binsize,binsize))\n", "\n", " nonzero_bins_ind = np.nonzero(cnt)[0]\n", " nonzero_bins = bins[nonzero_bins_ind]\n", "\n", " # num_engagements and time intervals\n", " hist = cnt[nonzero_bins_ind].reshape(-1,1)\n", " deltas = nonzero_bins_ind[1:]-nonzero_bins_ind[:-1]\n", " deltas = np.insert(deltas,0,0).reshape(-1, 1)\n", " \n", " # user features\n", " X_users = []\n", " for bid, bin_left in enumerate(nonzero_bins):\n", " bin_userlist = []\n", " bin_right = bin_left + binsize\n", " bin_user_feature = []\n", " for tid, t in enumerate(ts):\n", " if t=bin_right:\n", " break\n", " uind = all_users_index[events[eid]['uid'][tid]]\n", " bin_user_feature += [all_users_features[uind].reshape(1,-1)] # (1,n_components)\n", "\n", " X_users += [np.concatenate(bin_user_feature, axis=0).mean(axis=0).reshape(1,-1)]\n", " X_users = np.concatenate(X_users, axis=0)\n", " \n", " # text features\n", " X_text = []\n", " for bid, bin_left in enumerate(nonzero_bins):\n", " bin_right = bin_left + binsize\n", " doc = ''\n", " for tid, t in enumerate(ts):\n", " if t=bin_right:\n", " break\n", " string = events[eid]['text'][tid]\n", " string = re.sub(r\"http\\S+\", \"\", string)\n", " string = re.sub(\"[?!.,:;()'@#$%^&*-=+/\\[\\[\\]\\]]\", ' ', string) # !.,:;()'@#$%^&*-_{}=+/\\\"\n", " doc += string\n", " X_text += [doc_vectorizer.infer_vector(\n", "# jieba.lcut(doc)\n", " doc\n", " ).reshape(1,-1)]\n", " X_text = np.concatenate(X_text, axis=0)\n", " \n", " # stack all\n", " X_capture = np.hstack([hist, deltas, X_users, X_text])\n", " \n", " ## score\n", " eind = events_index[eid]\n", " top_active_users = np.where(sub_matrix[:, eind].toarray() > 0)[0]\n", " X_score = top_users_features[top_active_users]\n", " \n", " ## label\n", " y = events[eid]['label']\n", " \n", " return X_capture, X_score, y" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# xc, xs, y = get_event_features('9833726676')\n", "# xc.shape, xs.shape, y" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f4d178f5971648f589d3b1637893930a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=78.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "save_path = f'assets/{dataset}/{split}/pkls/'\n", "os.makedirs(save_path, exist_ok=True)\n", "\n", "for eid in tqdm(events):\n", " xc, xs, y = get_event_features(eid)\n", " pickle.dump(\n", " {'x_capture': xc, 'x_score':xs, 'label': y}, \n", " open(save_path + eid + '.pkl', 'wb')\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ " " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }