123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603 |
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "from __future__ import division\n",
- "import re\n",
- "from collections import Counter\n",
- "import pickle\n",
- "import numpy as np\n",
- "import os\n",
- "from tqdm.notebook import tqdm\n",
- "from matplotlib import pyplot as plt\n",
- "import jieba, re\n",
- "import time\n",
- "from sklearn.utils.extmath import randomized_svd\n",
- "\n",
- "\n",
- "from gensim import utils\n",
- "from gensim.models.doc2vec import TaggedDocument\n",
- "from gensim.models import Doc2Vec\n",
- "\n",
- "import json"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## data dictionary"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "45d59671b76b4f8ab7fac5fe8c9228c8",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(FloatProgress(value=0.0, max=78.0), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "ffa76ec995ac4712a7518f649aaca4b3",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(FloatProgress(value=0.0, max=78.0), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
- "source": [
- "path = '/media/external_3TB/3TB/rafie/master/model-inputs'\n",
- "dataset = 'twitter'\n",
- "split = 'validation'\n",
- "\n",
- "f = open(f'{path}/{dataset}/{split}.txt', \"r\")\n",
- "lines = f.readlines()\n",
- "f.close()\n",
- "\n",
- "events = {}\n",
- "for line in tqdm(lines):\n",
- " line = json.loads(line.strip())\n",
- " events[str(line[0]['eid'])] = {'label' : line[1]}\n",
- " \n",
- "def process_tweet(tweet):\n",
- "# return tweet['t'], tweet['uid'], tweet['text']\n",
- " t = int(time.mktime(time.strptime(tweet['created_at'],\"%a %b %d %H:%M:%S +0000 %Y\")))\n",
- " uid = tweet['user']['id']\n",
- " text = tweet['text']\n",
- " return t, uid, text\n",
- " \n",
- "\n",
- "path = f'/media/external_3TB/3TB/rafie/master/{dataset}-raw-data/{dataset.capitalize()}'\n",
- "for event in tqdm(events):\n",
- " timestamps = []\n",
- " uids = []\n",
- " texts = []\n",
- " \n",
- " for file in os.listdir(f\"{path}/{event}-{events[event]['label']}\"):\n",
- " file = json.load(open(f'{path}/{event}-{events[event][\"label\"]}/{file}'))\n",
- "# tweets = json.load(open(f'{path}/{event}.json'))\n",
- " for tweet in [file['tweet']] + file['retweets']:\n",
- " t, uid, text = process_tweet(tweet)\n",
- " timestamps.append(t) #\n",
- " uids.append(uid) # tweet['user_id']\n",
- " texts.append(text)\n",
- " \n",
- " events[event]['timestamps'] = timestamps\n",
- " events[event]['uid'] = uids\n",
- " events[event]['text'] = texts\n",
- " "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## user features"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [],
- "source": [
- "cnt = Counter()\n",
- "for event in events:\n",
- " cnt.update(events[event]['uid'])\n",
- "\n",
- "topk = 20000\n",
- "top_users = list(map(lambda x: x[0], cnt.most_common(topk)))\n",
- "all_users = list(map(lambda x: x[0], cnt.most_common()))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [],
- "source": [
- "top_users_index = {}\n",
- "for ii, uid in enumerate(top_users):\n",
- " top_users_index[uid] = ii\n",
- "\n",
- "all_users_index = {}\n",
- "for ii, uid in enumerate(all_users):\n",
- " all_users_index[uid] = ii\n",
- " \n",
- "events_index = {}\n",
- "for ii, eid in enumerate(events):\n",
- " events_index[eid] = ii"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "36ad18cb3f634d92a890dbd9b20f53b5",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "5964cfe1f2b44f44bbec0c061e2ed9fa",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "top_users_events_matrix shape : (20000, 78)\n",
- "Sparsity : 1.616 %\n",
- "matrix_main shape: (39948, 78)\n",
- "Sparsity : 1.449 % \n"
- ]
- }
- ],
- "source": [
- "from scipy.sparse import csr_matrix\n",
- "\n",
- "def get_user_in_event(eid, users):\n",
- " event_users = set(events[eid]['uid'])\n",
- " return list(set(users).intersection(event_users))\n",
- "\n",
- "def get_user_event_matrix(users, users_index):\n",
- " row = []\n",
- " col = []\n",
- " data = []\n",
- " for ii, (eid, value) in tqdm(enumerate(events.items())):\n",
- " user_in_event = get_user_in_event(eid, users)\n",
- " for uid in user_in_event:\n",
- " uind = users_index[uid]\n",
- " col.append(ii)\n",
- " row.append(uind)\n",
- " data.append(1)\n",
- " return csr_matrix((data, (row, col)), shape=(len(users), len(events)))\n",
- "\n",
- "sub_matrix = get_user_event_matrix(top_users, top_users_index)\n",
- "main_matrix = get_user_event_matrix(all_users, all_users_index)\n",
- "\n",
- "print(\"top_users_events_matrix shape : {}\".format(sub_matrix.shape))\n",
- "print(\"Sparsity : {:.3f} %\".format(sub_matrix.count_nonzero()/np.prod(sub_matrix.shape) * 100))\n",
- "print(\"matrix_main shape: {}\".format(main_matrix.shape))\n",
- "print(\"Sparsity : {:.3f} % \".format(main_matrix.count_nonzero()/np.prod(main_matrix.shape) * 100))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [],
- "source": [
- "RELOAD = False\n",
- "save_path = f'assets/{dataset}/{split}'\n",
- "os.makedirs(save_path, exist_ok=True)\n",
- "\n",
- "if RELOAD:\n",
- " \n",
- " u_main = np.load(open(f'{save_path}/u_main.npy','rb'))\n",
- " sigma_main = np.load(open(f'{save_path}/sigma_main.npy','rb'))\n",
- " vt_main = np.load(open(f'{save_path}/vt_main.npy','rb'))\n",
- " all_users_features = u_main@np.diag(sigma_main)\n",
- " \n",
- " u_sub = np.load(open(f'{save_path}/u_sub.npy','rb'))\n",
- " sigma_sub = np.load(open(f'{save_path}/sigma_sub.npy','rb'))\n",
- " vt_sib = np.load(open(f'{save_path}/vt_sub.npy','rb'))\n",
- " top_users_features = u_sub@np.diag(sigma_sub)\n",
- "else:\n",
- " num_main_features = 20 # 10 for weibo, 20 for tweet\n",
- " n_iter = 7 # 15 for weibo, 7 for tweet\n",
- " \n",
- " u_main, sigma_main, vt_main = randomized_svd(main_matrix, n_components=num_main_features, n_iter=n_iter, random_state=42)\n",
- " all_users_features = u_main@np.diag(sigma_main)\n",
- " \n",
- " \n",
- " num_sub_features = 50\n",
- " \n",
- " u_sub, sigma_sub, vt_sub = randomized_svd(sub_matrix@sub_matrix.T, n_components=num_sub_features, n_iter=n_iter, random_state=42) # random_state=42\n",
- " top_users_features = u_sub@np.diag(sigma_sub)\n",
- " \n",
- " np.save(f'{save_path}/u_main.npy',u_main)\n",
- " np.save(f'{save_path}/sigma_main.npy',sigma_main)\n",
- " np.save(f'{save_path}/vt_main.npy',vt_main)\n",
- " \n",
- " np.save(f'{save_path}/u_sub.npy',u_sub)\n",
- " np.save(f'{save_path}/sigma_sub.npy',sigma_sub)\n",
- " np.save(f'{save_path}/vt_sub.npy',vt_sub)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "((39948, 20), (20000, 50))"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "all_users_features.shape, top_users_features.shape"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## text features"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "e3a6d76d2fee4a4188f90cf440e44c63",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(FloatProgress(value=0.0, max=806.0), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "train # sentences : 66081\n"
- ]
- }
- ],
- "source": [
- "binsize = 3600\n",
- "threshold = 120*24\n",
- "chinese_stopwords = '、 。 〃 〄 々 〆 〇 〈〉 《 》 「 」 『 』 【】 〒 〓 〔 〕 〖 〗 〘〙 〚 〛 〛 〜 〝 〞 〟,'\n",
- "rx = '[' + re.escape(''.join(chinese_stopwords.split())) + ']'\n",
- "\n",
- "\n",
- "def get_sentences():\n",
- " sentences = []\n",
- " for eid in tqdm(events):\n",
- " ts = sorted(events[eid]['timestamps'])\n",
- " cnt, bins = np.histogram(ts, bins=range(ts[0],ts[0]+threshold*binsize,binsize))\n",
- "\n",
- " nonzero_bins_ind = np.nonzero(cnt)[0]\n",
- " nonzero_bins = bins[nonzero_bins_ind]\n",
- " hist = cnt[nonzero_bins_ind]\n",
- " inv = nonzero_bins_ind[1:]-nonzero_bins_ind[:-1]\n",
- " intervals = np.insert(inv,0,0)\n",
- "\n",
- " for bid, bin_left in enumerate(nonzero_bins):\n",
- " bin_right = bin_left + binsize\n",
- " doc = ''\n",
- " for tid, t in enumerate(ts):\n",
- " if t<bin_left:\n",
- " continue\n",
- " elif t>=bin_right:\n",
- " break\n",
- " string = events[eid]['text'][tid]\n",
- " string = re.sub(r\"http\\S+\", \"\", string)\n",
- " string = re.sub(\"[?!.,:;()'@#$%^&*-=+/\\[\\[\\]\\]]\", ' ', string) # !.,:;()'@#$%^&*-_{}=+/\\\"\n",
- " doc += string\n",
- " sentences.append(TaggedDocument(\n",
- " words=doc, #jieba.lcut(doc), \n",
- " tags=[eid+'_%s' % bid]\n",
- " ))\n",
- " \n",
- " return sentences\n",
- " \n",
- "train_sentences = get_sentences()\n",
- "\n",
- "print(\"train # sentences : {}\".format(len(train_sentences)))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY8AAAD8CAYAAACPWyg8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEyJJREFUeJzt3XuMXOV5x/HvExsIzc0GtpZlO13SWIpM1RKYglGiKCUK2CSK+SOKjKpiUSuWCkhJL0pMI5XcKoVULQQ1IaGFYtIkxrkJi0Ad1yC1UsVlHYjBEMfLTdgC7MRcGkUihTz9Y94Nx8vs7rzrGc9s/P1IoznnOe8555kzu/PzzDmzjsxEkqQarxt0A5KkucfwkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUbf6gG5itU045JUdHRwfdhiTNGTt37vxZZo70YltzNjxGR0cZGxsbdBuSNGdExJO92pYfW0mSqhkekqRqXYVHRDwREQ9GxAMRMVZqJ0XE9ojYW+4XlnpExLURMR4RuyLijMZ21pXxeyNiXaN+Ztn+eFk3ev1AJUm9U/PO408y8/TMbJX5jcCOzFwO7CjzAKuB5eW2AbgO2mEDXAmcDZwFXDkROGXMRxvrrZr1I5Ik9d2RfGy1BthUpjcBFzbqN2fb3cCCiFgMnA9sz8xDmfkcsB1YVZa9OTPvzvZ/LnJzY1uSpCHUbXgk8MOI2BkRG0ptUWY+XaafARaV6SXAU41195XadPV9HeqSpCHV7aW6787M/RHxu8D2iPhJc2FmZkT0/b8kLMG1AeCtb31rv3cnSZpCV+88MnN/uT8AfJ/2OYtny0dOlPsDZfh+YFlj9aWlNl19aYd6pz6uz8xWZrZGRnryPRdJ0izMGB4R8YaIeNPENHAe8BCwFZi4YmodcGuZ3gpcXK66Wgm8UD7e2gacFxELy4ny84BtZdmLEbGyXGV1cWNbkqQh1M3HVouA75erZ+cD38zM/4iI+4AtEbEeeBL4SBl/O3ABMA78ErgEIDMPRcTngPvKuM9m5qEyfSlwE3AicEe5SZKGVLQvcJp7Wq1W+udJJKl7EbGz8XWLI+I3zCVJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVug6PiJgXEfdHxG1l/tSIuCcixiPilog4vtRPKPPjZfloYxtXlPqeiDi/UV9VauMRsbF3D0+S1A817zw+BjzSmL8KuDoz3w48B6wv9fXAc6V+dRlHRKwA1gKnAauAr5RAmgd8GVgNrAAuKmMlSUOqq/CIiKXAB4B/LfMBnAt8pwzZBFxYpteUecry95Xxa4DNmflSZj4OjANnldt4Zj6Wmb8CNpexkqQh1e07j2uATwC/LvMnA89n5stlfh+wpEwvAZ4CKMtfKON/U5+0zlR1SdKQmjE8IuKDwIHM3HkU+pmplw0RMRYRYwcPHhx0O5J0zOrmnce7gA9FxBO0P1I6F/gSsCAi5pcxS4H9ZXo/sAygLH8L8PNmfdI6U9VfIzOvz8xWZrZGRka6aF2S1A8zhkdmXpGZSzNzlPYJ7zsz80+Bu4APl2HrgFvL9NYyT1l+Z2Zmqa8tV2OdCiwH7gXuA5aXq7eOL/vY2pNHJ0nqi/kzD5nSJ4HNEfF54H7ghlK/Afh6RIwDh2iHAZm5OyK2AA8DLwOXZeYrABFxObANmAfcmJm7j6AvSVKfRftNwdzTarVybGxs0G1I0pwRETszs9WLbfkNc0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVK1GcMjIl4fEfdGxI8jYndEfKbUT42IeyJiPCJuiYjjS/2EMj9elo82tnVFqe+JiPMb9VWlNh4RG3v/MCVJvdTNO4+XgHMz84+A04FVEbESuAq4OjPfDjwHrC/j1wPPlfrVZRwRsQJYC5wGrAK+EhHzImIe8GVgNbACuKiMlSQNqRnDI9t+UWaPK7cEzgW+U+qbgAvL9JoyT1n+voiIUt+cmS9l5uPAOHBWuY1n5mOZ+StgcxkrSRpSXZ3zKO8QHgAOANuBR4HnM/PlMmQfsKRMLwGeAijLXwBObtYnrTNVXZI0pLoKj8x8JTNPB5bSfqfwjr52NYWI2BARYxExdvDgwUG0IEmi8mqrzHweuAs4B1gQEfPLoqXA/jK9H1gGUJa/Bfh5sz5pnanqnfZ/fWa2MrM1MjJS07okqYe6udpqJCIWlOkTgfcDj9AOkQ+XYeuAW8v01jJPWX5nZmapry1XY50KLAfuBe4Dlpert46nfVJ9ay8enCSpP+bPPITFwKZyVdTrgC2ZeVtEPAxsjojPA/cDN5TxNwBfj4hx4BDtMCAzd0fEFuBh4GXgssx8BSAiLge2AfOAGzNzd88eoSSp56L9pmDuabVaOTY2Nug2JGnOiIidmdnqxbb8hrkkqZrhIUmqZnhIkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySp2ozhERHLIuKuiHg4InZHxMdK/aSI2B4Re8v9wlKPiLg2IsYjYldEnNHY1royfm9ErGvUz4yIB8s610ZE9OPBSpJ6o5t3Hi8Df52ZK4CVwGURsQLYCOzIzOXAjjIPsBpYXm4bgOugHTbAlcDZwFnAlROBU8Z8tLHeqiN/aJKkfpkxPDLz6cz8UZn+X+ARYAmwBthUhm0CLizTa4Cbs+1uYEFELAbOB7Zn5qHMfA7YDqwqy96cmXdnZgI3N7YlSRpCVec8ImIUeCdwD7AoM58ui54BFpXpJcBTjdX2ldp09X0d6pKkIdV1eETEG4HvAh/PzBeby8o7huxxb5162BARYxExdvDgwX7vTpI0ha7CIyKOox0c38jM75Xys+UjJ8r9gVLfDyxrrL601KarL+1Qf43MvD4zW5nZGhkZ6aZ1SVIfdHO1VQA3AI9k5j81Fm0FJq6YWgfc2qhfXK66Wgm8UD7e2gacFxELy4ny84BtZdmLEbGy7OvixrYkSUNofhdj3gX8GfBgRDxQan8LfAHYEhHrgSeBj5RltwMXAOPAL4FLADLzUER8DrivjPtsZh4q05cCNwEnAneUmyRpSEX7dMXc02q1cmxsbNBtSNKcERE7M7PVi235DXNJUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lSNcNDklTN8JAkVTM8JEnVDA9JUjXDQ5JUzfCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdUMD0lStRnDIyJujIgDEfFQo3ZSRGyPiL3lfmGpR0RcGxHjEbErIs5orLOujN8bEesa9TMj4sGyzrUREb1+kJKk3urmncdNwKpJtY3AjsxcDuwo8wCrgeXltgG4DtphA1wJnA2cBVw5EThlzEcb603elyRpyMwYHpn5X8ChSeU1wKYyvQm4sFG/OdvuBhZExGLgfGB7Zh7KzOeA7cCqsuzNmXl3ZiZwc2NbkqQhNdtzHosy8+ky/QywqEwvAZ5qjNtXatPV93WoS5KG2BGfMC/vGLIHvcwoIjZExFhEjB08ePBo7FKS1MFsw+PZ8pET5f5Aqe8HljXGLS216epLO9Q7yszrM7OVma2RkZFZti5JOlKzDY+twMQVU+uAWxv1i8tVVyuBF8rHW9uA8yJiYTlRfh6wrSx7MSJWlqusLm5sS5I0pObPNCAivgW8FzglIvbRvmrqC8CWiFgPPAl8pAy/HbgAGAd+CVwCkJmHIuJzwH1l3Gczc+Ik/KW0r+g6Ebij3CRJQyzapyzmnlarlWNjY4NuQ5LmjIjYmZmtXmzLb5hLkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqoZHpKkaoaHJKma4SFJqmZ4SJKqGR6SpGqGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqodk+ExuvEHg25Bkua0YzI8JElHxvCQJFUzPCRJ1QwPSVI1w0OSVM3wkCRVMzwkSdWGJjwiYlVE7ImI8YjYOOh+JElTG4rwiIh5wJeB1cAK4KKIWNHPffpFQUmavaEID+AsYDwzH8vMXwGbgTX93qkBIkmzMyzhsQR4qjG/r9T6bnTjDwwRSao0f9AN1IiIDcCGMvuLiNgzy02dAvzssG1fdSSd9dRrehsi9jY79jY7w9wbDHd/U/X2e73awbCEx35gWWN+aakdJjOvB64/0p1FxFhmto50O/1gb7Njb7Njb7M3zP0djd6G5WOr+4DlEXFqRBwPrAW2DrgnSdIUhuKdR2a+HBGXA9uAecCNmbl7wG1JkqYwFOEBkJm3A7cfpd0d8UdffWRvs2Nvs2NvszfM/fW9t8jMfu9DkvRbZljOeUiS5pLMPGZuwCpgDzAObOzzvp4AHgQeAMZK7SRgO7C33C8s9QCuLX3tAs5obGddGb8XWNeon1m2P17WjWl6uRE4ADzUqPW9l6n20UVvn6Z9td0D5XZBY9kVZT97gPNnem6BU4F7Sv0W4PhSP6HMj5flox16WwbcBTwM7AY+NizHbpreBn7sgNcD9wI/Lr19Zrbb61XPXfR2E/B447idPojfhzJuHnA/cNuwHLeOffbzBXSYbuUJeRR4G3B8+eFZ0cf9PQGcMqn2xYknDNgIXFWmLwDuKD+oK4F7Gj9sj5X7hWV64oXq3jI2yrqrp+nlPcAZHP4C3fdeptpHF719GvibDmNXlOfthPLD/mh5Xqd8boEtwNoy/VXgL8r0pcBXy/Ra4JYO+1tMebEA3gT8tPQw8GM3TW8DP3blsbyxTB9H+0VpZe32etlzF73dBHy4w3E7qr8PZdlfAd/k1fAY+HHr2Ge/XjyH7QacA2xrzF8BXNHH/T3Ba8NjD7C4TC8G9pTprwEXTR4HXAR8rVH/WqktBn7SqB82bop+Rjn8BbrvvUy1jy56+zSdXwAPe85oX513zlTPbfnl/Rkwf/LPwMS6ZXp+GTflu7cy7lbg/cN07Dr0NlTHDvgd4EfA2bXb62XPXfR2E53D46g+p7S/47YDOBe4bTbPQ7+P28TtWDrncbT/BEoCP4yIneWb8QCLMvPpMv0MsGiG3qar7+tQr3E0eplqH924PCJ2RcSNEbFwlr2dDDyfmS936O0365TlL5TxHUXEKPBO2v9SHapjN6k3GIJjFxHzIuIB2h9Jbqf9L97a7fWy5yl7y8yJ4/b35bhdHREnTO6tyx6O9Dm9BvgE8OsyP5vnoS/HbbJjKTyOtndn5hm0/1LwZRHxnubCbEd8DqSzSY5GL5X7uA74feB04GngH/vVVzci4o3Ad4GPZ+aLzWWDPnYdehuKY5eZr2Tm6bT/JX0W8I5B9NHJ5N4i4g9o/wv8HcAf0/4o6pN97uE1z2lEfBA4kJk7+7nvXjmWwqOrP4HSK5m5v9wfAL5P+xfo2YhYDFDuD8zQ23T1pR3qNY5GL1PtY1qZ+Wz5Bf818C+0j91sevs5sCAi5k+qH7atsvwtZfxhIuI42i/O38jM783wuI7qsevU2zAdu9LP87RP7J8zi+31sufpeluVmU9n20vAvzH743Ykz+m7gA9FxBO0/7L4ucCXpnlMAzluvzHT51q/LTfanwk+RvsE0sTJotP6tK83AG9qTP8P7asc/oHDT5h9sUx/gMNPyt1b6ifRvgJkYbk9DpxUlk0+KXfBDD2Ncvh5hb73MtU+uuhtcWP6L4HNZfo0Dj8R+Bjtk4BTPrfAtzn8ROClZfoyDj/ZuKVDXwHcDFwzqT7wYzdNbwM/dsAIsKBMnwj8N/DB2u31sucuelvcOK7XAF8Y1O9DWf5eXj1hPvDj1rHHfrx4DuuN9pUTP6X9+eun+rift5UnZuJywE+V+sm0T4btBf6z8cMWtP8zrEdpX+LXamzrz2lfPjcOXNKot4CHyjr/zPSX6n6L9kcY/0f788z1R6OXqfbRRW9fL/veRftvnDVfED9V9rOHxhVmUz235bm4t/T8beCEUn99mR8vy9/Wobd30/5oYReNS1+H4dhN09vAjx3wh7QvNd1VHtvfzXZ7veq5i97uLMftIeDfefWKrKP6+9DYxnt5NTwGftw63fyGuSSp2rF0zkOS1COGhySpmuEhSapmeEiSqhkekqRqhockqZrhIUmqZnhIkqr9P21J550AQwc7AAAAAElFTkSuQmCC\n",
- "text/plain": [
- "<Figure size 432x288 with 1 Axes>"
- ]
- },
- "metadata": {
- "needs_background": "light"
- },
- "output_type": "display_data"
- }
- ],
- "source": [
- "lens = [len(s.words) for s in train_sentences]\n",
- "plt.hist(lens, 1000)\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "build_vocab is done.\n",
- "doc2vec training is done.\n"
- ]
- }
- ],
- "source": [
- "reload = False\n",
- "if reload:\n",
- " doc_vectorizer = Doc2Vec.load(f'assets/{dataset}/doc2vec.model')\n",
- " print(\"doc_vectorizer is loaded.\")\n",
- "else:\n",
- " doc_vectorizer = Doc2Vec(min_count=1, window=10, vector_size=100, sample=1e-4, negative=5, workers=8)\n",
- " doc_vectorizer.build_vocab(train_sentences)\n",
- " print(\"build_vocab is done.\")\n",
- " doc_vectorizer.train(train_sentences,total_examples=doc_vectorizer.corpus_count,epochs=10)\n",
- " print(\"doc2vec training is done.\")\n",
- " doc_vectorizer.save(f'assets/{dataset}/doc2vec.model')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## gather all features"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [],
- "source": [
- "threshold = 120*24\n",
- "binsize = 3600\n",
- "\n",
- "def get_event_features(eid):\n",
- " \n",
- " ## capture\n",
- " ts = sorted(events[eid]['timestamps'])\n",
- " cnt, bins = np.histogram(ts, bins=range(ts[0],ts[0]+threshold*binsize,binsize))\n",
- "\n",
- " nonzero_bins_ind = np.nonzero(cnt)[0]\n",
- " nonzero_bins = bins[nonzero_bins_ind]\n",
- "\n",
- " # num_engagements and time intervals\n",
- " hist = cnt[nonzero_bins_ind].reshape(-1,1)\n",
- " deltas = nonzero_bins_ind[1:]-nonzero_bins_ind[:-1]\n",
- " deltas = np.insert(deltas,0,0).reshape(-1, 1)\n",
- " \n",
- " # user features\n",
- " X_users = []\n",
- " for bid, bin_left in enumerate(nonzero_bins):\n",
- " bin_userlist = []\n",
- " bin_right = bin_left + binsize\n",
- " bin_user_feature = []\n",
- " for tid, t in enumerate(ts):\n",
- " if t<bin_left:\n",
- " continue\n",
- " elif t>=bin_right:\n",
- " break\n",
- " uind = all_users_index[events[eid]['uid'][tid]]\n",
- " bin_user_feature += [all_users_features[uind].reshape(1,-1)] # (1,n_components)\n",
- "\n",
- " X_users += [np.concatenate(bin_user_feature, axis=0).mean(axis=0).reshape(1,-1)]\n",
- " X_users = np.concatenate(X_users, axis=0)\n",
- " \n",
- " # text features\n",
- " X_text = []\n",
- " for bid, bin_left in enumerate(nonzero_bins):\n",
- " bin_right = bin_left + binsize\n",
- " doc = ''\n",
- " for tid, t in enumerate(ts):\n",
- " if t<bin_left:\n",
- " continue\n",
- " elif t>=bin_right:\n",
- " break\n",
- " string = events[eid]['text'][tid]\n",
- " string = re.sub(r\"http\\S+\", \"\", string)\n",
- " string = re.sub(\"[?!.,:;()'@#$%^&*-=+/\\[\\[\\]\\]]\", ' ', string) # !.,:;()'@#$%^&*-_{}=+/\\\"\n",
- " doc += string\n",
- " X_text += [doc_vectorizer.infer_vector(\n",
- "# jieba.lcut(doc)\n",
- " doc\n",
- " ).reshape(1,-1)]\n",
- " X_text = np.concatenate(X_text, axis=0)\n",
- " \n",
- " # stack all\n",
- " X_capture = np.hstack([hist, deltas, X_users, X_text])\n",
- " \n",
- " ## score\n",
- " eind = events_index[eid]\n",
- " top_active_users = np.where(sub_matrix[:, eind].toarray() > 0)[0]\n",
- " X_score = top_users_features[top_active_users]\n",
- " \n",
- " ## label\n",
- " y = events[eid]['label']\n",
- " \n",
- " return X_capture, X_score, y"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [],
- "source": [
- "# xc, xs, y = get_event_features('9833726676')\n",
- "# xc.shape, xs.shape, y"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "f4d178f5971648f589d3b1637893930a",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "HBox(children=(FloatProgress(value=0.0, max=78.0), HTML(value='')))"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
- "source": [
- "save_path = f'assets/{dataset}/{split}/pkls/'\n",
- "os.makedirs(save_path, exist_ok=True)\n",
- "\n",
- "for eid in tqdm(events):\n",
- " xc, xs, y = get_event_features(eid)\n",
- " pickle.dump(\n",
- " {'x_capture': xc, 'x_score':xs, 'label': y}, \n",
- " open(save_path + eid + '.pkl', 'wb')\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- " "
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|