|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277 |
- import os
- import json
- import random
- from multiprocessing import Pool
- from io import open
- from pathlib import Path
- from pathlib import Path
- import calendar
- from gensim.models.doc2vec import Doc2Vec, TaggedDocument
- #from nltk.tokenize import word_tokenize
- from sklearn import preprocessing
-
-
- import numpy as npe
- from tqdm import tqdm
- import argparse
- import numpy as np
-
-
- from utils import TUser
- import random
-
- global all_event
- global less_than_limit_event
- all_event = 0
- less_than_limit_event = 0
- users = {}
- db_config = {}
- global limit
- train_ratio = 0.8
- t = 60
- f_model_addr = 'd2v/fd2v_t.model'
- r_model_addr = 'd2v/rd2v_t.model'
- cas_lens = [50,40,30,20,10]
-
-
-
- def make_all_users(data_addr,data_class_addr, users_addr):
- data_dir = data_addr
- engaged_users={}
- user_id ={}
- labels = {}
- seqs={}
- with open(data_class_addr) as f:
- for line in f.readlines():
- eid = str(line[line.index('eid:') + 4: line.index('label')-1])
- label = int(line[line.index('label:') + 6])
- labels[eid] = label
- seq = line[line.index('label:') + 8 : line.index('\n')]
- seq = seq.split()
- seqs[eid] = seq
-
- for eid, label in labels.items():
- tweet_ind = 0
- while ( tweet_ind < len(seqs[eid])):
- tweet = seqs[eid][tweet_ind]
- tweet_ind += 1
- filename = tweet + '.txt'
- addr = data_addr + '/' + eid + '-' + str(label)
-
- if(Path(addr + '/' + filename).exists()):
- with open(addr + '/' + filename, encoding="utf8") as f:
- event_data = json.load(f)
- user = event_data['tweet']['user']
- user = TUser(user)
- if user.id in engaged_users:
- engaged_users[user.id]+=1
- else:
- engaged_users[user.id]=1
- user_id[user.id] = user
-
-
- print('before all users')
- all_users = list(set([user_id[userid] for userid,count in engaged_users.items() if count>0]))
-
- print('after all usesrs')
- user_embedding =[]
- for user in all_users:
- user_embedding.append(user.vector[0:6])
- scaler = preprocessing.MinMaxScaler()
- scaler.fit(user_embedding)
- user_embedding = scaler.transform(user_embedding)
- print('before embedding')
-
- with open(users_addr, 'wt') as users_f:
- for i, user in enumerate(all_users):
- new_user_vector = user.vector.tolist()
- # print('before' , new_user_vector)
- new_user_vector[0:6] = user_embedding[i]
- # print('after' , new_user_vector ,user_embedding[i])
- user_id = user.id
- json_data = json.dumps([new_user_vector , int(user_id)])
- users_f.write(json_data + '\n')
-
-
-
-
- def make_all_pro_paths(data_addr, data_class_addr, train_addr='', test_addr='' ,validation_adr='' ,eid_addr=''):
-
- empty_eids = 0
- all_tweets = 0
- empty_tweets = 0
- fake_events = 0
- real_events = 0
- all_len=[]
- all_time =[]
-
- # with open(eid_addr) as file:
- # eids = json.load(file)
-
- labels = {}
- seqs = {}
- # r_model = Doc2Vec.load(r_model_addr)
- # f_model = Doc2Vec.load(f_model_addr)
- with open(data_class_addr) as f:
- for line in f.readlines():
- eid = str(line[line.index('eid:') + 4: line.index('label')-1])
- label = int(line[line.index('label:') + 6])
- seq = line[line.index('label:') + 8 : line.index('\n')]
- seq = seq.split()
- seqs[eid] = seq
- labels[eid] = label
- all_tweets += len(seq)
-
- with open(train_addr, 'wt') as train_f, open(test_addr, 'wt') as test_f, open(validation_adr, 'wt') as valid_f:
- for eid, label in labels.items():
-
- engaged_users = []
- tweet_ind = 0
- while ( tweet_ind < len(seqs[eid])):
- tweet = seqs[eid][tweet_ind]
- tweet_ind += 1
- filename = tweet + '.txt'
- addr = data_addr + '/' + eid + '-' + str(label)
-
- if(Path(addr + '/' + filename).exists()):
- with open(addr + '/' + filename, encoding="utf8") as f:
- event_data = json.load(f)
- user = event_data['tweet']['user']
- engaged_users.append(user)
-
- if label==1 or label==0:
- for cas_len in cas_lens:
- if len(engaged_users) > cas_len +1 :
- cascade = [TUser(user).id for user in engaged_users]
- cascade = cascade[0:cas_len]
- cascade = convert_to_seq(cascade)
-
- if random.random()<0.8:
- train_f.write(cascade + '\n')
- else:
- if random.random()<0.5:
- test_f.writelines(cascade + '\n')
- else:
- valid_f.writelines(cascade + '\n')
-
-
- def convert_to_seq(cascade):
- seq_str =''
- for user in cascade:
- seq_str += str(user)+',' + str(0)+' '
- return seq_str
-
-
- def make_propagation_path(event):
- global all_event
- global less_than_limit_event
- global limit
- all_event+=1
- engaged_users = []
- for tweet in event:
- engaged_users.append(tweet)
- if len(engaged_users) < limit:
- engaged_users.extend([random.choice(engaged_users) for i in range(limit - len(engaged_users))])
- less_than_limit_event+=1
- engaged_users = engaged_users[0:limit]
-
- engaged_users_vector = [User(user).vector for user in engaged_users]
- return engaged_users_vector
-
-
- def read_all_events(data_addr, data_class_addr,eids_addr):
- data_dir = data_addr
- labels = {}
- all_events = {}
- seqs = {}
- with open(eids_addr) as file:
- eids = json.load(file)
- with open(data_class_addr) as f:
- for line in f.readlines():
- eid = str(line[line.index('eid:') + 4: line.index('label') - 1])
- label = int(line[line.index('label:') + 6])
- seq = line[line.index('label:') + 8: line.index('\n')]
- seq = seq.split()
- seqs[eid] = seq
- labels[eid] = label
-
- for eid, label in labels.items():
- tweet_ind = 0
- # if eid in eids['train']:
- while (tweet_ind < len(seqs[eid])):
- tweet = seqs[eid][tweet_ind]
- tweet_ind += 1
- filename = tweet + '.txt'
- addr = data_addr + '/' + eid + '-' + str(label)
-
- if (Path(addr + '/' + filename).exists()):
- with open(addr + '/' + filename, encoding="utf8") as f:
- event_data = json.load(f)
- all_events[tweet] = {}
- all_events[tweet]['tweet'] = event_data
- all_events[tweet]['label'] = label
- return all_events
-
-
- def save_w2v_models(data_addr, data_class_addr):
- real_texts = []
- fake_texts = []
- print('reading events..')
- all_events = read_all_events(data_addr, data_class_addr, '/media/external_3TB/3TB/rafie/paper/twitter_all/eids')
- print('reading texts..')
-
- for tweet_id, tweet in all_events.items():
- label = tweet['label']
- t = tweet['tweet']['tweet']
- if label == 0:
- real_texts.append(t['text'])
- else:
- fake_texts.append(t['text'])
-
- learn_model(real_texts , r_model_addr)
- learn_model(fake_texts , f_model_addr)
-
-
-
- def learn_model(data ,addr):
- model = Doc2Vec(size=50,
- alpha=0.025,
- min_alpha=0.00025,
- min_count=1,
- dm=1)
- tagged_data = [TaggedDocument(_d.lower().split(),[i]) for i, _d in enumerate(data)]
-
- model.build_vocab(tagged_data)
-
- for epoch in range(10):
- print('iteration {0}'.format(epoch))
- model.train(tagged_data,
- total_examples=model.corpus_count,
- epochs=model.iter)
- # decrease the learning rate
- model.alpha -= 0.0002
- # fix the learning rate, no decay
- model.min_alpha = model.alpha
-
- model.save(addr)
- print("Model Saved in" , addr)
-
-
- if __name__ == '__main__':
- global limit
- # parser = argparse.ArgumentParser()
- # parser.add_argument('--train', type=str, required=True)
- # parser.add_argument('--test', type=str, required=True)
- # parser.add_argument('--validation', type=str, required=True)
- # args = parser.parse_args()
-
-
-
- # save_w2v_models('/media/external_3TB/3TB/rafie/results-retweet', '/media/external_3TB/3TB/rafie/rumdect/Twitter.txt')
-
- make_all_users('/media/external_3TB/3TB/rafie/politifact-raw-data/Politifact',
- '/media/external_3TB/3TB/rafie/politifact-raw-data/Politifact.txt',
- '/home/rafie/NeuralDiffusionModel-master/data/politifact/users_limited.txt')
-
- # make_all_pro_paths('/media/external_3TB/3TB/rafie/gossipcop-raw-data/Gossipcop', '/media/external_3TB/3TB/rafie/gossipcop-raw-data/Gossipcop.txt','/home/rafie/NeuralDiffusionModel-master/data/gossipcop/'+args.train , '/home/rafie/NeuralDiffusionModel-master/data/gossipcop/'+args.test , '/home/rafie/NeuralDiffusionModel-master/data/gossipcop/'+args.validation,'/media/external_3TB/3TB/rafie/paper/twitter_all/eids')
- print('done')
|