You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

prepare_twitter_cas.py 9.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. import os
  2. import json
  3. import random
  4. from multiprocessing import Pool
  5. from io import open
  6. from pathlib import Path
  7. from pathlib import Path
  8. import calendar
  9. from gensim.models.doc2vec import Doc2Vec, TaggedDocument
  10. #from nltk.tokenize import word_tokenize
  11. from sklearn import preprocessing
  12. import numpy as npe
  13. from tqdm import tqdm
  14. import argparse
  15. import numpy as np
  16. from utils import TUser
  17. import random
  18. global all_event
  19. global less_than_limit_event
  20. all_event = 0
  21. less_than_limit_event = 0
  22. users = {}
  23. db_config = {}
  24. global limit
  25. train_ratio = 0.8
  26. t = 60
  27. f_model_addr = 'd2v/fd2v_t.model'
  28. r_model_addr = 'd2v/rd2v_t.model'
  29. cas_lens = [50,40,30,20,10]
  30. def make_all_users(data_addr,data_class_addr, users_addr):
  31. data_dir = data_addr
  32. engaged_users={}
  33. user_id ={}
  34. labels = {}
  35. seqs={}
  36. with open(data_class_addr) as f:
  37. for line in f.readlines():
  38. eid = str(line[line.index('eid:') + 4: line.index('label')-1])
  39. label = int(line[line.index('label:') + 6])
  40. labels[eid] = label
  41. seq = line[line.index('label:') + 8 : line.index('\n')]
  42. seq = seq.split()
  43. seqs[eid] = seq
  44. for eid, label in labels.items():
  45. tweet_ind = 0
  46. while ( tweet_ind < len(seqs[eid])):
  47. tweet = seqs[eid][tweet_ind]
  48. tweet_ind += 1
  49. filename = tweet + '.txt'
  50. addr = data_addr + '/' + eid + '-' + str(label)
  51. if(Path(addr + '/' + filename).exists()):
  52. with open(addr + '/' + filename, encoding="utf8") as f:
  53. event_data = json.load(f)
  54. user = event_data['tweet']['user']
  55. user = TUser(user)
  56. if user.id in engaged_users:
  57. engaged_users[user.id]+=1
  58. else:
  59. engaged_users[user.id]=1
  60. user_id[user.id] = user
  61. print('before all users')
  62. all_users = list(set([user_id[userid] for userid,count in engaged_users.items() if count>0]))
  63. print('after all usesrs')
  64. user_embedding =[]
  65. for user in all_users:
  66. user_embedding.append(user.vector[0:6])
  67. scaler = preprocessing.MinMaxScaler()
  68. scaler.fit(user_embedding)
  69. user_embedding = scaler.transform(user_embedding)
  70. print('before embedding')
  71. with open(users_addr, 'wt') as users_f:
  72. for i, user in enumerate(all_users):
  73. new_user_vector = user.vector.tolist()
  74. # print('before' , new_user_vector)
  75. new_user_vector[0:6] = user_embedding[i]
  76. # print('after' , new_user_vector ,user_embedding[i])
  77. user_id = user.id
  78. json_data = json.dumps([new_user_vector , int(user_id)])
  79. users_f.write(json_data + '\n')
  80. def make_all_pro_paths(data_addr, data_class_addr, train_addr='', test_addr='' ,validation_adr='' ,eid_addr=''):
  81. empty_eids = 0
  82. all_tweets = 0
  83. empty_tweets = 0
  84. fake_events = 0
  85. real_events = 0
  86. all_len=[]
  87. all_time =[]
  88. # with open(eid_addr) as file:
  89. # eids = json.load(file)
  90. labels = {}
  91. seqs = {}
  92. # r_model = Doc2Vec.load(r_model_addr)
  93. # f_model = Doc2Vec.load(f_model_addr)
  94. with open(data_class_addr) as f:
  95. for line in f.readlines():
  96. eid = str(line[line.index('eid:') + 4: line.index('label')-1])
  97. label = int(line[line.index('label:') + 6])
  98. seq = line[line.index('label:') + 8 : line.index('\n')]
  99. seq = seq.split()
  100. seqs[eid] = seq
  101. labels[eid] = label
  102. all_tweets += len(seq)
  103. with open(train_addr, 'wt') as train_f, open(test_addr, 'wt') as test_f, open(validation_adr, 'wt') as valid_f:
  104. for eid, label in labels.items():
  105. engaged_users = []
  106. tweet_ind = 0
  107. while ( tweet_ind < len(seqs[eid])):
  108. tweet = seqs[eid][tweet_ind]
  109. tweet_ind += 1
  110. filename = tweet + '.txt'
  111. addr = data_addr + '/' + eid + '-' + str(label)
  112. if(Path(addr + '/' + filename).exists()):
  113. with open(addr + '/' + filename, encoding="utf8") as f:
  114. event_data = json.load(f)
  115. user = event_data['tweet']['user']
  116. engaged_users.append(user)
  117. if label==1 or label==0:
  118. for cas_len in cas_lens:
  119. if len(engaged_users) > cas_len +1 :
  120. cascade = [TUser(user).id for user in engaged_users]
  121. cascade = cascade[0:cas_len]
  122. cascade = convert_to_seq(cascade)
  123. if random.random()<0.8:
  124. train_f.write(cascade + '\n')
  125. else:
  126. if random.random()<0.5:
  127. test_f.writelines(cascade + '\n')
  128. else:
  129. valid_f.writelines(cascade + '\n')
  130. def convert_to_seq(cascade):
  131. seq_str =''
  132. for user in cascade:
  133. seq_str += str(user)+',' + str(0)+' '
  134. return seq_str
  135. def make_propagation_path(event):
  136. global all_event
  137. global less_than_limit_event
  138. global limit
  139. all_event+=1
  140. engaged_users = []
  141. for tweet in event:
  142. engaged_users.append(tweet)
  143. if len(engaged_users) < limit:
  144. engaged_users.extend([random.choice(engaged_users) for i in range(limit - len(engaged_users))])
  145. less_than_limit_event+=1
  146. engaged_users = engaged_users[0:limit]
  147. engaged_users_vector = [User(user).vector for user in engaged_users]
  148. return engaged_users_vector
  149. def read_all_events(data_addr, data_class_addr,eids_addr):
  150. data_dir = data_addr
  151. labels = {}
  152. all_events = {}
  153. seqs = {}
  154. with open(eids_addr) as file:
  155. eids = json.load(file)
  156. with open(data_class_addr) as f:
  157. for line in f.readlines():
  158. eid = str(line[line.index('eid:') + 4: line.index('label') - 1])
  159. label = int(line[line.index('label:') + 6])
  160. seq = line[line.index('label:') + 8: line.index('\n')]
  161. seq = seq.split()
  162. seqs[eid] = seq
  163. labels[eid] = label
  164. for eid, label in labels.items():
  165. tweet_ind = 0
  166. # if eid in eids['train']:
  167. while (tweet_ind < len(seqs[eid])):
  168. tweet = seqs[eid][tweet_ind]
  169. tweet_ind += 1
  170. filename = tweet + '.txt'
  171. addr = data_addr + '/' + eid + '-' + str(label)
  172. if (Path(addr + '/' + filename).exists()):
  173. with open(addr + '/' + filename, encoding="utf8") as f:
  174. event_data = json.load(f)
  175. all_events[tweet] = {}
  176. all_events[tweet]['tweet'] = event_data
  177. all_events[tweet]['label'] = label
  178. return all_events
  179. def save_w2v_models(data_addr, data_class_addr):
  180. real_texts = []
  181. fake_texts = []
  182. print('reading events..')
  183. all_events = read_all_events(data_addr, data_class_addr, '/media/external_3TB/3TB/rafie/paper/twitter_all/eids')
  184. print('reading texts..')
  185. for tweet_id, tweet in all_events.items():
  186. label = tweet['label']
  187. t = tweet['tweet']['tweet']
  188. if label == 0:
  189. real_texts.append(t['text'])
  190. else:
  191. fake_texts.append(t['text'])
  192. learn_model(real_texts , r_model_addr)
  193. learn_model(fake_texts , f_model_addr)
  194. def learn_model(data ,addr):
  195. model = Doc2Vec(size=50,
  196. alpha=0.025,
  197. min_alpha=0.00025,
  198. min_count=1,
  199. dm=1)
  200. tagged_data = [TaggedDocument(_d.lower().split(),[i]) for i, _d in enumerate(data)]
  201. model.build_vocab(tagged_data)
  202. for epoch in range(10):
  203. print('iteration {0}'.format(epoch))
  204. model.train(tagged_data,
  205. total_examples=model.corpus_count,
  206. epochs=model.iter)
  207. # decrease the learning rate
  208. model.alpha -= 0.0002
  209. # fix the learning rate, no decay
  210. model.min_alpha = model.alpha
  211. model.save(addr)
  212. print("Model Saved in" , addr)
  213. if __name__ == '__main__':
  214. global limit
  215. # parser = argparse.ArgumentParser()
  216. # parser.add_argument('--train', type=str, required=True)
  217. # parser.add_argument('--test', type=str, required=True)
  218. # parser.add_argument('--validation', type=str, required=True)
  219. # args = parser.parse_args()
  220. # save_w2v_models('/media/external_3TB/3TB/rafie/results-retweet', '/media/external_3TB/3TB/rafie/rumdect/Twitter.txt')
  221. make_all_users('/media/external_3TB/3TB/rafie/politifact-raw-data/Politifact',
  222. '/media/external_3TB/3TB/rafie/politifact-raw-data/Politifact.txt',
  223. '/home/rafie/NeuralDiffusionModel-master/data/politifact/users_limited.txt')
  224. # make_all_pro_paths('/media/external_3TB/3TB/rafie/gossipcop-raw-data/Gossipcop', '/media/external_3TB/3TB/rafie/gossipcop-raw-data/Gossipcop.txt','/home/rafie/NeuralDiffusionModel-master/data/gossipcop/'+args.train , '/home/rafie/NeuralDiffusionModel-master/data/gossipcop/'+args.test , '/home/rafie/NeuralDiffusionModel-master/data/gossipcop/'+args.validation,'/media/external_3TB/3TB/rafie/paper/twitter_all/eids')
  225. print('done')