You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

cluster_text.py 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. import logging
  2. import os
  3. from sentence_transformers import SentenceTransformer
  4. import numpy as np
  5. from sklearn.decomposition import PCA
  6. import pickle
  7. from nose.tools import assert_equal
  8. from kmodes import kprototypes
  9. import pandas as pd
  10. from sklearn.neighbors._dist_metrics import DistanceMetric
  11. from sklearn.cluster import AgglomerativeClustering, KMeans
  12. import gower
  13. from sklearn.preprocessing import MaxAbsScaler
  14. ROOT_DIR = os.path.dirname(os.path.realpath(__file__))
  15. model = SentenceTransformer('paraphrase-distilroberta-base-v1')
  16. logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', filename='cluster_text.log', level=logging.INFO,
  17. datefmt='%Y-%m-%d %H:%M:%S')
  18. def embed_user_cascades(user_cascades):
  19. fake_casc_percent = 0
  20. user_tweets_embed = np.array([0 for _ in range(768)])
  21. global model
  22. for cascade in user_cascades:
  23. fake_casc_percent += int(cascade[1])
  24. user_tweets_embed = np.add(user_tweets_embed, model.encode(cascade[2]))
  25. user_tweets_embed = user_tweets_embed / len(user_cascades)
  26. fake_casc_percent = fake_casc_percent / len(user_cascades)
  27. return user_tweets_embed, fake_casc_percent
  28. def user_ids(users_dict):
  29. global model
  30. user_ids = []
  31. for user_id, user_info in users_dict.items():
  32. user_ids += [user_id]
  33. return user_ids
  34. def user_embedding(users_dict):
  35. global model
  36. logging.info("start embedding.")
  37. user_ids = []
  38. users_bio = None
  39. users_tweets_embed = None
  40. logging.info("start embedding.")
  41. ctr = 0
  42. for user_id, user_info in users_dict.items():
  43. # if ctr >= 5000:
  44. # break
  45. ctr += 1
  46. user_ids += [user_id]
  47. user_bio_embed = model.encode(
  48. user_info['profile_features']['description'])
  49. user_tweets_embed, fake_casc_percent = embed_user_cascades(
  50. user_info['cascades_feature'])
  51. if users_bio is None:
  52. users_bio = [user_bio_embed.tolist()]
  53. users_tweets_embed = [user_tweets_embed.tolist()]
  54. else:
  55. users_bio = np.append(users_bio, [user_bio_embed.tolist()], axis=0)
  56. users_tweets_embed = np.append(
  57. users_tweets_embed, [user_tweets_embed.tolist()], axis=0)
  58. logging.info("start pca.")
  59. bio_pca = PCA(n_components=5)
  60. tweet_pca = PCA(n_components=5)
  61. users_bio = bio_pca.fit_transform(users_bio)
  62. logging.info("users pca finished.")
  63. users_tweets_embed = tweet_pca.fit_transform(users_tweets_embed)
  64. logging.info("tweets pca finished.")
  65. return user_ids, users_bio, users_tweets_embed
  66. def users_feature_extraction_with_profile(users_ids, my_data):
  67. data_set = {}
  68. logging.info("start making feature vectors of users.")
  69. for i in range(len(users_ids)):
  70. user = users_ids[i]
  71. profile_background_tile = 1 if my_data[user]['profile_features']['profile_background_tile'] else 0
  72. profile_use_background_image = 1 if my_data[user][
  73. 'profile_features']['profile_use_background_image'] else 0
  74. screen_name = len(my_data[user]['profile_features']['screen_name'])
  75. verified = 1 if my_data[user]['profile_features']['verified'] else 0
  76. statuses_count = my_data[user]['profile_features']['statuses_count']
  77. favourites_count = my_data[user]['profile_features']['favourites_count']
  78. has_extended_profile = 1 if my_data[user]['profile_features']['has_extended_profile'] else 0
  79. friends_count = my_data[user]['profile_features']['friends_count']
  80. followers_count = my_data[user]['profile_features']['followers_count']
  81. number_cascades = len(my_data[user]['cascades_feature'])
  82. user_feature_vect = [
  83. profile_background_tile,
  84. verified,
  85. statuses_count,
  86. favourites_count,
  87. has_extended_profile,
  88. friends_count,
  89. followers_count,
  90. number_cascades
  91. ]
  92. data_set[user] = user_feature_vect
  93. logging.info("user {0} added. index = {1}.".format(user, i))
  94. logging.info('writing output to users_feature.p')
  95. file_to_write = open('users_feature_8d.p', 'wb')
  96. pickle.dump(data_set, file_to_write)
  97. logging.info("finished.")
  98. def users_feature_extraction(users_ids, users_bio, users_tweet, my_data):
  99. data_set = {}
  100. logging.info("start making feature vectors of users.")
  101. for i in range(len(users_ids)):
  102. user = users_ids[i]
  103. user_bio = users_bio[i].tolist()
  104. user_tweet = users_tweet[i].tolist()
  105. profile_background_tile = 1 if my_data[user]['profile_features']['profile_background_tile'] else 0
  106. profile_use_background_image = 1 if my_data[user][
  107. 'profile_features']['profile_use_background_image'] else 0
  108. screen_name = len(my_data[user]['profile_features']['screen_name'])
  109. verified = 1 if my_data[user]['profile_features']['verified'] else 0
  110. statuses_count = my_data[user]['profile_features']['statuses_count']
  111. favourites_count = my_data[user]['profile_features']['favourites_count']
  112. has_extended_profile = 1 if my_data[user]['profile_features']['has_extended_profile'] else 0
  113. friends_count = my_data[user]['profile_features']['friends_count']
  114. followers_count = my_data[user]['profile_features']['followers_count']
  115. number_cascades = len(my_data[user]['cascades_feature'])
  116. user_feature_vect = [
  117. profile_background_tile,
  118. profile_use_background_image,
  119. screen_name,
  120. verified,
  121. statuses_count,
  122. favourites_count,
  123. has_extended_profile,
  124. friends_count,
  125. followers_count,
  126. number_cascades
  127. ] + user_bio + user_tweet
  128. data_set[user] = user_feature_vect
  129. logging.info("user {0} added. index = {1}.".format(user, i))
  130. logging.info('writing output to users_feature.p')
  131. file_to_write = open('users_feature_20d.p', 'wb')
  132. pickle.dump(data_set, file_to_write)
  133. logging.info("finished.")
  134. def users_clustering(users_ids, users_bio, users_tweet, my_data):
  135. users_dataset = []
  136. data_set = {}
  137. for i in range(len(users_ids)):
  138. user = users_ids[i]
  139. user_bio = users_bio[i].tolist()
  140. user_tweet = users_tweet[i].tolist()
  141. profile_background_tile = 1 if my_data[user]['profile_features']['profile_background_tile'] else 0
  142. profile_use_background_image = 1 if my_data[user][
  143. 'profile_features']['profile_use_background_image'] else 0
  144. screen_name = len(my_data[user]['profile_features']['screen_name'])
  145. verified = 1 if my_data[user]['profile_features']['verified'] else 0
  146. statuses_count = my_data[user]['profile_features']['statuses_count']
  147. favourites_count = my_data[user]['profile_features']['favourites_count']
  148. has_extended_profile = 1 if my_data[user]['profile_features']['has_extended_profile'] else 0
  149. friends_count = my_data[user]['profile_features']['friends_count']
  150. followers_count = my_data[user]['profile_features']['followers_count']
  151. number_cascades = len(my_data[user]['cascades_feature'])
  152. users_dataset.append([
  153. profile_background_tile,
  154. profile_use_background_image,
  155. screen_name,
  156. verified,
  157. statuses_count,
  158. favourites_count,
  159. has_extended_profile,
  160. friends_count,
  161. followers_count,
  162. number_cascades
  163. ] + user_bio + user_tweet)
  164. data_set[i] = user
  165. logging.info("making data matrix finished.")
  166. users_dataset = np.array(users_dataset)
  167. logging.info('data set created')
  168. kproto_init = kprototypes.KPrototypes(
  169. n_clusters=3600, init="Huang", verbose=2, n_init=1)
  170. logging.info('go for learning clusters')
  171. result = kproto_init.fit_predict(users_dataset, categorical=[0, 1, 3, 6])
  172. logging.info("model fit-predict result:{0}".format(result))
  173. pickle.dump(result, open('results1_text.p', 'wb'))
  174. pickle.dump(data_set, open('results11_text.p', 'wb'))
  175. with open('results1_text.txt', 'w') as f:
  176. f.write("\n".join(str(result)))
  177. def cluster_from_pickle(number_of_clusters=3600):
  178. user_features = pickle.load(
  179. open(os.path.join(ROOT_DIR, 'users_feature.p'), 'rb'))
  180. users_features_vectors = list(user_features.values())
  181. users_dataset = np.array(users_features_vectors)
  182. print(users_dataset[1])
  183. kproto_init = kprototypes.KPrototypes(
  184. n_clusters=number_of_clusters, init="Huang", verbose=2, n_init=1)
  185. result = kproto_init.fit_predict(users_dataset, categorical=[0, 1, 3, 6])
  186. clustering_result = {}
  187. for i in range(len(result)):
  188. if result[i] in clustering_result:
  189. clustering_result[result[i]] += [users_features_vectors[i]]
  190. else:
  191. clustering_result[result[i]] = [users_features_vectors[i]]
  192. file_to_write = open('users_vectprs_clustering.p', 'wb')
  193. pickle.dump(clustering_result, file_to_write)
  194. # cluster_vectors = np.array([[0. for i in range(len(users_dataset[0]))] for i in range(number_of_clusters)])
  195. # for i in range(len(result)):
  196. # cluster_vectors[result[i]] = np.add(cluster_vectors[result[i]], users_dataset[i])
  197. # return cluster_vectors
  198. def gower_distance(X):
  199. """
  200. This function expects a pandas dataframe as input
  201. The data frame is to contain the features along the columns. Based on these features a
  202. distance matrix will be returned which will contain the pairwise gower distance between the rows
  203. All variables of object type will be treated as nominal variables and the others will be treated as
  204. numeric variables.
  205. Distance metrics used for:
  206. Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
  207. Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry)
  208. """
  209. X = pd.DataFrame(X)
  210. individual_variable_distances = []
  211. print(type(X))
  212. for i in range(X.shape[1]):
  213. feature = X.iloc[:, [i]]
  214. if feature.dtypes[0] == np.object:
  215. feature_dist = DistanceMetric.get_metric(
  216. 'dice').pairwise(pd.get_dummies(feature))
  217. else:
  218. feature_dist = DistanceMetric.get_metric(
  219. 'manhattan').pairwise(feature) / np.ptp(feature.values)
  220. individual_variable_distances.append(feature_dist)
  221. return np.array(individual_variable_distances).mean(0)
  222. def scikit_clustering_ver2(number_of_clusters=1700):
  223. user_features = pickle.load(
  224. open(os.path.join(ROOT_DIR, '../../Trial/data2/idx2vec.pickle'), 'rb'))
  225. users_features_vectors = []
  226. users_id = []
  227. for user_id, user_vector in user_features.items():
  228. users_features_vectors += [user_vector]
  229. users_id += [user_id]
  230. users_dataset = np.array(users_features_vectors)
  231. df = pd.DataFrame(users_dataset)
  232. df[0] = df[0].astype('category')
  233. df[1] = df[1].astype('category')
  234. df[3] = df[3].astype('category')
  235. df[6] = df[6].astype('category')
  236. abs_scaler = MaxAbsScaler()
  237. abs_scaler.fit(df[[2,4,5,7,8,9]])
  238. df[[2,4,5,7,8,9]] = abs_scaler.transform(df[[2,4,5,7,8,9]])
  239. clustering = KMeans(n_clusters=number_of_clusters, verbose=1).fit(df)
  240. result = clustering.labels_
  241. logging.info("result: {0}".format(result))
  242. clusters_vectors = {}
  243. cluster_size = {}
  244. user_to_cluster = {}
  245. for i in range(len(result)):
  246. if result[i] in clusters_vectors:
  247. clusters_vectors[result[i]] = np.add(clusters_vectors[result[i]], users_features_vectors[i])
  248. cluster_size[result[i]] += 1
  249. else:
  250. clusters_vectors[result[i]] = np.array(users_features_vectors[i])
  251. cluster_size[result[i]] = 0
  252. user_to_cluster[users_id[i]] = result[i]
  253. for cluster_ind, cluster_vec in clusters_vectors.items():
  254. clusters_vectors[cluster_ind] = cluster_vec / cluster_size[cluster_ind]
  255. # clustering_result = {}
  256. # for i in range(len(result)):
  257. # if result[i] in clustering_result:
  258. # clustering_result[result[i]] += [users_features_vectors[i]]
  259. # else:
  260. # clustering_result[result[i]] = [users_features_vectors[i]]
  261. # file_to_write = open('users_vectors_clustering.p', 'wb')
  262. # pickle.dump(clustering_result, file_to_write)
  263. file_to_write = open('clusters_vectors_20d.p', 'wb')
  264. pickle.dump(clusters_vectors, file_to_write)
  265. file_to_write = open('users_to_cluster_20d.p', 'wb')
  266. pickle.dump(user_to_cluster, file_to_write)
  267. def scikit_clustering(number_of_clusters=3600):
  268. user_features = pickle.load(
  269. open(os.path.join(ROOT_DIR, 'users_feature_66d.p'), 'rb'))
  270. users_features_vectors = list(user_features.values())
  271. users_dataset = np.array(users_features_vectors)
  272. df = pd.DataFrame(users_dataset)
  273. df[0] = df[0].astype('category')
  274. df[1] = df[1].astype('category')
  275. df[3] = df[3].astype('category')
  276. df[6] = df[6].astype('category')
  277. abs_scaler = MaxAbsScaler()
  278. abs_scaler.fit(df[[2,4,5,7,8,9]])
  279. df[[2,4,5,7,8,9]] = abs_scaler.transform(df[[2,4,5,7,8,9]])
  280. print(df.iloc[:,[0]].dtypes[0])
  281. clustering = AgglomerativeClustering(
  282. n_clusters=number_of_clusters, affinity=gower.gower_matrix, linkage='complete' ).fit(df)
  283. result = clustering.labels_
  284. clustering_result = {}
  285. for i in range(len(result)):
  286. if result[i] in clustering_result:
  287. clustering_result[result[i]] += [users_features_vectors[i]]
  288. else:
  289. clustering_result[result[i]] = [users_features_vectors[i]]
  290. file_to_write = open('users_vectors_clustering_66d.p', 'wb')
  291. pickle.dump(clustering_result, file_to_write)
  292. if __name__ == '__main__':
  293. user_dict = pickle.load(open('../twitter-raw-data/users_data.p', "rb"))
  294. # user_ids = user_ids(user_dict)
  295. # users_feature_extraction_with_profile(user_ids, user_dict)
  296. # user_ids, users_bio, users_tweet = user_embedding(user_dict)
  297. # users_feature_extraction(user_ids, users_bio, users_tweet, user_dict)
  298. scikit_clustering_ver2()
  299. # user_embedding({'12': {'description':'hi to you', 'cascades_feature':[[12, 1, 'this is a test']]},'13': {'description':'hi to me', 'cascades_feature':[[12, 1, 'this is not a test']]}})