You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

motif.py 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. import os
  2. import numpy as np
  3. from numpy import exp
  4. from math import log
  5. # z motif count matrix
  6. def create_motif_count_matrix(mat):
  7. z = np.zeros([num_of_nodes, num_of_nodes])
  8. freq = 0
  9. motif_count_vector = [set() for i in range(7)]
  10. for i in range(num_of_nodes):
  11. for j in range(num_of_nodes):
  12. if mat[i][j] != 0:
  13. for k in range(num_of_nodes):
  14. if mat[j][k] != 0 and mat[k][i] != 0:
  15. motif_count_vector[0].add(repr(sorted([(i, j), (j, k), (k, i)])))
  16. if mat[j][k] != 0 and mat[k][i] != 0 and mat[k][j] != 0:
  17. motif_count_vector[1].add(repr(sorted([(i, j), (j, k), (k, i), (k, j)])))
  18. if mat[j][k] != 0 and mat[k][i] != 0 and mat[j][i] != 0 and mat[i][k] != 0:
  19. motif_count_vector[2].add(repr(sorted([(i, j), (j, k), (k, i), (j, i), (i, k)])))
  20. if mat[j][i] != 0 and mat[j][k] != 0 and mat[k][j] != 0 and mat[k][i] != 0 and mat[i][k] != 0:
  21. motif_count_vector[3].add(repr(sorted([(i, j), (j, k), (k, i), (j, i), (k, j), (i, k)])))
  22. if mat[k][j] != 0 and mat[i][k] != 0:
  23. motif_count_vector[4].add(repr(sorted([(i, j), (k, j), (i, k)])))
  24. if mat[i][k] != 0 and mat[j][k] != 0 and mat[k][j] != 0:
  25. motif_count_vector[5].add(repr(sorted([(i, j), (i, k), (j, k), (k, j)])))
  26. if mat[i][k] != 0 and mat[k][i] != 0 and mat[k][j] != 0:
  27. motif_count_vector[6].add(repr(sorted([(i, j), (i, k), (k, i), (k, j)])))
  28. # if mat[j][k] != 0 and mat[k][i] != 0:
  29. # motif_count_vector[0].add(repr({(i, j), (j, k), (k, i)}))
  30. # freq = adjust_motif_count(z, i, j, k, freq, 0)
  31. # if mat[j][k] != 0 and mat[k][i] != 0 and (mat[j][i] != 0 or mat[k][j] != 0 or mat[i][k] != 0):
  32. # motif_count_vector[1].add(repr({}))
  33. # freq = adjust_motif_count(z, i, j, k, freq, 1)
  34. # if mat[j][k] != 0 and mat[k][i] != 0 and \
  35. # ((mat[j][i] != 0 and mat[k][j] != 0) or (mat[j][i] != 0 and mat[i][k] != 0) or (
  36. # mat[k][j] != 0 and mat[i][k] != 0)):
  37. # freq = adjust_motif_count(z, i, j, k, freq, 1)
  38. # if mat[j][i] != 0 and mat[j][k] != 0 and mat[k][j] != 0 and mat[k][i] != 0 and mat[i][k] != 0:
  39. # freq = adjust_motif_count(z, i, j, k, freq, 0)
  40. # if (mat[i][k] != 0 and mat[k][j] != 0) or \
  41. # (mat[i][k] != 0 and mat[j][k] != 0) or (mat[k][i] != 0 and mat[k][j] != 0):
  42. # freq = adjust_motif_count(z, i, j, k, freq, 1)
  43. # if (mat[i][k] != 0 and mat[k][j] != 0 and mat[j][k] != 0) or \
  44. # (mat[j][i] != 0 and mat[k][i] != 0 and mat[k][j] != 0):
  45. # freq = adjust_motif_count(z, i, j, k, freq, 2)
  46. # if (mat[i][k] != 0 and mat[k][i] != 0 and mat[k][j] != 0) or \
  47. # (mat[j][i] != 0 and mat[j][k] != 0 and mat[i][k] != 0):
  48. # freq = adjust_motif_count(z, i, j, k, freq, 2)
  49. for motif_vector in motif_count_vector:
  50. for x in motif_vector:
  51. for (i, j) in eval(x):
  52. z[i][j] += 1
  53. return freq, z
  54. # def adjust_motif_count(z, i, j, k, freq, motif_type):
  55. # if motif_type == 0:
  56. # freq += 1 / 3
  57. # z[i][j] += 1 / 3
  58. # z[j][k] += 1 / 3
  59. # z[k][i] += 1 / 3
  60. # elif motif_type == 1:
  61. # freq += 1
  62. # z[i][j] += 1
  63. # z[j][k] += 1
  64. # z[k][i] += 1
  65. # elif motif_type == 2:
  66. # freq += 1 / 2
  67. # z[i][j] += 1 / 2
  68. # z[j][k] += 1 / 2
  69. # z[k][i] += 1 / 2
  70. # return freq
  71. # def check_motif():
  72. # arr = np.array([])
  73. # for i in range(10):
  74. # rand = np.repeat(s, 1, axis=1)
  75. # np.random.shuffle(rand)
  76. # f = check_exist(rand)
  77. # print(f)
  78. # arr += [f]
  79. # print(arr.mean(), arr.std(), (freq[0] - arr.mean()) / arr.std())
  80. def normalize(mat):
  81. s = np.amax(mat)
  82. return np.vectorize(lambda val: val / s)(np.repeat(mat, 1, axis=1))
  83. def find_r(mat, freq_mat):
  84. global num_of_nodes, finish_time
  85. summation = 0.
  86. for i in range(num_of_nodes):
  87. for j in range(num_of_nodes):
  88. summation += abs(mat[i][j] / (freq_mat[i][j] + 1))
  89. return summation
  90. # this assumes that cascades are sorted by time
  91. def likelihood_cascade(cascade, mat):
  92. likelihood = 1.
  93. for i in range(len(cascade['timing'])):
  94. tup = cascade['timing'][i]
  95. # not infecteds from him:
  96. for m in cascade['not_infecteds']:
  97. likelihood *= survival(tup[1], mat[tup[0]][m], finish_time)
  98. # nodes he might be infected from
  99. cumulative_hazards = 0.
  100. for infected in cascade['timing'][:i]:
  101. cumulative_hazards += hazard(infected[1], mat[infected[0]][tup[0]], tup[1])
  102. likelihood *= cumulative_hazards
  103. for infected in cascade['timing'][:i]:
  104. likelihood *= survival(infected[1], mat[infected[0]][tup[0]], tup[1])
  105. return likelihood
  106. def hazard(start_time: float, rate: float, end_time: float) -> float:
  107. return probability(start_time, rate, end_time) / survival(start_time, rate, end_time)
  108. def survival(start_time: float, rate: float, end_time: float) -> float:
  109. x = exp(-rate * (end_time - start_time))
  110. return x
  111. def probability(start_time: float, rate: float, end_time: float) -> float:
  112. return rate * exp(-rate * (end_time - start_time))
  113. def gradient(mat, cascades, freq_mat):
  114. gradient = np.zeros([num_of_nodes, num_of_nodes])
  115. for cascade in cascades:
  116. for non_infected in cascade['not_infecteds']:
  117. for infected in cascade['timing']:
  118. gr = finish_time - infected[1]
  119. gradient[infected[0]][non_infected] -= gr
  120. for k in range(len(cascade['timing'])):
  121. infected_k = cascade['timing'][k]
  122. for infected_j in cascade['timing'][:k]:
  123. sigma = 0
  124. for infected_l in cascade['timing'][:k]:
  125. sigma += mat[infected_l[0]][infected_k[0]]
  126. gr = (infected_k[1] - infected_j[1]) - (1 / sigma) if sigma != 0 else infected_k[1] - infected_j[1]
  127. gradient[infected_j[0]][infected_k[0]] = gr
  128. # not sure
  129. for i in range(num_of_nodes):
  130. for j in range(num_of_nodes):
  131. gradient[i][j] += 1 / (freq_mat[i][j] + 1)
  132. # print('gradient:\n', gradient)
  133. return gradient
  134. def gradient_descent(mat, cascades, freq_mat, learning_rate=0.01):
  135. errors = []
  136. ii = 0
  137. while ii < gradient_descent_steps:
  138. prev_mat = np.repeat(mat, 1, axis=1)
  139. mat -= learning_rate * gradient(mat, cascades, freq_mat)
  140. mat = normalize(mat)
  141. likelihood = 1.
  142. for casc in cascades:
  143. likelihood *= likelihood_cascade(casc, mat)
  144. # print('likelihood:', likelihood - find_r(mat, freq_mat))
  145. for i in range(num_of_nodes):
  146. for j in range(num_of_nodes):
  147. mat[i][j] = 0 if mat[i][j] < 0 else mat[i][j]
  148. error = 0
  149. for j in range(num_of_nodes):
  150. for k in range(num_of_nodes):
  151. # if mat[j][k] - prev_mat[j][k] != 0:
  152. # print('different values seen in:', j, k, 'with value:', prev_mat[j][k], 'becoming:', mat[j][k])
  153. error += abs(mat[j][k] - prev_mat[j][k])
  154. error /= (num_of_nodes ** 2)
  155. errors += [error]
  156. # print('gradient descent error in step', ii, ' :', error)
  157. if error < gradient_descent_threshold:
  158. break
  159. if ii > min_gradient_descent_steps and error - errors[-2] > min_error_dif_jump:
  160. break
  161. ii += 1
  162. print('gradient descent errors ranged from:\n', errors[0], 'to:', errors[-1])
  163. return mat
  164. def read_result(name: str, num_of_nodes: int = 32):
  165. mat = np.zeros([num_of_nodes, num_of_nodes])
  166. if os.path.isfile('./{}'.format(name)):
  167. with open(name, 'r') as outfile:
  168. for line in outfile:
  169. if '.' in line:
  170. stripped = line.split(',')
  171. i = int(stripped[0])
  172. j = int(stripped[1])
  173. mat[i][j] = float(stripped[-1])
  174. return mat
  175. def print_mat(mat, num_of_nodes=32):
  176. for i in range(num_of_nodes):
  177. for j in range(num_of_nodes):
  178. if mat[i][j] != 0:
  179. print((i, j, mat[i][j]), end=' ')
  180. print()
  181. def new_print(mat):
  182. for x in mat:
  183. print(x)
  184. def round_up(mat):
  185. return np.vectorize(lambda val: 1. if val > 0. else 0.)(np.repeat(mat, 1, axis=1))
  186. def diff_is_ignorable(a, b, lim):
  187. return abs(a - b) < lim
  188. def diff_is_huge(a, b, lim):
  189. return abs(a - b) > lim
  190. def accuracy(mat_guess, mat_answer, num_of_nodes, lim_huge=0.7, lim_ignore=0.1, lim_diff_zero=0.3, lim_diff_one=0.3):
  191. tp, fp, tn, fn = 0, 0, 0, 0
  192. for i in range(num_of_nodes):
  193. for j in range(num_of_nodes):
  194. if diff_is_ignorable(mat_guess[i][j], mat_answer[i][j], lim_ignore) and diff_is_ignorable(mat_guess[i][j],
  195. 1, lim_diff_one):
  196. tp += 1
  197. elif diff_is_ignorable(mat_guess[i][j], mat_answer[i][j], lim_ignore) and diff_is_ignorable(mat_guess[i][j],
  198. 0,
  199. lim_diff_zero):
  200. tn += 1
  201. elif diff_is_huge(mat_guess[i][j], mat_answer[i][j], lim_huge) and diff_is_ignorable(mat_guess[i][j], 1,
  202. lim_diff_one):
  203. fp += 1
  204. elif diff_is_huge(mat_guess[i][j], mat_answer[i][j], lim_huge) and diff_is_ignorable(mat_guess[i][j], 0,
  205. lim_diff_zero):
  206. fn += 1
  207. try:
  208. precision = tp / (tp + fp)
  209. recall = tp / (tp + fn)
  210. f_measure = 2 * precision * recall / (precision + recall)
  211. return precision, recall, f_measure
  212. except Exception as e:
  213. print('error:', 'tp', tp, 'tn', tn, 'fp', fp, 'fn', fn)
  214. return None, None, None
  215. # reading data and creating occurrence matrix
  216. def read_data(name, window_width, with_cascade_id, semicolon):
  217. global num_of_nodes, finish_time
  218. num_of_nodes, finish_time = 0, 0.
  219. cascades = []
  220. if os.path.isfile('./{}'.format(name)):
  221. with open(name, 'r') as outfile:
  222. for line in outfile:
  223. try:
  224. if line.strip() == '':
  225. occurrence_matrix = np.zeros((num_of_nodes, num_of_nodes))
  226. elif ';' in line or '.' in line:
  227. cascade_not_infecteds = list(range(num_of_nodes))
  228. cascade_timing = []
  229. if with_cascade_id:
  230. stripped = line.strip().split(';')[1].split(',')
  231. for i in range(len(stripped)):
  232. if i % 2 == 0:
  233. cascade_timing += [(int(stripped[i]), float(stripped[i + 1]))]
  234. cascade_not_infecteds.remove(cascade_timing[-1][0])
  235. if cascade_timing[-1][1] > finish_time:
  236. finish_time = cascade_timing[-1][1]
  237. else:
  238. if semicolon:
  239. stripped = line.strip().split(';')
  240. for event_str in stripped:
  241. event = (int(event_str.split(',')[0]), float(event_str.split(',')[1]))
  242. cascade_timing += [event]
  243. cascade_not_infecteds.remove(event[0])
  244. if cascade_timing[-1][1] > finish_time:
  245. finish_time = cascade_timing[-1][1]
  246. for previous_event in cascade_timing[:-1][::-1]:
  247. if previous_event[1] < event[1] < previous_event[1] + window_width:
  248. occurrence_matrix[previous_event[0]][event[0]] += 1
  249. else:
  250. stripped = line.strip().split(',')
  251. for i in range(len(stripped)):
  252. if i % 2 == 0:
  253. event = (int(stripped[i]), float(stripped[i + 1]))
  254. cascade_timing += [event]
  255. cascade_not_infecteds.remove(event[0])
  256. if cascade_timing[-1][1] > finish_time:
  257. finish_time = cascade_timing[-1][1]
  258. for previous_event in cascade_timing[:-1][::-1]:
  259. if previous_event[1] < event[1] < previous_event[1] + window_width:
  260. occurrence_matrix[previous_event[0]][event[0]] += 1
  261. # event[0]] += 1 if occur_type == 'simple' else np.random.exponential(
  262. # event[1] - previous_event[1]) if occur_type == 'exp' else np.random.rayleigh(
  263. # event[1] - previous_event[1])
  264. else:
  265. break
  266. cascade = {'timing': cascade_timing, 'not_infecteds': cascade_not_infecteds}
  267. cascades += [cascade]
  268. else:
  269. num_of_nodes += 1
  270. except Exception as e:
  271. print(e)
  272. return occurrence_matrix, cascades, window_width
  273. def motif_aware_inference(name, result_name, semicolon, if_id, occur_type='simple', algo_steps=100, min_algo_steps=10,
  274. algo_threshold=0.000001):
  275. global gradient_descent_threshold, min_gradient_descent_steps, gradient_descent_steps, min_error_dif_jump
  276. gradient_descent_steps = 100
  277. min_gradient_descent_steps = 10
  278. gradient_descent_threshold = 0.000001
  279. min_error_dif_jump = 0.0001
  280. # np.set_printoptions(edgeitems=6)
  281. # np.core.arrayprint._line_width = 1000000
  282. occurrence_matrix, cascades, window_width = read_data(name=name, window_width=0.7, with_cascade_id=if_id,
  283. semicolon=semicolon)
  284. # print('Cascades:\n', cascades)
  285. print('Occurrence Matrix:', occurrence_matrix)
  286. print('Num of Nodes:', num_of_nodes)
  287. print('Finish Time:', finish_time)
  288. print('Window Width:', window_width)
  289. mean = occurrence_matrix.mean()
  290. std = occurrence_matrix.std()
  291. print('Mean:', mean, 'Standard Deviation:', std)
  292. if std == 0:
  293. std = 1
  294. # filter out
  295. # s: significant pairwise influences
  296. s = np.vectorize(lambda val: 0. if (val - mean) / std < 0 else (val - mean) / std)(
  297. np.repeat(occurrence_matrix, 1, axis=1))
  298. s = normalize(s)
  299. print('Significant Pairwise Influences:\n', s)
  300. frequency, z = create_motif_count_matrix(s)
  301. print('motif frequency matrix:\n', z)
  302. print('Num of Motifs Seen:', frequency)
  303. mat = np.zeros([num_of_nodes, num_of_nodes])
  304. i = 0
  305. errors = []
  306. while i < algo_steps:
  307. mats = []
  308. print('\nstarted run:', i)
  309. prev_mat = np.repeat(mat, 1, axis=1)
  310. mat = gradient_descent(mat, cascades, z)
  311. frequency, z = create_motif_count_matrix(mat)
  312. print('in run', i, 'result adjacency matrix:')
  313. print(mat)
  314. mats += [mat]
  315. print('result motif frequency matrix:')
  316. print(z)
  317. # for xy in l:
  318. # print('important edge:', xy[0], xy[1], mat[xy[0]][xy[1]])
  319. # print('error in step', i, ' :', error)
  320. i += 1
  321. error = 0
  322. for j in range(num_of_nodes):
  323. for k in range(num_of_nodes):
  324. error += abs(mat[j][k] - prev_mat[j][k])
  325. error /= (num_of_nodes ** 2)
  326. errors += [error]
  327. if i > min_algo_steps and error < algo_threshold:
  328. break
  329. if i > min_algo_steps and error - errors[-2] > min_error_dif_jump:
  330. mat = prev_mat
  331. break
  332. print('\n\nerrors ranged from:', errors[0], 'to:', errors[1])
  333. print('\nA:\n', mat)
  334. mean_m, std_m = mat.mean(), mat.std()
  335. mat = np.vectorize(lambda val: 0. if (val - mean_m) / std_m < 0 else (val - mean_m) / std_m)(mat)
  336. mat = normalize(mat)
  337. res = read_result(result_name, num_of_nodes=num_of_nodes)
  338. print('ground truth result:\n', res)
  339. pr, rec, f1 = accuracy(mat, res, num_of_nodes=num_of_nodes)
  340. print('precision', pr, 'recall', rec, 'f-score', f1)
  341. mat_round = round_up(mat)
  342. print('rounded up matrix\n', mat_round)
  343. res_round = round_up(res)
  344. print('rounded up ground truth\n', res_round)
  345. pr_round, rec_round, f1_round = accuracy(mat_round, res_round, num_of_nodes=num_of_nodes)
  346. print('precision', pr_round, 'recall', rec_round, 'f-score', f1_round)
  347. # print(util.print_mat(mat, num_of_nodes))
  348. # mat = np.zeros([4, 4])
  349. # mat[0][1] = 1
  350. # mat[1][2] = 1
  351. # mat[1][3] = 1
  352. # mat[2][3] = 1
  353. # mat[3][2] = 1
  354. # print(likelihood_cascade(cascades[0], mat))
  355. return mat
  356. mat = motif_aware_inference(name='cascades3.txt', result_name='network3.txt', semicolon=False, if_id=False)