123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408 |
- import os
- import numpy as np
- from numpy import exp
- from math import log
-
- # z motif count matrix
- def create_motif_count_matrix(mat):
- z = np.zeros([num_of_nodes, num_of_nodes])
- freq = 0
- motif_count_vector = [set() for i in range(7)]
- for i in range(num_of_nodes):
- for j in range(num_of_nodes):
- if mat[i][j] != 0:
- for k in range(num_of_nodes):
- if mat[j][k] != 0 and mat[k][i] != 0:
- motif_count_vector[0].add(repr(sorted([(i, j), (j, k), (k, i)])))
- if mat[j][k] != 0 and mat[k][i] != 0 and mat[k][j] != 0:
- motif_count_vector[1].add(repr(sorted([(i, j), (j, k), (k, i), (k, j)])))
- if mat[j][k] != 0 and mat[k][i] != 0 and mat[j][i] != 0 and mat[i][k] != 0:
- motif_count_vector[2].add(repr(sorted([(i, j), (j, k), (k, i), (j, i), (i, k)])))
- if mat[j][i] != 0 and mat[j][k] != 0 and mat[k][j] != 0 and mat[k][i] != 0 and mat[i][k] != 0:
- motif_count_vector[3].add(repr(sorted([(i, j), (j, k), (k, i), (j, i), (k, j), (i, k)])))
- if mat[k][j] != 0 and mat[i][k] != 0:
- motif_count_vector[4].add(repr(sorted([(i, j), (k, j), (i, k)])))
- if mat[i][k] != 0 and mat[j][k] != 0 and mat[k][j] != 0:
- motif_count_vector[5].add(repr(sorted([(i, j), (i, k), (j, k), (k, j)])))
- if mat[i][k] != 0 and mat[k][i] != 0 and mat[k][j] != 0:
- motif_count_vector[6].add(repr(sorted([(i, j), (i, k), (k, i), (k, j)])))
- # if mat[j][k] != 0 and mat[k][i] != 0:
- # motif_count_vector[0].add(repr({(i, j), (j, k), (k, i)}))
- # freq = adjust_motif_count(z, i, j, k, freq, 0)
- # if mat[j][k] != 0 and mat[k][i] != 0 and (mat[j][i] != 0 or mat[k][j] != 0 or mat[i][k] != 0):
- # motif_count_vector[1].add(repr({}))
- # freq = adjust_motif_count(z, i, j, k, freq, 1)
- # if mat[j][k] != 0 and mat[k][i] != 0 and \
- # ((mat[j][i] != 0 and mat[k][j] != 0) or (mat[j][i] != 0 and mat[i][k] != 0) or (
- # mat[k][j] != 0 and mat[i][k] != 0)):
- # freq = adjust_motif_count(z, i, j, k, freq, 1)
- # if mat[j][i] != 0 and mat[j][k] != 0 and mat[k][j] != 0 and mat[k][i] != 0 and mat[i][k] != 0:
- # freq = adjust_motif_count(z, i, j, k, freq, 0)
- # if (mat[i][k] != 0 and mat[k][j] != 0) or \
- # (mat[i][k] != 0 and mat[j][k] != 0) or (mat[k][i] != 0 and mat[k][j] != 0):
- # freq = adjust_motif_count(z, i, j, k, freq, 1)
- # if (mat[i][k] != 0 and mat[k][j] != 0 and mat[j][k] != 0) or \
- # (mat[j][i] != 0 and mat[k][i] != 0 and mat[k][j] != 0):
- # freq = adjust_motif_count(z, i, j, k, freq, 2)
- # if (mat[i][k] != 0 and mat[k][i] != 0 and mat[k][j] != 0) or \
- # (mat[j][i] != 0 and mat[j][k] != 0 and mat[i][k] != 0):
- # freq = adjust_motif_count(z, i, j, k, freq, 2)
- for motif_vector in motif_count_vector:
- for x in motif_vector:
- for (i, j) in eval(x):
- z[i][j] += 1
- return freq, z
-
-
- # def adjust_motif_count(z, i, j, k, freq, motif_type):
- # if motif_type == 0:
- # freq += 1 / 3
- # z[i][j] += 1 / 3
- # z[j][k] += 1 / 3
- # z[k][i] += 1 / 3
- # elif motif_type == 1:
- # freq += 1
- # z[i][j] += 1
- # z[j][k] += 1
- # z[k][i] += 1
- # elif motif_type == 2:
- # freq += 1 / 2
- # z[i][j] += 1 / 2
- # z[j][k] += 1 / 2
- # z[k][i] += 1 / 2
- # return freq
-
-
- # def check_motif():
- # arr = np.array([])
- # for i in range(10):
- # rand = np.repeat(s, 1, axis=1)
- # np.random.shuffle(rand)
- # f = check_exist(rand)
- # print(f)
- # arr += [f]
- # print(arr.mean(), arr.std(), (freq[0] - arr.mean()) / arr.std())
-
-
- def normalize(mat):
- s = np.amax(mat)
- return np.vectorize(lambda val: val / s)(np.repeat(mat, 1, axis=1))
-
-
- def find_r(mat, freq_mat):
- global num_of_nodes, finish_time
- summation = 0.
- for i in range(num_of_nodes):
- for j in range(num_of_nodes):
- summation += abs(mat[i][j] / (freq_mat[i][j] + 1))
- return summation
-
-
- # this assumes that cascades are sorted by time
- def likelihood_cascade(cascade, mat):
- likelihood = 1.
- for i in range(len(cascade['timing'])):
- tup = cascade['timing'][i]
- # not infecteds from him:
- for m in cascade['not_infecteds']:
- likelihood *= survival(tup[1], mat[tup[0]][m], finish_time)
- # nodes he might be infected from
- cumulative_hazards = 0.
- for infected in cascade['timing'][:i]:
- cumulative_hazards += hazard(infected[1], mat[infected[0]][tup[0]], tup[1])
- likelihood *= cumulative_hazards
- for infected in cascade['timing'][:i]:
- likelihood *= survival(infected[1], mat[infected[0]][tup[0]], tup[1])
- return likelihood
-
-
- def hazard(start_time: float, rate: float, end_time: float) -> float:
- return probability(start_time, rate, end_time) / survival(start_time, rate, end_time)
-
-
- def survival(start_time: float, rate: float, end_time: float) -> float:
- x = exp(-rate * (end_time - start_time))
- return x
-
-
- def probability(start_time: float, rate: float, end_time: float) -> float:
- return rate * exp(-rate * (end_time - start_time))
-
-
- def gradient(mat, cascades, freq_mat):
- gradient = np.zeros([num_of_nodes, num_of_nodes])
- for cascade in cascades:
- for non_infected in cascade['not_infecteds']:
- for infected in cascade['timing']:
- gr = finish_time - infected[1]
- gradient[infected[0]][non_infected] -= gr
- for k in range(len(cascade['timing'])):
- infected_k = cascade['timing'][k]
- for infected_j in cascade['timing'][:k]:
- sigma = 0
- for infected_l in cascade['timing'][:k]:
- sigma += mat[infected_l[0]][infected_k[0]]
- gr = (infected_k[1] - infected_j[1]) - (1 / sigma) if sigma != 0 else infected_k[1] - infected_j[1]
-
- gradient[infected_j[0]][infected_k[0]] = gr
- # not sure
- for i in range(num_of_nodes):
- for j in range(num_of_nodes):
- gradient[i][j] += 1 / (freq_mat[i][j] + 1)
- # print('gradient:\n', gradient)
- return gradient
-
-
- def gradient_descent(mat, cascades, freq_mat, learning_rate=0.01):
- errors = []
- ii = 0
- while ii < gradient_descent_steps:
- prev_mat = np.repeat(mat, 1, axis=1)
- mat -= learning_rate * gradient(mat, cascades, freq_mat)
- mat = normalize(mat)
- likelihood = 1.
- for casc in cascades:
- likelihood *= likelihood_cascade(casc, mat)
- # print('likelihood:', likelihood - find_r(mat, freq_mat))
- for i in range(num_of_nodes):
- for j in range(num_of_nodes):
- mat[i][j] = 0 if mat[i][j] < 0 else mat[i][j]
- error = 0
- for j in range(num_of_nodes):
- for k in range(num_of_nodes):
- # if mat[j][k] - prev_mat[j][k] != 0:
- # print('different values seen in:', j, k, 'with value:', prev_mat[j][k], 'becoming:', mat[j][k])
- error += abs(mat[j][k] - prev_mat[j][k])
- error /= (num_of_nodes ** 2)
- errors += [error]
- # print('gradient descent error in step', ii, ' :', error)
- if error < gradient_descent_threshold:
- break
- if ii > min_gradient_descent_steps and error - errors[-2] > min_error_dif_jump:
- break
- ii += 1
- print('gradient descent errors ranged from:\n', errors[0], 'to:', errors[-1])
-
- return mat
-
-
- def read_result(name: str, num_of_nodes: int = 32):
- mat = np.zeros([num_of_nodes, num_of_nodes])
- if os.path.isfile('./{}'.format(name)):
- with open(name, 'r') as outfile:
- for line in outfile:
- if '.' in line:
- stripped = line.split(',')
- i = int(stripped[0])
- j = int(stripped[1])
- mat[i][j] = float(stripped[-1])
- return mat
-
-
- def print_mat(mat, num_of_nodes=32):
- for i in range(num_of_nodes):
- for j in range(num_of_nodes):
- if mat[i][j] != 0:
- print((i, j, mat[i][j]), end=' ')
- print()
-
-
- def new_print(mat):
- for x in mat:
- print(x)
-
-
- def round_up(mat):
- return np.vectorize(lambda val: 1. if val > 0. else 0.)(np.repeat(mat, 1, axis=1))
-
-
- def diff_is_ignorable(a, b, lim):
- return abs(a - b) < lim
-
-
- def diff_is_huge(a, b, lim):
- return abs(a - b) > lim
-
-
- def accuracy(mat_guess, mat_answer, num_of_nodes, lim_huge=0.7, lim_ignore=0.1, lim_diff_zero=0.3, lim_diff_one=0.3):
- tp, fp, tn, fn = 0, 0, 0, 0
- for i in range(num_of_nodes):
- for j in range(num_of_nodes):
- if diff_is_ignorable(mat_guess[i][j], mat_answer[i][j], lim_ignore) and diff_is_ignorable(mat_guess[i][j],
- 1, lim_diff_one):
- tp += 1
- elif diff_is_ignorable(mat_guess[i][j], mat_answer[i][j], lim_ignore) and diff_is_ignorable(mat_guess[i][j],
- 0,
- lim_diff_zero):
- tn += 1
- elif diff_is_huge(mat_guess[i][j], mat_answer[i][j], lim_huge) and diff_is_ignorable(mat_guess[i][j], 1,
- lim_diff_one):
- fp += 1
- elif diff_is_huge(mat_guess[i][j], mat_answer[i][j], lim_huge) and diff_is_ignorable(mat_guess[i][j], 0,
- lim_diff_zero):
- fn += 1
- try:
- precision = tp / (tp + fp)
- recall = tp / (tp + fn)
- f_measure = 2 * precision * recall / (precision + recall)
- return precision, recall, f_measure
- except Exception as e:
- print('error:', 'tp', tp, 'tn', tn, 'fp', fp, 'fn', fn)
- return None, None, None
-
-
- # reading data and creating occurrence matrix
- def read_data(name, window_width, with_cascade_id, semicolon):
- global num_of_nodes, finish_time
- num_of_nodes, finish_time = 0, 0.
- cascades = []
- if os.path.isfile('./{}'.format(name)):
- with open(name, 'r') as outfile:
- for line in outfile:
- try:
- if line.strip() == '':
- occurrence_matrix = np.zeros((num_of_nodes, num_of_nodes))
- elif ';' in line or '.' in line:
- cascade_not_infecteds = list(range(num_of_nodes))
- cascade_timing = []
- if with_cascade_id:
- stripped = line.strip().split(';')[1].split(',')
- for i in range(len(stripped)):
- if i % 2 == 0:
- cascade_timing += [(int(stripped[i]), float(stripped[i + 1]))]
- cascade_not_infecteds.remove(cascade_timing[-1][0])
- if cascade_timing[-1][1] > finish_time:
- finish_time = cascade_timing[-1][1]
- else:
- if semicolon:
- stripped = line.strip().split(';')
- for event_str in stripped:
- event = (int(event_str.split(',')[0]), float(event_str.split(',')[1]))
- cascade_timing += [event]
- cascade_not_infecteds.remove(event[0])
- if cascade_timing[-1][1] > finish_time:
- finish_time = cascade_timing[-1][1]
- for previous_event in cascade_timing[:-1][::-1]:
- if previous_event[1] < event[1] < previous_event[1] + window_width:
- occurrence_matrix[previous_event[0]][event[0]] += 1
- else:
- stripped = line.strip().split(',')
- for i in range(len(stripped)):
- if i % 2 == 0:
- event = (int(stripped[i]), float(stripped[i + 1]))
- cascade_timing += [event]
- cascade_not_infecteds.remove(event[0])
- if cascade_timing[-1][1] > finish_time:
- finish_time = cascade_timing[-1][1]
- for previous_event in cascade_timing[:-1][::-1]:
- if previous_event[1] < event[1] < previous_event[1] + window_width:
- occurrence_matrix[previous_event[0]][event[0]] += 1
- # event[0]] += 1 if occur_type == 'simple' else np.random.exponential(
- # event[1] - previous_event[1]) if occur_type == 'exp' else np.random.rayleigh(
- # event[1] - previous_event[1])
- else:
- break
- cascade = {'timing': cascade_timing, 'not_infecteds': cascade_not_infecteds}
- cascades += [cascade]
- else:
- num_of_nodes += 1
- except Exception as e:
- print(e)
- return occurrence_matrix, cascades, window_width
-
-
- def motif_aware_inference(name, result_name, semicolon, if_id, occur_type='simple', algo_steps=100, min_algo_steps=10,
- algo_threshold=0.000001):
- global gradient_descent_threshold, min_gradient_descent_steps, gradient_descent_steps, min_error_dif_jump
- gradient_descent_steps = 100
- min_gradient_descent_steps = 10
- gradient_descent_threshold = 0.000001
- min_error_dif_jump = 0.0001
- # np.set_printoptions(edgeitems=6)
- # np.core.arrayprint._line_width = 1000000
- occurrence_matrix, cascades, window_width = read_data(name=name, window_width=0.7, with_cascade_id=if_id,
- semicolon=semicolon)
- # print('Cascades:\n', cascades)
- print('Occurrence Matrix:', occurrence_matrix)
- print('Num of Nodes:', num_of_nodes)
- print('Finish Time:', finish_time)
- print('Window Width:', window_width)
- mean = occurrence_matrix.mean()
- std = occurrence_matrix.std()
- print('Mean:', mean, 'Standard Deviation:', std)
- if std == 0:
- std = 1
-
- # filter out
- # s: significant pairwise influences
- s = np.vectorize(lambda val: 0. if (val - mean) / std < 0 else (val - mean) / std)(
- np.repeat(occurrence_matrix, 1, axis=1))
- s = normalize(s)
- print('Significant Pairwise Influences:\n', s)
- frequency, z = create_motif_count_matrix(s)
- print('motif frequency matrix:\n', z)
- print('Num of Motifs Seen:', frequency)
- mat = np.zeros([num_of_nodes, num_of_nodes])
- i = 0
- errors = []
- while i < algo_steps:
- mats = []
- print('\nstarted run:', i)
- prev_mat = np.repeat(mat, 1, axis=1)
- mat = gradient_descent(mat, cascades, z)
- frequency, z = create_motif_count_matrix(mat)
-
- print('in run', i, 'result adjacency matrix:')
- print(mat)
- mats += [mat]
- print('result motif frequency matrix:')
- print(z)
-
- # for xy in l:
- # print('important edge:', xy[0], xy[1], mat[xy[0]][xy[1]])
- # print('error in step', i, ' :', error)
-
- i += 1
- error = 0
- for j in range(num_of_nodes):
- for k in range(num_of_nodes):
- error += abs(mat[j][k] - prev_mat[j][k])
- error /= (num_of_nodes ** 2)
- errors += [error]
-
- if i > min_algo_steps and error < algo_threshold:
- break
- if i > min_algo_steps and error - errors[-2] > min_error_dif_jump:
- mat = prev_mat
- break
- print('\n\nerrors ranged from:', errors[0], 'to:', errors[1])
- print('\nA:\n', mat)
- mean_m, std_m = mat.mean(), mat.std()
- mat = np.vectorize(lambda val: 0. if (val - mean_m) / std_m < 0 else (val - mean_m) / std_m)(mat)
- mat = normalize(mat)
-
- res = read_result(result_name, num_of_nodes=num_of_nodes)
- print('ground truth result:\n', res)
- pr, rec, f1 = accuracy(mat, res, num_of_nodes=num_of_nodes)
- print('precision', pr, 'recall', rec, 'f-score', f1)
-
- mat_round = round_up(mat)
- print('rounded up matrix\n', mat_round)
- res_round = round_up(res)
- print('rounded up ground truth\n', res_round)
- pr_round, rec_round, f1_round = accuracy(mat_round, res_round, num_of_nodes=num_of_nodes)
- print('precision', pr_round, 'recall', rec_round, 'f-score', f1_round)
-
- # print(util.print_mat(mat, num_of_nodes))
-
- # mat = np.zeros([4, 4])
- # mat[0][1] = 1
- # mat[1][2] = 1
- # mat[1][3] = 1
- # mat[2][3] = 1
- # mat[3][2] = 1
- # print(likelihood_cascade(cascades[0], mat))
- return mat
-
-
- mat = motif_aware_inference(name='cascades3.txt', result_name='network3.txt', semicolon=False, if_id=False)
|