import os import numpy as np from numpy import exp from math import log # z motif count matrix def create_motif_count_matrix(mat): z = np.zeros([num_of_nodes, num_of_nodes]) freq = 0 motif_count_vector = [set() for i in range(7)] for i in range(num_of_nodes): for j in range(num_of_nodes): if mat[i][j] != 0: for k in range(num_of_nodes): if mat[j][k] != 0 and mat[k][i] != 0: motif_count_vector[0].add(repr(sorted([(i, j), (j, k), (k, i)]))) if mat[j][k] != 0 and mat[k][i] != 0 and mat[k][j] != 0: motif_count_vector[1].add(repr(sorted([(i, j), (j, k), (k, i), (k, j)]))) if mat[j][k] != 0 and mat[k][i] != 0 and mat[j][i] != 0 and mat[i][k] != 0: motif_count_vector[2].add(repr(sorted([(i, j), (j, k), (k, i), (j, i), (i, k)]))) if mat[j][i] != 0 and mat[j][k] != 0 and mat[k][j] != 0 and mat[k][i] != 0 and mat[i][k] != 0: motif_count_vector[3].add(repr(sorted([(i, j), (j, k), (k, i), (j, i), (k, j), (i, k)]))) if mat[k][j] != 0 and mat[i][k] != 0: motif_count_vector[4].add(repr(sorted([(i, j), (k, j), (i, k)]))) if mat[i][k] != 0 and mat[j][k] != 0 and mat[k][j] != 0: motif_count_vector[5].add(repr(sorted([(i, j), (i, k), (j, k), (k, j)]))) if mat[i][k] != 0 and mat[k][i] != 0 and mat[k][j] != 0: motif_count_vector[6].add(repr(sorted([(i, j), (i, k), (k, i), (k, j)]))) # if mat[j][k] != 0 and mat[k][i] != 0: # motif_count_vector[0].add(repr({(i, j), (j, k), (k, i)})) # freq = adjust_motif_count(z, i, j, k, freq, 0) # if mat[j][k] != 0 and mat[k][i] != 0 and (mat[j][i] != 0 or mat[k][j] != 0 or mat[i][k] != 0): # motif_count_vector[1].add(repr({})) # freq = adjust_motif_count(z, i, j, k, freq, 1) # if mat[j][k] != 0 and mat[k][i] != 0 and \ # ((mat[j][i] != 0 and mat[k][j] != 0) or (mat[j][i] != 0 and mat[i][k] != 0) or ( # mat[k][j] != 0 and mat[i][k] != 0)): # freq = adjust_motif_count(z, i, j, k, freq, 1) # if mat[j][i] != 0 and mat[j][k] != 0 and mat[k][j] != 0 and mat[k][i] != 0 and mat[i][k] != 0: # freq = adjust_motif_count(z, i, j, k, freq, 0) # if (mat[i][k] != 0 and mat[k][j] != 0) or \ # (mat[i][k] != 0 and mat[j][k] != 0) or (mat[k][i] != 0 and mat[k][j] != 0): # freq = adjust_motif_count(z, i, j, k, freq, 1) # if (mat[i][k] != 0 and mat[k][j] != 0 and mat[j][k] != 0) or \ # (mat[j][i] != 0 and mat[k][i] != 0 and mat[k][j] != 0): # freq = adjust_motif_count(z, i, j, k, freq, 2) # if (mat[i][k] != 0 and mat[k][i] != 0 and mat[k][j] != 0) or \ # (mat[j][i] != 0 and mat[j][k] != 0 and mat[i][k] != 0): # freq = adjust_motif_count(z, i, j, k, freq, 2) for motif_vector in motif_count_vector: for x in motif_vector: for (i, j) in eval(x): z[i][j] += 1 return freq, z # def adjust_motif_count(z, i, j, k, freq, motif_type): # if motif_type == 0: # freq += 1 / 3 # z[i][j] += 1 / 3 # z[j][k] += 1 / 3 # z[k][i] += 1 / 3 # elif motif_type == 1: # freq += 1 # z[i][j] += 1 # z[j][k] += 1 # z[k][i] += 1 # elif motif_type == 2: # freq += 1 / 2 # z[i][j] += 1 / 2 # z[j][k] += 1 / 2 # z[k][i] += 1 / 2 # return freq # def check_motif(): # arr = np.array([]) # for i in range(10): # rand = np.repeat(s, 1, axis=1) # np.random.shuffle(rand) # f = check_exist(rand) # print(f) # arr += [f] # print(arr.mean(), arr.std(), (freq[0] - arr.mean()) / arr.std()) def normalize(mat): s = np.amax(mat) return np.vectorize(lambda val: val / s)(np.repeat(mat, 1, axis=1)) def find_r(mat, freq_mat): global num_of_nodes, finish_time summation = 0. for i in range(num_of_nodes): for j in range(num_of_nodes): summation += abs(mat[i][j] / (freq_mat[i][j] + 1)) return summation # this assumes that cascades are sorted by time def likelihood_cascade(cascade, mat): likelihood = 1. for i in range(len(cascade['timing'])): tup = cascade['timing'][i] # not infecteds from him: for m in cascade['not_infecteds']: likelihood *= survival(tup[1], mat[tup[0]][m], finish_time) # nodes he might be infected from cumulative_hazards = 0. for infected in cascade['timing'][:i]: cumulative_hazards += hazard(infected[1], mat[infected[0]][tup[0]], tup[1]) likelihood *= cumulative_hazards for infected in cascade['timing'][:i]: likelihood *= survival(infected[1], mat[infected[0]][tup[0]], tup[1]) return likelihood def hazard(start_time: float, rate: float, end_time: float) -> float: return probability(start_time, rate, end_time) / survival(start_time, rate, end_time) def survival(start_time: float, rate: float, end_time: float) -> float: x = exp(-rate * (end_time - start_time)) return x def probability(start_time: float, rate: float, end_time: float) -> float: return rate * exp(-rate * (end_time - start_time)) def gradient(mat, cascades, freq_mat): gradient = np.zeros([num_of_nodes, num_of_nodes]) for cascade in cascades: for non_infected in cascade['not_infecteds']: for infected in cascade['timing']: gr = finish_time - infected[1] gradient[infected[0]][non_infected] -= gr for k in range(len(cascade['timing'])): infected_k = cascade['timing'][k] for infected_j in cascade['timing'][:k]: sigma = 0 for infected_l in cascade['timing'][:k]: sigma += mat[infected_l[0]][infected_k[0]] gr = (infected_k[1] - infected_j[1]) - (1 / sigma) if sigma != 0 else infected_k[1] - infected_j[1] gradient[infected_j[0]][infected_k[0]] = gr # not sure for i in range(num_of_nodes): for j in range(num_of_nodes): gradient[i][j] += 1 / (freq_mat[i][j] + 1) # print('gradient:\n', gradient) return gradient def gradient_descent(mat, cascades, freq_mat, learning_rate=0.01): errors = [] ii = 0 while ii < gradient_descent_steps: prev_mat = np.repeat(mat, 1, axis=1) mat -= learning_rate * gradient(mat, cascades, freq_mat) mat = normalize(mat) likelihood = 1. for casc in cascades: likelihood *= likelihood_cascade(casc, mat) # print('likelihood:', likelihood - find_r(mat, freq_mat)) for i in range(num_of_nodes): for j in range(num_of_nodes): mat[i][j] = 0 if mat[i][j] < 0 else mat[i][j] error = 0 for j in range(num_of_nodes): for k in range(num_of_nodes): # if mat[j][k] - prev_mat[j][k] != 0: # print('different values seen in:', j, k, 'with value:', prev_mat[j][k], 'becoming:', mat[j][k]) error += abs(mat[j][k] - prev_mat[j][k]) error /= (num_of_nodes ** 2) errors += [error] # print('gradient descent error in step', ii, ' :', error) if error < gradient_descent_threshold: break if ii > min_gradient_descent_steps and error - errors[-2] > min_error_dif_jump: break ii += 1 print('gradient descent errors ranged from:\n', errors[0], 'to:', errors[-1]) return mat def read_result(name: str, num_of_nodes: int = 32): mat = np.zeros([num_of_nodes, num_of_nodes]) if os.path.isfile('./{}'.format(name)): with open(name, 'r') as outfile: for line in outfile: if '.' in line: stripped = line.split(',') i = int(stripped[0]) j = int(stripped[1]) mat[i][j] = float(stripped[-1]) return mat def print_mat(mat, num_of_nodes=32): for i in range(num_of_nodes): for j in range(num_of_nodes): if mat[i][j] != 0: print((i, j, mat[i][j]), end=' ') print() def new_print(mat): for x in mat: print(x) def round_up(mat): return np.vectorize(lambda val: 1. if val > 0. else 0.)(np.repeat(mat, 1, axis=1)) def diff_is_ignorable(a, b, lim): return abs(a - b) < lim def diff_is_huge(a, b, lim): return abs(a - b) > lim def accuracy(mat_guess, mat_answer, num_of_nodes, lim_huge=0.7, lim_ignore=0.1, lim_diff_zero=0.3, lim_diff_one=0.3): tp, fp, tn, fn = 0, 0, 0, 0 for i in range(num_of_nodes): for j in range(num_of_nodes): if diff_is_ignorable(mat_guess[i][j], mat_answer[i][j], lim_ignore) and diff_is_ignorable(mat_guess[i][j], 1, lim_diff_one): tp += 1 elif diff_is_ignorable(mat_guess[i][j], mat_answer[i][j], lim_ignore) and diff_is_ignorable(mat_guess[i][j], 0, lim_diff_zero): tn += 1 elif diff_is_huge(mat_guess[i][j], mat_answer[i][j], lim_huge) and diff_is_ignorable(mat_guess[i][j], 1, lim_diff_one): fp += 1 elif diff_is_huge(mat_guess[i][j], mat_answer[i][j], lim_huge) and diff_is_ignorable(mat_guess[i][j], 0, lim_diff_zero): fn += 1 try: precision = tp / (tp + fp) recall = tp / (tp + fn) f_measure = 2 * precision * recall / (precision + recall) return precision, recall, f_measure except Exception as e: print('error:', 'tp', tp, 'tn', tn, 'fp', fp, 'fn', fn) return None, None, None # reading data and creating occurrence matrix def read_data(name, window_width, with_cascade_id, semicolon): global num_of_nodes, finish_time num_of_nodes, finish_time = 0, 0. cascades = [] if os.path.isfile('./{}'.format(name)): with open(name, 'r') as outfile: for line in outfile: try: if line.strip() == '': occurrence_matrix = np.zeros((num_of_nodes, num_of_nodes)) elif ';' in line or '.' in line: cascade_not_infecteds = list(range(num_of_nodes)) cascade_timing = [] if with_cascade_id: stripped = line.strip().split(';')[1].split(',') for i in range(len(stripped)): if i % 2 == 0: cascade_timing += [(int(stripped[i]), float(stripped[i + 1]))] cascade_not_infecteds.remove(cascade_timing[-1][0]) if cascade_timing[-1][1] > finish_time: finish_time = cascade_timing[-1][1] else: if semicolon: stripped = line.strip().split(';') for event_str in stripped: event = (int(event_str.split(',')[0]), float(event_str.split(',')[1])) cascade_timing += [event] cascade_not_infecteds.remove(event[0]) if cascade_timing[-1][1] > finish_time: finish_time = cascade_timing[-1][1] for previous_event in cascade_timing[:-1][::-1]: if previous_event[1] < event[1] < previous_event[1] + window_width: occurrence_matrix[previous_event[0]][event[0]] += 1 else: stripped = line.strip().split(',') for i in range(len(stripped)): if i % 2 == 0: event = (int(stripped[i]), float(stripped[i + 1])) cascade_timing += [event] cascade_not_infecteds.remove(event[0]) if cascade_timing[-1][1] > finish_time: finish_time = cascade_timing[-1][1] for previous_event in cascade_timing[:-1][::-1]: if previous_event[1] < event[1] < previous_event[1] + window_width: occurrence_matrix[previous_event[0]][event[0]] += 1 # event[0]] += 1 if occur_type == 'simple' else np.random.exponential( # event[1] - previous_event[1]) if occur_type == 'exp' else np.random.rayleigh( # event[1] - previous_event[1]) else: break cascade = {'timing': cascade_timing, 'not_infecteds': cascade_not_infecteds} cascades += [cascade] else: num_of_nodes += 1 except Exception as e: print(e) return occurrence_matrix, cascades, window_width def motif_aware_inference(name, result_name, semicolon, if_id, occur_type='simple', algo_steps=100, min_algo_steps=10, algo_threshold=0.000001): global gradient_descent_threshold, min_gradient_descent_steps, gradient_descent_steps, min_error_dif_jump gradient_descent_steps = 100 min_gradient_descent_steps = 10 gradient_descent_threshold = 0.000001 min_error_dif_jump = 0.0001 # np.set_printoptions(edgeitems=6) # np.core.arrayprint._line_width = 1000000 occurrence_matrix, cascades, window_width = read_data(name=name, window_width=0.7, with_cascade_id=if_id, semicolon=semicolon) # print('Cascades:\n', cascades) print('Occurrence Matrix:', occurrence_matrix) print('Num of Nodes:', num_of_nodes) print('Finish Time:', finish_time) print('Window Width:', window_width) mean = occurrence_matrix.mean() std = occurrence_matrix.std() print('Mean:', mean, 'Standard Deviation:', std) if std == 0: std = 1 # filter out # s: significant pairwise influences s = np.vectorize(lambda val: 0. if (val - mean) / std < 0 else (val - mean) / std)( np.repeat(occurrence_matrix, 1, axis=1)) s = normalize(s) print('Significant Pairwise Influences:\n', s) frequency, z = create_motif_count_matrix(s) print('motif frequency matrix:\n', z) print('Num of Motifs Seen:', frequency) mat = np.zeros([num_of_nodes, num_of_nodes]) i = 0 errors = [] while i < algo_steps: mats = [] print('\nstarted run:', i) prev_mat = np.repeat(mat, 1, axis=1) mat = gradient_descent(mat, cascades, z) frequency, z = create_motif_count_matrix(mat) print('in run', i, 'result adjacency matrix:') print(mat) mats += [mat] print('result motif frequency matrix:') print(z) # for xy in l: # print('important edge:', xy[0], xy[1], mat[xy[0]][xy[1]]) # print('error in step', i, ' :', error) i += 1 error = 0 for j in range(num_of_nodes): for k in range(num_of_nodes): error += abs(mat[j][k] - prev_mat[j][k]) error /= (num_of_nodes ** 2) errors += [error] if i > min_algo_steps and error < algo_threshold: break if i > min_algo_steps and error - errors[-2] > min_error_dif_jump: mat = prev_mat break print('\n\nerrors ranged from:', errors[0], 'to:', errors[1]) print('\nA:\n', mat) mean_m, std_m = mat.mean(), mat.std() mat = np.vectorize(lambda val: 0. if (val - mean_m) / std_m < 0 else (val - mean_m) / std_m)(mat) mat = normalize(mat) res = read_result(result_name, num_of_nodes=num_of_nodes) print('ground truth result:\n', res) pr, rec, f1 = accuracy(mat, res, num_of_nodes=num_of_nodes) print('precision', pr, 'recall', rec, 'f-score', f1) mat_round = round_up(mat) print('rounded up matrix\n', mat_round) res_round = round_up(res) print('rounded up ground truth\n', res_round) pr_round, rec_round, f1_round = accuracy(mat_round, res_round, num_of_nodes=num_of_nodes) print('precision', pr_round, 'recall', rec_round, 'f-score', f1_round) # print(util.print_mat(mat, num_of_nodes)) # mat = np.zeros([4, 4]) # mat[0][1] = 1 # mat[1][2] = 1 # mat[1][3] = 1 # mat[2][3] = 1 # mat[3][2] = 1 # print(likelihood_cascade(cascades[0], mat)) return mat mat = motif_aware_inference(name='cascades3.txt', result_name='network3.txt', semicolon=False, if_id=False)