farhadi
/
motif-aware-network-inference


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
							import os
import numpy as np
from numpy import exp
from math import log

# z motif count matrix
def create_motif_count_matrix(mat):
    z = np.zeros([num_of_nodes, num_of_nodes])
    freq = 0
    motif_count_vector = [set() for i in range(7)]
    for i in range(num_of_nodes):
        for j in range(num_of_nodes):
            if mat[i][j] != 0:
                for k in range(num_of_nodes):
                    if mat[j][k] != 0 and mat[k][i] != 0:
                        motif_count_vector[0].add(repr(sorted([(i, j), (j, k), (k, i)])))
                    if mat[j][k] != 0 and mat[k][i] != 0 and mat[k][j] != 0:
                        motif_count_vector[1].add(repr(sorted([(i, j), (j, k), (k, i), (k, j)])))
                    if mat[j][k] != 0 and mat[k][i] != 0 and mat[j][i] != 0 and mat[i][k] != 0:
                        motif_count_vector[2].add(repr(sorted([(i, j), (j, k), (k, i), (j, i), (i, k)])))
                    if mat[j][i] != 0 and mat[j][k] != 0 and mat[k][j] != 0 and mat[k][i] != 0 and mat[i][k] != 0:
                        motif_count_vector[3].add(repr(sorted([(i, j), (j, k), (k, i), (j, i), (k, j), (i, k)])))
                    if mat[k][j] != 0 and mat[i][k] != 0:
                        motif_count_vector[4].add(repr(sorted([(i, j), (k, j), (i, k)])))
                    if mat[i][k] != 0 and mat[j][k] != 0 and mat[k][j] != 0:
                        motif_count_vector[5].add(repr(sorted([(i, j), (i, k), (j, k), (k, j)])))
                    if mat[i][k] != 0 and mat[k][i] != 0 and mat[k][j] != 0:
                        motif_count_vector[6].add(repr(sorted([(i, j), (i, k), (k, i), (k, j)])))
                    # if mat[j][k] != 0 and mat[k][i] != 0:
                    #     motif_count_vector[0].add(repr({(i, j), (j, k), (k, i)}))
                    #     freq = adjust_motif_count(z, i, j, k, freq, 0)
                    # if mat[j][k] != 0 and mat[k][i] != 0 and (mat[j][i] != 0 or mat[k][j] != 0 or mat[i][k] != 0):
                    #     motif_count_vector[1].add(repr({}))
                    #     freq = adjust_motif_count(z, i, j, k, freq, 1)
                    # if mat[j][k] != 0 and mat[k][i] != 0 and \
                    #         ((mat[j][i] != 0 and mat[k][j] != 0) or (mat[j][i] != 0 and mat[i][k] != 0) or (
                    #                 mat[k][j] != 0 and mat[i][k] != 0)):
                    #     freq = adjust_motif_count(z, i, j, k, freq, 1)
                    # if mat[j][i] != 0 and mat[j][k] != 0 and mat[k][j] != 0 and mat[k][i] != 0 and mat[i][k] != 0:
                    #     freq = adjust_motif_count(z, i, j, k, freq, 0)
                    # if (mat[i][k] != 0 and mat[k][j] != 0) or \
                    #         (mat[i][k] != 0 and mat[j][k] != 0) or (mat[k][i] != 0 and mat[k][j] != 0):
                    #     freq = adjust_motif_count(z, i, j, k, freq, 1)
                    # if (mat[i][k] != 0 and mat[k][j] != 0 and mat[j][k] != 0) or \
                    #         (mat[j][i] != 0 and mat[k][i] != 0 and mat[k][j] != 0):
                    #     freq = adjust_motif_count(z, i, j, k, freq, 2)
                    # if (mat[i][k] != 0 and mat[k][i] != 0 and mat[k][j] != 0) or \
                    #         (mat[j][i] != 0 and mat[j][k] != 0 and mat[i][k] != 0):
                    #     freq = adjust_motif_count(z, i, j, k, freq, 2)
    for motif_vector in motif_count_vector:
        for x in motif_vector:
            for (i, j) in eval(x):
                z[i][j] += 1
    return freq, z


# def adjust_motif_count(z, i, j, k, freq, motif_type):
# if motif_type == 0:
#     freq += 1 / 3
#     z[i][j] += 1 / 3
#     z[j][k] += 1 / 3
#     z[k][i] += 1 / 3
# elif motif_type == 1:
#     freq += 1
#     z[i][j] += 1
#     z[j][k] += 1
#     z[k][i] += 1
# elif motif_type == 2:
#     freq += 1 / 2
#     z[i][j] += 1 / 2
#     z[j][k] += 1 / 2
#     z[k][i] += 1 / 2
# return freq


# def check_motif():
#     arr = np.array([])
#     for i in range(10):
#         rand = np.repeat(s, 1, axis=1)
#         np.random.shuffle(rand)
#         f = check_exist(rand)
#         print(f)
#         arr += [f]
#     print(arr.mean(), arr.std(), (freq[0] - arr.mean()) / arr.std())


def normalize(mat):
    s = np.amax(mat)
    return np.vectorize(lambda val: val / s)(np.repeat(mat, 1, axis=1))


def find_r(mat, freq_mat):
    global num_of_nodes, finish_time
    summation = 0.
    for i in range(num_of_nodes):
        for j in range(num_of_nodes):
            summation += abs(mat[i][j] / (freq_mat[i][j] + 1))
    return summation


# this assumes that cascades are sorted by time
def likelihood_cascade(cascade, mat):
    likelihood = 1.
    for i in range(len(cascade['timing'])):
        tup = cascade['timing'][i]
        # not infecteds from him:
        for m in cascade['not_infecteds']:
            likelihood *= survival(tup[1], mat[tup[0]][m], finish_time)
        # nodes he might be infected from
        cumulative_hazards = 0.
        for infected in cascade['timing'][:i]:
            cumulative_hazards += hazard(infected[1], mat[infected[0]][tup[0]], tup[1])
        likelihood *= cumulative_hazards
        for infected in cascade['timing'][:i]:
            likelihood *= survival(infected[1], mat[infected[0]][tup[0]], tup[1])
    return likelihood


def hazard(start_time: float, rate: float, end_time: float) -> float:
    return probability(start_time, rate, end_time) / survival(start_time, rate, end_time)


def survival(start_time: float, rate: float, end_time: float) -> float:
    x = exp(-rate * (end_time - start_time))
    return x


def probability(start_time: float, rate: float, end_time: float) -> float:
    return rate * exp(-rate * (end_time - start_time))


def gradient(mat, cascades, freq_mat):
    gradient = np.zeros([num_of_nodes, num_of_nodes])
    for cascade in cascades:
        for non_infected in cascade['not_infecteds']:
            for infected in cascade['timing']:
                gr = finish_time - infected[1]
                gradient[infected[0]][non_infected] -= gr
        for k in range(len(cascade['timing'])):
            infected_k = cascade['timing'][k]
            for infected_j in cascade['timing'][:k]:
                sigma = 0
                for infected_l in cascade['timing'][:k]:
                    sigma += mat[infected_l[0]][infected_k[0]]
                gr = (infected_k[1] - infected_j[1]) - (1 / sigma) if sigma != 0 else infected_k[1] - infected_j[1]

                gradient[infected_j[0]][infected_k[0]] = gr
    # not sure
    for i in range(num_of_nodes):
        for j in range(num_of_nodes):
            gradient[i][j] += 1 / (freq_mat[i][j] + 1)
    # print('gradient:\n', gradient)
    return gradient


def gradient_descent(mat, cascades, freq_mat, learning_rate=0.01):
    errors = []
    ii = 0
    while ii < gradient_descent_steps:
        prev_mat = np.repeat(mat, 1, axis=1)
        mat -= learning_rate * gradient(mat, cascades, freq_mat)
        mat = normalize(mat)
        likelihood = 1.
        for casc in cascades:
            likelihood *= likelihood_cascade(casc, mat)
        # print('likelihood:', likelihood - find_r(mat, freq_mat))
        for i in range(num_of_nodes):
            for j in range(num_of_nodes):
                mat[i][j] = 0 if mat[i][j] < 0 else mat[i][j]
        error = 0
        for j in range(num_of_nodes):
            for k in range(num_of_nodes):
                # if mat[j][k] - prev_mat[j][k] != 0:
                #     print('different values seen in:', j, k, 'with value:', prev_mat[j][k], 'becoming:', mat[j][k])
                error += abs(mat[j][k] - prev_mat[j][k])
        error /= (num_of_nodes ** 2)
        errors += [error]
        # print('gradient descent error in step', ii, ' :', error)
        if error < gradient_descent_threshold:
            break
        if ii > min_gradient_descent_steps and error - errors[-2] > min_error_dif_jump:
            break
        ii += 1
    print('gradient descent errors ranged from:\n', errors[0], 'to:', errors[-1])

    return mat


def read_result(name: str, num_of_nodes: int = 32):
    mat = np.zeros([num_of_nodes, num_of_nodes])
    if os.path.isfile('./{}'.format(name)):
        with open(name, 'r') as outfile:
            for line in outfile:
                if '.' in line:
                    stripped = line.split(',')
                    i = int(stripped[0])
                    j = int(stripped[1])
                    mat[i][j] = float(stripped[-1])
    return mat


def print_mat(mat, num_of_nodes=32):
    for i in range(num_of_nodes):
        for j in range(num_of_nodes):
            if mat[i][j] != 0:
                print((i, j, mat[i][j]), end=' ')
        print()


def new_print(mat):
    for x in mat:
        print(x)


def round_up(mat):
    return np.vectorize(lambda val: 1. if val > 0. else 0.)(np.repeat(mat, 1, axis=1))


def diff_is_ignorable(a, b, lim):
    return abs(a - b) < lim


def diff_is_huge(a, b, lim):
    return abs(a - b) > lim


def accuracy(mat_guess, mat_answer, num_of_nodes, lim_huge=0.7, lim_ignore=0.1, lim_diff_zero=0.3, lim_diff_one=0.3):
    tp, fp, tn, fn = 0, 0, 0, 0
    for i in range(num_of_nodes):
        for j in range(num_of_nodes):
            if diff_is_ignorable(mat_guess[i][j], mat_answer[i][j], lim_ignore) and diff_is_ignorable(mat_guess[i][j],
                                                                                                      1, lim_diff_one):
                tp += 1
            elif diff_is_ignorable(mat_guess[i][j], mat_answer[i][j], lim_ignore) and diff_is_ignorable(mat_guess[i][j],
                                                                                                        0,
                                                                                                        lim_diff_zero):
                tn += 1
            elif diff_is_huge(mat_guess[i][j], mat_answer[i][j], lim_huge) and diff_is_ignorable(mat_guess[i][j], 1,
                                                                                                 lim_diff_one):
                fp += 1
            elif diff_is_huge(mat_guess[i][j], mat_answer[i][j], lim_huge) and diff_is_ignorable(mat_guess[i][j], 0,
                                                                                                 lim_diff_zero):
                fn += 1
    try:
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f_measure = 2 * precision * recall / (precision + recall)
        return precision, recall, f_measure
    except Exception as e:
        print('error:', 'tp', tp, 'tn', tn, 'fp', fp, 'fn', fn)
    return None, None, None


# reading data and creating occurrence matrix
def read_data(name, window_width, with_cascade_id, semicolon):
    global num_of_nodes, finish_time
    num_of_nodes, finish_time = 0, 0.
    cascades = []
    if os.path.isfile('./{}'.format(name)):
        with open(name, 'r') as outfile:
            for line in outfile:
                try:
                    if line.strip() == '':
                        occurrence_matrix = np.zeros((num_of_nodes, num_of_nodes))
                    elif ';' in line or '.' in line:
                        cascade_not_infecteds = list(range(num_of_nodes))
                        cascade_timing = []
                        if with_cascade_id:
                            stripped = line.strip().split(';')[1].split(',')
                            for i in range(len(stripped)):
                                if i % 2 == 0:
                                    cascade_timing += [(int(stripped[i]), float(stripped[i + 1]))]
                                    cascade_not_infecteds.remove(cascade_timing[-1][0])
                                    if cascade_timing[-1][1] > finish_time:
                                        finish_time = cascade_timing[-1][1]
                        else:
                            if semicolon:
                                stripped = line.strip().split(';')
                                for event_str in stripped:
                                    event = (int(event_str.split(',')[0]), float(event_str.split(',')[1]))
                                    cascade_timing += [event]
                                    cascade_not_infecteds.remove(event[0])
                                    if cascade_timing[-1][1] > finish_time:
                                        finish_time = cascade_timing[-1][1]
                                    for previous_event in cascade_timing[:-1][::-1]:
                                        if previous_event[1] < event[1] < previous_event[1] + window_width:
                                            occurrence_matrix[previous_event[0]][event[0]] += 1
                            else:
                                stripped = line.strip().split(',')
                                for i in range(len(stripped)):
                                    if i % 2 == 0:
                                        event = (int(stripped[i]), float(stripped[i + 1]))
                                        cascade_timing += [event]
                                        cascade_not_infecteds.remove(event[0])
                                        if cascade_timing[-1][1] > finish_time:
                                            finish_time = cascade_timing[-1][1]
                                        for previous_event in cascade_timing[:-1][::-1]:
                                            if previous_event[1] < event[1] < previous_event[1] + window_width:
                                                occurrence_matrix[previous_event[0]][event[0]] += 1
                                                # event[0]] += 1 if occur_type == 'simple' else np.random.exponential(
                                                # event[1] - previous_event[1]) if occur_type == 'exp' else np.random.rayleigh(
                                                # event[1] - previous_event[1])
                                            else:
                                                break
                        cascade = {'timing': cascade_timing, 'not_infecteds': cascade_not_infecteds}
                        cascades += [cascade]
                    else:
                        num_of_nodes += 1
                except Exception as e:
                    print(e)
            return occurrence_matrix, cascades, window_width


def motif_aware_inference(name, result_name, semicolon, if_id, occur_type='simple', algo_steps=100, min_algo_steps=10,
                          algo_threshold=0.000001):
    global gradient_descent_threshold, min_gradient_descent_steps, gradient_descent_steps, min_error_dif_jump
    gradient_descent_steps = 100
    min_gradient_descent_steps = 10
    gradient_descent_threshold = 0.000001
    min_error_dif_jump = 0.0001
    # np.set_printoptions(edgeitems=6)
    # np.core.arrayprint._line_width = 1000000
    occurrence_matrix, cascades, window_width = read_data(name=name, window_width=0.7, with_cascade_id=if_id,
                                                          semicolon=semicolon)
    # print('Cascades:\n', cascades)
    print('Occurrence Matrix:', occurrence_matrix)
    print('Num of Nodes:', num_of_nodes)
    print('Finish Time:', finish_time)
    print('Window Width:', window_width)
    mean = occurrence_matrix.mean()
    std = occurrence_matrix.std()
    print('Mean:', mean, 'Standard Deviation:', std)
    if std == 0:
        std = 1

    # filter out
    # s: significant pairwise influences
    s = np.vectorize(lambda val: 0. if (val - mean) / std < 0 else (val - mean) / std)(
        np.repeat(occurrence_matrix, 1, axis=1))
    s = normalize(s)
    print('Significant Pairwise Influences:\n', s)
    frequency, z = create_motif_count_matrix(s)
    print('motif frequency matrix:\n', z)
    print('Num of Motifs Seen:', frequency)
    mat = np.zeros([num_of_nodes, num_of_nodes])
    i = 0
    errors = []
    while i < algo_steps:
        mats = []
        print('\nstarted run:', i)
        prev_mat = np.repeat(mat, 1, axis=1)
        mat = gradient_descent(mat, cascades, z)
        frequency, z = create_motif_count_matrix(mat)

        print('in run', i, 'result adjacency matrix:')
        print(mat)
        mats += [mat]
        print('result motif frequency matrix:')
        print(z)

        # for xy in l:
        #     print('important edge:', xy[0], xy[1], mat[xy[0]][xy[1]])
        # print('error in step', i, ' :', error)

        i += 1
        error = 0
        for j in range(num_of_nodes):
            for k in range(num_of_nodes):
                error += abs(mat[j][k] - prev_mat[j][k])
        error /= (num_of_nodes ** 2)
        errors += [error]

        if i > min_algo_steps and error < algo_threshold:
            break
        if i > min_algo_steps and error - errors[-2] > min_error_dif_jump:
            mat = prev_mat
            break
    print('\n\nerrors ranged from:', errors[0], 'to:', errors[1])
    print('\nA:\n', mat)
    mean_m, std_m = mat.mean(), mat.std()
    mat = np.vectorize(lambda val: 0. if (val - mean_m) / std_m < 0 else (val - mean_m) / std_m)(mat)
    mat = normalize(mat)

    res = read_result(result_name, num_of_nodes=num_of_nodes)
    print('ground truth result:\n', res)
    pr, rec, f1 = accuracy(mat, res, num_of_nodes=num_of_nodes)
    print('precision', pr, 'recall', rec, 'f-score', f1)

    mat_round = round_up(mat)
    print('rounded up matrix\n', mat_round)
    res_round = round_up(res)
    print('rounded up ground truth\n', res_round)
    pr_round, rec_round, f1_round = accuracy(mat_round, res_round, num_of_nodes=num_of_nodes)
    print('precision', pr_round, 'recall', rec_round, 'f-score', f1_round)

    # print(util.print_mat(mat, num_of_nodes))

    # mat = np.zeros([4, 4])
    # mat[0][1] = 1
    # mat[1][2] = 1
    # mat[1][3] = 1
    # mat[2][3] = 1
    # mat[3][2] = 1
    # print(likelihood_cascade(cascades[0], mat))
    return mat


mat = motif_aware_inference(name='cascades3.txt', result_name='network3.txt', semicolon=False, if_id=False)