2 years ago · fb0c5820c7
--- a/data_loader.py
+++ b/data_loader.py
@@ -0,0 +1,120 @@
 import scipy.io as sio
 import numpy as np
 import torch

 class DataLoader:
    """Data Loader class"""
    def __init__(self, S_train, S_val, S_test, data_dir, mat_file_path):
        self.S_train = S_train
        self.S_val = S_val
        self.S_test = S_test
        self.data_root_dir = data_dir
        self.mat_file_path = mat_file_path

        self.saved_content = sio.loadmat(self.data_root_dir + self.mat_file_path)
        self.T = np.max(self.saved_content["A_labels_subs"][0, :]) + 1
        print("Number of total graphs: {}".format(self.T))
        self.N = max(np.max(self.saved_content["A_labels_subs"][1, :]),
                     np.max(self.saved_content["A_labels_subs"][2, :])) + 1
        print("Number of nodes in each graph: {}".format(self.N))

        self.A_size = torch.Size([self.T, self.N, self.N])  # size of the adjacency matrix
        self.C_size = torch.Size([self.T, self.N, self.N])  # similar to the adjacency matrix, C tensor has TxNxN shape
        # labels of the edges
        self.A_labels = torch.sparse.FloatTensor(torch.tensor(self.saved_content["A_labels_subs"], dtype=torch.long),
                                                 torch.squeeze(torch.tensor(self.saved_content["A_labels_vals"])),
                                                 self.A_size).coalesce()

        # Laplacian Transformed of adjacency matrix
        self.C = torch.sparse.FloatTensor(torch.tensor(self.saved_content["C_subs"], dtype=torch.long),
                                          torch.squeeze(torch.tensor(self.saved_content["C_vals"])),
                                          self.C_size).coalesce()

        # adjacency matrix
        self.A = torch.sparse.FloatTensor(self.A_labels._indices(),
                                          torch.ones(self.A_labels._values().shape),
                                          self.A_size).coalesce()
        # create node features
        self.X = self.create_node_features()

    def split_data(self):
        C_train = []
        for j in range(self.S_train):
            idx = self.C._indices()[0] == j
            C_train.append(torch.sparse.FloatTensor(self.C._indices()[1:3, idx],
                                                    self.C._values()[idx]))

        C_val = []
        for j in range(self.S_train, self.S_train + self.S_val):
            idx = self.C._indices()[0] == j
            C_val.append(torch.sparse.FloatTensor(self.C._indices()[1:3, idx],
                                                    self.C._values()[idx]))

        C_test = []
        for j in range(self.S_train+self.S_test, self.S_train + self.S_val + self.S_test):
            idx = self.C._indices()[0] == j
            C_test.append(torch.sparse.FloatTensor(self.C._indices()[1:3, idx],
                                                  self.C._values()[idx]))

        C = {'C_train': C_train,
                 'C_val': C_val,
                 'C_test': C_test}

        X_train = self.X[0:self.S_train].double()
        X_val = self.X[self.S_train:self.S_train + self.S_val].double()
        X_test = self.X[self.S_train + self.S_val:].double()

        data = {'X_train' : X_train,
                'X_val': X_val,
                'X_test': X_test}


        return data, C

    def get_edges_and_labels(self):
        # training
        subs_train = self.A_labels._indices()[0] < self.S_train
        edges_train = self.A_labels._indices()[:, subs_train]
        labels_train = torch.sign(self.A_labels._values()[subs_train])
        target_train = (labels_train != -1).long()    # element = 0 if class = -1; and 1 if class is 0 or +1

        # validation
        subs_val = (self.A_labels._indices()[0] >= self.S_train) & (self.A_labels._indices()[0] < self.S_train + self.S_val)
        edges_val = self.A_labels._indices()[:, subs_val]
        edges_val[0] -= self.S_train
        labels_val = torch.sign(self.A_labels._values()[subs_val])
        target_val = (labels_val != -1).long()

        # Testing
        subs_test = (self.A_labels._indices()[0] >= self.S_train + self.S_val)
        edges_test = self.A_labels._indices()[:, subs_test]
        edges_test[0] -= (self.S_train + self.S_val)
        labels_test = torch.sign(self.A_labels._values()[subs_test])
        target_test = (labels_test != -1).long()

        targets = {'target_train': target_train,
                   'target_val': target_val,
                   'target_test': target_test}

        edges = {'edges_train': edges_train,
                 'edges_val': edges_val,
                 'edges_test': edges_test}

        return targets, edges

    def create_node_features(self):
        X = torch.zeros(self.A.shape[0], self.A.shape[1], 2)
        X[:, :, 0] = torch.sparse.sum(self.A, 1).to_dense()   # number of outgoing edges
        X[:, :, 1] = torch.sparse.sum(self.A, 2).to_dense()  # number of in coming edges

        return X

    def load_data(self):
        print("Loading the data...")
        data, C = self.split_data()
        targets, edges = self.get_edges_and_labels()
        print("======================")
        return data, C, targets, edges



--- a/read_data.py
+++ b/read_data.py
@@ -0,0 +1,291 @@
 import numpy as np
 import math
 import torch
 import scipy.io as sio
 import argparse

 parser = argparse.ArgumentParser()
 parser.add_argument('--data_path', type=str, help='path to the root of data directory')
 parser.add_argument('--dataset', type=str, help='dataset name')
 parser.add_argument('--time_partitioning', type=list, help='time partitioning for train/test/val split')

 args = parser.parse_args()
    
 #Settings
 edge_life = True
 edge_life_window = 10
 no_diag = 20
 make_symmetric = True
 print(args.dataset)

 def print_tensor(A, str):
    print('------------------------')
    print(str)
    print(A)
    print(torch.sum(A._values()))
    print('------------------------')

 if args.dataset == 'Bitcoin Alpha':
    data = np.loadtxt(args.data_path, delimiter=',')
    save_file_location = 'data/Bitcoin_Alpha/'
    save_file_name = 'saved_content_bitcoin_alpha.mat'
    time_delta = 60*60*24*14 # 2 weeks
    time_slices = args.time_partitioning
    no_train_samples, no_val_samples, no_test_samples = time_slices[0], time_slices[1], time_slices[2]

 if args.dataset == 'Bitcoin OTC':
    data = np.loadtxt(args.data_path, delimiter=',')
    save_file_location = 'data/Bitcoin_OTC/'
    save_file_name = 'saved_content_bitcoin_otc.mat'
    time_delta = 60*60*24*14 # 2 weeks
    time_slices = args.time_partitioning
    no_train_samples, no_val_samples, no_test_samples = time_slices[0], time_slices[1], time_slices[2]

 if args.dataset == 'Reddit':
    data = np.loadtxt(args.data_path)
    save_file_location = 'data/reddit/'
    save_file_name = 'saved_content_reddit.mat'
    time_delta = 60 * 60 * 24 * 14  # 2 weeks
    time_slices = args.time_partitioning
    no_train_samples, no_val_samples, no_test_samples = time_slices[0], time_slices[1], time_slices[2]

 elif args.dataset == 'Chess':
    data = np.loadtxt(args.data_path, delimiter=',', skiprows=1)
    #data = readmatrix('./data/chess/out.chess.csv')
    save_file_location = '/home/shivmaran/Desktop/Tensor-GCN/data/chess/'
    save_file_name = 'saved_content_python_chess.mat'
    time_delta = 60*60*24*31 # 31 days
    time_slices = args.time_partitioning
    no_train_samples, no_val_samples, no_test_samples = time_slices[0], time_slices[1], time_slices[2]
 else:
    print('Invalid dataset')
    #exit

 data = torch.tensor(data)
 # Create full tensor
 if args.dataset == 'Chess':
    dates = np.unique(data[:,3])
    no_time_slices = len(dates)
 else:
    no_time_slices = math.floor((max(data[:,3]) - min(data[:,3]))/time_delta)

 N = int(max(max(data[:,0]), max(data[:,1])))
 T = int(no_train_samples)
 TT = int(no_time_slices)

 #Create M
 M = np.zeros((T,T))
 for i in range(no_diag):
    A = M[i:, :T-i]
    np.fill_diagonal(A, 1)
 L = np.sum(M, axis=1)
 M = M/L[:,None]
 M = torch.tensor(M) 

 #Create A and A_labels
 if not args.dataset == 'Chess':
    data = data[data[:,3] < min(data[:,3])+TT*time_delta]
    start_time = min(data[:,3]);

 tensor_idx = torch.zeros([data.size()[0], 3], dtype=torch.long)
 tensor_val = torch.ones([data.size()[0]], dtype=torch.double)
 tensor_labels = torch.zeros([data.size()[0]], dtype=torch.double)

 for t in range(TT):
    if args.dataset == 'Chess':
        idx = data[:,3] == dates[t]
    else:
        end_time = start_time + time_delta
        idx = (data[:, 3] >= start_time) & (data[:, 3] < end_time)
        start_time = end_time
    
    tensor_idx[idx, 1:3] = (data[idx, 0:2] - 1).type('torch.LongTensor')
    tensor_idx[idx, 0] = t
    tensor_labels[idx] = data[idx, 2].type('torch.DoubleTensor')

 A =  torch.sparse.DoubleTensor(tensor_idx.transpose(1,0), tensor_val, torch.Size([TT, N, N])).coalesce()
 A_labels = torch.sparse.DoubleTensor(tensor_idx.transpose(1,0), tensor_labels, torch.Size([TT, N, N])).coalesce()

 def func_make_symmetric(sparse_tensor, N, TT):  
    count = 0
    tensor_idx = torch.LongTensor([])
    tensor_val = torch.DoubleTensor([]).unsqueeze(1)
    A_idx = sparse_tensor._indices()
    A_val = sparse_tensor._values()
    for j in range(TT):
        idx = A_idx[0] == j
        mat = torch.sparse.DoubleTensor(A_idx[1:3,idx], A_val[idx], torch.Size([N,N]))
        mat_t = mat.transpose(1,0)
        sym_mat = mat + mat_t
        sym_mat = sym_mat/2
        count = count + sym_mat._nnz()
        vertices = torch.tensor(sym_mat._indices())
        time = torch.ones(sym_mat._nnz(), dtype=torch.long)* j
        time = time.unsqueeze(0)
        full = torch.cat((time,vertices),0)
        tensor_idx = torch.cat((tensor_idx,full),1)
        tensor_val = torch.cat((tensor_val, sym_mat._values().unsqueeze(1)),0)        
    tensor_val.squeeze_(1)
    A =  torch.sparse.DoubleTensor(tensor_idx, tensor_val, torch.Size([TT, N, N])).coalesce()
    return A

 if make_symmetric:
    B = func_make_symmetric(A, N, TT)
 else:
    B = A

 def func_edge_life(A, N, TT):
    A_new = A.clone()
    A_new._values()[:] = 0
    idx = A._indices()[0] == 0
    for t in range(TT):
        idx =  (A._indices()[0] >= max(0, t-edge_life_window+1)) & (A._indices()[0] <= t)  
        block = torch.sparse.DoubleTensor(A._indices()[0:3,idx], A._values()[idx], torch.Size([TT, N, N]))
        block._indices()[0] = t
        A_new = A_new + block
    return A_new.coalesce()

 if edge_life:
    B = func_edge_life(B,N,TT)

 def func_laplacian_transformation(B, N, TT):
    vertices = torch.LongTensor([range(N), range(N)])
    tensor_idx = torch.LongTensor([])
    tensor_val = torch.DoubleTensor([]).unsqueeze(1)
    for j in range(TT):
        time = torch.ones(N, dtype=torch.long) * j
        time = time.unsqueeze(0)
        full = torch.cat((time,vertices),0)
        tensor_idx = torch.cat((tensor_idx,full),1)
        val = torch.ones(N, dtype=torch.double)
        tensor_val = torch.cat((tensor_val, val.unsqueeze(1)),0) 
    tensor_val.squeeze_(1)
    I = torch.sparse.DoubleTensor(tensor_idx, tensor_val , torch.Size([TT,N,N]))
    C = B + I
    tensor_idx = torch.LongTensor([])
    tensor_val = torch.DoubleTensor([]).unsqueeze(1)
    for j in range(TT):
        idx = C._indices()[0] == j
        mat = torch.sparse.DoubleTensor(C._indices()[1:3,idx], C._values()[idx], torch.Size([N,N]))
        vec = torch.ones([N,1], dtype=torch.double)
        degree =  1/torch.sqrt(torch.sparse.mm(mat, vec))        
        index = torch.LongTensor(C._indices()[0:3,idx].size())
        values = torch.DoubleTensor(C._values()[idx].size())
        index[0] = j
        index[1:3] = mat._indices()
        values = mat._values()
        count = 0
        for i,j in index[1:3].transpose(1,0):
            values[count] = values[count] * degree[i] * degree[j]
            count = count + 1
        tensor_idx = torch.cat((tensor_idx,index), 1)
        tensor_val = torch.cat((tensor_val,values.unsqueeze(1)),0)
    tensor_val.squeeze_(1)
    C = torch.sparse.DoubleTensor(tensor_idx, tensor_val , torch.Size([TT,N,N]))
    return C.coalesce()

 C = func_laplacian_transformation(B, N, TT)
 Ct = C.clone().coalesce() 
 if TT < (T + no_val_samples + no_test_samples):
    TTT= (T + no_val_samples + no_test_samples)
    Ct = torch.sparse.DoubleTensor(Ct._indices(), Ct._values() , torch.Size([TTT,N,N])).coalesce()
 else:
    TTT = TT

 def func_create_sparse(A, N, TTT, T, start, end):
    assert (end-start) == T
    idx = (A._indices()[0] >= start) & (A._indices()[0] < end)        
    index = torch.LongTensor(A._indices()[0:3,idx].size())
    values = torch.DoubleTensor(A._values()[idx].size())    
    index[0:3] = A._indices()[0:3,idx]
    index[0] = index[0] - start
    values = A._values()[idx]
    sub = torch.sparse.DoubleTensor(index, values , torch.Size([T,N,N]))
    return sub.coalesce()


 C_train = func_create_sparse(Ct, N, TTT, T, 0, T)#There is a bug in matlab Bitcoin Alpha
 C_val = func_create_sparse(Ct, N, TTT, T, no_val_samples, T+no_val_samples)
 C_test = func_create_sparse(Ct, N, TTT, T, no_val_samples+no_test_samples, TTT)

 def to_sparse(x):
    x_typename = torch.typename(x).split('.')[-1]
    sparse_tensortype = getattr(torch.sparse, x_typename)

    indices = torch.nonzero(x)
    if len(indices.shape) == 0:  # if all elements are zeros
        return sparse_tensortype(*x.shape)
    indices = indices.t()
    values = x[tuple(indices[i] for i in range(indices.shape[0]))]
    return sparse_tensortype(indices, values, x.size())

 dense = torch.randn(3,3)
 dense[[0,0,1], [1,2,0]] = 0 # make sparse

 def func_MProduct(C, M):
    assert C.size()[0] == M.size()[0]
    Tr = C.size()[0]
    N = C.size()[1]
    C_new = torch.sparse.DoubleTensor(C.size())
    #C_new = C.clone()
    for j in range(Tr):
        idx = C._indices()[0] == j
        mat = torch.sparse.DoubleTensor(C._indices()[1:3,idx], C._values()[idx], torch.Size([N,N]))
        tensor_idx = torch.zeros([3, mat._nnz()], dtype=torch.long)
        tensor_val = torch.zeros([mat._nnz()], dtype=torch.double)
        tensor_idx[1:3] = mat._indices()[0:2]
        indices = torch.nonzero(M[:,j])
        assert indices.size()[0] <= no_diag
        for i in range(indices.size()[0]):
            tensor_idx[0] = indices[i]
            tensor_val = M[indices[i], j] * mat._values()
            C_new = C_new + torch.sparse.DoubleTensor(tensor_idx, tensor_val , C.size())
        C_new.coalesce()                      
    return C_new.coalesce()  
        
 Ct_train = func_MProduct(C_train, M)#There is a bug in matlab Bitcoin Alpha
 Ct_val = func_MProduct(C_val, M)
 Ct_test = func_MProduct(C_test, M)

 A_subs = A._indices()
 A_vals = A._values()
 A_labels_subs = A_labels._indices()
 A_labels_vals = A_labels._values()
 C_subs = C._indices()
 C_vals = C.values()
 C_train_subs = C_train._indices()
 C_train_vals = C_train.values()
 C_val_subs = C_val._indices()
 C_val_vals = C_val.values()
 C_test_subs = C_test._indices()
 C_test_vals = C_test.values()
 Ct_train_subs = Ct_train._indices()
 Ct_train_vals = Ct_train.values()
 Ct_val_subs = Ct_val._indices()
 Ct_val_vals = Ct_val.values()
 Ct_test_subs = Ct_test._indices()
 Ct_test_vals = Ct_test.values()

 sio.savemat(save_file_location + save_file_name, {
        'tensor_idx': np.array(tensor_idx),
        'tensor_labels': np.array(tensor_labels),
        'A_labels_subs': np.array(A_labels_subs),
        'A_labels_vals': np.array(A_labels_vals),
        'A_subs': np.array(A_subs),
        'A_vals': np.array(A_vals),
        'C_subs': np.array(C_subs),
        'C_vals': np.array(C_vals),
        'C_train_subs': np.array(C_train_subs),
        'C_train_vals': np.array(C_train_vals),
        'C_val_subs': np.array(C_val_subs),
        'C_val_vals': np.array(C_val_vals),
        'C_test_subs': np.array(C_test_subs),
        'C_test_vals': np.array(C_test_vals),
        'Ct_train_subs': np.array(Ct_train_subs),
        'Ct_train_vals': np.array(Ct_train_vals),
        'Ct_val_subs': np.array(Ct_val_subs),
        'Ct_val_vals': np.array(Ct_val_vals),
        'Ct_test_subs': np.array(Ct_test_subs),
        'Ct_test_vals': np.array(Ct_test_vals),
        'M': np.array(M)
    })
--- a/run.sh
+++ b/run.sh
@@ -0,0 +1,6 @@
 #PBS –N job1
 #PBS –m ae
 #PBS –M [email protected]
 #PBS –l nodes=1:ppn=5
 cd ./
 python -nodisplay -nodesktop -nojvm <./read_data.py > result.txt
--- a/utils.py
+++ b/utils.py
@@ -0,0 +1,39 @@
 import torch
 import pickle
 import os

 def f1_score(y_pred, y_true):
    tp = torch.sum((y_pred == 0) & (y_true == 0), dtype=torch.float64)
    fp = torch.sum((y_pred == 0) & (y_true != 0), dtype=torch.float64)
    fn = torch.sum((y_pred != 0) & (y_true == 0), dtype=torch.float64)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)

    return f1, precision, recall

 if __name__ == "__main__":

    max_f1 = 0
    fname = None
    epoch = None
    each_split = []

    for bin_fname in os.listdir('results_edge_classification_bitcoin_otc/'):
        bin_file = open("results_edge_classification_bitcoin_otc/" + bin_fname, "rb")
        data = pickle.load(bin_file)
        assert data.shape[1] == 12

        for i in range(len(data)):
            if data[i,2] == 1 or data[i,4] == 1 or data[i,6] == 1:
                continue
            mean_f1 = (data[i, 2] + data[i, 4] + data[i, 6] ) / 3
            if mean_f1 > max_f1:
                max_f1 = mean_f1
                fname = bin_fname
                epoch = i
                each_split = [data[i, 2], data[i, 4], data[i, 6]]

    print(fname, epoch, each_split,  str(max_f1))