import torch
import torchvision as tv
import torch.nn as nn
from torch.autograd import Variable
import matplotlib.pyplot as plt
from random import shuffle

import networkx as nx
import pickle as pkl
import scipy.sparse as sp
import logging

import random
import shutil
import os
import time
from model import *
from utils import *


# load ENZYMES and PROTEIN and DD dataset
def Graph_load_batch(min_num_nodes = 20, max_num_nodes = 1000, name = 'ENZYMES',node_attributes = True,graph_labels=True):
    '''
    load many graphs, e.g. enzymes
    :return: a list of graphs
    '''
    print('Loading graph dataset: '+str(name))
    G = nx.Graph()
    # load data
    path = 'dataset/'+name+'/'
    data_adj = np.loadtxt(path+name+'_A.txt', delimiter=',').astype(int)
    if node_attributes:
        data_node_att = np.loadtxt(path+name+'_node_attributes.txt', delimiter=',')
    data_node_label = np.loadtxt(path+name+'_node_labels.txt', delimiter=',').astype(int)
    data_graph_indicator = np.loadtxt(path+name+'_graph_indicator.txt', delimiter=',').astype(int)
    if graph_labels:
        data_graph_labels = np.loadtxt(path+name+'_graph_labels.txt', delimiter=',').astype(int)


    data_tuple = list(map(tuple, data_adj))
    # print(len(data_tuple))
    # print(data_tuple[0])

    # add edges
    G.add_edges_from(data_tuple)
    # add node attributes
    for i in range(data_node_label.shape[0]):
        if node_attributes:
            G.add_node(i+1, feature = data_node_att[i])
        G.add_node(i+1, label = data_node_label[i])
    G.remove_nodes_from(list(nx.isolates(G)))

    # print(G.number_of_nodes())
    # print(G.number_of_edges())

    # split into graphs
    graph_num = data_graph_indicator.max()
    node_list = np.arange(data_graph_indicator.shape[0])+1
    graphs = []
    max_nodes = 0
    for i in range(graph_num):
        # find the nodes for each graph
        nodes = node_list[data_graph_indicator==i+1]
        G_sub = G.subgraph(nodes)
        if graph_labels:
            G_sub.graph['label'] = data_graph_labels[i]
        # print('nodes', G_sub.number_of_nodes())
        # print('edges', G_sub.number_of_edges())
        # print('label', G_sub.graph)
        if G_sub.number_of_nodes()>=min_num_nodes and G_sub.number_of_nodes()<=max_num_nodes:
            graphs.append(G_sub)
            if G_sub.number_of_nodes() > max_nodes:
                max_nodes = G_sub.number_of_nodes()
            # print(G_sub.number_of_nodes(), 'i', i)
    # print('Graph dataset name: {}, total graph num: {}'.format(name, len(graphs)))
    # logging.warning('Graphs loaded, total num: {}'.format(len(graphs)))
    print('Loaded')
    return graphs

def test_graph_load_DD():
    graphs, max_num_nodes = Graph_load_batch(min_num_nodes=10,name='DD',node_attributes=False,graph_labels=True)
    shuffle(graphs)
    plt.switch_backend('agg')
    plt.hist([len(graphs[i]) for i in range(len(graphs))], bins=100)
    plt.savefig('figures/test.png')
    plt.close()
    row = 4
    col = 4
    draw_graph_list(graphs[0:row*col], row=row,col=col, fname='figures/test')
    print('max num nodes',max_num_nodes)


def parse_index_file(filename):
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

# load cora, citeseer and pubmed dataset
def Graph_load(dataset = 'cora'):
    '''
    Load a single graph dataset
    :param dataset: dataset name
    :return:
    '''
    names = ['x', 'tx', 'allx', 'graph']
    objects = []
    for i in range(len(names)):
        load = pkl.load(open("dataset/ind.{}.{}".format(dataset, names[i]), 'rb'), encoding='latin1')
        # print('loaded')
        objects.append(load)
        # print(load)
    x, tx, allx, graph = tuple(objects)
    test_idx_reorder = parse_index_file("dataset/ind.{}.test.index".format(dataset))
    test_idx_range = np.sort(test_idx_reorder)

    if dataset == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range - min(test_idx_range), :] = tx
        tx = tx_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    G = nx.from_dict_of_lists(graph)
    adj = nx.adjacency_matrix(G)
    return adj, features, G


######### code test ########
# adj, features,G = Graph_load()
# print(adj)
# print(G.number_of_nodes(), G.number_of_edges())

# _,_,G = Graph_load(dataset='citeseer')
# G = max(nx.connected_component_subgraphs(G), key=len)
# G = nx.convert_node_labels_to_integers(G)
#
# count = 0
# max_node = 0
# for i in range(G.number_of_nodes()):
#     G_ego = nx.ego_graph(G, i, radius=3)
#     # draw_graph(G_ego,prefix='test'+str(i))
#     m = G_ego.number_of_nodes()
#     if m>max_node:
#         max_node = m
#     if m>=50:
#         print(i, G_ego.number_of_nodes(), G_ego.number_of_edges())
#         count += 1
# print('count', count)
# print('max_node', max_node)


def bfs_seq(G, start_id):
    '''
    get a bfs node sequence
    :param G:
    :param start_id:
    :return:
    '''
    dictionary = dict(nx.bfs_successors(G, start_id))
    start = [start_id]
    output = [start_id]
    while len(start) > 0:
        next = []
        while len(start) > 0:
            current = start.pop(0)
            neighbor = dictionary.get(current)
            if neighbor is not None:
                #### a wrong example, should not permute here!
                # shuffle(neighbor)
                next = next + neighbor
        output = output + next
        start = next
    return output


def encode_adj(adj, max_prev_node=10, is_full = False):
    '''

    :param adj: n*n, rows means time step, while columns are input dimension
    :param max_degree: we want to keep row number, but truncate column numbers
    :return:
    '''
    if is_full:
        max_prev_node = adj.shape[0]-1

    # pick up lower tri
    adj = np.tril(adj, k=-1)
    n = adj.shape[0]
    adj = adj[1:n, 0:n-1]

    # use max_prev_node to truncate
    # note: now adj is a (n-1)*(n-1) matrix
    adj_output = np.zeros((adj.shape[0], max_prev_node))
    for i in range(adj.shape[0]):
        input_start = max(0, i - max_prev_node + 1)
        input_end = i + 1
        output_start = max_prev_node + input_start - input_end
        output_end = max_prev_node
        adj_output[i, output_start:output_end] = adj[i, input_start:input_end]
        adj_output[i,:] = adj_output[i,:][::-1] # reverse order

    return adj_output

def decode_adj(adj_output):
    '''
        recover to adj from adj_output
        note: here adj_output have shape (n-1)*m
    '''
    max_prev_node = adj_output.shape[1]
    adj = np.zeros((adj_output.shape[0], adj_output.shape[0]))
    for i in range(adj_output.shape[0]):
        input_start = max(0, i - max_prev_node + 1)
        input_end = i + 1
        output_start = max_prev_node + max(0, i - max_prev_node + 1) - (i + 1)
        output_end = max_prev_node
        adj[i, input_start:input_end] = adj_output[i,::-1][output_start:output_end] # reverse order
    adj_full = np.zeros((adj_output.shape[0]+1, adj_output.shape[0]+1))
    n = adj_full.shape[0]
    adj_full[1:n, 0:n-1] = np.tril(adj, 0)
    adj_full = adj_full + adj_full.T

    return adj_full


def encode_adj_flexible(adj):
    '''
    return a flexible length of output
    note that here there is no loss when encoding/decoding an adj matrix
    :param adj: adj matrix
    :return:
    '''
    # pick up lower tri
    adj = np.tril(adj, k=-1)
    n = adj.shape[0]
    adj = adj[1:n, 0:n-1]

    adj_output = []
    input_start = 0
    for i in range(adj.shape[0]):
        input_end = i + 1
        adj_slice = adj[i, input_start:input_end]
        adj_output.append(adj_slice)
        non_zero = np.nonzero(adj_slice)[0]
        input_start = input_end-len(adj_slice)+np.amin(non_zero)

    return adj_output


def decode_adj_flexible(adj_output):
    '''
    return a flexible length of output
    note that here there is no loss when encoding/decoding an adj matrix
    :param adj: adj matrix
    :return:
    '''
    adj = np.zeros((len(adj_output), len(adj_output)))
    for i in range(len(adj_output)):
        output_start = i+1-len(adj_output[i])
        output_end = i+1
        adj[i, output_start:output_end] = adj_output[i]
    adj_full = np.zeros((len(adj_output)+1, len(adj_output)+1))
    n = adj_full.shape[0]
    adj_full[1:n, 0:n-1] = np.tril(adj, 0)
    adj_full = adj_full + adj_full.T

    return adj_full

def test_encode_decode_adj():
######## code test ###########
    G = nx.ladder_graph(5)
    G = nx.grid_2d_graph(20,20)
    G = nx.ladder_graph(200)
    G = nx.karate_club_graph()
    G = nx.connected_caveman_graph(2,3)
    print(G.number_of_nodes())
    
    adj = np.asarray(nx.to_numpy_matrix(G))
    G = nx.from_numpy_matrix(adj)
    #
    start_idx = np.random.randint(adj.shape[0])
    x_idx = np.array(bfs_seq(G, start_idx))
    adj = adj[np.ix_(x_idx, x_idx)]
    
    print('adj\n',adj)
    adj_output = encode_adj(adj,max_prev_node=5)
    print('adj_output\n',adj_output)
    adj_recover = decode_adj(adj_output,max_prev_node=5)
    print('adj_recover\n',adj_recover)
    print('error\n',np.amin(adj_recover-adj),np.amax(adj_recover-adj))
    
    
    adj_output = encode_adj_flexible(adj)
    for i in range(len(adj_output)):
        print(len(adj_output[i]))
    adj_recover = decode_adj_flexible(adj_output)
    print(adj_recover)
    print(np.amin(adj_recover-adj),np.amax(adj_recover-adj))


def encode_adj_full(adj):
    '''
    return a n-1*n-1*2 tensor, the first dimension is an adj matrix, the second show if each entry is valid
    :param adj: adj matrix
    :return:
    '''
    # pick up lower tri
    adj = np.tril(adj, k=-1)
    n = adj.shape[0]
    adj = adj[1:n, 0:n-1]
    adj_output = np.zeros((adj.shape[0],adj.shape[1],2))
    adj_len = np.zeros(adj.shape[0])

    for i in range(adj.shape[0]):
        non_zero = np.nonzero(adj[i,:])[0]
        input_start = np.amin(non_zero)
        input_end = i + 1
        adj_slice = adj[i, input_start:input_end]
        # write adj
        adj_output[i,0:adj_slice.shape[0],0] = adj_slice[::-1] # put in reverse order
        # write stop token (if token is 0, stop)
        adj_output[i,0:adj_slice.shape[0],1] = 1 # put in reverse order
        # write sequence length
        adj_len[i] = adj_slice.shape[0]

    return adj_output,adj_len

def decode_adj_full(adj_output):
    '''
    return an adj according to adj_output
    :param
    :return:
    '''
    # pick up lower tri
    adj = np.zeros((adj_output.shape[0]+1,adj_output.shape[1]+1))

    for i in range(adj_output.shape[0]):
        non_zero = np.nonzero(adj_output[i,:,1])[0] # get valid sequence
        input_end = np.amax(non_zero)
        adj_slice = adj_output[i, 0:input_end+1, 0] # get adj slice
        # write adj
        output_end = i+1
        output_start = i+1-input_end-1
        adj[i+1,output_start:output_end] = adj_slice[::-1] # put in reverse order
    adj = adj + adj.T
    return adj

def test_encode_decode_adj_full():
########### code test #############
    # G = nx.ladder_graph(10)
    G = nx.karate_club_graph()
    # get bfs adj
    adj = np.asarray(nx.to_numpy_matrix(G))
    G = nx.from_numpy_matrix(adj)
    start_idx = np.random.randint(adj.shape[0])
    x_idx = np.array(bfs_seq(G, start_idx))
    adj = adj[np.ix_(x_idx, x_idx)]
    
    adj_output, adj_len = encode_adj_full(adj)
    print('adj\n',adj)
    print('adj_output[0]\n',adj_output[:,:,0])
    print('adj_output[1]\n',adj_output[:,:,1])
    # print('adj_len\n',adj_len)
    
    adj_recover = decode_adj_full(adj_output)
    print('adj_recover\n', adj_recover)
    print('error\n',adj_recover-adj)
    print('error_sum\n',np.amax(adj_recover-adj), np.amin(adj_recover-adj))


########## use pytorch dataloader
class Graph_sequence_sampler_pytorch(torch.utils.data.Dataset):
    def __init__(self, G_list, max_num_node=None, max_prev_node=None, iteration=20000):
        self.adj_all = []
        self.len_all = []
        for G in G_list:
            self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
            self.len_all.append(G.number_of_nodes())
        if max_num_node is None:
            self.n = max(self.len_all)
        else:
            self.n = max_num_node
        if max_prev_node is None:
            print('calculating max previous node, total iteration: {}'.format(iteration))
            self.max_prev_node = max(self.calc_max_prev_node(iter=iteration))
            print('max previous node: {}'.format(self.max_prev_node))
        else:
            self.max_prev_node = max_prev_node

        # self.max_prev_node = max_prev_node

        # # sort Graph in descending order
        # len_batch_order = np.argsort(np.array(self.len_all))[::-1]
        # self.len_all = [self.len_all[i] for i in len_batch_order]
        # self.adj_all = [self.adj_all[i] for i in len_batch_order]
    def __len__(self):
        return len(self.adj_all)
    def __getitem__(self, idx):
        adj_copy = self.adj_all[idx].copy()
        x_batch = np.zeros((self.n, self.max_prev_node))  # here zeros are padded for small graph
        x_batch[0,:] = 1 # the first input token is all ones
        y_batch = np.zeros((self.n, self.max_prev_node))  # here zeros are padded for small graph
        # generate input x, y pairs
        len_batch = adj_copy.shape[0]
        x_idx = np.random.permutation(adj_copy.shape[0])
        adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
        adj_copy_matrix = np.asmatrix(adj_copy)
        G = nx.from_numpy_matrix(adj_copy_matrix)
        # then do bfs in the permuted G
        start_idx = np.random.randint(adj_copy.shape[0])
        x_idx = np.array(bfs_seq(G, start_idx))
        adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
        adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.max_prev_node)
        # get x and y and adj
        # for small graph the rest are zero padded
        y_batch[0:adj_encoded.shape[0], :] = adj_encoded
        x_batch[1:adj_encoded.shape[0] + 1, :] = adj_encoded
        return {'x':x_batch,'y':y_batch, 'len':len_batch}

    def calc_max_prev_node(self, iter=20000,topk=10):
        max_prev_node = []
        for i in range(iter):
            if i % (iter / 5) == 0:
                print('iter {} times'.format(i))
            adj_idx = np.random.randint(len(self.adj_all))
            adj_copy = self.adj_all[adj_idx].copy()
            # print('Graph size', adj_copy.shape[0])
            x_idx = np.random.permutation(adj_copy.shape[0])
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            adj_copy_matrix = np.asmatrix(adj_copy)
            G = nx.from_numpy_matrix(adj_copy_matrix)
            # then do bfs in the permuted G
            start_idx = np.random.randint(adj_copy.shape[0])
            x_idx = np.array(bfs_seq(G, start_idx))
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            # encode adj
            adj_encoded = encode_adj_flexible(adj_copy.copy())
            max_encoded_len = max([len(adj_encoded[i]) for i in range(len(adj_encoded))])
            max_prev_node.append(max_encoded_len)
        max_prev_node = sorted(max_prev_node)[-1*topk:]
        return max_prev_node


########## use pytorch dataloader
class Graph_sequence_sampler_pytorch_nobfs(torch.utils.data.Dataset):
    def __init__(self, G_list, max_num_node=None):
        self.adj_all = []
        self.len_all = []
        for G in G_list:
            self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
            self.len_all.append(G.number_of_nodes())
        if max_num_node is None:
            self.n = max(self.len_all)
        else:
            self.n = max_num_node
    def __len__(self):
        return len(self.adj_all)
    def __getitem__(self, idx):
        adj_copy = self.adj_all[idx].copy()
        x_batch = np.zeros((self.n, self.n-1))  # here zeros are padded for small graph
        x_batch[0,:] = 1 # the first input token is all ones
        y_batch = np.zeros((self.n, self.n-1))  # here zeros are padded for small graph
        # generate input x, y pairs
        len_batch = adj_copy.shape[0]
        x_idx = np.random.permutation(adj_copy.shape[0])
        adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
        adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.n-1)
        # get x and y and adj
        # for small graph the rest are zero padded
        y_batch[0:adj_encoded.shape[0], :] = adj_encoded
        x_batch[1:adj_encoded.shape[0] + 1, :] = adj_encoded
        return {'x':x_batch,'y':y_batch, 'len':len_batch}

# dataset = Graph_sequence_sampler_pytorch_nobfs(graphs)
# print(dataset[1]['x'])
# print(dataset[1]['y'])
# print(dataset[1]['len'])


########## use pytorch dataloader
########## for completion task
class Graph_sequence_sampler_pytorch_nobfs_for_completion(torch.utils.data.Dataset):
    def __init__(self, G_list, max_num_node=None):
        self.adj_all = []
        self.len_all = []
        for G in G_list:
            self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
            self.len_all.append(G.number_of_nodes())
        if max_num_node is None:
            self.n = max(self.len_all)
        else:
            self.n = max_num_node

    def __len__(self):
        return len(self.adj_all)

    def shift_left(self, vector, idx):
        shifted_vector = vector
        length = max(np.shape(vector))
        for i in range(length):
            if i >= idx and i < length - 1:
                shifted_vector[:, i:i + 1] = vector[:, i + 1:i + 2]
            elif i == length - 1:
                shifted_vector[:, i:i + 1] = 0
        return shifted_vector

    def get_graph(self, adj):
        '''
        get a graph from zero-padded adj
        :param adj:
        :return:
        '''
        # remove all zeros rows and columns
        adj = adj[~np.all(adj == 0, axis=1)]
        adj = adj[:, ~np.all(adj == 0, axis=0)]
        adj = np.asmatrix(adj)
        G = nx.from_numpy_matrix(adj)
        return G

    def get_matrix_permutation_for_node_completion(self, input_matrix):
        G = self.get_graph(input_matrix)
        # graph_show(G)
        nodes = G.nodes()
        idx = random.randint(0, len(nodes) - 1)
        G.remove_node(nodes[idx])
        # graph_show(G)
        incomplete_matrix = np.asarray(nx.to_numpy_matrix(G))
        removed_vector = self.shift_left(input_matrix[idx:idx + 1, :], idx)
        incomplete_matrix = np.append(incomplete_matrix, removed_vector[:, :-1], axis=0)
        permuted_complete_matrix = np.append(incomplete_matrix, removed_vector.T, axis=1)
        # graph_show(get_graph(permuted_complete_matrix))
        return permuted_complete_matrix

    def __getitem__(self, idx):
        adj_copy = self.adj_all[idx].copy()
        x_batch = np.zeros((self.n, self.n-1))  # here zeros are padded for small graph
        x_batch[0,:] = 1 # the first input token is all ones
        y_batch = np.zeros((self.n, self.n-1))  # here zeros are padded for small graph
        # generate input x, y pairs
        len_batch = adj_copy.shape[0]
        x_idx = np.random.permutation(adj_copy.shape[0])
        adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
        adj_copy = self.get_matrix_permutation_for_node_completion(adj_copy)
        adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.n-1)
        # get x and y and adj
        # for small graph the rest are zero padded
        y_batch[0:adj_encoded.shape[0], :] = adj_encoded
        x_batch[1:adj_encoded.shape[0] + 1, :] = adj_encoded
        return {'x':x_batch,'y':y_batch, 'len':len_batch}


########## use pytorch dataloader
class Graph_sequence_sampler_pytorch_canonical(torch.utils.data.Dataset):
    def __init__(self, G_list, max_num_node=None, max_prev_node=None, iteration=20000):
        self.adj_all = []
        self.len_all = []
        for G in G_list:
            self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
            self.len_all.append(G.number_of_nodes())
        if max_num_node is None:
            self.n = max(self.len_all)
        else:
            self.n = max_num_node
        if max_prev_node is None:
            # print('calculating max previous node, total iteration: {}'.format(iteration))
            # self.max_prev_node = max(self.calc_max_prev_node(iter=iteration))
            # print('max previous node: {}'.format(self.max_prev_node))
            self.max_prev_node = self.n-1
        else:
            self.max_prev_node = max_prev_node

        # self.max_prev_node = max_prev_node

        # # sort Graph in descending order
        # len_batch_order = np.argsort(np.array(self.len_all))[::-1]
        # self.len_all = [self.len_all[i] for i in len_batch_order]
        # self.adj_all = [self.adj_all[i] for i in len_batch_order]
    def __len__(self):
        return len(self.adj_all)
    def __getitem__(self, idx):
        adj_copy = self.adj_all[idx].copy()
        x_batch = np.zeros((self.n, self.max_prev_node))  # here zeros are padded for small graph
        x_batch[0,:] = 1 # the first input token is all ones
        y_batch = np.zeros((self.n, self.max_prev_node))  # here zeros are padded for small graph
        # generate input x, y pairs
        len_batch = adj_copy.shape[0]
        # adj_copy_matrix = np.asmatrix(adj_copy)
        # G = nx.from_numpy_matrix(adj_copy_matrix)
        # then do bfs in the permuted G
        # start_idx = G.number_of_nodes()-1
        # x_idx = np.array(bfs_seq(G, start_idx))
        # adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
        adj_encoded = encode_adj(adj_copy, max_prev_node=self.max_prev_node)
        # get x and y and adj
        # for small graph the rest are zero padded
        y_batch[0:adj_encoded.shape[0], :] = adj_encoded
        x_batch[1:adj_encoded.shape[0] + 1, :] = adj_encoded
        return {'x':x_batch,'y':y_batch, 'len':len_batch}

    def calc_max_prev_node(self, iter=20000,topk=10):
        max_prev_node = []
        for i in range(iter):
            if i % (iter / 5) == 0:
                print('iter {} times'.format(i))
            adj_idx = np.random.randint(len(self.adj_all))
            adj_copy = self.adj_all[adj_idx].copy()
            # print('Graph size', adj_copy.shape[0])
            x_idx = np.random.permutation(adj_copy.shape[0])
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            adj_copy_matrix = np.asmatrix(adj_copy)
            G = nx.from_numpy_matrix(adj_copy_matrix)
            # then do bfs in the permuted G
            start_idx = np.random.randint(adj_copy.shape[0])
            x_idx = np.array(bfs_seq(G, start_idx))
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            # encode adj
            adj_encoded = encode_adj_flexible(adj_copy.copy())
            max_encoded_len = max([len(adj_encoded[i]) for i in range(len(adj_encoded))])
            max_prev_node.append(max_encoded_len)
        max_prev_node = sorted(max_prev_node)[-1*topk:]
        return max_prev_node


########## use pytorch dataloader
class Graph_sequence_sampler_pytorch_nll(torch.utils.data.Dataset):
    def __init__(self, G_list, max_num_node=None, max_prev_node=None, iteration=20000):
        self.adj_all = []
        self.len_all = []
        for G in G_list:
            adj = np.asarray(nx.to_numpy_matrix(G))
            adj_temp = self.calc_adj(adj)
            self.adj_all.extend(adj_temp)
            self.len_all.append(G.number_of_nodes())
        if max_num_node is None:
            self.n = max(self.len_all)
        else:
            self.n = max_num_node
        if max_prev_node is None:
            # print('calculating max previous node, total iteration: {}'.format(iteration))
            # self.max_prev_node = max(self.calc_max_prev_node(iter=iteration))
            # print('max previous node: {}'.format(self.max_prev_node))
            self.max_prev_node = self.n-1
        else:
            self.max_prev_node = max_prev_node

        # self.max_prev_node = max_prev_node

        # # sort Graph in descending order
        # len_batch_order = np.argsort(np.array(self.len_all))[::-1]
        # self.len_all = [self.len_all[i] for i in len_batch_order]
        # self.adj_all = [self.adj_all[i] for i in len_batch_order]
    def __len__(self):
        return len(self.adj_all)
    def __getitem__(self, idx):
        adj_copy = self.adj_all[idx].copy()
        x_batch = np.zeros((self.n, self.max_prev_node))  # here zeros are padded for small graph
        x_batch[0,:] = 1 # the first input token is all ones
        y_batch = np.zeros((self.n, self.max_prev_node))  # here zeros are padded for small graph
        # generate input x, y pairs
        len_batch = adj_copy.shape[0]
        # adj_copy_matrix = np.asmatrix(adj_copy)
        # G = nx.from_numpy_matrix(adj_copy_matrix)
        # then do bfs in the permuted G
        # start_idx = G.number_of_nodes()-1
        # x_idx = np.array(bfs_seq(G, start_idx))
        # adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
        adj_encoded = encode_adj(adj_copy, max_prev_node=self.max_prev_node)
        # get x and y and adj
        # for small graph the rest are zero padded
        y_batch[0:adj_encoded.shape[0], :] = adj_encoded
        x_batch[1:adj_encoded.shape[0] + 1, :] = adj_encoded
        return {'x':x_batch,'y':y_batch, 'len':len_batch}

    def calc_adj(self,adj):
        max_iter = 10000
        adj_all = [adj]
        adj_all_len = 1
        i_old = 0
        for i in range(max_iter):
            adj_copy = adj.copy()
            x_idx = np.random.permutation(adj_copy.shape[0])
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            adj_copy_matrix = np.asmatrix(adj_copy)
            G = nx.from_numpy_matrix(adj_copy_matrix)
            # then do bfs in the permuted G
            start_idx = np.random.randint(adj_copy.shape[0])
            x_idx = np.array(bfs_seq(G, start_idx))
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            add_flag = True
            for adj_exist in adj_all:
                if np.array_equal(adj_exist, adj_copy):
                    add_flag = False
                    break
            if add_flag:
                adj_all.append(adj_copy)
                adj_all_len += 1
            if adj_all_len % 10 ==0:
                print('adj found:',adj_all_len,'iter used',i)
        return adj_all


# graphs = [nx.barabasi_albert_graph(20,3)]
# graphs = [nx.grid_2d_graph(4,4)]
# dataset = Graph_sequence_sampler_pytorch_nll(graphs)


############## below are codes not used in current version
############## they are based on pytorch default data loader, we should consider reimplement them in current datasets, since they are more efficient


# normal version
class Graph_sequence_sampler_truncate():
    '''
    the output will truncate according to the max_prev_node
    '''
    def __init__(self, G_list, max_node_num=25, batch_size=4, max_prev_node = 25):
        self.batch_size = batch_size
        self.n = max_node_num
        self.max_prev_node = max_prev_node

        self.adj_all = []
        for G in G_list:
            self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))

    def sample(self):
        # batch, length, feature
        x_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
        y_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
        len_batch = np.zeros(self.batch_size)
        # generate input x, y pairs
        for i in range(self.batch_size):
            # first sample and get a permuted adj
            adj_idx = np.random.randint(len(self.adj_all))
            adj_copy = self.adj_all[adj_idx].copy()
            len_batch[i] = adj_copy.shape[0]
            x_idx = np.random.permutation(adj_copy.shape[0])
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            adj_copy_matrix = np.asmatrix(adj_copy)
            G = nx.from_numpy_matrix(adj_copy_matrix)
            # then do bfs in the permuted G
            start_idx = np.random.randint(adj_copy.shape[0])
            x_idx = np.array(bfs_seq(G, start_idx))
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.max_prev_node)
            # get x and y and adj
            # for small graph the rest are zero padded
            y_batch[i, 0:adj_encoded.shape[0], :] = adj_encoded
            x_batch[i, 1:adj_encoded.shape[0]+1, :] = adj_encoded
        # sort in descending order
        len_batch_order = np.argsort(len_batch)[::-1]
        len_batch = len_batch[len_batch_order]
        x_batch = x_batch[len_batch_order,:,:]
        y_batch = y_batch[len_batch_order,:,:]

        return torch.from_numpy(x_batch).float(), torch.from_numpy(y_batch).float(), len_batch.astype('int').tolist()
    def calc_max_prev_node(self,iter):
        max_prev_node = []
        for i in range(iter):
            if i%(iter/10)==0:
                print(i)
            adj_idx = np.random.randint(len(self.adj_all))
            adj_copy = self.adj_all[adj_idx].copy()
            # print('Graph size', adj_copy.shape[0])
            x_idx = np.random.permutation(adj_copy.shape[0])
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            adj_copy_matrix = np.asmatrix(adj_copy)
            G = nx.from_numpy_matrix(adj_copy_matrix)
            time1 = time.time()
            # then do bfs in the permuted G
            start_idx = np.random.randint(adj_copy.shape[0])
            x_idx = np.array(bfs_seq(G, start_idx))
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            # encode adj
            adj_encoded = encode_adj_flexible(adj_copy.copy())
            max_encoded_len = max([len(adj_encoded[i]) for i in range(len(adj_encoded))])
            max_prev_node.append(max_encoded_len)
        max_prev_node = sorted(max_prev_node)[-100:]
        return max_prev_node


# graphs, max_num_nodes = Graph_load_batch(min_num_nodes=6, name='DD',node_attributes=False)
# dataset = Graph_sequence_sampler_truncate([nx.karate_club_graph()])
# max_prev_nodes = dataset.calc_max_prev_node(iter=10000)
# print(max_prev_nodes)
# x,y,len = dataset.sample()
# print('x',x)
# print('y',y)
# print(len)


# only output y_batch (which is needed in batch version of new model)
class Graph_sequence_sampler_fast():
    def __init__(self, G_list, max_node_num=25, batch_size=4, max_prev_node = 25):
        self.batch_size = batch_size
        self.G_list = G_list
        self.n = max_node_num
        self.max_prev_node = max_prev_node

        self.adj_all = []
        for G in G_list:
            self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))


    def sample(self):
        # batch, length, feature
        y_batch = np.zeros((self.batch_size, self.n, self.max_prev_node))  # here zeros are padded for small graph
        # generate input x, y pairs
        for i in range(self.batch_size):
            # first sample and get a permuted adj
            adj_idx = np.random.randint(len(self.adj_all))
            adj_copy = self.adj_all[adj_idx].copy()
            # print('graph size',adj_copy.shape[0])
            x_idx = np.random.permutation(adj_copy.shape[0])
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            adj_copy_matrix = np.asmatrix(adj_copy)
            G = nx.from_numpy_matrix(adj_copy_matrix)
            # then do bfs in the permuted G
            start_idx = np.random.randint(adj_copy.shape[0])
            x_idx = np.array(bfs_seq(G, start_idx))
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            # get the feature for the permuted G
            # dict = nx.bfs_successors(G, start_idx)
            # print('dict', dict, 'node num', self.G.number_of_nodes())
            # print('x idx', x_idx, 'len', len(x_idx))

            # print('adj')
            # np.set_printoptions(linewidth=200)
            # for print_i in range(adj_copy.shape[0]):
            #     print(adj_copy[print_i].astype(int))
            # adj_before = adj_copy.copy()

            # encode adj
            adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.max_prev_node)
            # print('adj encoded')
            # np.set_printoptions(linewidth=200)
            # for print_i in range(adj_copy.shape[0]):
            #     print(adj_copy[print_i].astype(int))


            # decode adj
            # print('adj recover error')
            # adj_decode = decode_adj(adj_encoded.copy(), max_prev_node=self.max_prev_node)
            # adj_err = adj_decode-adj_copy
            # print(np.sum(adj_err))
            # if np.sum(adj_err)!=0:
            #     print(adj_err)
            # np.set_printoptions(linewidth=200)
            # for print_i in range(adj_err.shape[0]):
            #     print(adj_err[print_i].astype(int))

            # get x and y and adj
            # for small graph the rest are zero padded
            y_batch[i, 0:adj_encoded.shape[0], :] = adj_encoded


            # np.set_printoptions(linewidth=200,precision=3)
            # print('y\n')
            # for print_i in range(self.y_batch[i,:,:].shape[0]):
            #     print(self.y_batch[i,:,:][print_i].astype(int))
            # print('x\n')
            # for print_i in range(self.x_batch[i, :, :].shape[0]):
            #     print(self.x_batch[i, :, :][print_i].astype(int))
            # print('adj\n')
            # for print_i in range(self.adj_batch[i, :, :].shape[0]):
            #     print(self.adj_batch[i, :, :][print_i].astype(int))
            # print('adj_norm\n')
            # for print_i in range(self.adj_norm_batch[i, :, :].shape[0]):
            #     print(self.adj_norm_batch[i, :, :][print_i].astype(float))
            # print('feature\n')
            # for print_i in range(self.feature_batch[i, :, :].shape[0]):
            #     print(self.feature_batch[i, :, :][print_i].astype(float))


        # print('x_batch\n',self.x_batch)
        # print('y_batch\n',self.y_batch)

        return torch.from_numpy(y_batch).float()

# graphs, max_num_nodes = Graph_load_batch(min_num_nodes=6, name='PROTEINS_full')
# print(max_num_nodes)
# G = nx.ladder_graph(100)
# # G1 = nx.karate_club_graph()
# # G2 = nx.connected_caveman_graph(4,5)
# G_list = [G]
# dataset = Graph_sequence_sampler_fast(graphs, batch_size=128, max_node_num=max_num_nodes, max_prev_node=30)
# for i in range(5):
#     time0 = time.time()
#     y = dataset.sample()
#     time1 = time.time()
#     print(i,'time', time1 - time0)


# output size is flexible (using list to represent), batch size is 1
class Graph_sequence_sampler_flexible():
    def __init__(self, G_list):
        self.G_list = G_list
        self.adj_all = []
        for G in G_list:
            self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))

        self.y_batch = []
    def sample(self):
        # generate input x, y pairs
        # first sample and get a permuted adj
        adj_idx = np.random.randint(len(self.adj_all))
        adj_copy = self.adj_all[adj_idx].copy()
        # print('graph size',adj_copy.shape[0])
        x_idx = np.random.permutation(adj_copy.shape[0])
        adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
        adj_copy_matrix = np.asmatrix(adj_copy)
        G = nx.from_numpy_matrix(adj_copy_matrix)
        # then do bfs in the permuted G
        start_idx = np.random.randint(adj_copy.shape[0])
        x_idx = np.array(bfs_seq(G, start_idx))
        adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
        # get the feature for the permuted G
        # dict = nx.bfs_successors(G, start_idx)
        # print('dict', dict, 'node num', self.G.number_of_nodes())
        # print('x idx', x_idx, 'len', len(x_idx))

        # print('adj')
        # np.set_printoptions(linewidth=200)
        # for print_i in range(adj_copy.shape[0]):
        #     print(adj_copy[print_i].astype(int))
        # adj_before = adj_copy.copy()

        # encode adj
        adj_encoded = encode_adj_flexible(adj_copy.copy())
        # print('adj encoded')
        # np.set_printoptions(linewidth=200)
        # for print_i in range(adj_copy.shape[0]):
        #     print(adj_copy[print_i].astype(int))


        # decode adj
        # print('adj recover error')
        # adj_decode = decode_adj(adj_encoded.copy(), max_prev_node=self.max_prev_node)
        # adj_err = adj_decode-adj_copy
        # print(np.sum(adj_err))
        # if np.sum(adj_err)!=0:
        #     print(adj_err)
        # np.set_printoptions(linewidth=200)
        # for print_i in range(adj_err.shape[0]):
        #     print(adj_err[print_i].astype(int))

        # get x and y and adj
        # for small graph the rest are zero padded
        self.y_batch=adj_encoded


        # np.set_printoptions(linewidth=200,precision=3)
        # print('y\n')
        # for print_i in range(self.y_batch[i,:,:].shape[0]):
        #     print(self.y_batch[i,:,:][print_i].astype(int))
        # print('x\n')
        # for print_i in range(self.x_batch[i, :, :].shape[0]):
        #     print(self.x_batch[i, :, :][print_i].astype(int))
        # print('adj\n')
        # for print_i in range(self.adj_batch[i, :, :].shape[0]):
        #     print(self.adj_batch[i, :, :][print_i].astype(int))
        # print('adj_norm\n')
        # for print_i in range(self.adj_norm_batch[i, :, :].shape[0]):
        #     print(self.adj_norm_batch[i, :, :][print_i].astype(float))
        # print('feature\n')
        # for print_i in range(self.feature_batch[i, :, :].shape[0]):
        #     print(self.feature_batch[i, :, :][print_i].astype(float))

        return self.y_batch,adj_copy


# G = nx.ladder_graph(5)
# # G = nx.grid_2d_graph(20,20)
# # G = nx.ladder_graph(200)
# graphs = [G]
#
# graphs, max_num_nodes = Graph_load_batch(min_num_nodes=6, name='ENZYMES')
# sampler = Graph_sequence_sampler_flexible(graphs)
#
# y_max_all = []
# for i in range(10000):
#     y_raw,adj_copy = sampler.sample()
#     y_max = max(len(y_raw[i]) for i in range(len(y_raw)))
#     y_max_all.append(y_max)
#     # print('max bfs node',y_max)
# print('max', max(y_max_all))
# print(y[1])
# print(Variable(torch.FloatTensor(y[1])).cuda(CUDA))


########### potential use: an encoder along with the GraphRNN decoder
# preprocess the adjacency matrix
def preprocess(A):
    # Get size of the adjacency matrix
    size = len(A)
    # Get the degrees for each node
    degrees = np.sum(A, axis=1)+1

    # Create diagonal matrix D from the degrees of the nodes
    D = np.diag(np.power(degrees, -0.5).flatten())
    # Cholesky decomposition of D
    # D = np.linalg.cholesky(D)
    # Inverse of the Cholesky decomposition of D
    # D = np.linalg.inv(D)
    # Create an identity matrix of size x size
    I = np.eye(size)
    # Create A hat
    A_hat = A + I
    # Return A_hat
    A_normal = np.dot(np.dot(D,A_hat),D)
    return A_normal


# truncate the output seqence to save representation, and allowing for infinite generation
# now having a list of graphs
class Graph_sequence_sampler_bfs_permute_truncate_multigraph():
    def __init__(self, G_list, max_node_num=25, batch_size=4, max_prev_node = 25, feature = None):
        self.batch_size = batch_size
        self.G_list = G_list
        self.n = max_node_num
        self.max_prev_node = max_prev_node

        self.adj_all = []
        for G in G_list:
            self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
        self.has_feature = feature

    def sample(self):

        # batch, length, feature
        # self.x_batch = np.ones((self.batch_size, self.n - 1, self.max_prev_node))
        x_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
        # self.x_batch[:,0,:] = np.ones((self.batch_size, self.max_prev_node))  # first input is all ones
        # batch, length, feature
        y_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
        # batch, length, length
        adj_batch = np.zeros((self.batch_size, self.n, self.n)) # here zeros are padded for small graph
        # batch, size, size
        adj_norm_batch = np.zeros((self.batch_size, self.n, self.n))  # here zeros are padded for small graph
        # batch, size, feature_len: degree and clustering coefficient
        if self.has_feature is None:
            feature_batch = np.zeros((self.batch_size, self.n, self.n)) # use one hot feature
        else:
            feature_batch = np.zeros((self.batch_size, self.n, 2))

        # generate input x, y pairs
        for i in range(self.batch_size):
            time0 = time.time()
            # first sample and get a permuted adj
            adj_idx = np.random.randint(len(self.adj_all))
            adj_copy = self.adj_all[adj_idx].copy()
            # print('Graph size', adj_copy.shape[0])
            x_idx = np.random.permutation(adj_copy.shape[0])
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            adj_copy_matrix = np.asmatrix(adj_copy)
            G = nx.from_numpy_matrix(adj_copy_matrix)
            time1 = time.time()
            # then do bfs in the permuted G
            start_idx = np.random.randint(adj_copy.shape[0])
            x_idx = np.array(bfs_seq(G, start_idx))
            adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
            # get the feature for the permuted G
            node_list = [G.nodes()[i] for i in x_idx]
            feature_degree = np.array(list(G.degree(node_list).values()))[:,np.newaxis]
            feature_clustering = np.array(list(nx.clustering(G,nodes=node_list).values()))[:,np.newaxis]
            time2 = time.time()

            # dict = nx.bfs_successors(G, start_idx)
            # print('dict', dict, 'node num', self.G.number_of_nodes())
            # print('x idx', x_idx, 'len', len(x_idx))

            # print('adj')
            # np.set_printoptions(linewidth=200)
            # for print_i in range(adj_copy.shape[0]):
            #     print(adj_copy[print_i].astype(int))
            # adj_before = adj_copy.copy()

            # encode adj
            adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.max_prev_node)
            # print('adj encoded')
            # np.set_printoptions(linewidth=200)
            # for print_i in range(adj_copy.shape[0]):
            #     print(adj_copy[print_i].astype(int))


            # decode adj
            # print('adj recover error')
            # adj_decode = decode_adj(adj_encoded.copy(), max_prev_node=self.max_prev_node)
            # adj_err = adj_decode-adj_copy
            # print(np.sum(adj_err))
            # if np.sum(adj_err)!=0:
            #     print(adj_err)
            # np.set_printoptions(linewidth=200)
            # for print_i in range(adj_err.shape[0]):
            #     print(adj_err[print_i].astype(int))

            # get x and y and adj
            # for small graph the rest are zero padded
            y_batch[i, 0:adj_encoded.shape[0], :] = adj_encoded
            x_batch[i, 1:adj_encoded.shape[0]+1, :] = adj_encoded
            adj_batch[i, 0:adj_copy.shape[0], 0:adj_copy.shape[0]] = adj_copy
            adj_copy_norm = preprocess(adj_copy)
            time3 = time.time()
            adj_norm_batch[i, 0:adj_copy.shape[0], 0:adj_copy.shape[0]] = adj_copy_norm

            if self.has_feature is None:
                feature_batch[i, 0:adj_copy.shape[0], 0:adj_copy.shape[0]] = np.eye(adj_copy.shape[0])
            else:
                feature_batch[i,0:adj_copy.shape[0],:] = np.concatenate((feature_degree,feature_clustering),axis=1)


            # np.set_printoptions(linewidth=200,precision=3)
            # print('y\n')
            # for print_i in range(self.y_batch[i,:,:].shape[0]):
            #     print(self.y_batch[i,:,:][print_i].astype(int))
            # print('x\n')
            # for print_i in range(self.x_batch[i, :, :].shape[0]):
            #     print(self.x_batch[i, :, :][print_i].astype(int))
            # print('adj\n')
            # for print_i in range(self.adj_batch[i, :, :].shape[0]):
            #     print(self.adj_batch[i, :, :][print_i].astype(int))
            # print('adj_norm\n')
            # for print_i in range(self.adj_norm_batch[i, :, :].shape[0]):
            #     print(self.adj_norm_batch[i, :, :][print_i].astype(float))
            # print('feature\n')
            # for print_i in range(self.feature_batch[i, :, :].shape[0]):
            #     print(self.feature_batch[i, :, :][print_i].astype(float))
            time4 = time.time()
            # print('1 ',time1-time0)
            # print('2 ',time2-time1)
            # print('3 ',time3-time2)
            # print('4 ',time4-time3)

        # print('x_batch\n',self.x_batch)
        # print('y_batch\n',self.y_batch)

        return torch.from_numpy(x_batch).float(), torch.from_numpy(y_batch).float(),\
               torch.from_numpy(adj_batch).float(), torch.from_numpy(adj_norm_batch).float(), torch.from_numpy(feature_batch).float()


# generate own synthetic dataset
def Graph_synthetic(seed):
    G = nx.Graph()
    np.random.seed(seed)
    base = np.repeat(np.eye(5), 20, axis=0)
    rand = np.random.randn(100, 5) * 0.05
    node_features = base + rand

    # # print('node features')
    # for i in range(node_features.shape[0]):
    #     print(np.around(node_features[i], decimals=4))

    node_distance_l1 = np.ones((node_features.shape[0], node_features.shape[0]))
    node_distance_np = np.zeros((node_features.shape[0], node_features.shape[0]))
    for i in range(node_features.shape[0]):
        for j in range(node_features.shape[0]):
            if i != j:
                node_distance_l1[i,j] = np.sum(np.abs(node_features[i] - node_features[j]))
                # print('node distance', node_distance_l1[i,j])
                node_distance_np[i, j] = 1 / np.sum(np.abs(node_features[i] - node_features[j]) ** 2)

    print('node distance max', np.max(node_distance_l1))
    print('node distance min', np.min(node_distance_l1))
    node_distance_np_sum = np.sum(node_distance_np, axis=1, keepdims=True)
    embedding_dist = node_distance_np / node_distance_np_sum

    # generate the graph
    average_degree = 9
    for i in range(node_features.shape[0]):
        for j in range(i + 1, embedding_dist.shape[0]):
            p = np.random.rand()
            if p < embedding_dist[i, j] * average_degree:
                G.add_edge(i, j)

    G.remove_nodes_from(nx.isolates(G))
    print('num of nodes', G.number_of_nodes())
    print('num of edges', G.number_of_edges())

    G_deg = nx.degree_histogram(G)
    G_deg_sum = [a * b for a, b in zip(G_deg, range(0, len(G_deg)))]
    print('average degree', sum(G_deg_sum) / G.number_of_nodes())
    print('average path length', nx.average_shortest_path_length(G))
    print('diameter', nx.diameter(G))
    G_cluster = sorted(list(nx.clustering(G).values()))
    print('average clustering coefficient', sum(G_cluster) / len(G_cluster))
    print('Graph generation complete!')
    # node_features = np.concatenate((node_features, np.zeros((1,node_features.shape[1]))),axis=0)

    return G, node_features

# G = Graph_synthetic(10)


# return adj and features from a single graph
class GraphDataset_adj(torch.utils.data.Dataset):
    """Graph Dataset"""
    def __init__(self, G, features=None):
        self.G = G
        self.n = G.number_of_nodes()
        adj = np.asarray(nx.to_numpy_matrix(self.G))

        # permute adj
        subgraph_idx = np.random.permutation(self.n)
        # subgraph_idx = np.arange(self.n)
        adj = adj[np.ix_(subgraph_idx, subgraph_idx)]

        self.adj = torch.from_numpy(adj+np.eye(len(adj))).float()
        self.adj_norm = torch.from_numpy(preprocess(adj)).float()
        if features is None:
            self.features = torch.Tensor(self.n, self.n)
            self.features = nn.init.eye(self.features)
        else:
            features = features[subgraph_idx,:]
            self.features = torch.from_numpy(features).float()
        print('embedding size', self.features.size())
    def __len__(self):
        return 1
    def __getitem__(self, idx):
        sample = {'adj':self.adj,'adj_norm':self.adj_norm, 'features':self.features}
        return sample

# G = nx.karate_club_graph()
# dataset = GraphDataset_adj(G)
# train_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True, num_workers=1)
# for data in train_loader:
#     print(data)


# return adj and features from a list of graphs
class GraphDataset_adj_batch(torch.utils.data.Dataset):
    """Graph Dataset"""
    def __init__(self, graphs, has_feature = True, num_nodes = 20):
        self.graphs = graphs
        self.has_feature = has_feature
        self.num_nodes = num_nodes
    def __len__(self):
        return len(self.graphs)
    def __getitem__(self, idx):
        adj_raw = np.asarray(nx.to_numpy_matrix(self.graphs[idx]))
        np.fill_diagonal(adj_raw,0) # in case the self connection already exists

        # sample num_nodes size subgraph
        subgraph_idx = np.random.permutation(adj_raw.shape[0])[0:self.num_nodes]
        adj_raw = adj_raw[np.ix_(subgraph_idx,subgraph_idx)]

        adj = torch.from_numpy(adj_raw+np.eye(len(adj_raw))).float()
        adj_norm = torch.from_numpy(preprocess(adj_raw)).float()
        adj_raw = torch.from_numpy(adj_raw).float()
        if self.has_feature:
            dictionary = nx.get_node_attributes(self.graphs[idx], 'feature')
            features = np.zeros((self.num_nodes, list(dictionary.values())[0].shape[0]))
            for i in range(self.num_nodes):
                features[i, :] = list(dictionary.values())[subgraph_idx[i]]
            # normalize
            features -= np.mean(features, axis=0)
            epsilon = 1e-6
            features /= (np.std(features, axis=0)+epsilon)
            features = torch.from_numpy(features).float()
        else:
            n = self.num_nodes
            features = torch.Tensor(n, n)
            features = nn.init.eye(features)

        sample = {'adj':adj,'adj_norm':adj_norm, 'features':features, 'adj_raw':adj_raw}
        return sample

# return adj and features from a list of graphs, batch size = 1, so that graphs can have various size each time
class GraphDataset_adj_batch_1(torch.utils.data.Dataset):
    """Graph Dataset"""

    def __init__(self, graphs, has_feature=True):
        self.graphs = graphs
        self.has_feature = has_feature

    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, idx):
        adj_raw = np.asarray(nx.to_numpy_matrix(self.graphs[idx]))
        np.fill_diagonal(adj_raw, 0)  # in case the self connection already exists
        n = adj_raw.shape[0]
        # give a permutation
        subgraph_idx = np.random.permutation(n)
        # subgraph_idx = np.arange(n)

        adj_raw = adj_raw[np.ix_(subgraph_idx, subgraph_idx)]

        adj = torch.from_numpy(adj_raw + np.eye(len(adj_raw))).float()
        adj_norm = torch.from_numpy(preprocess(adj_raw)).float()

        if self.has_feature:
            dictionary = nx.get_node_attributes(self.graphs[idx], 'feature')
            features = np.zeros((n, list(dictionary.values())[0].shape[0]))
            for i in range(n):
                features[i, :] = list(dictionary.values())[i]
            features = features[subgraph_idx, :]
            # normalize
            features -= np.mean(features, axis=0)
            epsilon = 1e-6
            features /= (np.std(features, axis=0) + epsilon)
            features = torch.from_numpy(features).float()
        else:
            features = torch.Tensor(n, n)
            features = nn.init.eye(features)

        sample = {'adj': adj, 'adj_norm': adj_norm, 'features': features}
        return sample


# get one node at a time, for a single graph
class GraphDataset(torch.utils.data.Dataset):
    """Graph Dataset"""
    def __init__(self, G, hops = 1, max_degree = 5, vocab_size = 35, embedding_dim = 35, embedding = None,  shuffle_neighbour = True):
        self.G = G
        self.shuffle_neighbour = shuffle_neighbour
        self.hops = hops
        self.max_degree = max_degree
        if embedding is None:
            self.embedding = torch.Tensor(vocab_size, embedding_dim)
            self.embedding = nn.init.eye(self.embedding)
        else:
            self.embedding = torch.from_numpy(embedding).float()
        print('embedding size', self.embedding.size())
    def __len__(self):
        return len(self.G.nodes())
    def __getitem__(self, idx):
        idx = idx+1
        idx_list = [idx]
        node_list = [self.embedding[idx].view(-1, self.embedding.size(1))]
        node_count_list = []
        for i in range(self.hops):
            # sample this hop
            adj_list = np.array([])
            adj_count_list = np.array([])
            for idx in idx_list:
                if self.shuffle_neighbour:
                    adj_list_new = list(self.G.adj[idx - 1])
                    random.shuffle(adj_list_new)
                    adj_list_new = np.array(adj_list_new) + 1
                else:
                    adj_list_new = np.array(list(self.G.adj[idx-1]))+1
                adj_count_list_new = np.array([len(adj_list_new)])
                adj_list = np.concatenate((adj_list, adj_list_new), axis=0)
                adj_count_list = np.concatenate((adj_count_list, adj_count_list_new), axis=0)
            # print(i, adj_list)
            # print(i, embedding(Variable(torch.from_numpy(adj_list)).long()))
            index = torch.from_numpy(adj_list).long()
            adj_list_emb = self.embedding[index]
            node_list.append(adj_list_emb)
            node_count_list.append(adj_count_list)
            idx_list = adj_list


        # padding, used as target
        idx_list = [idx]
        node_list_pad = [self.embedding[idx].view(-1, self.embedding.size(1))]
        node_count_list_pad = []
        node_adj_list = []
        for i in range(self.hops):
            adj_list = np.zeros(self.max_degree ** (i + 1))
            adj_count_list = np.ones(self.max_degree ** (i)) * self.max_degree
            for j, idx in enumerate(idx_list):
                if idx == 0:
                    adj_list_new = np.zeros(self.max_degree)
                else:
                    if self.shuffle_neighbour:
                        adj_list_new = list(self.G.adj[idx - 1])
                        # random.shuffle(adj_list_new)
                        adj_list_new = np.array(adj_list_new) + 1
                    else:
                        adj_list_new = np.array(list(self.G.adj[idx-1]))+1
                start_idx = j * self.max_degree
                incre_idx = min(self.max_degree, adj_list_new.shape[0])
                adj_list[start_idx:start_idx + incre_idx] = adj_list_new[:incre_idx]
            index = torch.from_numpy(adj_list).long()
            adj_list_emb = self.embedding[index]
            node_list_pad.append(adj_list_emb)
            node_count_list_pad.append(adj_count_list)
            idx_list = adj_list
            # calc adj matrix
            node_adj = torch.zeros(index.size(0),index.size(0))
            for first in range(index.size(0)):
                for second in range(first, index.size(0)):
                    if index[first]==index[second]:
                        node_adj[first,second] = 1
                        node_adj[second,first] = 1
                    elif self.G.has_edge(index[first],index[second]):
                        node_adj[first, second] = 0.5
                        node_adj[second, first] = 0.5
            node_adj_list.append(node_adj)


        node_list = list(reversed(node_list))
        node_count_list = list(reversed(node_count_list))
        node_list_pad = list(reversed(node_list_pad))
        node_count_list_pad = list(reversed(node_count_list_pad))
        node_adj_list = list(reversed(node_adj_list))
        sample = {'node_list':node_list, 'node_count_list':node_count_list,
                  'node_list_pad':node_list_pad, 'node_count_list_pad':node_count_list_pad, 'node_adj_list':node_adj_list}
        return sample