You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

data.py 52KB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392
  1. import torch
  2. import torchvision as tv
  3. import torch.nn as nn
  4. from torch.autograd import Variable
  5. import matplotlib.pyplot as plt
  6. from random import shuffle
  7. import networkx as nx
  8. import pickle as pkl
  9. import scipy.sparse as sp
  10. import logging
  11. import random
  12. import shutil
  13. import os
  14. import time
  15. from model import *
  16. from utils import *
  17. # load ENZYMES and PROTEIN and DD dataset
  18. def Graph_load_batch(min_num_nodes = 20, max_num_nodes = 1000, name = 'ENZYMES',node_attributes = True,graph_labels=True):
  19. '''
  20. load many graphs, e.g. enzymes
  21. :return: a list of graphs
  22. '''
  23. print('Loading graph dataset: '+str(name))
  24. G = nx.Graph()
  25. # load data
  26. path = 'dataset/'+name+'/'
  27. data_adj = np.loadtxt(path+name+'_A.txt', delimiter=',').astype(int)
  28. if node_attributes:
  29. data_node_att = np.loadtxt(path+name+'_node_attributes.txt', delimiter=',')
  30. data_node_label = np.loadtxt(path+name+'_node_labels.txt', delimiter=',').astype(int)
  31. data_graph_indicator = np.loadtxt(path+name+'_graph_indicator.txt', delimiter=',').astype(int)
  32. if graph_labels:
  33. data_graph_labels = np.loadtxt(path+name+'_graph_labels.txt', delimiter=',').astype(int)
  34. data_tuple = list(map(tuple, data_adj))
  35. # print(len(data_tuple))
  36. # print(data_tuple[0])
  37. # add edges
  38. G.add_edges_from(data_tuple)
  39. # add node attributes
  40. for i in range(data_node_label.shape[0]):
  41. if node_attributes:
  42. G.add_node(i+1, feature = data_node_att[i])
  43. G.add_node(i+1, label = data_node_label[i])
  44. G.remove_nodes_from(list(nx.isolates(G)))
  45. # print(G.number_of_nodes())
  46. # print(G.number_of_edges())
  47. # split into graphs
  48. graph_num = data_graph_indicator.max()
  49. node_list = np.arange(data_graph_indicator.shape[0])+1
  50. graphs = []
  51. max_nodes = 0
  52. for i in range(graph_num):
  53. # find the nodes for each graph
  54. nodes = node_list[data_graph_indicator==i+1]
  55. G_sub = G.subgraph(nodes)
  56. if graph_labels:
  57. G_sub.graph['label'] = data_graph_labels[i]
  58. # print('nodes', G_sub.number_of_nodes())
  59. # print('edges', G_sub.number_of_edges())
  60. # print('label', G_sub.graph)
  61. if G_sub.number_of_nodes()>=min_num_nodes and G_sub.number_of_nodes()<=max_num_nodes:
  62. graphs.append(G_sub)
  63. if G_sub.number_of_nodes() > max_nodes:
  64. max_nodes = G_sub.number_of_nodes()
  65. # print(G_sub.number_of_nodes(), 'i', i)
  66. # print('Graph dataset name: {}, total graph num: {}'.format(name, len(graphs)))
  67. # logging.warning('Graphs loaded, total num: {}'.format(len(graphs)))
  68. print('Loaded')
  69. return graphs
  70. def test_graph_load_DD():
  71. graphs, max_num_nodes = Graph_load_batch(min_num_nodes=10,name='DD',node_attributes=False,graph_labels=True)
  72. shuffle(graphs)
  73. plt.switch_backend('agg')
  74. plt.hist([len(graphs[i]) for i in range(len(graphs))], bins=100)
  75. plt.savefig('figures/test.png')
  76. plt.close()
  77. row = 4
  78. col = 4
  79. draw_graph_list(graphs[0:row*col], row=row,col=col, fname='figures/test')
  80. print('max num nodes',max_num_nodes)
  81. def parse_index_file(filename):
  82. index = []
  83. for line in open(filename):
  84. index.append(int(line.strip()))
  85. return index
  86. # load cora, citeseer and pubmed dataset
  87. def Graph_load(dataset = 'cora'):
  88. '''
  89. Load a single graph dataset
  90. :param dataset: dataset name
  91. :return:
  92. '''
  93. names = ['x', 'tx', 'allx', 'graph']
  94. objects = []
  95. for i in range(len(names)):
  96. load = pkl.load(open("dataset/ind.{}.{}".format(dataset, names[i]), 'rb'), encoding='latin1')
  97. # print('loaded')
  98. objects.append(load)
  99. # print(load)
  100. x, tx, allx, graph = tuple(objects)
  101. test_idx_reorder = parse_index_file("dataset/ind.{}.test.index".format(dataset))
  102. test_idx_range = np.sort(test_idx_reorder)
  103. if dataset == 'citeseer':
  104. # Fix citeseer dataset (there are some isolated nodes in the graph)
  105. # Find isolated nodes, add them as zero-vecs into the right position
  106. test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1)
  107. tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
  108. tx_extended[test_idx_range - min(test_idx_range), :] = tx
  109. tx = tx_extended
  110. features = sp.vstack((allx, tx)).tolil()
  111. features[test_idx_reorder, :] = features[test_idx_range, :]
  112. G = nx.from_dict_of_lists(graph)
  113. adj = nx.adjacency_matrix(G)
  114. return adj, features, G
  115. ######### code test ########
  116. # adj, features,G = Graph_load()
  117. # print(adj)
  118. # print(G.number_of_nodes(), G.number_of_edges())
  119. # _,_,G = Graph_load(dataset='citeseer')
  120. # G = max(nx.connected_component_subgraphs(G), key=len)
  121. # G = nx.convert_node_labels_to_integers(G)
  122. #
  123. # count = 0
  124. # max_node = 0
  125. # for i in range(G.number_of_nodes()):
  126. # G_ego = nx.ego_graph(G, i, radius=3)
  127. # # draw_graph(G_ego,prefix='test'+str(i))
  128. # m = G_ego.number_of_nodes()
  129. # if m>max_node:
  130. # max_node = m
  131. # if m>=50:
  132. # print(i, G_ego.number_of_nodes(), G_ego.number_of_edges())
  133. # count += 1
  134. # print('count', count)
  135. # print('max_node', max_node)
  136. def bfs_seq(G, start_id):
  137. '''
  138. get a bfs node sequence
  139. :param G:
  140. :param start_id:
  141. :return:
  142. '''
  143. dictionary = dict(nx.bfs_successors(G, start_id))
  144. start = [start_id]
  145. output = [start_id]
  146. while len(start) > 0:
  147. next = []
  148. while len(start) > 0:
  149. current = start.pop(0)
  150. neighbor = dictionary.get(current)
  151. if neighbor is not None:
  152. #### a wrong example, should not permute here!
  153. # shuffle(neighbor)
  154. next = next + neighbor
  155. output = output + next
  156. start = next
  157. return output
  158. def encode_adj(adj, max_prev_node=10, is_full = False):
  159. '''
  160. :param adj: n*n, rows means time step, while columns are input dimension
  161. :param max_degree: we want to keep row number, but truncate column numbers
  162. :return:
  163. '''
  164. if is_full:
  165. max_prev_node = adj.shape[0]-1
  166. # pick up lower tri
  167. adj = np.tril(adj, k=-1)
  168. n = adj.shape[0]
  169. adj = adj[1:n, 0:n-1]
  170. # use max_prev_node to truncate
  171. # note: now adj is a (n-1)*(n-1) matrix
  172. adj_output = np.zeros((adj.shape[0], max_prev_node))
  173. for i in range(adj.shape[0]):
  174. input_start = max(0, i - max_prev_node + 1)
  175. input_end = i + 1
  176. output_start = max_prev_node + input_start - input_end
  177. output_end = max_prev_node
  178. adj_output[i, output_start:output_end] = adj[i, input_start:input_end]
  179. adj_output[i,:] = adj_output[i,:][::-1] # reverse order
  180. return adj_output
  181. def decode_adj(adj_output):
  182. '''
  183. recover to adj from adj_output
  184. note: here adj_output have shape (n-1)*m
  185. '''
  186. max_prev_node = adj_output.shape[1]
  187. adj = np.zeros((adj_output.shape[0], adj_output.shape[0]))
  188. for i in range(adj_output.shape[0]):
  189. input_start = max(0, i - max_prev_node + 1)
  190. input_end = i + 1
  191. output_start = max_prev_node + max(0, i - max_prev_node + 1) - (i + 1)
  192. output_end = max_prev_node
  193. adj[i, input_start:input_end] = adj_output[i,::-1][output_start:output_end] # reverse order
  194. adj_full = np.zeros((adj_output.shape[0]+1, adj_output.shape[0]+1))
  195. n = adj_full.shape[0]
  196. adj_full[1:n, 0:n-1] = np.tril(adj, 0)
  197. adj_full = adj_full + adj_full.T
  198. return adj_full
  199. def encode_adj_flexible(adj):
  200. '''
  201. return a flexible length of output
  202. note that here there is no loss when encoding/decoding an adj matrix
  203. :param adj: adj matrix
  204. :return:
  205. '''
  206. # pick up lower tri
  207. adj = np.tril(adj, k=-1)
  208. n = adj.shape[0]
  209. adj = adj[1:n, 0:n-1]
  210. adj_output = []
  211. input_start = 0
  212. for i in range(adj.shape[0]):
  213. input_end = i + 1
  214. adj_slice = adj[i, input_start:input_end]
  215. adj_output.append(adj_slice)
  216. non_zero = np.nonzero(adj_slice)[0]
  217. input_start = input_end-len(adj_slice)+np.amin(non_zero)
  218. return adj_output
  219. def decode_adj_flexible(adj_output):
  220. '''
  221. return a flexible length of output
  222. note that here there is no loss when encoding/decoding an adj matrix
  223. :param adj: adj matrix
  224. :return:
  225. '''
  226. adj = np.zeros((len(adj_output), len(adj_output)))
  227. for i in range(len(adj_output)):
  228. output_start = i+1-len(adj_output[i])
  229. output_end = i+1
  230. adj[i, output_start:output_end] = adj_output[i]
  231. adj_full = np.zeros((len(adj_output)+1, len(adj_output)+1))
  232. n = adj_full.shape[0]
  233. adj_full[1:n, 0:n-1] = np.tril(adj, 0)
  234. adj_full = adj_full + adj_full.T
  235. return adj_full
  236. def test_encode_decode_adj():
  237. ######## code test ###########
  238. G = nx.ladder_graph(5)
  239. G = nx.grid_2d_graph(20,20)
  240. G = nx.ladder_graph(200)
  241. G = nx.karate_club_graph()
  242. G = nx.connected_caveman_graph(2,3)
  243. print(G.number_of_nodes())
  244. adj = np.asarray(nx.to_numpy_matrix(G))
  245. G = nx.from_numpy_matrix(adj)
  246. #
  247. start_idx = np.random.randint(adj.shape[0])
  248. x_idx = np.array(bfs_seq(G, start_idx))
  249. adj = adj[np.ix_(x_idx, x_idx)]
  250. print('adj\n',adj)
  251. adj_output = encode_adj(adj,max_prev_node=5)
  252. print('adj_output\n',adj_output)
  253. adj_recover = decode_adj(adj_output,max_prev_node=5)
  254. print('adj_recover\n',adj_recover)
  255. print('error\n',np.amin(adj_recover-adj),np.amax(adj_recover-adj))
  256. adj_output = encode_adj_flexible(adj)
  257. for i in range(len(adj_output)):
  258. print(len(adj_output[i]))
  259. adj_recover = decode_adj_flexible(adj_output)
  260. print(adj_recover)
  261. print(np.amin(adj_recover-adj),np.amax(adj_recover-adj))
  262. def encode_adj_full(adj):
  263. '''
  264. return a n-1*n-1*2 tensor, the first dimension is an adj matrix, the second show if each entry is valid
  265. :param adj: adj matrix
  266. :return:
  267. '''
  268. # pick up lower tri
  269. adj = np.tril(adj, k=-1)
  270. n = adj.shape[0]
  271. adj = adj[1:n, 0:n-1]
  272. adj_output = np.zeros((adj.shape[0],adj.shape[1],2))
  273. adj_len = np.zeros(adj.shape[0])
  274. for i in range(adj.shape[0]):
  275. non_zero = np.nonzero(adj[i,:])[0]
  276. input_start = np.amin(non_zero)
  277. input_end = i + 1
  278. adj_slice = adj[i, input_start:input_end]
  279. # write adj
  280. adj_output[i,0:adj_slice.shape[0],0] = adj_slice[::-1] # put in reverse order
  281. # write stop token (if token is 0, stop)
  282. adj_output[i,0:adj_slice.shape[0],1] = 1 # put in reverse order
  283. # write sequence length
  284. adj_len[i] = adj_slice.shape[0]
  285. return adj_output,adj_len
  286. def decode_adj_full(adj_output):
  287. '''
  288. return an adj according to adj_output
  289. :param
  290. :return:
  291. '''
  292. # pick up lower tri
  293. adj = np.zeros((adj_output.shape[0]+1,adj_output.shape[1]+1))
  294. for i in range(adj_output.shape[0]):
  295. non_zero = np.nonzero(adj_output[i,:,1])[0] # get valid sequence
  296. input_end = np.amax(non_zero)
  297. adj_slice = adj_output[i, 0:input_end+1, 0] # get adj slice
  298. # write adj
  299. output_end = i+1
  300. output_start = i+1-input_end-1
  301. adj[i+1,output_start:output_end] = adj_slice[::-1] # put in reverse order
  302. adj = adj + adj.T
  303. return adj
  304. def test_encode_decode_adj_full():
  305. ########### code test #############
  306. # G = nx.ladder_graph(10)
  307. G = nx.karate_club_graph()
  308. # get bfs adj
  309. adj = np.asarray(nx.to_numpy_matrix(G))
  310. G = nx.from_numpy_matrix(adj)
  311. start_idx = np.random.randint(adj.shape[0])
  312. x_idx = np.array(bfs_seq(G, start_idx))
  313. adj = adj[np.ix_(x_idx, x_idx)]
  314. adj_output, adj_len = encode_adj_full(adj)
  315. print('adj\n',adj)
  316. print('adj_output[0]\n',adj_output[:,:,0])
  317. print('adj_output[1]\n',adj_output[:,:,1])
  318. # print('adj_len\n',adj_len)
  319. adj_recover = decode_adj_full(adj_output)
  320. print('adj_recover\n', adj_recover)
  321. print('error\n',adj_recover-adj)
  322. print('error_sum\n',np.amax(adj_recover-adj), np.amin(adj_recover-adj))
  323. ########## use pytorch dataloader
  324. class Graph_sequence_sampler_pytorch(torch.utils.data.Dataset):
  325. def __init__(self, G_list, max_num_node=None, max_prev_node=None, iteration=20000):
  326. self.adj_all = []
  327. self.len_all = []
  328. for G in G_list:
  329. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  330. self.len_all.append(G.number_of_nodes())
  331. if max_num_node is None:
  332. self.n = max(self.len_all)
  333. else:
  334. self.n = max_num_node
  335. if max_prev_node is None:
  336. print('calculating max previous node, total iteration: {}'.format(iteration))
  337. self.max_prev_node = max(self.calc_max_prev_node(iter=iteration))
  338. print('max previous node: {}'.format(self.max_prev_node))
  339. else:
  340. self.max_prev_node = max_prev_node
  341. # self.max_prev_node = max_prev_node
  342. # # sort Graph in descending order
  343. # len_batch_order = np.argsort(np.array(self.len_all))[::-1]
  344. # self.len_all = [self.len_all[i] for i in len_batch_order]
  345. # self.adj_all = [self.adj_all[i] for i in len_batch_order]
  346. def __len__(self):
  347. return len(self.adj_all)
  348. def __getitem__(self, idx):
  349. adj_copy = self.adj_all[idx].copy()
  350. x_batch = np.zeros((self.n, self.max_prev_node)) # here zeros are padded for small graph
  351. x_batch[0,:] = 1 # the first input token is all ones
  352. y_batch = np.zeros((self.n, self.max_prev_node)) # here zeros are padded for small graph
  353. # generate input x, y pairs
  354. len_batch = adj_copy.shape[0]
  355. x_idx = np.random.permutation(adj_copy.shape[0])
  356. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  357. adj_copy_matrix = np.asmatrix(adj_copy)
  358. G = nx.from_numpy_matrix(adj_copy_matrix)
  359. # then do bfs in the permuted G
  360. start_idx = np.random.randint(adj_copy.shape[0])
  361. x_idx = np.array(bfs_seq(G, start_idx))
  362. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  363. adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.max_prev_node)
  364. # get x and y and adj
  365. # for small graph the rest are zero padded
  366. y_batch[0:adj_encoded.shape[0], :] = adj_encoded
  367. x_batch[1:adj_encoded.shape[0] + 1, :] = adj_encoded
  368. return {'x':x_batch,'y':y_batch, 'len':len_batch}
  369. def calc_max_prev_node(self, iter=20000,topk=10):
  370. max_prev_node = []
  371. for i in range(iter):
  372. if i % (iter / 5) == 0:
  373. print('iter {} times'.format(i))
  374. adj_idx = np.random.randint(len(self.adj_all))
  375. adj_copy = self.adj_all[adj_idx].copy()
  376. # print('Graph size', adj_copy.shape[0])
  377. x_idx = np.random.permutation(adj_copy.shape[0])
  378. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  379. adj_copy_matrix = np.asmatrix(adj_copy)
  380. G = nx.from_numpy_matrix(adj_copy_matrix)
  381. # then do bfs in the permuted G
  382. start_idx = np.random.randint(adj_copy.shape[0])
  383. x_idx = np.array(bfs_seq(G, start_idx))
  384. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  385. # encode adj
  386. adj_encoded = encode_adj_flexible(adj_copy.copy())
  387. max_encoded_len = max([len(adj_encoded[i]) for i in range(len(adj_encoded))])
  388. max_prev_node.append(max_encoded_len)
  389. max_prev_node = sorted(max_prev_node)[-1*topk:]
  390. return max_prev_node
  391. ########## use pytorch dataloader
  392. class Graph_sequence_sampler_pytorch_nobfs(torch.utils.data.Dataset):
  393. def __init__(self, G_list, max_num_node=None):
  394. self.adj_all = []
  395. self.len_all = []
  396. for G in G_list:
  397. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  398. self.len_all.append(G.number_of_nodes())
  399. if max_num_node is None:
  400. self.n = max(self.len_all)
  401. else:
  402. self.n = max_num_node
  403. def __len__(self):
  404. return len(self.adj_all)
  405. def __getitem__(self, idx):
  406. adj_copy = self.adj_all[idx].copy()
  407. x_batch = np.zeros((self.n, self.n-1)) # here zeros are padded for small graph
  408. x_batch[0,:] = 1 # the first input token is all ones
  409. y_batch = np.zeros((self.n, self.n-1)) # here zeros are padded for small graph
  410. # generate input x, y pairs
  411. len_batch = adj_copy.shape[0]
  412. x_idx = np.random.permutation(adj_copy.shape[0])
  413. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  414. adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.n-1)
  415. # get x and y and adj
  416. # for small graph the rest are zero padded
  417. y_batch[0:adj_encoded.shape[0], :] = adj_encoded
  418. x_batch[1:adj_encoded.shape[0] + 1, :] = adj_encoded
  419. return {'x':x_batch,'y':y_batch, 'len':len_batch}
  420. # dataset = Graph_sequence_sampler_pytorch_nobfs(graphs)
  421. # print(dataset[1]['x'])
  422. # print(dataset[1]['y'])
  423. # print(dataset[1]['len'])
  424. ########## use pytorch dataloader
  425. class Graph_sequence_sampler_pytorch_canonical(torch.utils.data.Dataset):
  426. def __init__(self, G_list, max_num_node=None, max_prev_node=None, iteration=20000):
  427. self.adj_all = []
  428. self.len_all = []
  429. for G in G_list:
  430. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  431. self.len_all.append(G.number_of_nodes())
  432. if max_num_node is None:
  433. self.n = max(self.len_all)
  434. else:
  435. self.n = max_num_node
  436. if max_prev_node is None:
  437. # print('calculating max previous node, total iteration: {}'.format(iteration))
  438. # self.max_prev_node = max(self.calc_max_prev_node(iter=iteration))
  439. # print('max previous node: {}'.format(self.max_prev_node))
  440. self.max_prev_node = self.n-1
  441. else:
  442. self.max_prev_node = max_prev_node
  443. # self.max_prev_node = max_prev_node
  444. # # sort Graph in descending order
  445. # len_batch_order = np.argsort(np.array(self.len_all))[::-1]
  446. # self.len_all = [self.len_all[i] for i in len_batch_order]
  447. # self.adj_all = [self.adj_all[i] for i in len_batch_order]
  448. def __len__(self):
  449. return len(self.adj_all)
  450. def __getitem__(self, idx):
  451. adj_copy = self.adj_all[idx].copy()
  452. x_batch = np.zeros((self.n, self.max_prev_node)) # here zeros are padded for small graph
  453. x_batch[0,:] = 1 # the first input token is all ones
  454. y_batch = np.zeros((self.n, self.max_prev_node)) # here zeros are padded for small graph
  455. # generate input x, y pairs
  456. len_batch = adj_copy.shape[0]
  457. # adj_copy_matrix = np.asmatrix(adj_copy)
  458. # G = nx.from_numpy_matrix(adj_copy_matrix)
  459. # then do bfs in the permuted G
  460. # start_idx = G.number_of_nodes()-1
  461. # x_idx = np.array(bfs_seq(G, start_idx))
  462. # adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  463. adj_encoded = encode_adj(adj_copy, max_prev_node=self.max_prev_node)
  464. # get x and y and adj
  465. # for small graph the rest are zero padded
  466. y_batch[0:adj_encoded.shape[0], :] = adj_encoded
  467. x_batch[1:adj_encoded.shape[0] + 1, :] = adj_encoded
  468. return {'x':x_batch,'y':y_batch, 'len':len_batch}
  469. def calc_max_prev_node(self, iter=20000,topk=10):
  470. max_prev_node = []
  471. for i in range(iter):
  472. if i % (iter / 5) == 0:
  473. print('iter {} times'.format(i))
  474. adj_idx = np.random.randint(len(self.adj_all))
  475. adj_copy = self.adj_all[adj_idx].copy()
  476. # print('Graph size', adj_copy.shape[0])
  477. x_idx = np.random.permutation(adj_copy.shape[0])
  478. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  479. adj_copy_matrix = np.asmatrix(adj_copy)
  480. G = nx.from_numpy_matrix(adj_copy_matrix)
  481. # then do bfs in the permuted G
  482. start_idx = np.random.randint(adj_copy.shape[0])
  483. x_idx = np.array(bfs_seq(G, start_idx))
  484. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  485. # encode adj
  486. adj_encoded = encode_adj_flexible(adj_copy.copy())
  487. max_encoded_len = max([len(adj_encoded[i]) for i in range(len(adj_encoded))])
  488. max_prev_node.append(max_encoded_len)
  489. max_prev_node = sorted(max_prev_node)[-1*topk:]
  490. return max_prev_node
  491. ########## use pytorch dataloader
  492. class Graph_sequence_sampler_pytorch_nll(torch.utils.data.Dataset):
  493. def __init__(self, G_list, max_num_node=None, max_prev_node=None, iteration=20000):
  494. self.adj_all = []
  495. self.len_all = []
  496. for G in G_list:
  497. adj = np.asarray(nx.to_numpy_matrix(G))
  498. adj_temp = self.calc_adj(adj)
  499. self.adj_all.extend(adj_temp)
  500. self.len_all.append(G.number_of_nodes())
  501. if max_num_node is None:
  502. self.n = max(self.len_all)
  503. else:
  504. self.n = max_num_node
  505. if max_prev_node is None:
  506. # print('calculating max previous node, total iteration: {}'.format(iteration))
  507. # self.max_prev_node = max(self.calc_max_prev_node(iter=iteration))
  508. # print('max previous node: {}'.format(self.max_prev_node))
  509. self.max_prev_node = self.n-1
  510. else:
  511. self.max_prev_node = max_prev_node
  512. # self.max_prev_node = max_prev_node
  513. # # sort Graph in descending order
  514. # len_batch_order = np.argsort(np.array(self.len_all))[::-1]
  515. # self.len_all = [self.len_all[i] for i in len_batch_order]
  516. # self.adj_all = [self.adj_all[i] for i in len_batch_order]
  517. def __len__(self):
  518. return len(self.adj_all)
  519. def __getitem__(self, idx):
  520. adj_copy = self.adj_all[idx].copy()
  521. x_batch = np.zeros((self.n, self.max_prev_node)) # here zeros are padded for small graph
  522. x_batch[0,:] = 1 # the first input token is all ones
  523. y_batch = np.zeros((self.n, self.max_prev_node)) # here zeros are padded for small graph
  524. # generate input x, y pairs
  525. len_batch = adj_copy.shape[0]
  526. # adj_copy_matrix = np.asmatrix(adj_copy)
  527. # G = nx.from_numpy_matrix(adj_copy_matrix)
  528. # then do bfs in the permuted G
  529. # start_idx = G.number_of_nodes()-1
  530. # x_idx = np.array(bfs_seq(G, start_idx))
  531. # adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  532. adj_encoded = encode_adj(adj_copy, max_prev_node=self.max_prev_node)
  533. # get x and y and adj
  534. # for small graph the rest are zero padded
  535. y_batch[0:adj_encoded.shape[0], :] = adj_encoded
  536. x_batch[1:adj_encoded.shape[0] + 1, :] = adj_encoded
  537. return {'x':x_batch,'y':y_batch, 'len':len_batch}
  538. def calc_adj(self,adj):
  539. max_iter = 10000
  540. adj_all = [adj]
  541. adj_all_len = 1
  542. i_old = 0
  543. for i in range(max_iter):
  544. adj_copy = adj.copy()
  545. x_idx = np.random.permutation(adj_copy.shape[0])
  546. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  547. adj_copy_matrix = np.asmatrix(adj_copy)
  548. G = nx.from_numpy_matrix(adj_copy_matrix)
  549. # then do bfs in the permuted G
  550. start_idx = np.random.randint(adj_copy.shape[0])
  551. x_idx = np.array(bfs_seq(G, start_idx))
  552. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  553. add_flag = True
  554. for adj_exist in adj_all:
  555. if np.array_equal(adj_exist, adj_copy):
  556. add_flag = False
  557. break
  558. if add_flag:
  559. adj_all.append(adj_copy)
  560. adj_all_len += 1
  561. if adj_all_len % 10 ==0:
  562. print('adj found:',adj_all_len,'iter used',i)
  563. return adj_all
  564. # graphs = [nx.barabasi_albert_graph(20,3)]
  565. # graphs = [nx.grid_2d_graph(4,4)]
  566. # dataset = Graph_sequence_sampler_pytorch_nll(graphs)
  567. ############## below are codes not used in current version
  568. ############## they are based on pytorch default data loader, we should consider reimplement them in current datasets, since they are more efficient
  569. # normal version
  570. class Graph_sequence_sampler_truncate():
  571. '''
  572. the output will truncate according to the max_prev_node
  573. '''
  574. def __init__(self, G_list, max_node_num=25, batch_size=4, max_prev_node = 25):
  575. self.batch_size = batch_size
  576. self.n = max_node_num
  577. self.max_prev_node = max_prev_node
  578. self.adj_all = []
  579. for G in G_list:
  580. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  581. def sample(self):
  582. # batch, length, feature
  583. x_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
  584. y_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
  585. len_batch = np.zeros(self.batch_size)
  586. # generate input x, y pairs
  587. for i in range(self.batch_size):
  588. # first sample and get a permuted adj
  589. adj_idx = np.random.randint(len(self.adj_all))
  590. adj_copy = self.adj_all[adj_idx].copy()
  591. len_batch[i] = adj_copy.shape[0]
  592. x_idx = np.random.permutation(adj_copy.shape[0])
  593. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  594. adj_copy_matrix = np.asmatrix(adj_copy)
  595. G = nx.from_numpy_matrix(adj_copy_matrix)
  596. # then do bfs in the permuted G
  597. start_idx = np.random.randint(adj_copy.shape[0])
  598. x_idx = np.array(bfs_seq(G, start_idx))
  599. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  600. adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.max_prev_node)
  601. # get x and y and adj
  602. # for small graph the rest are zero padded
  603. y_batch[i, 0:adj_encoded.shape[0], :] = adj_encoded
  604. x_batch[i, 1:adj_encoded.shape[0]+1, :] = adj_encoded
  605. # sort in descending order
  606. len_batch_order = np.argsort(len_batch)[::-1]
  607. len_batch = len_batch[len_batch_order]
  608. x_batch = x_batch[len_batch_order,:,:]
  609. y_batch = y_batch[len_batch_order,:,:]
  610. return torch.from_numpy(x_batch).float(), torch.from_numpy(y_batch).float(), len_batch.astype('int').tolist()
  611. def calc_max_prev_node(self,iter):
  612. max_prev_node = []
  613. for i in range(iter):
  614. if i%(iter/10)==0:
  615. print(i)
  616. adj_idx = np.random.randint(len(self.adj_all))
  617. adj_copy = self.adj_all[adj_idx].copy()
  618. # print('Graph size', adj_copy.shape[0])
  619. x_idx = np.random.permutation(adj_copy.shape[0])
  620. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  621. adj_copy_matrix = np.asmatrix(adj_copy)
  622. G = nx.from_numpy_matrix(adj_copy_matrix)
  623. time1 = time.time()
  624. # then do bfs in the permuted G
  625. start_idx = np.random.randint(adj_copy.shape[0])
  626. x_idx = np.array(bfs_seq(G, start_idx))
  627. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  628. # encode adj
  629. adj_encoded = encode_adj_flexible(adj_copy.copy())
  630. max_encoded_len = max([len(adj_encoded[i]) for i in range(len(adj_encoded))])
  631. max_prev_node.append(max_encoded_len)
  632. max_prev_node = sorted(max_prev_node)[-100:]
  633. return max_prev_node
  634. # graphs, max_num_nodes = Graph_load_batch(min_num_nodes=6, name='DD',node_attributes=False)
  635. # dataset = Graph_sequence_sampler_truncate([nx.karate_club_graph()])
  636. # max_prev_nodes = dataset.calc_max_prev_node(iter=10000)
  637. # print(max_prev_nodes)
  638. # x,y,len = dataset.sample()
  639. # print('x',x)
  640. # print('y',y)
  641. # print(len)
  642. # only output y_batch (which is needed in batch version of new model)
  643. class Graph_sequence_sampler_fast():
  644. def __init__(self, G_list, max_node_num=25, batch_size=4, max_prev_node = 25):
  645. self.batch_size = batch_size
  646. self.G_list = G_list
  647. self.n = max_node_num
  648. self.max_prev_node = max_prev_node
  649. self.adj_all = []
  650. for G in G_list:
  651. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  652. def sample(self):
  653. # batch, length, feature
  654. y_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
  655. # generate input x, y pairs
  656. for i in range(self.batch_size):
  657. # first sample and get a permuted adj
  658. adj_idx = np.random.randint(len(self.adj_all))
  659. adj_copy = self.adj_all[adj_idx].copy()
  660. # print('graph size',adj_copy.shape[0])
  661. x_idx = np.random.permutation(adj_copy.shape[0])
  662. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  663. adj_copy_matrix = np.asmatrix(adj_copy)
  664. G = nx.from_numpy_matrix(adj_copy_matrix)
  665. # then do bfs in the permuted G
  666. start_idx = np.random.randint(adj_copy.shape[0])
  667. x_idx = np.array(bfs_seq(G, start_idx))
  668. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  669. # get the feature for the permuted G
  670. # dict = nx.bfs_successors(G, start_idx)
  671. # print('dict', dict, 'node num', self.G.number_of_nodes())
  672. # print('x idx', x_idx, 'len', len(x_idx))
  673. # print('adj')
  674. # np.set_printoptions(linewidth=200)
  675. # for print_i in range(adj_copy.shape[0]):
  676. # print(adj_copy[print_i].astype(int))
  677. # adj_before = adj_copy.copy()
  678. # encode adj
  679. adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.max_prev_node)
  680. # print('adj encoded')
  681. # np.set_printoptions(linewidth=200)
  682. # for print_i in range(adj_copy.shape[0]):
  683. # print(adj_copy[print_i].astype(int))
  684. # decode adj
  685. # print('adj recover error')
  686. # adj_decode = decode_adj(adj_encoded.copy(), max_prev_node=self.max_prev_node)
  687. # adj_err = adj_decode-adj_copy
  688. # print(np.sum(adj_err))
  689. # if np.sum(adj_err)!=0:
  690. # print(adj_err)
  691. # np.set_printoptions(linewidth=200)
  692. # for print_i in range(adj_err.shape[0]):
  693. # print(adj_err[print_i].astype(int))
  694. # get x and y and adj
  695. # for small graph the rest are zero padded
  696. y_batch[i, 0:adj_encoded.shape[0], :] = adj_encoded
  697. # np.set_printoptions(linewidth=200,precision=3)
  698. # print('y\n')
  699. # for print_i in range(self.y_batch[i,:,:].shape[0]):
  700. # print(self.y_batch[i,:,:][print_i].astype(int))
  701. # print('x\n')
  702. # for print_i in range(self.x_batch[i, :, :].shape[0]):
  703. # print(self.x_batch[i, :, :][print_i].astype(int))
  704. # print('adj\n')
  705. # for print_i in range(self.adj_batch[i, :, :].shape[0]):
  706. # print(self.adj_batch[i, :, :][print_i].astype(int))
  707. # print('adj_norm\n')
  708. # for print_i in range(self.adj_norm_batch[i, :, :].shape[0]):
  709. # print(self.adj_norm_batch[i, :, :][print_i].astype(float))
  710. # print('feature\n')
  711. # for print_i in range(self.feature_batch[i, :, :].shape[0]):
  712. # print(self.feature_batch[i, :, :][print_i].astype(float))
  713. # print('x_batch\n',self.x_batch)
  714. # print('y_batch\n',self.y_batch)
  715. return torch.from_numpy(y_batch).float()
  716. # graphs, max_num_nodes = Graph_load_batch(min_num_nodes=6, name='PROTEINS_full')
  717. # print(max_num_nodes)
  718. # G = nx.ladder_graph(100)
  719. # # G1 = nx.karate_club_graph()
  720. # # G2 = nx.connected_caveman_graph(4,5)
  721. # G_list = [G]
  722. # dataset = Graph_sequence_sampler_fast(graphs, batch_size=128, max_node_num=max_num_nodes, max_prev_node=30)
  723. # for i in range(5):
  724. # time0 = time.time()
  725. # y = dataset.sample()
  726. # time1 = time.time()
  727. # print(i,'time', time1 - time0)
  728. # output size is flexible (using list to represent), batch size is 1
  729. class Graph_sequence_sampler_flexible():
  730. def __init__(self, G_list):
  731. self.G_list = G_list
  732. self.adj_all = []
  733. for G in G_list:
  734. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  735. self.y_batch = []
  736. def sample(self):
  737. # generate input x, y pairs
  738. # first sample and get a permuted adj
  739. adj_idx = np.random.randint(len(self.adj_all))
  740. adj_copy = self.adj_all[adj_idx].copy()
  741. # print('graph size',adj_copy.shape[0])
  742. x_idx = np.random.permutation(adj_copy.shape[0])
  743. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  744. adj_copy_matrix = np.asmatrix(adj_copy)
  745. G = nx.from_numpy_matrix(adj_copy_matrix)
  746. # then do bfs in the permuted G
  747. start_idx = np.random.randint(adj_copy.shape[0])
  748. x_idx = np.array(bfs_seq(G, start_idx))
  749. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  750. # get the feature for the permuted G
  751. # dict = nx.bfs_successors(G, start_idx)
  752. # print('dict', dict, 'node num', self.G.number_of_nodes())
  753. # print('x idx', x_idx, 'len', len(x_idx))
  754. # print('adj')
  755. # np.set_printoptions(linewidth=200)
  756. # for print_i in range(adj_copy.shape[0]):
  757. # print(adj_copy[print_i].astype(int))
  758. # adj_before = adj_copy.copy()
  759. # encode adj
  760. adj_encoded = encode_adj_flexible(adj_copy.copy())
  761. # print('adj encoded')
  762. # np.set_printoptions(linewidth=200)
  763. # for print_i in range(adj_copy.shape[0]):
  764. # print(adj_copy[print_i].astype(int))
  765. # decode adj
  766. # print('adj recover error')
  767. # adj_decode = decode_adj(adj_encoded.copy(), max_prev_node=self.max_prev_node)
  768. # adj_err = adj_decode-adj_copy
  769. # print(np.sum(adj_err))
  770. # if np.sum(adj_err)!=0:
  771. # print(adj_err)
  772. # np.set_printoptions(linewidth=200)
  773. # for print_i in range(adj_err.shape[0]):
  774. # print(adj_err[print_i].astype(int))
  775. # get x and y and adj
  776. # for small graph the rest are zero padded
  777. self.y_batch=adj_encoded
  778. # np.set_printoptions(linewidth=200,precision=3)
  779. # print('y\n')
  780. # for print_i in range(self.y_batch[i,:,:].shape[0]):
  781. # print(self.y_batch[i,:,:][print_i].astype(int))
  782. # print('x\n')
  783. # for print_i in range(self.x_batch[i, :, :].shape[0]):
  784. # print(self.x_batch[i, :, :][print_i].astype(int))
  785. # print('adj\n')
  786. # for print_i in range(self.adj_batch[i, :, :].shape[0]):
  787. # print(self.adj_batch[i, :, :][print_i].astype(int))
  788. # print('adj_norm\n')
  789. # for print_i in range(self.adj_norm_batch[i, :, :].shape[0]):
  790. # print(self.adj_norm_batch[i, :, :][print_i].astype(float))
  791. # print('feature\n')
  792. # for print_i in range(self.feature_batch[i, :, :].shape[0]):
  793. # print(self.feature_batch[i, :, :][print_i].astype(float))
  794. return self.y_batch,adj_copy
  795. # G = nx.ladder_graph(5)
  796. # # G = nx.grid_2d_graph(20,20)
  797. # # G = nx.ladder_graph(200)
  798. # graphs = [G]
  799. #
  800. # graphs, max_num_nodes = Graph_load_batch(min_num_nodes=6, name='ENZYMES')
  801. # sampler = Graph_sequence_sampler_flexible(graphs)
  802. #
  803. # y_max_all = []
  804. # for i in range(10000):
  805. # y_raw,adj_copy = sampler.sample()
  806. # y_max = max(len(y_raw[i]) for i in range(len(y_raw)))
  807. # y_max_all.append(y_max)
  808. # # print('max bfs node',y_max)
  809. # print('max', max(y_max_all))
  810. # print(y[1])
  811. # print(Variable(torch.FloatTensor(y[1])).cuda(CUDA))
  812. ########### potential use: an encoder along with the GraphRNN decoder
  813. # preprocess the adjacency matrix
  814. def preprocess(A):
  815. # Get size of the adjacency matrix
  816. size = len(A)
  817. # Get the degrees for each node
  818. degrees = np.sum(A, axis=1)+1
  819. # Create diagonal matrix D from the degrees of the nodes
  820. D = np.diag(np.power(degrees, -0.5).flatten())
  821. # Cholesky decomposition of D
  822. # D = np.linalg.cholesky(D)
  823. # Inverse of the Cholesky decomposition of D
  824. # D = np.linalg.inv(D)
  825. # Create an identity matrix of size x size
  826. I = np.eye(size)
  827. # Create A hat
  828. A_hat = A + I
  829. # Return A_hat
  830. A_normal = np.dot(np.dot(D,A_hat),D)
  831. return A_normal
  832. # truncate the output seqence to save representation, and allowing for infinite generation
  833. # now having a list of graphs
  834. class Graph_sequence_sampler_bfs_permute_truncate_multigraph():
  835. def __init__(self, G_list, max_node_num=25, batch_size=4, max_prev_node = 25, feature = None):
  836. self.batch_size = batch_size
  837. self.G_list = G_list
  838. self.n = max_node_num
  839. self.max_prev_node = max_prev_node
  840. self.adj_all = []
  841. for G in G_list:
  842. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  843. self.has_feature = feature
  844. def sample(self):
  845. # batch, length, feature
  846. # self.x_batch = np.ones((self.batch_size, self.n - 1, self.max_prev_node))
  847. x_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
  848. # self.x_batch[:,0,:] = np.ones((self.batch_size, self.max_prev_node)) # first input is all ones
  849. # batch, length, feature
  850. y_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
  851. # batch, length, length
  852. adj_batch = np.zeros((self.batch_size, self.n, self.n)) # here zeros are padded for small graph
  853. # batch, size, size
  854. adj_norm_batch = np.zeros((self.batch_size, self.n, self.n)) # here zeros are padded for small graph
  855. # batch, size, feature_len: degree and clustering coefficient
  856. if self.has_feature is None:
  857. feature_batch = np.zeros((self.batch_size, self.n, self.n)) # use one hot feature
  858. else:
  859. feature_batch = np.zeros((self.batch_size, self.n, 2))
  860. # generate input x, y pairs
  861. for i in range(self.batch_size):
  862. time0 = time.time()
  863. # first sample and get a permuted adj
  864. adj_idx = np.random.randint(len(self.adj_all))
  865. adj_copy = self.adj_all[adj_idx].copy()
  866. # print('Graph size', adj_copy.shape[0])
  867. x_idx = np.random.permutation(adj_copy.shape[0])
  868. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  869. adj_copy_matrix = np.asmatrix(adj_copy)
  870. G = nx.from_numpy_matrix(adj_copy_matrix)
  871. time1 = time.time()
  872. # then do bfs in the permuted G
  873. start_idx = np.random.randint(adj_copy.shape[0])
  874. x_idx = np.array(bfs_seq(G, start_idx))
  875. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  876. # get the feature for the permuted G
  877. node_list = [G.nodes()[i] for i in x_idx]
  878. feature_degree = np.array(list(G.degree(node_list).values()))[:,np.newaxis]
  879. feature_clustering = np.array(list(nx.clustering(G,nodes=node_list).values()))[:,np.newaxis]
  880. time2 = time.time()
  881. # dict = nx.bfs_successors(G, start_idx)
  882. # print('dict', dict, 'node num', self.G.number_of_nodes())
  883. # print('x idx', x_idx, 'len', len(x_idx))
  884. # print('adj')
  885. # np.set_printoptions(linewidth=200)
  886. # for print_i in range(adj_copy.shape[0]):
  887. # print(adj_copy[print_i].astype(int))
  888. # adj_before = adj_copy.copy()
  889. # encode adj
  890. adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.max_prev_node)
  891. # print('adj encoded')
  892. # np.set_printoptions(linewidth=200)
  893. # for print_i in range(adj_copy.shape[0]):
  894. # print(adj_copy[print_i].astype(int))
  895. # decode adj
  896. # print('adj recover error')
  897. # adj_decode = decode_adj(adj_encoded.copy(), max_prev_node=self.max_prev_node)
  898. # adj_err = adj_decode-adj_copy
  899. # print(np.sum(adj_err))
  900. # if np.sum(adj_err)!=0:
  901. # print(adj_err)
  902. # np.set_printoptions(linewidth=200)
  903. # for print_i in range(adj_err.shape[0]):
  904. # print(adj_err[print_i].astype(int))
  905. # get x and y and adj
  906. # for small graph the rest are zero padded
  907. y_batch[i, 0:adj_encoded.shape[0], :] = adj_encoded
  908. x_batch[i, 1:adj_encoded.shape[0]+1, :] = adj_encoded
  909. adj_batch[i, 0:adj_copy.shape[0], 0:adj_copy.shape[0]] = adj_copy
  910. adj_copy_norm = preprocess(adj_copy)
  911. time3 = time.time()
  912. adj_norm_batch[i, 0:adj_copy.shape[0], 0:adj_copy.shape[0]] = adj_copy_norm
  913. if self.has_feature is None:
  914. feature_batch[i, 0:adj_copy.shape[0], 0:adj_copy.shape[0]] = np.eye(adj_copy.shape[0])
  915. else:
  916. feature_batch[i,0:adj_copy.shape[0],:] = np.concatenate((feature_degree,feature_clustering),axis=1)
  917. # np.set_printoptions(linewidth=200,precision=3)
  918. # print('y\n')
  919. # for print_i in range(self.y_batch[i,:,:].shape[0]):
  920. # print(self.y_batch[i,:,:][print_i].astype(int))
  921. # print('x\n')
  922. # for print_i in range(self.x_batch[i, :, :].shape[0]):
  923. # print(self.x_batch[i, :, :][print_i].astype(int))
  924. # print('adj\n')
  925. # for print_i in range(self.adj_batch[i, :, :].shape[0]):
  926. # print(self.adj_batch[i, :, :][print_i].astype(int))
  927. # print('adj_norm\n')
  928. # for print_i in range(self.adj_norm_batch[i, :, :].shape[0]):
  929. # print(self.adj_norm_batch[i, :, :][print_i].astype(float))
  930. # print('feature\n')
  931. # for print_i in range(self.feature_batch[i, :, :].shape[0]):
  932. # print(self.feature_batch[i, :, :][print_i].astype(float))
  933. time4 = time.time()
  934. # print('1 ',time1-time0)
  935. # print('2 ',time2-time1)
  936. # print('3 ',time3-time2)
  937. # print('4 ',time4-time3)
  938. # print('x_batch\n',self.x_batch)
  939. # print('y_batch\n',self.y_batch)
  940. return torch.from_numpy(x_batch).float(), torch.from_numpy(y_batch).float(),\
  941. torch.from_numpy(adj_batch).float(), torch.from_numpy(adj_norm_batch).float(), torch.from_numpy(feature_batch).float()
  942. # generate own synthetic dataset
  943. def Graph_synthetic(seed):
  944. G = nx.Graph()
  945. np.random.seed(seed)
  946. base = np.repeat(np.eye(5), 20, axis=0)
  947. rand = np.random.randn(100, 5) * 0.05
  948. node_features = base + rand
  949. # # print('node features')
  950. # for i in range(node_features.shape[0]):
  951. # print(np.around(node_features[i], decimals=4))
  952. node_distance_l1 = np.ones((node_features.shape[0], node_features.shape[0]))
  953. node_distance_np = np.zeros((node_features.shape[0], node_features.shape[0]))
  954. for i in range(node_features.shape[0]):
  955. for j in range(node_features.shape[0]):
  956. if i != j:
  957. node_distance_l1[i,j] = np.sum(np.abs(node_features[i] - node_features[j]))
  958. # print('node distance', node_distance_l1[i,j])
  959. node_distance_np[i, j] = 1 / np.sum(np.abs(node_features[i] - node_features[j]) ** 2)
  960. print('node distance max', np.max(node_distance_l1))
  961. print('node distance min', np.min(node_distance_l1))
  962. node_distance_np_sum = np.sum(node_distance_np, axis=1, keepdims=True)
  963. embedding_dist = node_distance_np / node_distance_np_sum
  964. # generate the graph
  965. average_degree = 9
  966. for i in range(node_features.shape[0]):
  967. for j in range(i + 1, embedding_dist.shape[0]):
  968. p = np.random.rand()
  969. if p < embedding_dist[i, j] * average_degree:
  970. G.add_edge(i, j)
  971. G.remove_nodes_from(nx.isolates(G))
  972. print('num of nodes', G.number_of_nodes())
  973. print('num of edges', G.number_of_edges())
  974. G_deg = nx.degree_histogram(G)
  975. G_deg_sum = [a * b for a, b in zip(G_deg, range(0, len(G_deg)))]
  976. print('average degree', sum(G_deg_sum) / G.number_of_nodes())
  977. print('average path length', nx.average_shortest_path_length(G))
  978. print('diameter', nx.diameter(G))
  979. G_cluster = sorted(list(nx.clustering(G).values()))
  980. print('average clustering coefficient', sum(G_cluster) / len(G_cluster))
  981. print('Graph generation complete!')
  982. # node_features = np.concatenate((node_features, np.zeros((1,node_features.shape[1]))),axis=0)
  983. return G, node_features
  984. # G = Graph_synthetic(10)
  985. # return adj and features from a single graph
  986. class GraphDataset_adj(torch.utils.data.Dataset):
  987. """Graph Dataset"""
  988. def __init__(self, G, features=None):
  989. self.G = G
  990. self.n = G.number_of_nodes()
  991. adj = np.asarray(nx.to_numpy_matrix(self.G))
  992. # permute adj
  993. subgraph_idx = np.random.permutation(self.n)
  994. # subgraph_idx = np.arange(self.n)
  995. adj = adj[np.ix_(subgraph_idx, subgraph_idx)]
  996. self.adj = torch.from_numpy(adj+np.eye(len(adj))).float()
  997. self.adj_norm = torch.from_numpy(preprocess(adj)).float()
  998. if features is None:
  999. self.features = torch.Tensor(self.n, self.n)
  1000. self.features = nn.init.eye(self.features)
  1001. else:
  1002. features = features[subgraph_idx,:]
  1003. self.features = torch.from_numpy(features).float()
  1004. print('embedding size', self.features.size())
  1005. def __len__(self):
  1006. return 1
  1007. def __getitem__(self, idx):
  1008. sample = {'adj':self.adj,'adj_norm':self.adj_norm, 'features':self.features}
  1009. return sample
  1010. # G = nx.karate_club_graph()
  1011. # dataset = GraphDataset_adj(G)
  1012. # train_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True, num_workers=1)
  1013. # for data in train_loader:
  1014. # print(data)
  1015. # return adj and features from a list of graphs
  1016. class GraphDataset_adj_batch(torch.utils.data.Dataset):
  1017. """Graph Dataset"""
  1018. def __init__(self, graphs, has_feature = True, num_nodes = 20):
  1019. self.graphs = graphs
  1020. self.has_feature = has_feature
  1021. self.num_nodes = num_nodes
  1022. def __len__(self):
  1023. return len(self.graphs)
  1024. def __getitem__(self, idx):
  1025. adj_raw = np.asarray(nx.to_numpy_matrix(self.graphs[idx]))
  1026. np.fill_diagonal(adj_raw,0) # in case the self connection already exists
  1027. # sample num_nodes size subgraph
  1028. subgraph_idx = np.random.permutation(adj_raw.shape[0])[0:self.num_nodes]
  1029. adj_raw = adj_raw[np.ix_(subgraph_idx,subgraph_idx)]
  1030. adj = torch.from_numpy(adj_raw+np.eye(len(adj_raw))).float()
  1031. adj_norm = torch.from_numpy(preprocess(adj_raw)).float()
  1032. adj_raw = torch.from_numpy(adj_raw).float()
  1033. if self.has_feature:
  1034. dictionary = nx.get_node_attributes(self.graphs[idx], 'feature')
  1035. features = np.zeros((self.num_nodes, list(dictionary.values())[0].shape[0]))
  1036. for i in range(self.num_nodes):
  1037. features[i, :] = list(dictionary.values())[subgraph_idx[i]]
  1038. # normalize
  1039. features -= np.mean(features, axis=0)
  1040. epsilon = 1e-6
  1041. features /= (np.std(features, axis=0)+epsilon)
  1042. features = torch.from_numpy(features).float()
  1043. else:
  1044. n = self.num_nodes
  1045. features = torch.Tensor(n, n)
  1046. features = nn.init.eye(features)
  1047. sample = {'adj':adj,'adj_norm':adj_norm, 'features':features, 'adj_raw':adj_raw}
  1048. return sample
  1049. # return adj and features from a list of graphs, batch size = 1, so that graphs can have various size each time
  1050. class GraphDataset_adj_batch_1(torch.utils.data.Dataset):
  1051. """Graph Dataset"""
  1052. def __init__(self, graphs, has_feature=True):
  1053. self.graphs = graphs
  1054. self.has_feature = has_feature
  1055. def __len__(self):
  1056. return len(self.graphs)
  1057. def __getitem__(self, idx):
  1058. adj_raw = np.asarray(nx.to_numpy_matrix(self.graphs[idx]))
  1059. np.fill_diagonal(adj_raw, 0) # in case the self connection already exists
  1060. n = adj_raw.shape[0]
  1061. # give a permutation
  1062. subgraph_idx = np.random.permutation(n)
  1063. # subgraph_idx = np.arange(n)
  1064. adj_raw = adj_raw[np.ix_(subgraph_idx, subgraph_idx)]
  1065. adj = torch.from_numpy(adj_raw + np.eye(len(adj_raw))).float()
  1066. adj_norm = torch.from_numpy(preprocess(adj_raw)).float()
  1067. if self.has_feature:
  1068. dictionary = nx.get_node_attributes(self.graphs[idx], 'feature')
  1069. features = np.zeros((n, list(dictionary.values())[0].shape[0]))
  1070. for i in range(n):
  1071. features[i, :] = list(dictionary.values())[i]
  1072. features = features[subgraph_idx, :]
  1073. # normalize
  1074. features -= np.mean(features, axis=0)
  1075. epsilon = 1e-6
  1076. features /= (np.std(features, axis=0) + epsilon)
  1077. features = torch.from_numpy(features).float()
  1078. else:
  1079. features = torch.Tensor(n, n)
  1080. features = nn.init.eye(features)
  1081. sample = {'adj': adj, 'adj_norm': adj_norm, 'features': features}
  1082. return sample
  1083. # get one node at a time, for a single graph
  1084. class GraphDataset(torch.utils.data.Dataset):
  1085. """Graph Dataset"""
  1086. def __init__(self, G, hops = 1, max_degree = 5, vocab_size = 35, embedding_dim = 35, embedding = None, shuffle_neighbour = True):
  1087. self.G = G
  1088. self.shuffle_neighbour = shuffle_neighbour
  1089. self.hops = hops
  1090. self.max_degree = max_degree
  1091. if embedding is None:
  1092. self.embedding = torch.Tensor(vocab_size, embedding_dim)
  1093. self.embedding = nn.init.eye(self.embedding)
  1094. else:
  1095. self.embedding = torch.from_numpy(embedding).float()
  1096. print('embedding size', self.embedding.size())
  1097. def __len__(self):
  1098. return len(self.G.nodes())
  1099. def __getitem__(self, idx):
  1100. idx = idx+1
  1101. idx_list = [idx]
  1102. node_list = [self.embedding[idx].view(-1, self.embedding.size(1))]
  1103. node_count_list = []
  1104. for i in range(self.hops):
  1105. # sample this hop
  1106. adj_list = np.array([])
  1107. adj_count_list = np.array([])
  1108. for idx in idx_list:
  1109. if self.shuffle_neighbour:
  1110. adj_list_new = list(self.G.adj[idx - 1])
  1111. random.shuffle(adj_list_new)
  1112. adj_list_new = np.array(adj_list_new) + 1
  1113. else:
  1114. adj_list_new = np.array(list(self.G.adj[idx-1]))+1
  1115. adj_count_list_new = np.array([len(adj_list_new)])
  1116. adj_list = np.concatenate((adj_list, adj_list_new), axis=0)
  1117. adj_count_list = np.concatenate((adj_count_list, adj_count_list_new), axis=0)
  1118. # print(i, adj_list)
  1119. # print(i, embedding(Variable(torch.from_numpy(adj_list)).long()))
  1120. index = torch.from_numpy(adj_list).long()
  1121. adj_list_emb = self.embedding[index]
  1122. node_list.append(adj_list_emb)
  1123. node_count_list.append(adj_count_list)
  1124. idx_list = adj_list
  1125. # padding, used as target
  1126. idx_list = [idx]
  1127. node_list_pad = [self.embedding[idx].view(-1, self.embedding.size(1))]
  1128. node_count_list_pad = []
  1129. node_adj_list = []
  1130. for i in range(self.hops):
  1131. adj_list = np.zeros(self.max_degree ** (i + 1))
  1132. adj_count_list = np.ones(self.max_degree ** (i)) * self.max_degree
  1133. for j, idx in enumerate(idx_list):
  1134. if idx == 0:
  1135. adj_list_new = np.zeros(self.max_degree)
  1136. else:
  1137. if self.shuffle_neighbour:
  1138. adj_list_new = list(self.G.adj[idx - 1])
  1139. # random.shuffle(adj_list_new)
  1140. adj_list_new = np.array(adj_list_new) + 1
  1141. else:
  1142. adj_list_new = np.array(list(self.G.adj[idx-1]))+1
  1143. start_idx = j * self.max_degree
  1144. incre_idx = min(self.max_degree, adj_list_new.shape[0])
  1145. adj_list[start_idx:start_idx + incre_idx] = adj_list_new[:incre_idx]
  1146. index = torch.from_numpy(adj_list).long()
  1147. adj_list_emb = self.embedding[index]
  1148. node_list_pad.append(adj_list_emb)
  1149. node_count_list_pad.append(adj_count_list)
  1150. idx_list = adj_list
  1151. # calc adj matrix
  1152. node_adj = torch.zeros(index.size(0),index.size(0))
  1153. for first in range(index.size(0)):
  1154. for second in range(first, index.size(0)):
  1155. if index[first]==index[second]:
  1156. node_adj[first,second] = 1
  1157. node_adj[second,first] = 1
  1158. elif self.G.has_edge(index[first],index[second]):
  1159. node_adj[first, second] = 0.5
  1160. node_adj[second, first] = 0.5
  1161. node_adj_list.append(node_adj)
  1162. node_list = list(reversed(node_list))
  1163. node_count_list = list(reversed(node_count_list))
  1164. node_list_pad = list(reversed(node_list_pad))
  1165. node_count_list_pad = list(reversed(node_count_list_pad))
  1166. node_adj_list = list(reversed(node_adj_list))
  1167. sample = {'node_list':node_list, 'node_count_list':node_count_list,
  1168. 'node_list_pad':node_list_pad, 'node_count_list_pad':node_count_list_pad, 'node_adj_list':node_adj_list}
  1169. return sample