You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

data.py 52KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341
  1. import pickle as pkl
  2. from random import shuffle
  3. import scipy.sparse as sp
  4. from model import *
  5. from utils import *
  6. # load ENZYMES and PROTEIN and DD dataset
  7. def Graph_load_batch(min_num_nodes=20, max_num_nodes=1000, name='ENZYMES', node_attributes=True, graph_labels=True):
  8. '''
  9. load many graphs, e.g. enzymes
  10. :return: a list of graphs
  11. '''
  12. print('Loading graph dataset: ' + str(name))
  13. G = nx.Graph()
  14. # load data
  15. path = 'dataset/' + name + '/'
  16. data_adj = np.loadtxt(path + name + '_A.txt', delimiter=',').astype(int)
  17. if node_attributes:
  18. data_node_att = np.loadtxt(path + name + '_node_attributes.txt', delimiter=',')
  19. data_node_label = np.loadtxt(path + name + '_node_labels.txt', delimiter=',').astype(int)
  20. data_graph_indicator = np.loadtxt(path + name + '_graph_indicator.txt', delimiter=',').astype(int)
  21. if graph_labels:
  22. data_graph_labels = np.loadtxt(path + name + '_graph_labels.txt', delimiter=',').astype(int)
  23. data_tuple = list(map(tuple, data_adj))
  24. # print(len(data_tuple))
  25. # print(data_tuple[0])
  26. # add edges
  27. G.add_edges_from(data_tuple)
  28. # add node attributes
  29. for i in range(data_node_label.shape[0]):
  30. if node_attributes:
  31. G.add_node(i + 1, feature=data_node_att[i])
  32. G.add_node(i + 1, label=data_node_label[i])
  33. G.remove_nodes_from(list(nx.isolates(G)))
  34. # print(G.number_of_nodes())
  35. # print(G.number_of_edges())
  36. # split into graphs
  37. graph_num = data_graph_indicator.max()
  38. node_list = np.arange(data_graph_indicator.shape[0]) + 1
  39. graphs = []
  40. max_nodes = 0
  41. for i in range(graph_num):
  42. # find the nodes for each graph
  43. nodes = node_list[data_graph_indicator == i + 1]
  44. G_sub = G.subgraph(nodes)
  45. if graph_labels:
  46. G_sub.graph['label'] = data_graph_labels[i]
  47. # print('nodes', G_sub.number_of_nodes())
  48. # print('edges', G_sub.number_of_edges())
  49. # print('label', G_sub.graph)
  50. if G_sub.number_of_nodes() >= min_num_nodes and G_sub.number_of_nodes() <= max_num_nodes:
  51. graphs.append(G_sub)
  52. if G_sub.number_of_nodes() > max_nodes:
  53. max_nodes = G_sub.number_of_nodes()
  54. # print(G_sub.number_of_nodes(), 'i', i)
  55. # print('Graph dataset name: {}, total graph num: {}'.format(name, len(graphs)))
  56. # logging.warning('Graphs loaded, total num: {}'.format(len(graphs)))
  57. print('Loaded')
  58. return graphs
  59. def test_graph_load_DD():
  60. graphs, max_num_nodes = Graph_load_batch(min_num_nodes=10, name='DD', node_attributes=False, graph_labels=True)
  61. shuffle(graphs)
  62. plt.switch_backend('agg')
  63. plt.hist([len(graphs[i]) for i in range(len(graphs))], bins=100)
  64. plt.savefig('figures/test.png')
  65. plt.close()
  66. row = 4
  67. col = 4
  68. draw_graph_list(graphs[0:row * col], row=row, col=col, fname='figures/test')
  69. print('max num nodes', max_num_nodes)
  70. def parse_index_file(filename):
  71. index = []
  72. for line in open(filename):
  73. index.append(int(line.strip()))
  74. return index
  75. # load cora, citeseer and pubmed dataset
  76. def Graph_load(dataset='cora'):
  77. '''
  78. Load a single graph dataset
  79. :param dataset: dataset name
  80. :return:
  81. '''
  82. names = ['x', 'tx', 'allx', 'graph']
  83. objects = []
  84. for i in range(len(names)):
  85. load = pkl.load(open("dataset/ind.{}.{}".format(dataset, names[i]), 'rb'), encoding='latin1')
  86. # print('loaded')
  87. objects.append(load)
  88. # print(load)
  89. x, tx, allx, graph = tuple(objects)
  90. test_idx_reorder = parse_index_file("dataset/ind.{}.test.index".format(dataset))
  91. test_idx_range = np.sort(test_idx_reorder)
  92. if dataset == 'citeseer':
  93. # Fix citeseer dataset (there are some isolated nodes in the graph)
  94. # Find isolated nodes, add them as zero-vecs into the right position
  95. test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1)
  96. tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
  97. tx_extended[test_idx_range - min(test_idx_range), :] = tx
  98. tx = tx_extended
  99. features = sp.vstack((allx, tx)).tolil()
  100. features[test_idx_reorder, :] = features[test_idx_range, :]
  101. G = nx.from_dict_of_lists(graph)
  102. adj = nx.adjacency_matrix(G)
  103. return adj, features, G
  104. ######### code test ########
  105. # adj, features,G = Graph_load()
  106. # print(adj)
  107. # print(G.number_of_nodes(), G.number_of_edges())
  108. # _,_,G = Graph_load(dataset='citeseer')
  109. # G = max(nx.connected_component_subgraphs(G), key=len)
  110. # G = nx.convert_node_labels_to_integers(G)
  111. #
  112. # count = 0
  113. # max_node = 0
  114. # for i in range(G.number_of_nodes()):
  115. # G_ego = nx.ego_graph(G, i, radius=3)
  116. # # draw_graph(G_ego,prefix='test'+str(i))
  117. # m = G_ego.number_of_nodes()
  118. # if m>max_node:
  119. # max_node = m
  120. # if m>=50:
  121. # print(i, G_ego.number_of_nodes(), G_ego.number_of_edges())
  122. # count += 1
  123. # print('count', count)
  124. # print('max_node', max_node)
  125. def bfs_seq(G, start_id):
  126. '''
  127. get a bfs node sequence
  128. :param G:
  129. :param start_id:
  130. :return:
  131. '''
  132. dictionary = dict(nx.bfs_successors(G, start_id))
  133. start = [start_id]
  134. output = [start_id]
  135. while len(start) > 0:
  136. next = []
  137. while len(start) > 0:
  138. current = start.pop(0)
  139. neighbor = dictionary.get(current)
  140. if neighbor is not None:
  141. #### a wrong example, should not permute here!
  142. # shuffle(neighbor)
  143. next = next + neighbor
  144. output = output + next
  145. start = next
  146. return output
  147. def encode_adj(adj, max_prev_node=10, is_full=False):
  148. '''
  149. :param adj: n*n, rows means time step, while columns are input dimension
  150. :param max_degree: we want to keep row number, but truncate column numbers
  151. :return:
  152. '''
  153. if is_full:
  154. max_prev_node = adj.shape[0] - 1
  155. # pick up lower tri
  156. adj = np.tril(adj, k=-1)
  157. n = adj.shape[0]
  158. adj = adj[1:n, 0:n - 1]
  159. # use max_prev_node to truncate
  160. # note: now adj is a (n-1)*(n-1) matrix
  161. adj_output = np.zeros((adj.shape[0], max_prev_node))
  162. for i in range(adj.shape[0]):
  163. input_start = max(0, i - max_prev_node + 1)
  164. input_end = i + 1
  165. output_start = max_prev_node + input_start - input_end
  166. output_end = max_prev_node
  167. adj_output[i, output_start:output_end] = adj[i, input_start:input_end]
  168. adj_output[i, :] = adj_output[i, :][::-1] # reverse order
  169. return adj_output
  170. def decode_adj(adj_output):
  171. '''
  172. recover to adj from adj_output
  173. note: here adj_output have shape (n-1)*m
  174. '''
  175. max_prev_node = adj_output.shape[1]
  176. adj = np.zeros((adj_output.shape[0], adj_output.shape[0]))
  177. for i in range(adj_output.shape[0]):
  178. input_start = max(0, i - max_prev_node + 1)
  179. input_end = i + 1
  180. output_start = max_prev_node + max(0, i - max_prev_node + 1) - (i + 1)
  181. output_end = max_prev_node
  182. adj[i, input_start:input_end] = adj_output[i, ::-1][output_start:output_end] # reverse order
  183. adj_full = np.zeros((adj_output.shape[0] + 1, adj_output.shape[0] + 1))
  184. n = adj_full.shape[0]
  185. adj_full[1:n, 0:n - 1] = np.tril(adj, 0)
  186. adj_full = adj_full + adj_full.T
  187. return adj_full
  188. def encode_adj_flexible(adj):
  189. '''
  190. return a flexible length of output
  191. note that here there is no loss when encoding/decoding an adj matrix
  192. :param adj: adj matrix
  193. :return:
  194. '''
  195. # pick up lower tri
  196. adj = np.tril(adj, k=-1)
  197. n = adj.shape[0]
  198. adj = adj[1:n, 0:n - 1]
  199. adj_output = []
  200. input_start = 0
  201. for i in range(adj.shape[0]):
  202. input_end = i + 1
  203. adj_slice = adj[i, input_start:input_end]
  204. adj_output.append(adj_slice)
  205. non_zero = np.nonzero(adj_slice)[0]
  206. input_start = input_end - len(adj_slice) + np.amin(non_zero)
  207. return adj_output
  208. def decode_adj_flexible(adj_output):
  209. '''
  210. return a flexible length of output
  211. note that here there is no loss when encoding/decoding an adj matrix
  212. :param adj: adj matrix
  213. :return:
  214. '''
  215. adj = np.zeros((len(adj_output), len(adj_output)))
  216. for i in range(len(adj_output)):
  217. output_start = i + 1 - len(adj_output[i])
  218. output_end = i + 1
  219. adj[i, output_start:output_end] = adj_output[i]
  220. adj_full = np.zeros((len(adj_output) + 1, len(adj_output) + 1))
  221. n = adj_full.shape[0]
  222. adj_full[1:n, 0:n - 1] = np.tril(adj, 0)
  223. adj_full = adj_full + adj_full.T
  224. return adj_full
  225. def test_encode_decode_adj():
  226. ######## code test ###########
  227. G = nx.ladder_graph(5)
  228. G = nx.grid_2d_graph(20, 20)
  229. G = nx.ladder_graph(200)
  230. G = nx.karate_club_graph()
  231. G = nx.connected_caveman_graph(2, 3)
  232. print(G.number_of_nodes())
  233. adj = np.asarray(nx.to_numpy_matrix(G))
  234. G = nx.from_numpy_matrix(adj)
  235. #
  236. start_idx = np.random.randint(adj.shape[0])
  237. x_idx = np.array(bfs_seq(G, start_idx))
  238. adj = adj[np.ix_(x_idx, x_idx)]
  239. print('adj\n', adj)
  240. adj_output = encode_adj(adj, max_prev_node=5)
  241. print('adj_output\n', adj_output)
  242. adj_recover = decode_adj(adj_output, max_prev_node=5)
  243. print('adj_recover\n', adj_recover)
  244. print('error\n', np.amin(adj_recover - adj), np.amax(adj_recover - adj))
  245. adj_output = encode_adj_flexible(adj)
  246. for i in range(len(adj_output)):
  247. print(len(adj_output[i]))
  248. adj_recover = decode_adj_flexible(adj_output)
  249. print(adj_recover)
  250. print(np.amin(adj_recover - adj), np.amax(adj_recover - adj))
  251. def encode_adj_full(adj):
  252. '''
  253. return a n-1*n-1*2 tensor, the first dimension is an adj matrix, the second show if each entry is valid
  254. :param adj: adj matrix
  255. :return:
  256. '''
  257. # pick up lower tri
  258. adj = np.tril(adj, k=-1)
  259. n = adj.shape[0]
  260. adj = adj[1:n, 0:n - 1]
  261. adj_output = np.zeros((adj.shape[0], adj.shape[1], 2))
  262. adj_len = np.zeros(adj.shape[0])
  263. for i in range(adj.shape[0]):
  264. non_zero = np.nonzero(adj[i, :])[0]
  265. input_start = np.amin(non_zero)
  266. input_end = i + 1
  267. adj_slice = adj[i, input_start:input_end]
  268. # write adj
  269. adj_output[i, 0:adj_slice.shape[0], 0] = adj_slice[::-1] # put in reverse order
  270. # write stop token (if token is 0, stop)
  271. adj_output[i, 0:adj_slice.shape[0], 1] = 1 # put in reverse order
  272. # write sequence length
  273. adj_len[i] = adj_slice.shape[0]
  274. return adj_output, adj_len
  275. def decode_adj_full(adj_output):
  276. '''
  277. return an adj according to adj_output
  278. :param
  279. :return:
  280. '''
  281. # pick up lower tri
  282. adj = np.zeros((adj_output.shape[0] + 1, adj_output.shape[1] + 1))
  283. for i in range(adj_output.shape[0]):
  284. non_zero = np.nonzero(adj_output[i, :, 1])[0] # get valid sequence
  285. input_end = np.amax(non_zero)
  286. adj_slice = adj_output[i, 0:input_end + 1, 0] # get adj slice
  287. # write adj
  288. output_end = i + 1
  289. output_start = i + 1 - input_end - 1
  290. adj[i + 1, output_start:output_end] = adj_slice[::-1] # put in reverse order
  291. adj = adj + adj.T
  292. return adj
  293. def test_encode_decode_adj_full():
  294. ########### code test #############
  295. # G = nx.ladder_graph(10)
  296. G = nx.karate_club_graph()
  297. # get bfs adj
  298. adj = np.asarray(nx.to_numpy_matrix(G))
  299. G = nx.from_numpy_matrix(adj)
  300. start_idx = np.random.randint(adj.shape[0])
  301. x_idx = np.array(bfs_seq(G, start_idx))
  302. adj = adj[np.ix_(x_idx, x_idx)]
  303. adj_output, adj_len = encode_adj_full(adj)
  304. print('adj\n', adj)
  305. print('adj_output[0]\n', adj_output[:, :, 0])
  306. print('adj_output[1]\n', adj_output[:, :, 1])
  307. # print('adj_len\n',adj_len)
  308. adj_recover = decode_adj_full(adj_output)
  309. print('adj_recover\n', adj_recover)
  310. print('error\n', adj_recover - adj)
  311. print('error_sum\n', np.amax(adj_recover - adj), np.amin(adj_recover - adj))
  312. ########## use pytorch dataloader
  313. class Graph_sequence_sampler_pytorch(torch.utils.data.Dataset):
  314. def __init__(self, G_list, max_num_node=None, max_prev_node=None, iteration=20000):
  315. self.adj_all = []
  316. self.len_all = []
  317. for G in G_list:
  318. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  319. self.len_all.append(G.number_of_nodes())
  320. if max_num_node is None:
  321. self.n = max(self.len_all)
  322. else:
  323. self.n = max_num_node
  324. if max_prev_node is None:
  325. print('calculating max previous node, total iteration: {}'.format(iteration))
  326. self.max_prev_node = max(self.calc_max_prev_node(iter=iteration))
  327. print('max previous node: {}'.format(self.max_prev_node))
  328. else:
  329. self.max_prev_node = max_prev_node
  330. # self.max_prev_node = max_prev_node
  331. # # sort Graph in descending order
  332. # len_batch_order = np.argsort(np.array(self.len_all))[::-1]
  333. # self.len_all = [self.len_all[i] for i in len_batch_order]
  334. # self.adj_all = [self.adj_all[i] for i in len_batch_order]
  335. def __len__(self):
  336. return len(self.adj_all)
  337. def __getitem__(self, idx):
  338. adj_copy = self.adj_all[idx].copy()
  339. x_batch = np.zeros((self.n, self.max_prev_node)) # here zeros are padded for small graph
  340. x_batch[0, :] = 1 # the first input token is all ones
  341. y_batch = np.zeros((self.n, self.max_prev_node)) # here zeros are padded for small graph
  342. # generate input x, y pairs
  343. len_batch = adj_copy.shape[0]
  344. x_idx = np.random.permutation(adj_copy.shape[0])
  345. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  346. adj_copy_matrix = np.asmatrix(adj_copy)
  347. G = nx.from_numpy_matrix(adj_copy_matrix)
  348. # then do bfs in the permuted G
  349. start_idx = np.random.randint(adj_copy.shape[0])
  350. x_idx = np.array(bfs_seq(G, start_idx))
  351. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  352. adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.max_prev_node)
  353. # get x and y and adj
  354. # for small graph the rest are zero padded
  355. y_batch[0:adj_encoded.shape[0], :] = adj_encoded
  356. x_batch[1:adj_encoded.shape[0] + 1, :] = adj_encoded
  357. return {'x': x_batch, 'y': y_batch, 'len': len_batch}
  358. def calc_max_prev_node(self, iter=20000, topk=10):
  359. max_prev_node = []
  360. for i in range(iter):
  361. if i % (iter / 5) == 0:
  362. print('iter {} times'.format(i))
  363. adj_idx = np.random.randint(len(self.adj_all))
  364. adj_copy = self.adj_all[adj_idx].copy()
  365. # print('Graph size', adj_copy.shape[0])
  366. x_idx = np.random.permutation(adj_copy.shape[0])
  367. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  368. adj_copy_matrix = np.asmatrix(adj_copy)
  369. G = nx.from_numpy_matrix(adj_copy_matrix)
  370. # then do bfs in the permuted G
  371. start_idx = np.random.randint(adj_copy.shape[0])
  372. x_idx = np.array(bfs_seq(G, start_idx))
  373. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  374. # encode adj
  375. adj_encoded = encode_adj_flexible(adj_copy.copy())
  376. max_encoded_len = max([len(adj_encoded[i]) for i in range(len(adj_encoded))])
  377. max_prev_node.append(max_encoded_len)
  378. max_prev_node = sorted(max_prev_node)[-1 * topk:]
  379. return max_prev_node
  380. ########## use pytorch dataloader
  381. class Graph_sequence_sampler_pytorch_nobfs(torch.utils.data.Dataset):
  382. def __init__(self, G_list, max_num_node=None):
  383. self.adj_all = []
  384. self.len_all = []
  385. for G in G_list:
  386. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  387. self.len_all.append(G.number_of_nodes())
  388. if max_num_node is None:
  389. self.n = max(self.len_all)
  390. else:
  391. self.n = max_num_node
  392. def __len__(self):
  393. return len(self.adj_all)
  394. def __getitem__(self, idx):
  395. adj_copy = self.adj_all[idx].copy()
  396. x_batch = np.zeros((self.n, self.n - 1)) # here zeros are padded for small graph
  397. x_batch[0, :] = 1 # the first input token is all ones
  398. y_batch = np.zeros((self.n, self.n - 1)) # here zeros are padded for small graph
  399. # generate input x, y pairs
  400. len_batch = adj_copy.shape[0]
  401. x_idx = np.random.permutation(adj_copy.shape[0])
  402. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  403. adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.n - 1)
  404. # get x and y and adj
  405. # for small graph the rest are zero padded
  406. y_batch[0:adj_encoded.shape[0], :] = adj_encoded
  407. x_batch[1:adj_encoded.shape[0] + 1, :] = adj_encoded
  408. return {'x': x_batch, 'y': y_batch, 'len': len_batch}
  409. # dataset = Graph_sequence_sampler_pytorch_nobfs(graphs)
  410. # print(dataset[1]['x'])
  411. # print(dataset[1]['y'])
  412. # print(dataset[1]['len'])
  413. ########## use pytorch dataloader
  414. class Graph_sequence_sampler_pytorch_canonical(torch.utils.data.Dataset):
  415. def __init__(self, G_list, max_num_node=None, max_prev_node=None, iteration=20000):
  416. self.adj_all = []
  417. self.len_all = []
  418. for G in G_list:
  419. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  420. self.len_all.append(G.number_of_nodes())
  421. if max_num_node is None:
  422. self.n = max(self.len_all)
  423. else:
  424. self.n = max_num_node
  425. if max_prev_node is None:
  426. # print('calculating max previous node, total iteration: {}'.format(iteration))
  427. # self.max_prev_node = max(self.calc_max_prev_node(iter=iteration))
  428. # print('max previous node: {}'.format(self.max_prev_node))
  429. self.max_prev_node = self.n - 1
  430. else:
  431. self.max_prev_node = max_prev_node
  432. # self.max_prev_node = max_prev_node
  433. # # sort Graph in descending order
  434. # len_batch_order = np.argsort(np.array(self.len_all))[::-1]
  435. # self.len_all = [self.len_all[i] for i in len_batch_order]
  436. # self.adj_all = [self.adj_all[i] for i in len_batch_order]
  437. def __len__(self):
  438. return len(self.adj_all)
  439. def __getitem__(self, idx):
  440. adj_copy = self.adj_all[idx].copy()
  441. x_batch = np.zeros((self.n, self.max_prev_node)) # here zeros are padded for small graph
  442. x_batch[0, :] = 1 # the first input token is all ones
  443. y_batch = np.zeros((self.n, self.max_prev_node)) # here zeros are padded for small graph
  444. # generate input x, y pairs
  445. len_batch = adj_copy.shape[0]
  446. # adj_copy_matrix = np.asmatrix(adj_copy)
  447. # G = nx.from_numpy_matrix(adj_copy_matrix)
  448. # then do bfs in the permuted G
  449. # start_idx = G.number_of_nodes()-1
  450. # x_idx = np.array(bfs_seq(G, start_idx))
  451. # adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  452. adj_encoded = encode_adj(adj_copy, max_prev_node=self.max_prev_node)
  453. # get x and y and adj
  454. # for small graph the rest are zero padded
  455. y_batch[0:adj_encoded.shape[0], :] = adj_encoded
  456. x_batch[1:adj_encoded.shape[0] + 1, :] = adj_encoded
  457. return {'x': x_batch, 'y': y_batch, 'len': len_batch}
  458. def calc_max_prev_node(self, iter=20000, topk=10):
  459. max_prev_node = []
  460. for i in range(iter):
  461. if i % (iter / 5) == 0:
  462. print('iter {} times'.format(i))
  463. adj_idx = np.random.randint(len(self.adj_all))
  464. adj_copy = self.adj_all[adj_idx].copy()
  465. # print('Graph size', adj_copy.shape[0])
  466. x_idx = np.random.permutation(adj_copy.shape[0])
  467. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  468. adj_copy_matrix = np.asmatrix(adj_copy)
  469. G = nx.from_numpy_matrix(adj_copy_matrix)
  470. # then do bfs in the permuted G
  471. start_idx = np.random.randint(adj_copy.shape[0])
  472. x_idx = np.array(bfs_seq(G, start_idx))
  473. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  474. # encode adj
  475. adj_encoded = encode_adj_flexible(adj_copy.copy())
  476. max_encoded_len = max([len(adj_encoded[i]) for i in range(len(adj_encoded))])
  477. max_prev_node.append(max_encoded_len)
  478. max_prev_node = sorted(max_prev_node)[-1 * topk:]
  479. return max_prev_node
  480. ########## use pytorch dataloader
  481. class Graph_sequence_sampler_pytorch_nll(torch.utils.data.Dataset):
  482. def __init__(self, G_list, max_num_node=None, max_prev_node=None, iteration=20000):
  483. self.adj_all = []
  484. self.len_all = []
  485. for G in G_list:
  486. adj = np.asarray(nx.to_numpy_matrix(G))
  487. adj_temp = self.calc_adj(adj)
  488. self.adj_all.extend(adj_temp)
  489. self.len_all.append(G.number_of_nodes())
  490. if max_num_node is None:
  491. self.n = max(self.len_all)
  492. else:
  493. self.n = max_num_node
  494. if max_prev_node is None:
  495. # print('calculating max previous node, total iteration: {}'.format(iteration))
  496. # self.max_prev_node = max(self.calc_max_prev_node(iter=iteration))
  497. # print('max previous node: {}'.format(self.max_prev_node))
  498. self.max_prev_node = self.n - 1
  499. else:
  500. self.max_prev_node = max_prev_node
  501. # self.max_prev_node = max_prev_node
  502. # # sort Graph in descending order
  503. # len_batch_order = np.argsort(np.array(self.len_all))[::-1]
  504. # self.len_all = [self.len_all[i] for i in len_batch_order]
  505. # self.adj_all = [self.adj_all[i] for i in len_batch_order]
  506. def __len__(self):
  507. return len(self.adj_all)
  508. def __getitem__(self, idx):
  509. adj_copy = self.adj_all[idx].copy()
  510. x_batch = np.zeros((self.n, self.max_prev_node)) # here zeros are padded for small graph
  511. x_batch[0, :] = 1 # the first input token is all ones
  512. y_batch = np.zeros((self.n, self.max_prev_node)) # here zeros are padded for small graph
  513. # generate input x, y pairs
  514. len_batch = adj_copy.shape[0]
  515. # adj_copy_matrix = np.asmatrix(adj_copy)
  516. # G = nx.from_numpy_matrix(adj_copy_matrix)
  517. # then do bfs in the permuted G
  518. # start_idx = G.number_of_nodes()-1
  519. # x_idx = np.array(bfs_seq(G, start_idx))
  520. # adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  521. adj_encoded = encode_adj(adj_copy, max_prev_node=self.max_prev_node)
  522. # get x and y and adj
  523. # for small graph the rest are zero padded
  524. y_batch[0:adj_encoded.shape[0], :] = adj_encoded
  525. x_batch[1:adj_encoded.shape[0] + 1, :] = adj_encoded
  526. return {'x': x_batch, 'y': y_batch, 'len': len_batch}
  527. def calc_adj(self, adj):
  528. max_iter = 10000
  529. adj_all = [adj]
  530. adj_all_len = 1
  531. i_old = 0
  532. for i in range(max_iter):
  533. adj_copy = adj.copy()
  534. x_idx = np.random.permutation(adj_copy.shape[0])
  535. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  536. adj_copy_matrix = np.asmatrix(adj_copy)
  537. G = nx.from_numpy_matrix(adj_copy_matrix)
  538. # then do bfs in the permuted G
  539. start_idx = np.random.randint(adj_copy.shape[0])
  540. x_idx = np.array(bfs_seq(G, start_idx))
  541. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  542. add_flag = True
  543. for adj_exist in adj_all:
  544. if np.array_equal(adj_exist, adj_copy):
  545. add_flag = False
  546. break
  547. if add_flag:
  548. adj_all.append(adj_copy)
  549. adj_all_len += 1
  550. if adj_all_len % 10 == 0:
  551. print('adj found:', adj_all_len, 'iter used', i)
  552. return adj_all
  553. # graphs = [nx.barabasi_albert_graph(20,3)]
  554. # graphs = [nx.grid_2d_graph(4,4)]
  555. # dataset = Graph_sequence_sampler_pytorch_nll(graphs)
  556. ############## below are codes not used in current version
  557. ############## they are based on pytorch default data loader, we should consider reimplement them in current datasets, since they are more efficient
  558. # normal version
  559. class Graph_sequence_sampler_truncate():
  560. '''
  561. the output will truncate according to the max_prev_node
  562. '''
  563. def __init__(self, G_list, max_node_num=25, batch_size=4, max_prev_node=25):
  564. self.batch_size = batch_size
  565. self.n = max_node_num
  566. self.max_prev_node = max_prev_node
  567. self.adj_all = []
  568. for G in G_list:
  569. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  570. def sample(self):
  571. # batch, length, feature
  572. x_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
  573. y_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
  574. len_batch = np.zeros(self.batch_size)
  575. # generate input x, y pairs
  576. for i in range(self.batch_size):
  577. # first sample and get a permuted adj
  578. adj_idx = np.random.randint(len(self.adj_all))
  579. adj_copy = self.adj_all[adj_idx].copy()
  580. len_batch[i] = adj_copy.shape[0]
  581. x_idx = np.random.permutation(adj_copy.shape[0])
  582. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  583. adj_copy_matrix = np.asmatrix(adj_copy)
  584. G = nx.from_numpy_matrix(adj_copy_matrix)
  585. # then do bfs in the permuted G
  586. start_idx = np.random.randint(adj_copy.shape[0])
  587. x_idx = np.array(bfs_seq(G, start_idx))
  588. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  589. adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.max_prev_node)
  590. # get x and y and adj
  591. # for small graph the rest are zero padded
  592. y_batch[i, 0:adj_encoded.shape[0], :] = adj_encoded
  593. x_batch[i, 1:adj_encoded.shape[0] + 1, :] = adj_encoded
  594. # sort in descending order
  595. len_batch_order = np.argsort(len_batch)[::-1]
  596. len_batch = len_batch[len_batch_order]
  597. x_batch = x_batch[len_batch_order, :, :]
  598. y_batch = y_batch[len_batch_order, :, :]
  599. return torch.from_numpy(x_batch).float(), torch.from_numpy(y_batch).float(), len_batch.astype('int').tolist()
  600. def calc_max_prev_node(self, iter):
  601. max_prev_node = []
  602. for i in range(iter):
  603. if i % (iter / 10) == 0:
  604. print(i)
  605. adj_idx = np.random.randint(len(self.adj_all))
  606. adj_copy = self.adj_all[adj_idx].copy()
  607. # print('Graph size', adj_copy.shape[0])
  608. x_idx = np.random.permutation(adj_copy.shape[0])
  609. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  610. adj_copy_matrix = np.asmatrix(adj_copy)
  611. G = nx.from_numpy_matrix(adj_copy_matrix)
  612. time1 = time.time()
  613. # then do bfs in the permuted G
  614. start_idx = np.random.randint(adj_copy.shape[0])
  615. x_idx = np.array(bfs_seq(G, start_idx))
  616. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  617. # encode adj
  618. adj_encoded = encode_adj_flexible(adj_copy.copy())
  619. max_encoded_len = max([len(adj_encoded[i]) for i in range(len(adj_encoded))])
  620. max_prev_node.append(max_encoded_len)
  621. max_prev_node = sorted(max_prev_node)[-100:]
  622. return max_prev_node
  623. # graphs, max_num_nodes = Graph_load_batch(min_num_nodes=6, name='DD',node_attributes=False)
  624. # dataset = Graph_sequence_sampler_truncate([nx.karate_club_graph()])
  625. # max_prev_nodes = dataset.calc_max_prev_node(iter=10000)
  626. # print(max_prev_nodes)
  627. # x,y,len = dataset.sample()
  628. # print('x',x)
  629. # print('y',y)
  630. # print(len)
  631. # only output y_batch (which is needed in batch version of new model)
  632. class Graph_sequence_sampler_fast():
  633. def __init__(self, G_list, max_node_num=25, batch_size=4, max_prev_node=25):
  634. self.batch_size = batch_size
  635. self.G_list = G_list
  636. self.n = max_node_num
  637. self.max_prev_node = max_prev_node
  638. self.adj_all = []
  639. for G in G_list:
  640. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  641. def sample(self):
  642. # batch, length, feature
  643. y_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
  644. # generate input x, y pairs
  645. for i in range(self.batch_size):
  646. # first sample and get a permuted adj
  647. adj_idx = np.random.randint(len(self.adj_all))
  648. adj_copy = self.adj_all[adj_idx].copy()
  649. # print('graph size',adj_copy.shape[0])
  650. x_idx = np.random.permutation(adj_copy.shape[0])
  651. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  652. adj_copy_matrix = np.asmatrix(adj_copy)
  653. G = nx.from_numpy_matrix(adj_copy_matrix)
  654. # then do bfs in the permuted G
  655. start_idx = np.random.randint(adj_copy.shape[0])
  656. x_idx = np.array(bfs_seq(G, start_idx))
  657. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  658. # get the feature for the permuted G
  659. # dict = nx.bfs_successors(G, start_idx)
  660. # print('dict', dict, 'node num', self.G.number_of_nodes())
  661. # print('x idx', x_idx, 'len', len(x_idx))
  662. # print('adj')
  663. # np.set_printoptions(linewidth=200)
  664. # for print_i in range(adj_copy.shape[0]):
  665. # print(adj_copy[print_i].astype(int))
  666. # adj_before = adj_copy.copy()
  667. # encode adj
  668. adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.max_prev_node)
  669. # print('adj encoded')
  670. # np.set_printoptions(linewidth=200)
  671. # for print_i in range(adj_copy.shape[0]):
  672. # print(adj_copy[print_i].astype(int))
  673. # decode adj
  674. # print('adj recover error')
  675. # adj_decode = decode_adj(adj_encoded.copy(), max_prev_node=self.max_prev_node)
  676. # adj_err = adj_decode-adj_copy
  677. # print(np.sum(adj_err))
  678. # if np.sum(adj_err)!=0:
  679. # print(adj_err)
  680. # np.set_printoptions(linewidth=200)
  681. # for print_i in range(adj_err.shape[0]):
  682. # print(adj_err[print_i].astype(int))
  683. # get x and y and adj
  684. # for small graph the rest are zero padded
  685. y_batch[i, 0:adj_encoded.shape[0], :] = adj_encoded
  686. # np.set_printoptions(linewidth=200,precision=3)
  687. # print('y\n')
  688. # for print_i in range(self.y_batch[i,:,:].shape[0]):
  689. # print(self.y_batch[i,:,:][print_i].astype(int))
  690. # print('x\n')
  691. # for print_i in range(self.x_batch[i, :, :].shape[0]):
  692. # print(self.x_batch[i, :, :][print_i].astype(int))
  693. # print('adj\n')
  694. # for print_i in range(self.adj_batch[i, :, :].shape[0]):
  695. # print(self.adj_batch[i, :, :][print_i].astype(int))
  696. # print('adj_norm\n')
  697. # for print_i in range(self.adj_norm_batch[i, :, :].shape[0]):
  698. # print(self.adj_norm_batch[i, :, :][print_i].astype(float))
  699. # print('feature\n')
  700. # for print_i in range(self.feature_batch[i, :, :].shape[0]):
  701. # print(self.feature_batch[i, :, :][print_i].astype(float))
  702. # print('x_batch\n',self.x_batch)
  703. # print('y_batch\n',self.y_batch)
  704. return torch.from_numpy(y_batch).float()
  705. # graphs, max_num_nodes = Graph_load_batch(min_num_nodes=6, name='PROTEINS_full')
  706. # print(max_num_nodes)
  707. # G = nx.ladder_graph(100)
  708. # # G1 = nx.karate_club_graph()
  709. # # G2 = nx.connected_caveman_graph(4,5)
  710. # G_list = [G]
  711. # dataset = Graph_sequence_sampler_fast(graphs, batch_size=128, max_node_num=max_num_nodes, max_prev_node=30)
  712. # for i in range(5):
  713. # time0 = time.time()
  714. # y = dataset.sample()
  715. # time1 = time.time()
  716. # print(i,'time', time1 - time0)
  717. # output size is flexible (using list to represent), batch size is 1
  718. class Graph_sequence_sampler_flexible():
  719. def __init__(self, G_list):
  720. self.G_list = G_list
  721. self.adj_all = []
  722. for G in G_list:
  723. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  724. self.y_batch = []
  725. def sample(self):
  726. # generate input x, y pairs
  727. # first sample and get a permuted adj
  728. adj_idx = np.random.randint(len(self.adj_all))
  729. adj_copy = self.adj_all[adj_idx].copy()
  730. # print('graph size',adj_copy.shape[0])
  731. x_idx = np.random.permutation(adj_copy.shape[0])
  732. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  733. adj_copy_matrix = np.asmatrix(adj_copy)
  734. G = nx.from_numpy_matrix(adj_copy_matrix)
  735. # then do bfs in the permuted G
  736. start_idx = np.random.randint(adj_copy.shape[0])
  737. x_idx = np.array(bfs_seq(G, start_idx))
  738. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  739. # get the feature for the permuted G
  740. # dict = nx.bfs_successors(G, start_idx)
  741. # print('dict', dict, 'node num', self.G.number_of_nodes())
  742. # print('x idx', x_idx, 'len', len(x_idx))
  743. # print('adj')
  744. # np.set_printoptions(linewidth=200)
  745. # for print_i in range(adj_copy.shape[0]):
  746. # print(adj_copy[print_i].astype(int))
  747. # adj_before = adj_copy.copy()
  748. # encode adj
  749. adj_encoded = encode_adj_flexible(adj_copy.copy())
  750. # print('adj encoded')
  751. # np.set_printoptions(linewidth=200)
  752. # for print_i in range(adj_copy.shape[0]):
  753. # print(adj_copy[print_i].astype(int))
  754. # decode adj
  755. # print('adj recover error')
  756. # adj_decode = decode_adj(adj_encoded.copy(), max_prev_node=self.max_prev_node)
  757. # adj_err = adj_decode-adj_copy
  758. # print(np.sum(adj_err))
  759. # if np.sum(adj_err)!=0:
  760. # print(adj_err)
  761. # np.set_printoptions(linewidth=200)
  762. # for print_i in range(adj_err.shape[0]):
  763. # print(adj_err[print_i].astype(int))
  764. # get x and y and adj
  765. # for small graph the rest are zero padded
  766. self.y_batch = adj_encoded
  767. # np.set_printoptions(linewidth=200,precision=3)
  768. # print('y\n')
  769. # for print_i in range(self.y_batch[i,:,:].shape[0]):
  770. # print(self.y_batch[i,:,:][print_i].astype(int))
  771. # print('x\n')
  772. # for print_i in range(self.x_batch[i, :, :].shape[0]):
  773. # print(self.x_batch[i, :, :][print_i].astype(int))
  774. # print('adj\n')
  775. # for print_i in range(self.adj_batch[i, :, :].shape[0]):
  776. # print(self.adj_batch[i, :, :][print_i].astype(int))
  777. # print('adj_norm\n')
  778. # for print_i in range(self.adj_norm_batch[i, :, :].shape[0]):
  779. # print(self.adj_norm_batch[i, :, :][print_i].astype(float))
  780. # print('feature\n')
  781. # for print_i in range(self.feature_batch[i, :, :].shape[0]):
  782. # print(self.feature_batch[i, :, :][print_i].astype(float))
  783. return self.y_batch, adj_copy
  784. # G = nx.ladder_graph(5)
  785. # # G = nx.grid_2d_graph(20,20)
  786. # # G = nx.ladder_graph(200)
  787. # graphs = [G]
  788. #
  789. # graphs, max_num_nodes = Graph_load_batch(min_num_nodes=6, name='ENZYMES')
  790. # sampler = Graph_sequence_sampler_flexible(graphs)
  791. #
  792. # y_max_all = []
  793. # for i in range(10000):
  794. # y_raw,adj_copy = sampler.sample()
  795. # y_max = max(len(y_raw[i]) for i in range(len(y_raw)))
  796. # y_max_all.append(y_max)
  797. # # print('max bfs node',y_max)
  798. # print('max', max(y_max_all))
  799. # print(y[1])
  800. # print(Variable(torch.FloatTensor(y[1])).cuda(CUDA))
  801. ########### potential use: an encoder along with the GraphRNN decoder
  802. # preprocess the adjacency matrix
  803. def preprocess(A):
  804. # Get size of the adjacency matrix
  805. size = len(A)
  806. # Get the degrees for each node
  807. degrees = np.sum(A, axis=1) + 1
  808. # Create diagonal matrix D from the degrees of the nodes
  809. D = np.diag(np.power(degrees, -0.5).flatten())
  810. # Cholesky decomposition of D
  811. # D = np.linalg.cholesky(D)
  812. # Inverse of the Cholesky decomposition of D
  813. # D = np.linalg.inv(D)
  814. # Create an identity matrix of size x size
  815. I = np.eye(size)
  816. # Create A hat
  817. A_hat = A + I
  818. # Return A_hat
  819. A_normal = np.dot(np.dot(D, A_hat), D)
  820. return A_normal
  821. # truncate the output seqence to save representation, and allowing for infinite generation
  822. # now having a list of graphs
  823. class Graph_sequence_sampler_bfs_permute_truncate_multigraph():
  824. def __init__(self, G_list, max_node_num=25, batch_size=4, max_prev_node=25, feature=None):
  825. self.batch_size = batch_size
  826. self.G_list = G_list
  827. self.n = max_node_num
  828. self.max_prev_node = max_prev_node
  829. self.adj_all = []
  830. for G in G_list:
  831. self.adj_all.append(np.asarray(nx.to_numpy_matrix(G)))
  832. self.has_feature = feature
  833. def sample(self):
  834. # batch, length, feature
  835. # self.x_batch = np.ones((self.batch_size, self.n - 1, self.max_prev_node))
  836. x_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
  837. # self.x_batch[:,0,:] = np.ones((self.batch_size, self.max_prev_node)) # first input is all ones
  838. # batch, length, feature
  839. y_batch = np.zeros((self.batch_size, self.n, self.max_prev_node)) # here zeros are padded for small graph
  840. # batch, length, length
  841. adj_batch = np.zeros((self.batch_size, self.n, self.n)) # here zeros are padded for small graph
  842. # batch, size, size
  843. adj_norm_batch = np.zeros((self.batch_size, self.n, self.n)) # here zeros are padded for small graph
  844. # batch, size, feature_len: degree and clustering coefficient
  845. if self.has_feature is None:
  846. feature_batch = np.zeros((self.batch_size, self.n, self.n)) # use one hot feature
  847. else:
  848. feature_batch = np.zeros((self.batch_size, self.n, 2))
  849. # generate input x, y pairs
  850. for i in range(self.batch_size):
  851. time0 = time.time()
  852. # first sample and get a permuted adj
  853. adj_idx = np.random.randint(len(self.adj_all))
  854. adj_copy = self.adj_all[adj_idx].copy()
  855. # print('Graph size', adj_copy.shape[0])
  856. x_idx = np.random.permutation(adj_copy.shape[0])
  857. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  858. adj_copy_matrix = np.asmatrix(adj_copy)
  859. G = nx.from_numpy_matrix(adj_copy_matrix)
  860. time1 = time.time()
  861. # then do bfs in the permuted G
  862. start_idx = np.random.randint(adj_copy.shape[0])
  863. x_idx = np.array(bfs_seq(G, start_idx))
  864. adj_copy = adj_copy[np.ix_(x_idx, x_idx)]
  865. # get the feature for the permuted G
  866. node_list = [G.nodes()[i] for i in x_idx]
  867. feature_degree = np.array(list(G.degree(node_list).values()))[:, np.newaxis]
  868. feature_clustering = np.array(list(nx.clustering(G, nodes=node_list).values()))[:, np.newaxis]
  869. time2 = time.time()
  870. # dict = nx.bfs_successors(G, start_idx)
  871. # print('dict', dict, 'node num', self.G.number_of_nodes())
  872. # print('x idx', x_idx, 'len', len(x_idx))
  873. # print('adj')
  874. # np.set_printoptions(linewidth=200)
  875. # for print_i in range(adj_copy.shape[0]):
  876. # print(adj_copy[print_i].astype(int))
  877. # adj_before = adj_copy.copy()
  878. # encode adj
  879. adj_encoded = encode_adj(adj_copy.copy(), max_prev_node=self.max_prev_node)
  880. # print('adj encoded')
  881. # np.set_printoptions(linewidth=200)
  882. # for print_i in range(adj_copy.shape[0]):
  883. # print(adj_copy[print_i].astype(int))
  884. # decode adj
  885. # print('adj recover error')
  886. # adj_decode = decode_adj(adj_encoded.copy(), max_prev_node=self.max_prev_node)
  887. # adj_err = adj_decode-adj_copy
  888. # print(np.sum(adj_err))
  889. # if np.sum(adj_err)!=0:
  890. # print(adj_err)
  891. # np.set_printoptions(linewidth=200)
  892. # for print_i in range(adj_err.shape[0]):
  893. # print(adj_err[print_i].astype(int))
  894. # get x and y and adj
  895. # for small graph the rest are zero padded
  896. y_batch[i, 0:adj_encoded.shape[0], :] = adj_encoded
  897. x_batch[i, 1:adj_encoded.shape[0] + 1, :] = adj_encoded
  898. adj_batch[i, 0:adj_copy.shape[0], 0:adj_copy.shape[0]] = adj_copy
  899. adj_copy_norm = preprocess(adj_copy)
  900. time3 = time.time()
  901. adj_norm_batch[i, 0:adj_copy.shape[0], 0:adj_copy.shape[0]] = adj_copy_norm
  902. if self.has_feature is None:
  903. feature_batch[i, 0:adj_copy.shape[0], 0:adj_copy.shape[0]] = np.eye(adj_copy.shape[0])
  904. else:
  905. feature_batch[i, 0:adj_copy.shape[0], :] = np.concatenate((feature_degree, feature_clustering), axis=1)
  906. # np.set_printoptions(linewidth=200,precision=3)
  907. # print('y\n')
  908. # for print_i in range(self.y_batch[i,:,:].shape[0]):
  909. # print(self.y_batch[i,:,:][print_i].astype(int))
  910. # print('x\n')
  911. # for print_i in range(self.x_batch[i, :, :].shape[0]):
  912. # print(self.x_batch[i, :, :][print_i].astype(int))
  913. # print('adj\n')
  914. # for print_i in range(self.adj_batch[i, :, :].shape[0]):
  915. # print(self.adj_batch[i, :, :][print_i].astype(int))
  916. # print('adj_norm\n')
  917. # for print_i in range(self.adj_norm_batch[i, :, :].shape[0]):
  918. # print(self.adj_norm_batch[i, :, :][print_i].astype(float))
  919. # print('feature\n')
  920. # for print_i in range(self.feature_batch[i, :, :].shape[0]):
  921. # print(self.feature_batch[i, :, :][print_i].astype(float))
  922. time4 = time.time()
  923. # print('1 ',time1-time0)
  924. # print('2 ',time2-time1)
  925. # print('3 ',time3-time2)
  926. # print('4 ',time4-time3)
  927. # print('x_batch\n',self.x_batch)
  928. # print('y_batch\n',self.y_batch)
  929. return torch.from_numpy(x_batch).float(), torch.from_numpy(y_batch).float(), \
  930. torch.from_numpy(adj_batch).float(), torch.from_numpy(adj_norm_batch).float(), torch.from_numpy(
  931. feature_batch).float()
  932. # generate own synthetic dataset
  933. def Graph_synthetic(seed):
  934. G = nx.Graph()
  935. np.random.seed(seed)
  936. base = np.repeat(np.eye(5), 20, axis=0)
  937. rand = np.random.randn(100, 5) * 0.05
  938. node_features = base + rand
  939. # # print('node features')
  940. # for i in range(node_features.shape[0]):
  941. # print(np.around(node_features[i], decimals=4))
  942. node_distance_l1 = np.ones((node_features.shape[0], node_features.shape[0]))
  943. node_distance_np = np.zeros((node_features.shape[0], node_features.shape[0]))
  944. for i in range(node_features.shape[0]):
  945. for j in range(node_features.shape[0]):
  946. if i != j:
  947. node_distance_l1[i, j] = np.sum(np.abs(node_features[i] - node_features[j]))
  948. # print('node distance', node_distance_l1[i,j])
  949. node_distance_np[i, j] = 1 / np.sum(np.abs(node_features[i] - node_features[j]) ** 2)
  950. print('node distance max', np.max(node_distance_l1))
  951. print('node distance min', np.min(node_distance_l1))
  952. node_distance_np_sum = np.sum(node_distance_np, axis=1, keepdims=True)
  953. embedding_dist = node_distance_np / node_distance_np_sum
  954. # generate the graph
  955. average_degree = 9
  956. for i in range(node_features.shape[0]):
  957. for j in range(i + 1, embedding_dist.shape[0]):
  958. p = np.random.rand()
  959. if p < embedding_dist[i, j] * average_degree:
  960. G.add_edge(i, j)
  961. G.remove_nodes_from(nx.isolates(G))
  962. print('num of nodes', G.number_of_nodes())
  963. print('num of edges', G.number_of_edges())
  964. G_deg = nx.degree_histogram(G)
  965. G_deg_sum = [a * b for a, b in zip(G_deg, range(0, len(G_deg)))]
  966. print('average degree', sum(G_deg_sum) / G.number_of_nodes())
  967. print('average path length', nx.average_shortest_path_length(G))
  968. print('diameter', nx.diameter(G))
  969. G_cluster = sorted(list(nx.clustering(G).values()))
  970. print('average clustering coefficient', sum(G_cluster) / len(G_cluster))
  971. print('Graph generation complete!')
  972. # node_features = np.concatenate((node_features, np.zeros((1,node_features.shape[1]))),axis=0)
  973. return G, node_features
  974. # G = Graph_synthetic(10)
  975. # return adj and features from a single graph
  976. class GraphDataset_adj(torch.utils.data.Dataset):
  977. """Graph Dataset"""
  978. def __init__(self, G, features=None):
  979. self.G = G
  980. self.n = G.number_of_nodes()
  981. adj = np.asarray(nx.to_numpy_matrix(self.G))
  982. # permute adj
  983. subgraph_idx = np.random.permutation(self.n)
  984. # subgraph_idx = np.arange(self.n)
  985. adj = adj[np.ix_(subgraph_idx, subgraph_idx)]
  986. self.adj = torch.from_numpy(adj + np.eye(len(adj))).float()
  987. self.adj_norm = torch.from_numpy(preprocess(adj)).float()
  988. if features is None:
  989. self.features = torch.Tensor(self.n, self.n)
  990. self.features = nn.init.eye(self.features)
  991. else:
  992. features = features[subgraph_idx, :]
  993. self.features = torch.from_numpy(features).float()
  994. print('embedding size', self.features.size())
  995. def __len__(self):
  996. return 1
  997. def __getitem__(self, idx):
  998. sample = {'adj': self.adj, 'adj_norm': self.adj_norm, 'features': self.features}
  999. return sample
  1000. # G = nx.karate_club_graph()
  1001. # dataset = GraphDataset_adj(G)
  1002. # train_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True, num_workers=1)
  1003. # for data in train_loader:
  1004. # print(data)
  1005. # return adj and features from a list of graphs
  1006. class GraphDataset_adj_batch(torch.utils.data.Dataset):
  1007. """Graph Dataset"""
  1008. def __init__(self, graphs, has_feature=True, num_nodes=20):
  1009. self.graphs = graphs
  1010. self.has_feature = has_feature
  1011. self.num_nodes = num_nodes
  1012. def __len__(self):
  1013. return len(self.graphs)
  1014. def __getitem__(self, idx):
  1015. adj_raw = np.asarray(nx.to_numpy_matrix(self.graphs[idx]))
  1016. np.fill_diagonal(adj_raw, 0) # in case the self connection already exists
  1017. # sample num_nodes size subgraph
  1018. subgraph_idx = np.random.permutation(adj_raw.shape[0])[0:self.num_nodes]
  1019. adj_raw = adj_raw[np.ix_(subgraph_idx, subgraph_idx)]
  1020. adj = torch.from_numpy(adj_raw + np.eye(len(adj_raw))).float()
  1021. adj_norm = torch.from_numpy(preprocess(adj_raw)).float()
  1022. adj_raw = torch.from_numpy(adj_raw).float()
  1023. if self.has_feature:
  1024. dictionary = nx.get_node_attributes(self.graphs[idx], 'feature')
  1025. features = np.zeros((self.num_nodes, list(dictionary.values())[0].shape[0]))
  1026. for i in range(self.num_nodes):
  1027. features[i, :] = list(dictionary.values())[subgraph_idx[i]]
  1028. # normalize
  1029. features -= np.mean(features, axis=0)
  1030. epsilon = 1e-6
  1031. features /= (np.std(features, axis=0) + epsilon)
  1032. features = torch.from_numpy(features).float()
  1033. else:
  1034. n = self.num_nodes
  1035. features = torch.Tensor(n, n)
  1036. features = nn.init.eye(features)
  1037. sample = {'adj': adj, 'adj_norm': adj_norm, 'features': features, 'adj_raw': adj_raw}
  1038. return sample
  1039. # return adj and features from a list of graphs, batch size = 1, so that graphs can have various size each time
  1040. class GraphDataset_adj_batch_1(torch.utils.data.Dataset):
  1041. """Graph Dataset"""
  1042. def __init__(self, graphs, has_feature=True):
  1043. self.graphs = graphs
  1044. self.has_feature = has_feature
  1045. def __len__(self):
  1046. return len(self.graphs)
  1047. def __getitem__(self, idx):
  1048. adj_raw = np.asarray(nx.to_numpy_matrix(self.graphs[idx]))
  1049. np.fill_diagonal(adj_raw, 0) # in case the self connection already exists
  1050. n = adj_raw.shape[0]
  1051. # give a permutation
  1052. subgraph_idx = np.random.permutation(n)
  1053. # subgraph_idx = np.arange(n)
  1054. adj_raw = adj_raw[np.ix_(subgraph_idx, subgraph_idx)]
  1055. adj = torch.from_numpy(adj_raw + np.eye(len(adj_raw))).float()
  1056. adj_norm = torch.from_numpy(preprocess(adj_raw)).float()
  1057. if self.has_feature:
  1058. dictionary = nx.get_node_attributes(self.graphs[idx], 'feature')
  1059. features = np.zeros((n, list(dictionary.values())[0].shape[0]))
  1060. for i in range(n):
  1061. features[i, :] = list(dictionary.values())[i]
  1062. features = features[subgraph_idx, :]
  1063. # normalize
  1064. features -= np.mean(features, axis=0)
  1065. epsilon = 1e-6
  1066. features /= (np.std(features, axis=0) + epsilon)
  1067. features = torch.from_numpy(features).float()
  1068. else:
  1069. features = torch.Tensor(n, n)
  1070. features = nn.init.eye(features)
  1071. sample = {'adj': adj, 'adj_norm': adj_norm, 'features': features}
  1072. return sample
  1073. # get one node at a time, for a single graph
  1074. class GraphDataset(torch.utils.data.Dataset):
  1075. """Graph Dataset"""
  1076. def __init__(self, G, hops=1, max_degree=5, vocab_size=35, embedding_dim=35, embedding=None,
  1077. shuffle_neighbour=True):
  1078. self.G = G
  1079. self.shuffle_neighbour = shuffle_neighbour
  1080. self.hops = hops
  1081. self.max_degree = max_degree
  1082. if embedding is None:
  1083. self.embedding = torch.Tensor(vocab_size, embedding_dim)
  1084. self.embedding = nn.init.eye(self.embedding)
  1085. else:
  1086. self.embedding = torch.from_numpy(embedding).float()
  1087. print('embedding size', self.embedding.size())
  1088. def __len__(self):
  1089. return len(self.G.nodes())
  1090. def __getitem__(self, idx):
  1091. idx = idx + 1
  1092. idx_list = [idx]
  1093. node_list = [self.embedding[idx].view(-1, self.embedding.size(1))]
  1094. node_count_list = []
  1095. for i in range(self.hops):
  1096. # sample this hop
  1097. adj_list = np.array([])
  1098. adj_count_list = np.array([])
  1099. for idx in idx_list:
  1100. if self.shuffle_neighbour:
  1101. adj_list_new = list(self.G.adj[idx - 1])
  1102. random.shuffle(adj_list_new)
  1103. adj_list_new = np.array(adj_list_new) + 1
  1104. else:
  1105. adj_list_new = np.array(list(self.G.adj[idx - 1])) + 1
  1106. adj_count_list_new = np.array([len(adj_list_new)])
  1107. adj_list = np.concatenate((adj_list, adj_list_new), axis=0)
  1108. adj_count_list = np.concatenate((adj_count_list, adj_count_list_new), axis=0)
  1109. # print(i, adj_list)
  1110. # print(i, embedding(Variable(torch.from_numpy(adj_list)).long()))
  1111. index = torch.from_numpy(adj_list).long()
  1112. adj_list_emb = self.embedding[index]
  1113. node_list.append(adj_list_emb)
  1114. node_count_list.append(adj_count_list)
  1115. idx_list = adj_list
  1116. # padding, used as target
  1117. idx_list = [idx]
  1118. node_list_pad = [self.embedding[idx].view(-1, self.embedding.size(1))]
  1119. node_count_list_pad = []
  1120. node_adj_list = []
  1121. for i in range(self.hops):
  1122. adj_list = np.zeros(self.max_degree ** (i + 1))
  1123. adj_count_list = np.ones(self.max_degree ** (i)) * self.max_degree
  1124. for j, idx in enumerate(idx_list):
  1125. if idx == 0:
  1126. adj_list_new = np.zeros(self.max_degree)
  1127. else:
  1128. if self.shuffle_neighbour:
  1129. adj_list_new = list(self.G.adj[idx - 1])
  1130. # random.shuffle(adj_list_new)
  1131. adj_list_new = np.array(adj_list_new) + 1
  1132. else:
  1133. adj_list_new = np.array(list(self.G.adj[idx - 1])) + 1
  1134. start_idx = j * self.max_degree
  1135. incre_idx = min(self.max_degree, adj_list_new.shape[0])
  1136. adj_list[start_idx:start_idx + incre_idx] = adj_list_new[:incre_idx]
  1137. index = torch.from_numpy(adj_list).long()
  1138. adj_list_emb = self.embedding[index]
  1139. node_list_pad.append(adj_list_emb)
  1140. node_count_list_pad.append(adj_count_list)
  1141. idx_list = adj_list
  1142. # calc adj matrix
  1143. node_adj = torch.zeros(index.size(0), index.size(0))
  1144. for first in range(index.size(0)):
  1145. for second in range(first, index.size(0)):
  1146. if index[first] == index[second]:
  1147. node_adj[first, second] = 1
  1148. node_adj[second, first] = 1
  1149. elif self.G.has_edge(index[first], index[second]):
  1150. node_adj[first, second] = 0.5
  1151. node_adj[second, first] = 0.5
  1152. node_adj_list.append(node_adj)
  1153. node_list = list(reversed(node_list))
  1154. node_count_list = list(reversed(node_count_list))
  1155. node_list_pad = list(reversed(node_list_pad))
  1156. node_count_list_pad = list(reversed(node_count_list_pad))
  1157. node_adj_list = list(reversed(node_adj_list))
  1158. sample = {'node_list': node_list, 'node_count_list': node_count_list,
  1159. 'node_list_pad': node_list_pad, 'node_count_list_pad': node_count_list_pad,
  1160. 'node_adj_list': node_adj_list}
  1161. return sample