You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.py 19KB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520
  1. import networkx as nx
  2. import numpy as np
  3. import torch
  4. import torch.nn as nn
  5. import torch.nn.init as init
  6. from torch.autograd import Variable
  7. import matplotlib.pyplot as plt
  8. import torch.nn.functional as F
  9. from torch import optim
  10. from torch.optim.lr_scheduler import MultiStepLR
  11. # import node2vec.src.main as nv
  12. from sklearn.decomposition import PCA
  13. import community
  14. import pickle
  15. import re
  16. import data
  17. def citeseer_ego():
  18. _, _, G = data.Graph_load(dataset='citeseer')
  19. G = max(nx.connected_component_subgraphs(G), key=len)
  20. G = nx.convert_node_labels_to_integers(G)
  21. graphs = []
  22. for i in range(G.number_of_nodes()):
  23. G_ego = nx.ego_graph(G, i, radius=3)
  24. if G_ego.number_of_nodes() >= 50 and (G_ego.number_of_nodes() <= 400):
  25. graphs.append(G_ego)
  26. return graphs
  27. def caveman_special(c=2, k=20, p_path=0.1, p_edge=0.3):
  28. p = p_path
  29. path_count = max(int(np.ceil(p * k)), 1)
  30. G = nx.caveman_graph(c, k)
  31. # remove 50% edges
  32. p = 1 - p_edge
  33. for (u, v) in list(G.edges()):
  34. if np.random.rand() < p and ((u < k and v < k) or (u >= k and v >= k)):
  35. G.remove_edge(u, v)
  36. # add path_count links
  37. for i in range(path_count):
  38. u = np.random.randint(0, k)
  39. v = np.random.randint(k, k * 2)
  40. G.add_edge(u, v)
  41. G = max(nx.connected_component_subgraphs(G), key=len)
  42. return G
  43. def n_community(c_sizes, p_inter=0.01):
  44. graphs = [nx.gnp_random_graph(c_sizes[i], 0.7, seed=i) for i in range(len(c_sizes))]
  45. G = nx.disjoint_union_all(graphs)
  46. communities = list(nx.connected_component_subgraphs(G))
  47. for i in range(len(communities)):
  48. subG1 = communities[i]
  49. nodes1 = list(subG1.nodes())
  50. for j in range(i + 1, len(communities)):
  51. subG2 = communities[j]
  52. nodes2 = list(subG2.nodes())
  53. has_inter_edge = False
  54. for n1 in nodes1:
  55. for n2 in nodes2:
  56. if np.random.rand() < p_inter:
  57. G.add_edge(n1, n2)
  58. has_inter_edge = True
  59. if not has_inter_edge:
  60. G.add_edge(nodes1[0], nodes2[0])
  61. # print('connected comp: ', len(list(nx.connected_component_subgraphs(G))))
  62. return G
  63. def perturb(graph_list, p_del, p_add=None):
  64. ''' Perturb the list of graphs by adding/removing edges.
  65. Args:
  66. p_add: probability of adding edges. If None, estimate it according to graph density,
  67. such that the expected number of added edges is equal to that of deleted edges.
  68. p_del: probability of removing edges
  69. Returns:
  70. A list of graphs that are perturbed from the original graphs
  71. '''
  72. perturbed_graph_list = []
  73. for G_original in graph_list:
  74. G = G_original.copy()
  75. trials = np.random.binomial(1, p_del, size=G.number_of_edges())
  76. edges = list(G.edges())
  77. i = 0
  78. for (u, v) in edges:
  79. if trials[i] == 1:
  80. G.remove_edge(u, v)
  81. i += 1
  82. if p_add is None:
  83. num_nodes = G.number_of_nodes()
  84. p_add_est = np.sum(trials) / (num_nodes * (num_nodes - 1) / 2 -
  85. G.number_of_edges())
  86. else:
  87. p_add_est = p_add
  88. nodes = list(G.nodes())
  89. tmp = 0
  90. for i in range(len(nodes)):
  91. u = nodes[i]
  92. trials = np.random.binomial(1, p_add_est, size=G.number_of_nodes())
  93. j = 0
  94. for j in range(i + 1, len(nodes)):
  95. v = nodes[j]
  96. if trials[j] == 1:
  97. tmp += 1
  98. G.add_edge(u, v)
  99. j += 1
  100. perturbed_graph_list.append(G)
  101. return perturbed_graph_list
  102. def perturb_new(graph_list, p):
  103. ''' Perturb the list of graphs by adding/removing edges.
  104. Args:
  105. p_add: probability of adding edges. If None, estimate it according to graph density,
  106. such that the expected number of added edges is equal to that of deleted edges.
  107. p_del: probability of removing edges
  108. Returns:
  109. A list of graphs that are perturbed from the original graphs
  110. '''
  111. perturbed_graph_list = []
  112. for G_original in graph_list:
  113. G = G_original.copy()
  114. edge_remove_count = 0
  115. for (u, v) in list(G.edges()):
  116. if np.random.rand() < p:
  117. G.remove_edge(u, v)
  118. edge_remove_count += 1
  119. # randomly add the edges back
  120. for i in range(edge_remove_count):
  121. while True:
  122. u = np.random.randint(0, G.number_of_nodes())
  123. v = np.random.randint(0, G.number_of_nodes())
  124. if (not G.has_edge(u, v)) and (u != v):
  125. break
  126. G.add_edge(u, v)
  127. perturbed_graph_list.append(G)
  128. return perturbed_graph_list
  129. def imsave(fname, arr, vmin=None, vmax=None, cmap=None, format=None, origin=None):
  130. from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
  131. from matplotlib.figure import Figure
  132. fig = Figure(figsize=arr.shape[::-1], dpi=1, frameon=False)
  133. canvas = FigureCanvas(fig)
  134. fig.figimage(arr, cmap=cmap, vmin=vmin, vmax=vmax, origin=origin)
  135. fig.savefig(fname, dpi=1, format=format)
  136. def save_prediction_histogram(y_pred_data, fname_pred, max_num_node, bin_n=20):
  137. bin_edge = np.linspace(1e-6, 1, bin_n + 1)
  138. output_pred = np.zeros((bin_n, max_num_node))
  139. for i in range(max_num_node):
  140. output_pred[:, i], _ = np.histogram(y_pred_data[:, i, :], bins=bin_edge, density=False)
  141. # normalize
  142. output_pred[:, i] /= np.sum(output_pred[:, i])
  143. imsave(fname=fname_pred, arr=output_pred, origin='upper', cmap='Greys_r', vmin=0.0, vmax=3.0 / bin_n)
  144. # draw a single graph G
  145. def draw_graph(G, prefix='test'):
  146. parts = community.best_partition(G)
  147. values = [parts.get(node) for node in G.nodes()]
  148. colors = []
  149. for i in range(len(values)):
  150. if values[i] == 0:
  151. colors.append('red')
  152. if values[i] == 1:
  153. colors.append('green')
  154. if values[i] == 2:
  155. colors.append('blue')
  156. if values[i] == 3:
  157. colors.append('yellow')
  158. if values[i] == 4:
  159. colors.append('orange')
  160. if values[i] == 5:
  161. colors.append('pink')
  162. if values[i] == 6:
  163. colors.append('black')
  164. # spring_pos = nx.spring_layout(G)
  165. plt.switch_backend('agg')
  166. plt.axis("off")
  167. pos = nx.spring_layout(G)
  168. nx.draw_networkx(G, with_labels=True, node_size=35, node_color=colors, pos=pos)
  169. # plt.switch_backend('agg')
  170. # options = {
  171. # 'node_color': 'black',
  172. # 'node_size': 10,
  173. # 'width': 1
  174. # }
  175. # plt.figure()
  176. # plt.subplot()
  177. # nx.draw_networkx(G, **options)
  178. plt.savefig('figures/graph_view_' + prefix + '.png', dpi=200)
  179. plt.close()
  180. plt.switch_backend('agg')
  181. G_deg = nx.degree_histogram(G)
  182. G_deg = np.array(G_deg)
  183. # plt.plot(range(len(G_deg)), G_deg, 'r', linewidth = 2)
  184. plt.loglog(np.arange(len(G_deg))[G_deg > 0], G_deg[G_deg > 0], 'r', linewidth=2)
  185. plt.savefig('figures/degree_view_' + prefix + '.png', dpi=200)
  186. plt.close()
  187. # degree_sequence = sorted(nx.degree(G).values(), reverse=True) # degree sequence
  188. # plt.loglog(degree_sequence, 'b-', marker='o')
  189. # plt.title("Degree rank plot")
  190. # plt.ylabel("degree")
  191. # plt.xlabel("rank")
  192. # plt.savefig('figures/degree_view_' + prefix + '.png', dpi=200)
  193. # plt.close()
  194. # G = nx.grid_2d_graph(8,8)
  195. # G = nx.karate_club_graph()
  196. # draw_graph(G)
  197. # draw a list of graphs [G]
  198. def draw_graph_list(G_list, row, col, fname='figures/test', layout='spring', is_single=False, k=1, node_size=55,
  199. alpha=1, width=1.3):
  200. # # draw graph view
  201. # from pylab import rcParams
  202. # rcParams['figure.figsize'] = 12,3
  203. plt.switch_backend('agg')
  204. for i, G in enumerate(G_list):
  205. plt.subplot(row, col, i + 1)
  206. plt.subplots_adjust(left=0, bottom=0, right=1, top=1,
  207. wspace=0, hspace=0)
  208. # if i%2==0:
  209. # plt.title('real nodes: '+str(G.number_of_nodes()), fontsize = 4)
  210. # else:
  211. # plt.title('pred nodes: '+str(G.number_of_nodes()), fontsize = 4)
  212. # plt.title('num of nodes: '+str(G.number_of_nodes()), fontsize = 4)
  213. # parts = community.best_partition(G)
  214. # values = [parts.get(node) for node in G.nodes()]
  215. # colors = []
  216. # for i in range(len(values)):
  217. # if values[i] == 0:
  218. # colors.append('red')
  219. # if values[i] == 1:
  220. # colors.append('green')
  221. # if values[i] == 2:
  222. # colors.append('blue')
  223. # if values[i] == 3:
  224. # colors.append('yellow')
  225. # if values[i] == 4:
  226. # colors.append('orange')
  227. # if values[i] == 5:
  228. # colors.append('pink')
  229. # if values[i] == 6:
  230. # colors.append('black')
  231. plt.axis("off")
  232. if layout == 'spring':
  233. pos = nx.spring_layout(G, k=k / np.sqrt(G.number_of_nodes()), iterations=100)
  234. # pos = nx.spring_layout(G)
  235. elif layout == 'spectral':
  236. pos = nx.spectral_layout(G)
  237. # # nx.draw_networkx(G, with_labels=True, node_size=2, width=0.15, font_size = 1.5, node_color=colors,pos=pos)
  238. # nx.draw_networkx(G, with_labels=False, node_size=1.5, width=0.2, font_size = 1.5, linewidths=0.2, node_color = 'k',pos=pos,alpha=0.2)
  239. if is_single:
  240. # node_size default 60, edge_width default 1.5
  241. nx.draw_networkx_nodes(G, pos, node_size=node_size, node_color='#336699', alpha=1, linewidths=0,
  242. font_size=0)
  243. nx.draw_networkx_edges(G, pos, alpha=alpha, width=width)
  244. else:
  245. nx.draw_networkx_nodes(G, pos, node_size=1.5, node_color='#336699', alpha=1, linewidths=0.2, font_size=1.5)
  246. nx.draw_networkx_edges(G, pos, alpha=0.3, width=0.2)
  247. # plt.axis('off')
  248. # plt.title('Complete Graph of Odd-degree Nodes')
  249. # plt.show()
  250. plt.tight_layout()
  251. plt.savefig(fname + '.png', dpi=600)
  252. plt.close()
  253. # # draw degree distribution
  254. # plt.switch_backend('agg')
  255. # for i, G in enumerate(G_list):
  256. # plt.subplot(row, col, i + 1)
  257. # G_deg = np.array(list(G.degree(G.nodes()).values()))
  258. # bins = np.arange(20)
  259. # plt.hist(np.array(G_deg), bins=bins, align='left')
  260. # plt.xlabel('degree', fontsize = 3)
  261. # plt.ylabel('count', fontsize = 3)
  262. # G_deg_mean = 2*G.number_of_edges()/float(G.number_of_nodes())
  263. # # if i % 2 == 0:
  264. # # plt.title('real average degree: {:.2f}'.format(G_deg_mean), fontsize=4)
  265. # # else:
  266. # # plt.title('pred average degree: {:.2f}'.format(G_deg_mean), fontsize=4)
  267. # plt.title('average degree: {:.2f}'.format(G_deg_mean), fontsize=4)
  268. # plt.tick_params(axis='both', which='major', labelsize=3)
  269. # plt.tick_params(axis='both', which='minor', labelsize=3)
  270. # plt.tight_layout()
  271. # plt.savefig(fname+'_degree.png', dpi=600)
  272. # plt.close()
  273. #
  274. # # draw clustering distribution
  275. # plt.switch_backend('agg')
  276. # for i, G in enumerate(G_list):
  277. # plt.subplot(row, col, i + 1)
  278. # G_cluster = list(nx.clustering(G).values())
  279. # bins = np.linspace(0,1,20)
  280. # plt.hist(np.array(G_cluster), bins=bins, align='left')
  281. # plt.xlabel('clustering coefficient', fontsize=3)
  282. # plt.ylabel('count', fontsize=3)
  283. # G_cluster_mean = sum(G_cluster) / len(G_cluster)
  284. # # if i % 2 == 0:
  285. # # plt.title('real average clustering: {:.4f}'.format(G_cluster_mean), fontsize=4)
  286. # # else:
  287. # # plt.title('pred average clustering: {:.4f}'.format(G_cluster_mean), fontsize=4)
  288. # plt.title('average clustering: {:.4f}'.format(G_cluster_mean), fontsize=4)
  289. # plt.tick_params(axis='both', which='major', labelsize=3)
  290. # plt.tick_params(axis='both', which='minor', labelsize=3)
  291. # plt.tight_layout()
  292. # plt.savefig(fname+'_clustering.png', dpi=600)
  293. # plt.close()
  294. #
  295. # # draw circle distribution
  296. # plt.switch_backend('agg')
  297. # for i, G in enumerate(G_list):
  298. # plt.subplot(row, col, i + 1)
  299. # cycle_len = []
  300. # cycle_all = nx.cycle_basis(G)
  301. # for item in cycle_all:
  302. # cycle_len.append(len(item))
  303. #
  304. # bins = np.arange(20)
  305. # plt.hist(np.array(cycle_len), bins=bins, align='left')
  306. # plt.xlabel('cycle length', fontsize=3)
  307. # plt.ylabel('count', fontsize=3)
  308. # G_cycle_mean = 0
  309. # if len(cycle_len)>0:
  310. # G_cycle_mean = sum(cycle_len) / len(cycle_len)
  311. # # if i % 2 == 0:
  312. # # plt.title('real average cycle: {:.4f}'.format(G_cycle_mean), fontsize=4)
  313. # # else:
  314. # # plt.title('pred average cycle: {:.4f}'.format(G_cycle_mean), fontsize=4)
  315. # plt.title('average cycle: {:.4f}'.format(G_cycle_mean), fontsize=4)
  316. # plt.tick_params(axis='both', which='major', labelsize=3)
  317. # plt.tick_params(axis='both', which='minor', labelsize=3)
  318. # plt.tight_layout()
  319. # plt.savefig(fname+'_cycle.png', dpi=600)
  320. # plt.close()
  321. #
  322. # # draw community distribution
  323. # plt.switch_backend('agg')
  324. # for i, G in enumerate(G_list):
  325. # plt.subplot(row, col, i + 1)
  326. # parts = community.best_partition(G)
  327. # values = np.array([parts.get(node) for node in G.nodes()])
  328. # counts = np.sort(np.bincount(values)[::-1])
  329. # pos = np.arange(len(counts))
  330. # plt.bar(pos,counts,align = 'edge')
  331. # plt.xlabel('community ID', fontsize=3)
  332. # plt.ylabel('count', fontsize=3)
  333. # G_community_count = len(counts)
  334. # # if i % 2 == 0:
  335. # # plt.title('real average clustering: {}'.format(G_community_count), fontsize=4)
  336. # # else:
  337. # # plt.title('pred average clustering: {}'.format(G_community_count), fontsize=4)
  338. # plt.title('average clustering: {}'.format(G_community_count), fontsize=4)
  339. # plt.tick_params(axis='both', which='major', labelsize=3)
  340. # plt.tick_params(axis='both', which='minor', labelsize=3)
  341. # plt.tight_layout()
  342. # plt.savefig(fname+'_community.png', dpi=600)
  343. # plt.close()
  344. # plt.switch_backend('agg')
  345. # G_deg = nx.degree_histogram(G)
  346. # G_deg = np.array(G_deg)
  347. # # plt.plot(range(len(G_deg)), G_deg, 'r', linewidth = 2)
  348. # plt.loglog(np.arange(len(G_deg))[G_deg>0], G_deg[G_deg>0], 'r', linewidth=2)
  349. # plt.savefig('figures/degree_view_' + prefix + '.png', dpi=200)
  350. # plt.close()
  351. # degree_sequence = sorted(nx.degree(G).values(), reverse=True) # degree sequence
  352. # plt.loglog(degree_sequence, 'b-', marker='o')
  353. # plt.title("Degree rank plot")
  354. # plt.ylabel("degree")
  355. # plt.xlabel("rank")
  356. # plt.savefig('figures/degree_view_' + prefix + '.png', dpi=200)
  357. # plt.close()
  358. # directly get graph statistics from adj, obsoleted
  359. def decode_graph(adj, prefix):
  360. adj = np.asmatrix(adj)
  361. G = nx.from_numpy_matrix(adj)
  362. # G.remove_nodes_from(nx.isolates(G))
  363. print('num of nodes: {}'.format(G.number_of_nodes()))
  364. print('num of edges: {}'.format(G.number_of_edges()))
  365. G_deg = nx.degree_histogram(G)
  366. G_deg_sum = [a * b for a, b in zip(G_deg, range(0, len(G_deg)))]
  367. print('average degree: {}'.format(sum(G_deg_sum) / G.number_of_nodes()))
  368. if nx.is_connected(G):
  369. print('average path length: {}'.format(nx.average_shortest_path_length(G)))
  370. print('average diameter: {}'.format(nx.diameter(G)))
  371. G_cluster = sorted(list(nx.clustering(G).values()))
  372. print('average clustering coefficient: {}'.format(sum(G_cluster) / len(G_cluster)))
  373. cycle_len = []
  374. cycle_all = nx.cycle_basis(G, 0)
  375. for item in cycle_all:
  376. cycle_len.append(len(item))
  377. print('cycles', cycle_len)
  378. print('cycle count', len(cycle_len))
  379. draw_graph(G, prefix=prefix)
  380. def get_graph(adj):
  381. '''
  382. get a graph from zero-padded adj
  383. :param adj:
  384. :return:
  385. '''
  386. # remove all zeros rows and columns
  387. adj = adj[~np.all(adj == 0, axis=1)]
  388. adj = adj[:, ~np.all(adj == 0, axis=0)]
  389. adj = np.asmatrix(adj)
  390. G = nx.from_numpy_matrix(adj)
  391. return G
  392. # save a list of graphs
  393. def save_graph_list(G_list, fname):
  394. with open(fname, "wb") as f:
  395. pickle.dump(G_list, f)
  396. # pick the first connected component
  397. def pick_connected_component(G):
  398. node_list = nx.node_connected_component(G, 0)
  399. return G.subgraph(node_list)
  400. def pick_connected_component_new(G):
  401. adj_list = G.adjacency_list()
  402. for id, adj in enumerate(adj_list):
  403. id_min = min(adj)
  404. if id < id_min and id >= 1:
  405. # if id<id_min and id>=4:
  406. break
  407. node_list = list(range(id)) # only include node prior than node "id"
  408. G = G.subgraph(node_list)
  409. G = max(nx.connected_component_subgraphs(G), key=len)
  410. return G
  411. # load a list of graphs
  412. def load_graph_list(fname, is_real=True):
  413. with open(fname, "rb") as f:
  414. graph_list = pickle.load(f)
  415. for i in range(len(graph_list)):
  416. edges_with_selfloops = graph_list[i].selfloop_edges()
  417. if len(edges_with_selfloops) > 0:
  418. graph_list[i].remove_edges_from(edges_with_selfloops)
  419. if is_real:
  420. graph_list[i] = max(nx.connected_component_subgraphs(graph_list[i]), key=len)
  421. graph_list[i] = nx.convert_node_labels_to_integers(graph_list[i])
  422. else:
  423. graph_list[i] = pick_connected_component_new(graph_list[i])
  424. return graph_list
  425. def export_graphs_to_txt(g_list, output_filename_prefix):
  426. i = 0
  427. for G in g_list:
  428. f = open(output_filename_prefix + '_' + str(i) + '.txt', 'w+')
  429. for (u, v) in G.edges():
  430. idx_u = G.nodes().index(u)
  431. idx_v = G.nodes().index(v)
  432. f.write(str(idx_u) + '\t' + str(idx_v) + '\n')
  433. i += 1
  434. def snap_txt_output_to_nx(in_fname):
  435. G = nx.Graph()
  436. with open(in_fname, 'r') as f:
  437. for line in f:
  438. if not line[0] == '#':
  439. splitted = re.split('[ \t]', line)
  440. # self loop might be generated, but should be removed
  441. u = int(splitted[0])
  442. v = int(splitted[1])
  443. if not u == v:
  444. G.add_edge(int(u), int(v))
  445. return G
  446. def test_perturbed():
  447. graphs = []
  448. for i in range(100, 101):
  449. for j in range(4, 5):
  450. for k in range(500):
  451. graphs.append(nx.barabasi_albert_graph(i, j))
  452. g_perturbed = perturb(graphs, 0.9)
  453. print([g.number_of_edges() for g in graphs])
  454. print([g.number_of_edges() for g in g_perturbed])
  455. if __name__ == '__main__':
  456. # test_perturbed()
  457. graphs = load_graph_list('graphs/' + 'GraphRNN_RNN_grid_4_128_train_0.dat')
  458. # graphs = load_graph_list('graphs/' + 'GraphRNN_RNN_community4_4_128_pred_2500_1.dat')
  459. # graphs = load_graph_list('eval_results/mmsb/' + 'community41.dat')
  460. for i in range(0, 160, 16):
  461. draw_graph_list(graphs[i:i + 16], 4, 4, fname='figures/community4_' + str(i))