123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154 |
- """Stochastic block model."""
-
- import argparse
- import os
- from time import time
-
- import edward as ed
- import networkx as nx
- import numpy as np
- import tensorflow as tf
-
- from edward.models import Bernoulli, Multinomial, Beta, Dirichlet, PointMass, Normal
- from observations import karate
- from sklearn.metrics.cluster import adjusted_rand_score
-
- import utils
-
- CUDA = 2
- ed.set_seed(int(time()))
- #ed.set_seed(42)
-
- # DATA
- #X_data, Z_true = karate("data")
-
- def disjoint_cliques_test_graph(num_cliques, clique_size):
- G = nx.disjoint_union_all([nx.complete_graph(clique_size) for _ in range(num_cliques)])
- return nx.to_numpy_matrix(G)
-
- def mmsb(N, K, data):
- # sparsity
- rho = 0.3
- # MODEL
- # probability of belonging to each of K blocks for each node
- gamma = Dirichlet(concentration=tf.ones([K]))
- # block connectivity
- Pi = Beta(concentration0=tf.ones([K, K]), concentration1=tf.ones([K, K]))
- # probability of belonging to each of K blocks for all nodes
- Z = Multinomial(total_count=1.0, probs=gamma, sample_shape=N)
- # adjacency
- X = Bernoulli(probs=(1 - rho) * tf.matmul(Z, tf.matmul(Pi, tf.transpose(Z))))
-
- # INFERENCE (EM algorithm)
- qgamma = PointMass(params=tf.nn.softmax(tf.Variable(tf.random_normal([K]))))
- qPi = PointMass(params=tf.nn.sigmoid(tf.Variable(tf.random_normal([K, K]))))
- qZ = PointMass(params=tf.nn.softmax(tf.Variable(tf.random_normal([N, K]))))
-
- #qgamma = Normal(loc=tf.get_variable("qgamma/loc", [K]),
- # scale=tf.nn.softplus(
- # tf.get_variable("qgamma/scale", [K])))
- #qPi = Normal(loc=tf.get_variable("qPi/loc", [K, K]),
- # scale=tf.nn.softplus(
- # tf.get_variable("qPi/scale", [K, K])))
- #qZ = Normal(loc=tf.get_variable("qZ/loc", [N, K]),
- # scale=tf.nn.softplus(
- # tf.get_variable("qZ/scale", [N, K])))
-
- #inference = ed.KLqp({gamma: qgamma, Pi: qPi, Z: qZ}, data={X: data})
- inference = ed.MAP({gamma: qgamma, Pi: qPi, Z: qZ}, data={X: data})
-
- #inference.run()
- n_iter = 6000
- inference.initialize(optimizer=tf.train.AdamOptimizer(learning_rate=0.01), n_iter=n_iter)
-
- tf.global_variables_initializer().run()
-
- for _ in range(inference.n_iter):
- info_dict = inference.update()
- inference.print_progress(info_dict)
-
- inference.finalize()
- print('qgamma after: ', qgamma.mean().eval())
- return qZ.mean().eval(), qPi.eval()
-
- def arg_parse():
- parser = argparse.ArgumentParser(description='MMSB arguments.')
- parser.add_argument('--dataset', dest='dataset',
- help='Input dataset.')
- parser.add_argument('--K', dest='K', type=int,
- help='Number of blocks.')
- parser.add_argument('--samples-per-G', dest='samples', type=int,
- help='Number of samples for every graph.')
-
- parser.set_defaults(dataset='community',
- K=4,
- samples=1)
- return parser.parse_args()
-
- def graph_gen_from_blockmodel(B, Z):
- n_blocks = len(B)
- B = np.array(B)
- Z = np.array(Z)
- adj_prob = np.dot(Z, np.dot(B, np.transpose(Z)))
- adj = np.random.binomial(1, adj_prob * 0.3)
- return nx.from_numpy_matrix(adj)
-
- if __name__ == '__main__':
- prog_args = arg_parse()
- os.environ['CUDA_VISIBLE_DEVICES'] = str(CUDA)
- print('CUDA', CUDA)
-
- X_dataset = []
- #X_data = nx.to_numpy_matrix(nx.connected_caveman_graph(4, 7))
- if prog_args.dataset == 'clique_test':
- X_data = disjoint_cliques_test_graph(4, 7)
- X_dataset.append(X_data)
- elif prog_args.dataset == 'citeseer':
- graphs = utils.citeseer_ego()
- X_dataset = [nx.to_numpy_matrix(g) for g in graphs]
- elif prog_args.dataset == 'community':
- graphs = []
- for i in range(2, 3):
- for j in range(30, 81):
- for k in range(10):
- graphs.append(utils.caveman_special(i,j, p_edge=0.3))
- X_dataset = [nx.to_numpy_matrix(g) for g in graphs]
- elif prog_args.dataset == 'grid':
- graphs = []
- for i in range(10,20):
- for j in range(10,20):
- graphs.append(nx.grid_2d_graph(i,j))
- X_dataset = [nx.to_numpy_matrix(g) for g in graphs]
- elif prog_args.dataset.startswith('community'):
- graphs = []
- num_communities = int(prog_args.dataset[-1])
- print('Creating dataset with ', num_communities, ' communities')
- c_sizes = np.random.choice([12, 13, 14, 15, 16, 17], num_communities)
- for k in range(3000):
- graphs.append(utils.n_community(c_sizes, p_inter=0.01))
- X_dataset = [nx.to_numpy_matrix(g) for g in graphs]
-
- print('Number of graphs: ', len(X_dataset))
- K = prog_args.K # number of clusters
- gen_graphs = []
- for i in range(len(X_dataset)):
- if i % 5 == 0:
- print(i)
- X_data = X_dataset[i]
- N = X_data.shape[0] # number of vertices
-
- Zp, B = mmsb(N, K, X_data)
- #print("Block: ", B)
- Z_pred = Zp.argmax(axis=1)
- print("Result (label flip can happen):")
- #print("prob: ", Zp)
- print("Predicted")
- print(Z_pred)
- #print(Z_true)
- #print("Adjusted Rand Index =", adjusted_rand_score(Z_pred, Z_true))
- for j in range(prog_args.samples):
- gen_graphs.append(graph_gen_from_blockmodel(B, Zp))
-
- save_path = '/lfs/local/0/rexy/graph-generation/eval_results/mmsb/'
- utils.save_graph_list(gen_graphs, os.path.join(save_path, prog_args.dataset + '.dat'))
|