You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

MissingNodeIdentification.m 11KB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. function [ graph_predicted, clustering ] = MissingNodeIdentification( graph_visible, num_placeholders, num_missing_nodes, affinity_calculation_type, reduce_dimensions )
  2. %MissingNodeIdentification
  3. % Inputs:
  4. % graph_visible - The visible graph adjacency matrix. The known nodes
  5. % should be first, followed by the placeholders
  6. % num_placeholders - The number of placeholders
  7. % num_missing_nodes - The number of missing nodes. If it is unkown,
  8. % enter 0 and the algorithm will try to estimate it.
  9. % affinity_calculation_type - which affinity to use (default is Adamic/Adar)
  10. % reduce_dimensions - If the algorithm should use dimension reduction
  11. % (much faster but slightly worse performance - default is 1)
  12. %
  13. % Outputs:
  14. % graph_predicted - the predicted graph
  15. % clustering - a vector containing the cluster index of each
  16. % placeholder
  17. global affinity_calculation_shortest_path;
  18. global affinity_calculation_euclid;
  19. global affinity_calculation_common_friends;
  20. global affinity_calculation_random_clustering;
  21. global affinity_calculation_adamic_adar;
  22. global affinity_calculation_katz_beta_0_5;
  23. global affinity_calculation_katz_beta_0_05;
  24. global affinity_calculation_katz_beta_0_005;
  25. affinity_calculation_shortest_path = 0;
  26. affinity_calculation_euclid = 1;
  27. affinity_calculation_common_friends = 2;
  28. affinity_calculation_random_clustering = 3;
  29. affinity_calculation_adamic_adar = 4;
  30. affinity_calculation_katz_beta_0_5 = 5;
  31. affinity_calculation_katz_beta_0_05 = 6;
  32. affinity_calculation_katz_beta_0_005 = 7;
  33. if nargin < 5
  34. reduce_dimensions = 1;
  35. end
  36. if nargin < 4
  37. affinity_calculation_type = affinity_calculation_adamic_adar;
  38. end
  39. graph_visible = sparse(graph_visible);
  40. %size of the visible graph
  41. visible_size = size(graph_visible,1);
  42. %size of the actual (original) graph
  43. actual_graph_size = visible_size - num_placeholders + num_missing_nodes;
  44. %%index of the first unknown node
  45. first_unk_node = visible_size - num_placeholders + 1;
  46. if num_missing_nodes <= 0
  47. num_clusters_known = 0;
  48. else
  49. num_clusters_known = 1;
  50. end
  51. %calculate the affinity matrix
  52. affinity = CalcAffinity( graph_visible, affinity_calculation_type, actual_graph_size, num_missing_nodes);
  53. %reduce dimension of the affinity matrix (only include nodes which affect
  54. %the placeholders clustering)
  55. if reduce_dimensions > 0
  56. [affinity, ~, first_unk_node] = ReduceDimensions(affinity, first_unk_node);
  57. end
  58. %determine the number of clusters (if it is known then the number of
  59. %missing nodes will be used as the number of clusters)
  60. k = DetermineNumberOfClusters(num_clusters_known, graph_visible, num_missing_nodes, num_placeholders);
  61. %predict the output graph
  62. [ graph_predicted, clustering] = PredictGraph(affinity, k, graph_visible, num_placeholders, affinity_calculation_type, 1);
  63. end
  64. function [affinity] = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes)
  65. global affinity_calculation_shortest_path;
  66. global affinity_calculation_euclid;
  67. global affinity_calculation_common_friends;
  68. global affinity_calculation_random_clustering;
  69. global affinity_calculation_adamic_adar;
  70. global affinity_calculation_katz_beta_0_5;
  71. global affinity_calculation_katz_beta_0_05;
  72. global affinity_calculation_katz_beta_0_005;
  73. if affinity_calculation_type == affinity_calculation_euclid
  74. sp_mat = graphallshortestpaths(data);
  75. %remove INF values
  76. max_value = max(sp_mat(sp_mat ~= Inf)) + 1;
  77. sp_mat_euclid = sp_mat;
  78. sp_mat_euclid(sp_mat == Inf) = max_value;
  79. affinity = CalculateAffinity(sp_mat_euclid);
  80. %affinity = exp(-(sp_mat.^2))/(2 * 0.3^2);
  81. elseif affinity_calculation_type == affinity_calculation_shortest_path
  82. % max_value = max(sp_mat(sp_mat ~= Inf)) + 1;
  83. % sp_mat_euclid = sp_mat;
  84. % sp_mat_euclid(sp_mat == Inf) = max_value;
  85. % affinity = (sp_mat_euclid + 1).^(-affinity_exp_factor);
  86. %affinity = spfun(affinityFunc, data);
  87. affinity = graphallshortestpaths(data);
  88. affinity = affinity .^ -2;
  89. affinity(affinity == Inf) = 1; %added on 05/11/11
  90. elseif affinity_calculation_type == affinity_calculation_common_friends
  91. affinity = CalcAffinityByCommonNeighbors_Sparse(data, actual_graph_size, num_missing_nodes);
  92. %affinity = CalcAffinityByCommonNeighbors(data, actual_graph_size, num_missing_nodes);
  93. elseif affinity_calculation_type == affinity_calculation_random_clustering
  94. affinity = data; %just a placeholder...
  95. elseif affinity_calculation_type == affinity_calculation_adamic_adar
  96. affinity = CalculateAffinityByAdamicAdar_Sparse( data, actual_graph_size, num_missing_nodes, 1 );
  97. if nnz(affinity) < 5
  98. x = 8;
  99. end
  100. elseif affinity_calculation_type == affinity_calculation_katz_beta_0_5
  101. affinity = CalcAffinityByKatzBeta_Sparse( data, 0.5, 3 );
  102. elseif affinity_calculation_type == affinity_calculation_katz_beta_0_05
  103. affinity = CalcAffinityByKatzBeta_Sparse( data, 0.05, 4 );
  104. elseif affinity_calculation_type == affinity_calculation_katz_beta_0_005
  105. affinity = CalcAffinityByKatzBeta_Sparse( data, 0.005, 4 );
  106. end
  107. end
  108. function [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes)
  109. global affinity_calculation_shortest_path;
  110. global affinity_calculation_euclid;
  111. global affinity_calculation_common_friends;
  112. global affinity_calculation_random_clustering;
  113. global affinity_calculation_adamic_adar;
  114. global affinity_calculation_katz_beta_0_5;
  115. global affinity_calculation_katz_beta_0_05;
  116. global affinity_calculation_katz_beta_0_005;
  117. first_unk_node = size(affinity,1) - num_placeholders + 1;
  118. diagonal = sum(affinity, 2); %sum the rows
  119. D = sparse(diag(diagonal)); %D is the matrix whose diagonal is the sum of the rows of A
  120. clear('diagonal');
  121. fprintf('Calculating NL\n');
  122. D = sqrt(D);
  123. NL1 = D * affinity * D;
  124. clear('D');
  125. fprintf('calculating U\n');
  126. fail = 0;
  127. try
  128. [nEigVec,eigValues] = eigs(NL1,k);
  129. catch ME1
  130. opts.tol = 1e-1;
  131. try
  132. [nEigVec,eigValues] = eigs(NL1,k, 'LM', opts);
  133. catch ME2
  134. fail = 1;
  135. end
  136. end
  137. % select k largest eigen vectors
  138. if fail == 0
  139. U = [];
  140. % construct the normalized matrix U from the obtained eigen vectors
  141. for i=1:size(nEigVec,1)
  142. n = sqrt(sum(nEigVec(i,:).^2));
  143. U(i,:) = nEigVec(i,:) ./ n;
  144. end
  145. num_samples = size(affinity,1) - first_unk_node + 1;
  146. if cluster_only_missing_nodes == 1
  147. U(1:first_unk_node - 1 ,:) = []; %cluster only the missing nodes
  148. end
  149. fprintf('kmeans clustering\n');
  150. % perform kmeans clustering on the matrix U
  151. fail = 1;
  152. while fail > 0
  153. try
  154. currK = k
  155. [IDX,C, SUMD, D] = kmeans(U,currK, 'EmptyAction', 'singleton'); %in case of an empty cluster just drop it
  156. fail = 0;
  157. catch ME1
  158. fail = fail + 1;
  159. if fail < 100
  160. disp('error in kmeans clustering. trying again...');
  161. else
  162. %give up on clustering and select random clusters...
  163. IDX = randi(currK, size(U));
  164. fail = 0;
  165. end
  166. end
  167. end
  168. test_clustering = IDX(size(IDX,1) - num_samples + 1 : size(IDX,1));
  169. %if it's random just replace everything...
  170. if affinity_calculation_type == affinity_calculation_random_clustering
  171. test_clustering = randi(k, size(test_clustering,1), size(test_clustering,2));
  172. end
  173. else
  174. disp('Failed in finding eigenvectors - using random!');
  175. if cluster_only_missing_nodes == 0
  176. num_samples = size(affinity,1);
  177. else
  178. num_samples = num_placeholders;
  179. end
  180. test_clustering = randi(k, num_samples, 1);
  181. end
  182. end
  183. function [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes)
  184. last_known_node = size(data,1) - num_placeholders;
  185. first_unk_node = last_known_node + 1;
  186. [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes);
  187. newNodes = CreateNewNodesFromClusters(data, test_clustering);
  188. newData = [data(1:last_known_node,1:last_known_node), newNodes(:, 1:last_known_node)';...
  189. newNodes(:,1:last_known_node), zeros(size(newNodes,1))];
  190. end
  191. function [k] = DetermineNumberOfClusters(num_clusters_known, data_untouched, num_missing_nodes, num_placeholders)
  192. %determine k - the number of clusters
  193. if num_clusters_known == 1
  194. k = num_missing_nodes;
  195. else
  196. known_graph_size = size(data_untouched,1) - num_placeholders;
  197. k = mean(sum(data_untouched(1 : known_graph_size, 1 : known_graph_size)));
  198. %fprintf('Average number of neighbors for known nodes = %d, number of added nodes = %d\n', k, num_added_nodes);
  199. k = round(num_placeholders / floor(k));
  200. fprintf('Rounding k to %d\n', k);
  201. end
  202. end
  203. function [affinity, num_placeholders, first_unk_node] = ReduceDimensions(affinity, first_unk_node)
  204. num_placeholders = size(affinity,1) - first_unk_node + 1;
  205. affinity_sum = sum(affinity(first_unk_node:size(affinity,1),:)); %the sum of the affinity of placeholders to all other nodes
  206. nodes_to_keep = (affinity_sum > 0); %keep only nodes which have some affinity to the placeholders
  207. nodes_to_keep(first_unk_node:size(affinity,1)) = 1; %keep all the placeholders even if for some reason they have a sum of zero...
  208. affinity = affinity(nodes_to_keep, nodes_to_keep);
  209. first_unk_node = size(affinity,1) - num_placeholders + 1;
  210. end
  211. function [true_clustering] = BuildTrueClustering(missing_nodes_mapping, actual_graph_size, num_missing_nodes, percent_known_placeholders, placeholders_to_remove, last_known_node)
  212. true_clustering = []; %zeros(size(test_clustering, 1), 1);
  213. for i = 2 : size(missing_nodes_mapping, 1)
  214. for j = 1 : size(missing_nodes_mapping,2)
  215. if missing_nodes_mapping(i,j) ~= 0
  216. true_clustering(missing_nodes_mapping(i,j) - actual_graph_size + num_missing_nodes, 1) = j; % missing_nodes_mapping(1, j);
  217. end
  218. end
  219. end
  220. if percent_known_placeholders < 1
  221. true_clustering(placeholders_to_remove - last_known_node) = [];
  222. end
  223. end