You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

MissingNodes_Sparse.m 35KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668
  1. function [ rand_score, purity, p_triads, missing_nodes_mapping, removed_nodes] = MissingNodes_Sparse( dataFilePath, dataFileName, affinityType, percentKnownPlaceholdersVec, missingNodes )
  2. %UNTITLED2 Summary of this function goes here
  3. % Detailed explanation goes here
  4. %[SUCCESS,MESSAGE,MESSAGEID] = mkdir('C:\\MissingNodes\\Code\\Results\\',dataFileName);
  5. global g_threshold;
  6. % addpath 'Spectral Clustering' % sigal 13.8.12
  7. % addpath 'mex'% sigal 13.8.12
  8. p_triads = [];
  9. %create a log file for this run
  10. %date_now = clock;
  11. %date_now = strcat(num2str(date_now(1)),'_',num2str(date_now(2)),'_', num2str(date_now(3)),'_', num2str(date_now(4)), num2str(date_now(5)),'_', num2str(date_now(6)));
  12. %diary(strcat('C:\missingnodes\Code\Log\log', date_now,'.log'));
  13. %affinity calculation types
  14. global affinity_calculation_shortest_path;
  15. global affinity_calculation_euclid;
  16. global affinity_calculation_common_friends;
  17. global affinity_calculation_random_clustering;
  18. global affinity_calculation_adamic_adar;
  19. global affinity_calculation_katz_beta_0_5;
  20. global affinity_calculation_katz_beta_0_05;
  21. global affinity_calculation_katz_beta_0_005;
  22. affinity_calculation_shortest_path = 0;
  23. affinity_calculation_euclid = 1;
  24. affinity_calculation_common_friends = 2;
  25. affinity_calculation_random_clustering = 3;
  26. affinity_calculation_adamic_adar = 4;
  27. affinity_calculation_katz_beta_0_5 = 5;
  28. affinity_calculation_katz_beta_0_05 = 6;
  29. affinity_calculation_katz_beta_0_005 = 7;
  30. %percent_known_placeholders_vec = [0.9, 0.8, 0.7, 0.6, 0.5,1];
  31. %%%%% for distance as function of num placeholders %%%%
  32. if nargin < 4
  33. percent_known_placeholders_vec = [0.5];
  34. else
  35. percent_known_placeholders_vec = percentKnownPlaceholdersVec;
  36. end
  37. if nargin >= 5
  38. select_random_missing_nodes = 0;
  39. else
  40. select_random_missing_nodes = 1;
  41. end
  42. compensate_for_unknown_placeholers = 0;
  43. %configuration parameters
  44. %num_values = 100000; %number of values to read from the raw data
  45. %graph_sizes = [400 600 800 1000 1200 1400]; %size of the graph matrix
  46. %graph_sizes = [400 1000]; %size of the graph matrix
  47. num_missing_nodes_arr = 5 : 5 : 30; %number of missing nodes from the network
  48. num_missing_nodes_arr = [11,21,31,41,50]; % 20;
  49. num_missing_nodes_arr = [5 10];
  50. unite_common_friends = 0; %should UNK nodes be united in accordance with the "friend of my friend principle"
  51. cluster_only_missing_nodes = 1;
  52. % if affinity_calculation_type == affinity_calculation_shortest_path || affinity_calculation_type == affinity_calculation_euclid
  53. % non_neighbors_distance = Inf;
  54. % elseif affinity_calculation_type == affinity_calculation_common_friends
  55. % non_neighbors_distance = 0;
  56. % end
  57. non_neighbors_distance = 0;
  58. %read the full network, up to num_values links
  59. %disp('reading network information from file...');
  60. fprintf('reading network information from file %s%s ...\n', dataFilePath, dataFileName);
  61. load(strcat(dataFilePath, dataFileName), 'data');
  62. data = sparse(data);
  63. clear('graph');
  64. %rand_score = zeros(2, 2, size(num_missing_nodes_arr,2), 6, 2); %(normalized or not, unite common friends, num missing nodes, affinity calculation, cluster only missing)
  65. %rand_score_sq = rand_score;
  66. %purity = zeros(2, size(num_missing_nodes_arr,2), 6, 2);%(unite common friends, num missing nodes, affinity calculation, cluster only missing)
  67. %purity_sq = purity;
  68. rand_score = [];
  69. purity = [];
  70. graph_size = size(data,1);
  71. %num_missing_nodes_arr = round(num_missing_nodes_arr .* graph_size);
  72. %initialize the data matrix (binary adjacency)
  73. disp('generating network graph...');
  74. actual_graph_size = size(data,1);
  75. original_data = data;
  76. missing_nodes_mapping = [];
  77. for num_missing_nodes_idx = 1 : size(num_missing_nodes_arr,2)
  78. if select_random_missing_nodes
  79. num_missing_nodes = num_missing_nodes_arr(1, num_missing_nodes_idx);
  80. else
  81. num_missing_nodes = length(missingNodes);
  82. end
  83. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  84. %remove random nodes %
  85. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  86. %randomize the missing nodes indexes and sort in descending order
  87. %disp('selecting random missing nodes...');
  88. fprintf('selecting %d random missing nodes...\n',num_missing_nodes);
  89. if select_random_missing_nodes
  90. [data, missing_nodes_mapping] = RemoveRandomNodes( original_data, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance );
  91. else
  92. [data, missing_nodes_mapping] = RemoveRandomNodes( original_data, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance, missingNodes);
  93. end
  94. data_untouched = data;
  95. for percent_known_placeholders = percent_known_placeholders_vec
  96. data = data_untouched;
  97. S = zeros(1, size(data,2));
  98. num_placeholders_to_remove = 0;
  99. placeholders_to_remove = [];
  100. num_placeholders = size(data,1) - actual_graph_size + num_missing_nodes ;
  101. last_known_node = size(data,1) - num_placeholders;
  102. if percent_known_placeholders < 1
  103. num_placeholders_to_remove = round(num_placeholders * (1 - percent_known_placeholders));
  104. while size(placeholders_to_remove, 2) < num_placeholders_to_remove
  105. %randomly selecting unique placeholder indexes
  106. placeholders_to_remove = unique([placeholders_to_remove, randi(num_placeholders, 1, num_placeholders_to_remove - size(placeholders_to_remove, 2))]);
  107. end
  108. %rand_vec = rand(1, num_placeholders);
  109. %placeholders_to_remove = find(rand_vec > percent_known_placeholders) + last_known_node;
  110. placeholders_to_remove = placeholders_to_remove + last_known_node;
  111. %S is the group of neighbors of the unknown placeholders
  112. S = data(placeholders_to_remove(1), :);
  113. for i = placeholders_to_remove
  114. S = S | data(i,:);
  115. end
  116. %switch from binary vector to list of indexes
  117. % S = find(S);
  118. data_all_placeholders = data;
  119. data(placeholders_to_remove,:) = [];
  120. data(:,placeholders_to_remove) = [];
  121. num_placeholders = num_placeholders - num_placeholders_to_remove;
  122. end
  123. removed_nodes{num_missing_nodes_idx} = missing_nodes_mapping(1,:); %save the removed nodes in each iteration
  124. disp('finding all pairs shortest paths...');
  125. %if we got an argument for the affinity calculation type then use
  126. %it. otherwise loop over all the types
  127. if nargin >= 3
  128. affinity_types = affinityType;
  129. else
  130. affinity_types = affinity_calculation_shortest_path: affinity_calculation_katz_beta_0_005;
  131. end
  132. orig_S = S;
  133. for affinity_calculation_type = affinity_types
  134. for withAttr = 0:1
  135. fprintf('affinity_calculation_type = %d\n', affinity_calculation_type);
  136. %data = data_untouched;
  137. first_unk_node = actual_graph_size - num_missing_nodes + 1;
  138. num_added_nodes = size(data,1) - first_unk_node + 1;
  139. % calculate the affinity / similarity matrix
  140. fprintf('calculating affinity matrix, type %d...\n', affinity_calculation_type);
  141. tic %affinity_calc_time
  142. affinity = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes);
  143. affinity_calc_time = toc;%affinity_calc_time
  144. %TODO: extend the dimension reduction to adding missing
  145. %links / reclustering
  146. for reduce_dimensions = [1] %0 must be first because it does not change the affinity matrix
  147. reduce_dim_time = 0;
  148. if reduce_dimensions == 1
  149. tic %ReduceDimensions
  150. [affinity, num_placeholders, first_unk_node] = ReduceDimensions(affinity, first_unk_node);
  151. reduce_dim_time = toc; %ReduceDimensions
  152. end
  153. fprintf('calculating true clustering\n');
  154. true_clustering = BuildTrueClustering(missing_nodes_mapping, actual_graph_size, num_missing_nodes, percent_known_placeholders, placeholders_to_remove, last_known_node);
  155. %figure,imshow(affinity,[]), title('Affinity Matrix')
  156. for num_clusters_known = [1] %[0 1]
  157. k = DetermineNumberOfClusters(num_clusters_known, data_untouched, actual_graph_size, num_missing_nodes, num_added_nodes);
  158. last_known_node = first_unk_node - 1;
  159. tic %graph_predict_time
  160. [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes);
  161. graph_predict_time = toc; %graph_predict_time
  162. if compensate_for_unknown_placeholers == 1
  163. S = orig_S;
  164. if size(newData,1) > size(S,2)
  165. %for breakpoint
  166. tttt = 98;
  167. end
  168. sigma = 1/4;
  169. S = S(1:size(newData,1));
  170. S = S + randn(size(S)) * sigma;
  171. sorted_S = sort(S, 'descend');
  172. %sum over the columns and find the columns which
  173. %indicate at least one neighbor
  174. neighbors_of_new_nodes = find(sum(newData(first_unk_node:size(newData,1), :)));
  175. first_united_node = size(newData,1) - k +1;
  176. newAffinity = CalcAffinityByKatzBeta_Sparse( newData, 0.05, 4 );
  177. %newAffinity = CalculateAffinityByAdamicAdar_Sparse(newData, size(newData,1), 0, 0);
  178. newNodesAffinity = newAffinity(first_united_node:size(newAffinity,1), :);
  179. newNodesAffinity(newNodesAffinity>=1) = 0;
  180. newNodesAffinity = newNodesAffinity / max(max(newNodesAffinity));
  181. %newNodesAffinity = newNodesAffinity / 2;
  182. newNodesAffinity(newData(first_united_node:size(newAffinity,1), :) >= 1) = 0;
  183. newNodesAffinity = (newNodesAffinity / max(max(newNodesAffinity)));
  184. %%%%trying to take only the
  185. %%%%k highest affinities
  186. sortedNewNodesAffinity = sort(newNodesAffinity(:), 'descend');
  187. %affinityThreshold = sortedNewNodesAffinity(k + size(neighbors_of_new_nodes, 2));
  188. newNodesAffinity_orig = newNodesAffinity;
  189. end
  190. if percent_known_placeholders < 1
  191. %calculating as a function
  192. %of number of links added
  193. maxNumLinksToAdd = mean(sum(data(1:last_known_node, 1:last_known_node))) * num_placeholders;
  194. maxNumLinksToAdd = min(maxNumLinksToAdd, 25);
  195. else
  196. maxNumLinksToAdd = 0;
  197. end
  198. %%%%% for distance as
  199. %%%%% function of num
  200. %%%%% placeholders %%%%
  201. %max_neighbors = S >= sorted_S(maxNumLinksToAdd);
  202. for numLinksToAdd = 0 : maxNumLinksToAdd
  203. if compensate_for_unknown_placeholers == 1
  204. newNodesAffinity = newNodesAffinity_orig;
  205. neighbors = [];
  206. if numLinksToAdd > 0
  207. neighbors = find(S >= sorted_S(numLinksToAdd), numLinksToAdd);
  208. end
  209. newDataWithMissingLinks = newData;
  210. newDataForClustering = data;
  211. for neighbor = neighbors
  212. [value, closest_new_node] = max(newNodesAffinity(:,neighbor));
  213. closest_new_node = closest_new_node(1);
  214. newDataWithMissingLinks(first_united_node + closest_new_node - 1, neighbor) = 1;
  215. newDataWithMissingLinks(neighbor, first_united_node + closest_new_node - 1) = 1;
  216. newPlaceholder = zeros(1, size(newDataForClustering,2));
  217. newPlaceholder(neighbor) = 1;
  218. newDataForClustering = [newDataForClustering, newPlaceholder';
  219. newPlaceholder, 0];
  220. end
  221. affinityWithS = CalcAffinity( newDataForClustering, affinity_calculation_type, actual_graph_size, num_missing_nodes);
  222. if reduce_dimensions == 1
  223. [affinityWithS, num_placeholdersWithS, first_unk_node_with_s] = ReduceDimensions(affinityWithS, first_united_node);
  224. else
  225. num_placeholdersWithS = num_placeholders + length(neighbors);
  226. first_unk_node_with_s = first_united_node;
  227. end
  228. [newPredictedGraph, newClustering] = PredictGraph(affinityWithS, k, newDataForClustering, num_placeholdersWithS, affinity_calculation_type, cluster_only_missing_nodes);
  229. %remap the original data so that
  230. %the known nodes match the
  231. %predicted data and the missing
  232. %nodes match the predicted nodes
  233. %created from each cluster
  234. perm_vector = 1:size(original_data,1);
  235. perm_vector(missing_nodes_mapping(1,:)) = [];
  236. perm_vector = [perm_vector, missing_nodes_mapping(1,:)];
  237. remapped_data = original_data(perm_vector,perm_vector);
  238. [small_data, indices_to_remove] = DecreaseGraphSize(remapped_data, first_united_node : size(remapped_data,1), neighbors); %changed from neighbors to perm(max_neighbors) - to be fair when there are more nieghbors
  239. %small_data2 = DecreaseGraphSize(newData, last_known_node+1 : size(newData,1));
  240. %new_nodes_affinity_sum = sum(sum(newNodesAffinity));
  241. %in case there is an empty cluster, the unrelated nodes may contain node index that does not exist
  242. %in the predicted graph (which may contain less nodes)
  243. indices_to_remove(indices_to_remove > size(newPredictedGraph,2)) = [];
  244. small_data2 = newData;
  245. small_data2(indices_to_remove,:) = [];
  246. small_data2(:,indices_to_remove) = [];
  247. %small_data3 = DecreaseGraphSize(newDataWithMissingLinks, last_known_node+1 : size(newDataWithMissingLinks,1));
  248. small_data3 = newDataWithMissingLinks;
  249. small_data3(indices_to_remove,:) = [];
  250. small_data3(:,indices_to_remove) = [];
  251. small_data4 = newPredictedGraph;
  252. small_data4(indices_to_remove,:) = [];
  253. small_data4(:,indices_to_remove) = [];
  254. %only calculate on
  255. %the first iteration
  256. %to save time
  257. if numLinksToAdd == 0
  258. edit_distance = GraphEditDistance( small_data, small_data2, num_missing_nodes );
  259. edit_distance2 = edit_distance;
  260. edit_distance3 = edit_distance;
  261. else
  262. edit_distance2 = GraphEditDistance( small_data, small_data3, num_missing_nodes );
  263. edit_distance3 = GraphEditDistance( small_data, small_data4, num_missing_nodes );
  264. end
  265. else
  266. perm_vector = 1:size(original_data,1);
  267. perm_vector(missing_nodes_mapping(1,:)) = [];
  268. perm_vector = [perm_vector, missing_nodes_mapping(1,:)];
  269. remapped_data = original_data(perm_vector,perm_vector);
  270. [small_data, indices_to_remove] = DecreaseGraphSize(remapped_data, (size(remapped_data,1) - num_missing_nodes + 1) : size(remapped_data,1), []); %changed from neighbors to perm(max_neighbors) - to be fair when there are more nieghbors
  271. indices_to_remove(indices_to_remove > size(newData,2)) = [];
  272. small_data2 = newData;
  273. small_data2(indices_to_remove,:) = [];
  274. small_data2(:,indices_to_remove) = [];
  275. edit_distance = GraphEditDistance( small_data, small_data2, num_missing_nodes );
  276. end
  277. %fprintf('num_links_to_add = %d; missing links distance = %d, new clustering dist = %d\n', numLinksToAdd, full(edit_distance2), full(edit_distance3));
  278. fprintf('calculating purity\n');
  279. curr_index = size(purity,2) + 1;
  280. %purity for actual clustering
  281. try
  282. temp_purity = ClusteringPurity(true_clustering, test_clustering);
  283. catch ME1
  284. temp_purity = 1; % Sigal 12.8.12 - tmp %
  285. ddddd = 1;
  286. end
  287. purity(curr_index).score = temp_purity;
  288. purity(curr_index).score_sq = temp_purity^2;
  289. purity(curr_index).edit_distance = edit_distance;
  290. if compensate_for_unknown_placeholers == 1
  291. purity(curr_index).numLinksToAdd = numLinksToAdd;
  292. purity(curr_index).edit_distance_missing_links = edit_distance2;
  293. purity(curr_index).edit_distance_new_clustering = edit_distance3;
  294. fprintf('numLinksToAdd - %d\nedit_distance - %d\nedit_distance_missing_links - %d\nedit_distance_new_clustering - %d\n', numLinksToAdd, full(edit_distance), full(edit_distance2), full(edit_distance3));
  295. end
  296. purity(curr_index).withAttr = withAttr;
  297. purity(curr_index).unite_common_friends = unite_common_friends;
  298. purity(curr_index).num_missing_nodes_idx = num_missing_nodes_idx;
  299. purity(curr_index).num_missing_nodes = num_missing_nodes_arr(num_missing_nodes_idx);
  300. purity(curr_index).affinity_calculation_type = affinity_calculation_type;
  301. purity(curr_index).cluster_only_missing_nodes = cluster_only_missing_nodes;
  302. purity(curr_index).num_clusters_known = num_clusters_known;
  303. purity(curr_index).num_placeholders = num_placeholders;
  304. purity(curr_index).num_placeholders_to_remove = num_placeholders_to_remove;
  305. purity(curr_index).iteration = 1;
  306. purity(curr_index).test_clustering = test_clustering;
  307. purity(curr_index).true_clustering = true_clustering;
  308. purity(curr_index).graph_size = graph_size;
  309. purity(curr_index).inverse_purity = 1; % Sigal 12.8.12 - tmp % CalculateInversePurity(true_clustering, test_clustering);
  310. purity(curr_index).NMI = 0; %CalcNormalizedMutualInformation(true_clustering, test_clustering);
  311. purity(curr_index).removed_nodes = removed_nodes;
  312. purity(curr_index).percent_known_placeholders = percent_known_placeholders;
  313. purity(curr_index).reduce_dimensions = reduce_dimensions;
  314. purity(curr_index).missing_nodes_mapping = missing_nodes_mapping;
  315. purity(curr_index).compensate_for_unknown_placeholers = compensate_for_unknown_placeholers;
  316. purity(curr_index).affinity_calc_time = affinity_calc_time;
  317. purity(curr_index).reduce_dim_time = reduce_dim_time;
  318. purity(curr_index).graph_predict_time = graph_predict_time;
  319. if compensate_for_unknown_placeholers == 0
  320. break
  321. end
  322. end
  323. %
  324. % purity(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) = ...
  325. % purity(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) + temp_purity;
  326. %
  327. % purity_sq(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) = ...
  328. % purity_sq(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) + temp_purity^2;
  329. %
  330. %fprintf('affinity_calculation_type = %d, unite_common_friends = %d\n', affinity_calculation_type, unite_common_friends);
  331. %fprintf('Graph size: %d, Number of missing nodes: %d, Purity: %f \n' ,graph_size, num_missing_nodes, temp_purity);
  332. %fprintf('============================================\n\n\n');
  333. %clear U;
  334. clear eigValues;
  335. clear eigVectors;
  336. end
  337. end
  338. clear('affinity');
  339. end
  340. end
  341. end
  342. end
  343. end
  344. function [affinity] = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes)
  345. global affinity_calculation_shortest_path;
  346. global affinity_calculation_euclid;
  347. global affinity_calculation_common_friends;
  348. global affinity_calculation_random_clustering;
  349. global affinity_calculation_adamic_adar;
  350. global affinity_calculation_katz_beta_0_5;
  351. global affinity_calculation_katz_beta_0_05;
  352. global affinity_calculation_katz_beta_0_005;
  353. if affinity_calculation_type == affinity_calculation_euclid
  354. sp_mat = graphallshortestpaths(data);
  355. %remove INF values
  356. max_value = max(sp_mat(sp_mat ~= Inf)) + 1;
  357. sp_mat_euclid = sp_mat;
  358. sp_mat_euclid(sp_mat == Inf) = max_value;
  359. affinity = CalculateAffinity(sp_mat_euclid);
  360. %affinity = exp(-(sp_mat.^2))/(2 * 0.3^2);
  361. elseif affinity_calculation_type == affinity_calculation_shortest_path
  362. % max_value = max(sp_mat(sp_mat ~= Inf)) + 1;
  363. % sp_mat_euclid = sp_mat;
  364. % sp_mat_euclid(sp_mat == Inf) = max_value;
  365. % affinity = (sp_mat_euclid + 1).^(-affinity_exp_factor);
  366. %affinity = spfun(affinityFunc, data);
  367. affinity = graphallshortestpaths(data);
  368. affinity = affinity .^ -2;
  369. affinity(affinity == Inf) = 1; %added on 05/11/11
  370. elseif affinity_calculation_type == affinity_calculation_common_friends
  371. affinity = CalcAffinityByCommonNeighbors_Sparse(data, actual_graph_size, num_missing_nodes);
  372. %affinity = CalcAffinityByCommonNeighbors(data, actual_graph_size, num_missing_nodes);
  373. elseif affinity_calculation_type == affinity_calculation_random_clustering
  374. affinity = data; %just a placeholder...
  375. elseif affinity_calculation_type == affinity_calculation_adamic_adar
  376. affinity = CalculateAffinityByAdamicAdar_Sparse( data, actual_graph_size, num_missing_nodes, 1 );
  377. if nnz(affinity) < 5
  378. x = 8;
  379. end
  380. elseif affinity_calculation_type == affinity_calculation_katz_beta_0_5
  381. affinity = CalcAffinityByKatzBeta_Sparse( data, 0.5, 3 );
  382. elseif affinity_calculation_type == affinity_calculation_katz_beta_0_05
  383. affinity = CalcAffinityByKatzBeta_Sparse( data, 0.05, 4 );
  384. elseif affinity_calculation_type == affinity_calculation_katz_beta_0_005
  385. affinity = CalcAffinityByKatzBeta_Sparse( data, 0.005, 4 );
  386. end
  387. end
  388. function [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes)
  389. global affinity_calculation_shortest_path;
  390. global affinity_calculation_euclid;
  391. global affinity_calculation_common_friends;
  392. global affinity_calculation_random_clustering;
  393. global affinity_calculation_adamic_adar;
  394. global affinity_calculation_katz_beta_0_5;
  395. global affinity_calculation_katz_beta_0_05;
  396. global affinity_calculation_katz_beta_0_005;
  397. first_unk_node = size(affinity,1) - num_placeholders + 1;
  398. diagonal = sum(affinity, 2); %sum the rows
  399. D = sparse(diag(diagonal)); %D is the matrix whose diagonal is the sum of the rows of A
  400. clear('diagonal');
  401. fprintf('Calculating NL\n');
  402. D = sqrt(D);
  403. NL1 = D * affinity * D;
  404. clear('D');
  405. fprintf('calculating U\n');
  406. fail = 0;
  407. try
  408. [nEigVec,eigValues] = eigs(NL1,k);
  409. catch ME1
  410. opts.tol = 1e-1;
  411. try
  412. [nEigVec,eigValues] = eigs(NL1,k, 'LM', opts);
  413. catch ME2
  414. fail = 1;
  415. end
  416. end
  417. % select k largest eigen vectors
  418. if fail == 0
  419. U = [];
  420. % construct the normalized matrix U from the obtained eigen vectors
  421. for i=1:size(nEigVec,1)
  422. n = sqrt(sum(nEigVec(i,:).^2));
  423. U(i,:) = nEigVec(i,:) ./ n;
  424. end
  425. num_samples = size(affinity,1) - first_unk_node + 1;
  426. if cluster_only_missing_nodes == 1
  427. U(1:first_unk_node - 1 ,:) = []; %cluster only the missing nodes
  428. end
  429. fprintf('kmeans clustering\n');
  430. % perform kmeans clustering on the matrix U
  431. fail = 1;
  432. while fail > 0
  433. try
  434. currK = k;
  435. [IDX,C, SUMD, D] = kmeans(U,currK, 'EmptyAction', 'singleton'); %in case of an empty cluster just drop it
  436. fail = 0;
  437. catch ME1
  438. fail = fail + 1;
  439. if fail < 100
  440. disp('error in kmeans clustering. trying again...');
  441. else
  442. %give up on clustering and select random clusters...
  443. IDX = randi(currK, size(U));
  444. fail = 0;
  445. end
  446. end
  447. end
  448. test_clustering = IDX(size(IDX,1) - num_samples + 1 : size(IDX,1));
  449. %if it's random just replace everything...
  450. if affinity_calculation_type == affinity_calculation_random_clustering
  451. test_clustering = randi(k, size(test_clustering,1), size(test_clustering,2));
  452. end
  453. else
  454. disp('Failed in finding eigenvectors - using random!');
  455. if cluster_only_missing_nodes == 0
  456. num_samples = size(affinity,1);
  457. else
  458. num_samples = num_placeholders;
  459. end
  460. test_clustering = randi(k, num_samples, 1);
  461. end
  462. end
  463. function [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes)
  464. last_known_node = size(data,1) - num_placeholders;
  465. first_unk_node = last_known_node + 1;
  466. [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes);
  467. newNodes = CreateNewNodesFromClusters(data, test_clustering);
  468. newData = [data(1:last_known_node,1:last_known_node), newNodes(:, 1:last_known_node)';...
  469. newNodes(:,1:last_known_node), zeros(size(newNodes,1))];
  470. end
  471. function [k] = DetermineNumberOfClusters(num_clusters_known, data_untouched, actual_graph_size, num_missing_nodes, num_added_nodes)
  472. %determine k - the number of clusters
  473. if num_clusters_known == 1
  474. k = num_missing_nodes;
  475. elseif num_clusters_known == 0
  476. k = mean(sum(data_untouched(1 : actual_graph_size - num_missing_nodes, 1 : actual_graph_size - num_missing_nodes)));
  477. %fprintf('Average number of neighbors for known nodes = %d, number of added nodes = %d\n', k, num_added_nodes);
  478. k = round(num_added_nodes / floor(k));
  479. fprintf('Rounding k to %d\n', k);
  480. fprintf('Actual number of clusters = %d\n', num_missing_nodes);
  481. elseif num_clusters_known == 2
  482. k = mean(sum(data_untouched(1 : actual_graph_size - num_missing_nodes, 1 : actual_graph_size - num_missing_nodes)));
  483. %fprintf('Average number of neighbors for known nodes = %d, number of added nodes = %d\n', k, num_added_nodes);
  484. k = 2*round(num_added_nodes / k);
  485. fprintf('Guessing upper limit for k: %d\n', k);
  486. fprintf('Actual number of clusters = %d\n', num_missing_nodes);
  487. end
  488. end
  489. function [affinity, num_placeholders, first_unk_node] = ReduceDimensions(affinity, first_unk_node)
  490. num_placeholders = size(affinity,1) - first_unk_node + 1;
  491. affinity_sum = sum(affinity(first_unk_node:size(affinity,1),:)); %the sum of the affinity of placeholders to all other nodes
  492. nodes_to_keep = (affinity_sum > 0); %keep only nodes which have some affinity to the placeholders
  493. nodes_to_keep(first_unk_node:size(affinity,1)) = 1; %keep all the placeholders even if for some reason they have a sum of zero...
  494. affinity = affinity(nodes_to_keep, nodes_to_keep);
  495. first_unk_node = size(affinity,1) - num_placeholders + 1;
  496. end
  497. function [true_clustering] = BuildTrueClustering(missing_nodes_mapping, actual_graph_size, num_missing_nodes, percent_known_placeholders, placeholders_to_remove, last_known_node)
  498. true_clustering = []; %zeros(size(test_clustering, 1), 1);
  499. for i = 2 : size(missing_nodes_mapping, 1)
  500. for j = 1 : size(missing_nodes_mapping,2)
  501. if missing_nodes_mapping(i,j) ~= 0
  502. true_clustering(missing_nodes_mapping(i,j) - actual_graph_size + num_missing_nodes, 1) = j; % missing_nodes_mapping(1, j);
  503. end
  504. end
  505. end
  506. if percent_known_placeholders < 1
  507. true_clustering(placeholders_to_remove - last_known_node) = [];
  508. end
  509. end