You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

MissingNodes_Sparse2n.m 46KB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831
  1. function [ rand_score, purity, p_triads, missing_nodes_mapping, removed_nodes] = MissingNodes_Sparse2n( dataFilePath, dataFileName, affinityType, ...
  2. num_missing_nodes_arr, percentKnownPlaceholdersVec, dumpSmallFlag, dumpSmallDataPath, iter, ...
  3. noisePercVec, noiseTypeVec, missingNodes)
  4. %UNTITLED2 Summary of this function goes here
  5. % Detailed explanation goes here
  6. %[SUCCESS,MESSAGE,MESSAGEID] = mkdir('C:\\MissingNodes\\Code\\Results\\',dataFileName);
  7. %---------------------------------------------
  8. %MissingNodes_Sparse2n - run option with noise
  9. %---------------------------------------------
  10. %noisePercVec = [0 0.1]; % additional noise as percentage of num PHs
  11. %noiseTypeVec = [1 2 3];% 1=global, 2=local(upto dist 2), 3=local(upto dist 3)
  12. %---------------------------------------------
  13. global g_threshold;
  14. % addpath 'Spectral Clustering' % sigal 13.8.12
  15. % addpath 'mex'% sigal 13.8.12
  16. p_triads = [];
  17. %create a log file for this run
  18. %date_now = clock;
  19. %date_now = strcat(num2str(date_now(1)),'_',num2str(date_now(2)),'_', num2str(date_now(3)),'_', num2str(date_now(4)), num2str(date_now(5)),'_', num2str(date_now(6)));
  20. %diary(strcat('C:\missingnodes\Code\Log\log', date_now,'.log'));
  21. %affinity calculation types
  22. global affinity_calculation_shortest_path;
  23. global affinity_calculation_euclid;
  24. global affinity_calculation_common_friends;
  25. global affinity_calculation_random_clustering;
  26. global affinity_calculation_adamic_adar;
  27. global affinity_calculation_katz_beta_0_5;
  28. global affinity_calculation_katz_beta_0_05;
  29. global affinity_calculation_katz_beta_0_005;
  30. affinity_calculation_shortest_path = 0;
  31. affinity_calculation_euclid = 1;
  32. affinity_calculation_common_friends = 2;
  33. affinity_calculation_random_clustering = 3;
  34. affinity_calculation_adamic_adar = 4;
  35. affinity_calculation_katz_beta_0_5 = 5;
  36. affinity_calculation_katz_beta_0_05 = 6;
  37. affinity_calculation_katz_beta_0_005 = 7;
  38. percent_known_placeholders_vec = percentKnownPlaceholdersVec;
  39. if nargin >= 11
  40. select_random_missing_nodes = 0;
  41. else
  42. select_random_missing_nodes = 1;
  43. end
  44. compensate_for_unknown_placeholers = 0;
  45. compensate_vec = [0 0.3 0.65 1 1.5];
  46. %configuration parameters
  47. %num_values = 100000; %number of values to read from the raw data
  48. %graph_sizes = [400 600 800 1000 1200 1400]; %size of the graph matrix
  49. %graph_sizes = [400 1000]; %size of the graph matrix
  50. % num_missing_nodes_arr = 5 : 5 : 30; %number of missing nodes from the network
  51. % num_missing_nodes_arr = [11,21,31,41,50]; % 20;
  52. %%num_missing_nodes_arr = 1;
  53. unite_common_friends = 0; %should UNK nodes be united in accordance with the "friend of my friend principle"
  54. cluster_only_missing_nodes = 1;
  55. % if affinity_calculation_type == affinity_calculation_shortest_path || affinity_calculation_type == affinity_calculation_euclid
  56. % non_neighbors_distance = Inf;
  57. % elseif affinity_calculation_type == affinity_calculation_common_friends
  58. % non_neighbors_distance = 0;
  59. % end
  60. non_neighbors_distance = 0;
  61. withAttr = 0;
  62. %read the full network, up to num_values links
  63. %disp('reading network information from file...');
  64. fprintf('reading network information from file %s%s ...\n', dataFilePath, dataFileName);
  65. load(strcat(dataFilePath, dataFileName), 'data');
  66. data = sparse(data);
  67. clear('graph');
  68. %rand_score = zeros(2, 2, size(num_missing_nodes_arr,2), 6, 2); %(normalized or not, unite common friends, num missing nodes, affinity calculation, cluster only missing)
  69. %rand_score_sq = rand_score;
  70. %purity = zeros(2, size(num_missing_nodes_arr,2), 6, 2);%(unite common friends, num missing nodes, affinity calculation, cluster only missing)
  71. %purity_sq = purity;
  72. rand_score = [];
  73. purity = [];
  74. graph_size = size(data,1);
  75. graph_edges = nnz(data)/2; % number of edges
  76. %initialize the data matrix (binary adjacency)
  77. disp('generating network graph...');
  78. actual_graph_size = size(data,1);
  79. original_data = data;
  80. missing_nodes_mapping = [];
  81. for num_missing_nodes_idx = 1 : size(num_missing_nodes_arr,2)
  82. if select_random_missing_nodes
  83. num_missing_nodes = num_missing_nodes_arr(1, num_missing_nodes_idx);
  84. else
  85. num_missing_nodes = length(missingNodes);
  86. end
  87. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  88. %remove random nodes %
  89. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  90. %randomize the missing nodes indexes and sort in descending order
  91. %disp('selecting random missing nodes...');
  92. fprintf('selecting %d random missing nodes...\n',num_missing_nodes);
  93. if select_random_missing_nodes
  94. [data, missing_nodes_mapping] = RemoveRandomNodes( original_data, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance );
  95. else
  96. [data, missing_nodes_mapping] = RemoveRandomNodes( original_data, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance, missingNodes);
  97. end
  98. data_untouched = data;
  99. % loop over partial data options
  100. for percent_known_placeholders = percent_known_placeholders_vec
  101. %loop over additional noise options
  102. for noisePercent = noisePercVec
  103. if noisePercent == 0
  104. noiseTypes = 0;
  105. else
  106. noiseTypes = noiseTypeVec;
  107. end
  108. % loop over added noise types
  109. for noiseType = noiseTypes
  110. data = data_untouched;
  111. S = zeros(1, size(data,2));
  112. num_placeholders_to_remove = 0;
  113. placeholders_to_remove = [];
  114. num_placeholders = size(data,1) - actual_graph_size + num_missing_nodes ;
  115. last_known_node = size(data,1) - num_placeholders;
  116. if percent_known_placeholders < 1
  117. num_placeholders_to_remove = round(num_placeholders * (1 - percent_known_placeholders));
  118. while size(placeholders_to_remove, 2) < num_placeholders_to_remove
  119. %randomly selecting unique placeholder indexes
  120. placeholders_to_remove = unique([placeholders_to_remove, randi(num_placeholders, 1, num_placeholders_to_remove - size(placeholders_to_remove, 2))]);
  121. end
  122. %rand_vec = rand(1, num_placeholders);
  123. %placeholders_to_remove = find(rand_vec > percent_known_placeholders) + last_known_node;
  124. placeholders_to_remove = placeholders_to_remove + last_known_node;
  125. %S is the group of neighbors of the unknown placeholders
  126. S = data(placeholders_to_remove(1), :);
  127. for i = placeholders_to_remove
  128. S = S | data(i,:);
  129. end
  130. %switch from binary vector to list of indexes
  131. % S = find(S);
  132. data_all_placeholders = data;
  133. data(placeholders_to_remove,:) = [];
  134. data(:,placeholders_to_remove) = [];
  135. num_placeholders = num_placeholders - num_placeholders_to_remove;
  136. end
  137. % sigal - 9.7.13 adding PHs noise
  138. noiseNumAddedPHs = round(num_placeholders * noisePercent);
  139. if noiseNumAddedPHs > 0
  140. selectedPHs = [];
  141. addedPHsNgs = [];
  142. for i=1:noiseNumAddedPHs
  143. % choose a random neighbor according to noiseType
  144. if noiseType == 1 % global (i.e any PH noise)
  145. neighbor = randi(last_known_node);
  146. while find(addedPHsNgs==neighbor)
  147. neighbor = randi(last_known_node);
  148. end
  149. addedPHsNgs = [addedPHsNgs neighbor];
  150. elseif noiseType == 2 || noiseType == 3
  151. neighbor2 = [];
  152. % choose a PH and find his neighbor's neighbors
  153. while size(neighbor2,1) == 0 || size(neighbor2,2) == 0
  154. % choose a random PH
  155. selectedPH = last_known_node+randi(num_placeholders);
  156. while find(selectedPHs==selectedPH)
  157. selectedPH = last_known_node+randi(num_placeholders);
  158. end
  159. selectedPHs = [selectedPHs selectedPH];
  160. % find his neighbor's neighbors
  161. neighbor1 = find(data(selectedPH,:),1);
  162. neighbor2 = find(data(neighbor1,1:last_known_node));
  163. end
  164. if noiseType == 2 % local with dist upto 2
  165. neighbor = neighbor2(1,randi(size(neighbor2,2)));
  166. else % local with dist upto 3
  167. all_neighbor3 = [];
  168. for n2=1:size(neighbor2,2)
  169. neighbor3 = find(data(neighbor2(1,n2),1:last_known_node));
  170. all_neighbor3 = unique([all_neighbor3, neighbor3]);
  171. end
  172. neighbor = all_neighbor3(1,randi(size(all_neighbor3,2)));
  173. end
  174. else
  175. fprintf('>>>> ERROR: MissingNodes_Sparse2n - noiseType=%d\n',noiseType);
  176. return;
  177. end
  178. % add the noise PH to the data
  179. newPlaceholder = zeros(1, size(data,2));
  180. newPlaceholder(neighbor) = 1;
  181. data = [data, newPlaceholder'; newPlaceholder, 0];
  182. end
  183. %update the PHs number
  184. num_placeholders = num_placeholders+noiseNumAddedPHs;
  185. end
  186. removed_nodes{num_missing_nodes_idx} = missing_nodes_mapping(1,:); %save the removed nodes in each iteration
  187. orig_S = S;
  188. for affinity_calculation_type = affinityType
  189. fprintf('affinity_calculation_type = %d\n', affinity_calculation_type);
  190. %data = data_untouched;
  191. first_unk_node = actual_graph_size - num_missing_nodes + 1;
  192. num_added_nodes = size(data,1) - first_unk_node + 1;
  193. % calculate the affinity / similarity matrix
  194. fprintf('calculating affinity matrix, type %d...\n', affinity_calculation_type);
  195. tic %affinity_calc_time
  196. affinity = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes);
  197. affinity_calc_time = toc;%affinity_calc_time
  198. %TODO: extend the dimension reduction to adding missing
  199. %links / reclustering
  200. for reduce_dimensions = 1 %[0 1] %0 must be first because it does not change the affinity matrix
  201. reduce_dim_time = 0;
  202. if reduce_dimensions == 1
  203. tic %ReduceDimensions
  204. [affinity, num_placeholders, first_unk_node] = ReduceDimensions(affinity, first_unk_node);
  205. reduce_dim_time = toc; %ReduceDimensions
  206. end
  207. fprintf('calculating true clustering\n');
  208. true_clustering = BuildTrueClustering(missing_nodes_mapping, actual_graph_size, num_missing_nodes, percent_known_placeholders, placeholders_to_remove, last_known_node);
  209. %figure,imshow(affinity,[]), title('Affinity Matrix')
  210. for num_clusters_known = 1 %%[1 4]
  211. k = DetermineNumberOfClusters(num_clusters_known, data_untouched, actual_graph_size, num_missing_nodes, num_added_nodes);
  212. debugEstimateK = 0;
  213. if debugEstimateK == 1 && affinity_calculation_type == 2
  214. for type=[0,3,4,8]
  215. estK = DetermineNumberOfClusters(type, data_untouched, actual_graph_size, num_missing_nodes, num_added_nodes);
  216. end
  217. end
  218. last_known_node = first_unk_node - 1;
  219. fprintf('calculating PredictGraph with k=%d and reduce_dimensions=%d ...\n',k,reduce_dimensions);
  220. tic %graph_predict_time
  221. [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes, noiseNumAddedPHs);
  222. graph_predict_time = toc; %graph_predict_time
  223. if compensate_for_unknown_placeholers == 1
  224. fprintf('*** running with compensate_for_unknown_placeholers mode ...\n');
  225. S = orig_S;
  226. if size(newData,1) > size(S,2)
  227. %for breakpoint
  228. tttt = 98;
  229. end
  230. sigma = 1/4;
  231. S = S(1:size(newData,1));
  232. S = S + randn(size(S)) * sigma;
  233. sorted_S = sort(S, 'descend');
  234. %sum over the columns and find the columns which
  235. %indicate at least one neighbor
  236. %neighbors_of_new_nodes = find(sum(newData(first_unk_node:size(newData,1), :)));
  237. first_united_node = size(newData,1) - k +1;
  238. if affinity_calculation_type == affinity_calculation_katz_beta_0_05
  239. newAffinity = CalcAffinityByKatzBeta_Sparse( newData, 0.05, 4 );
  240. elseif affinity_calculation_type == affinity_calculation_adamic_adar
  241. newAffinity = CalculateAffinityByAdamicAdar_Sparse(newData, size(newData,1), 0, 0);
  242. elseif affinity_calculation_type == affinity_calculation_common_friends
  243. newAffinity = CalcAffinityByCommonNeighbors_Sparse(newData, size(newData,1), 0);
  244. end
  245. newNodesAffinity = newAffinity(first_united_node:size(newAffinity,1), :);
  246. newNodesAffinity(newNodesAffinity>=1) = 0;
  247. newNodesAffinity = newNodesAffinity / max(max(newNodesAffinity));
  248. %newNodesAffinity = newNodesAffinity / 2;
  249. newNodesAffinity(newData(first_united_node:size(newAffinity,1), :) >= 1) = 0;
  250. newNodesAffinity = (newNodesAffinity / max(max(newNodesAffinity)));
  251. %%%%trying to take only the
  252. %%%%k highest affinities
  253. sortedNewNodesAffinity = sort(newNodesAffinity(:), 'descend');
  254. %affinityThreshold = sortedNewNodesAffinity(k + size(neighbors_of_new_nodes, 2));
  255. newNodesAffinity_orig = newNodesAffinity;
  256. end
  257. if percent_known_placeholders < 1 && compensate_for_unknown_placeholers == 1
  258. %calculating as a function of number of links added
  259. meanNumLinks = mean(sum(data(1:last_known_node, 1:last_known_node)));
  260. maxNumLinksToAdd = meanNumLinks * num_placeholders;
  261. maxNumLinksToAdd = min(maxNumLinksToAdd, 25);
  262. %sigal 27.6.13
  263. linksToAdd = round(compensate_vec*num_missing_nodes);
  264. else
  265. maxNumLinksToAdd = 0;
  266. %sigal 27.6.13
  267. linksToAdd = 0;
  268. end
  269. %%%%% for distance as
  270. %%%%% function of num
  271. %%%%% placeholders %%%%
  272. %max_neighbors = S >= sorted_S(maxNumLinksToAdd);
  273. %sigal 27.6.13
  274. %for numLinksToAdd = 0 : maxNumLinksToAdd
  275. for linksInx = 1:size(linksToAdd,2)
  276. numLinksToAdd = linksToAdd(linksInx);
  277. withAttr = compensate_vec(linksInx)*100; % numLinksToAdd
  278. % sigal - not implemented yet
  279. if numLinksToAdd > 0 && noiseNumAddedPHs> 0
  280. fprintf('>>>> ERROR: MissingNodes_Sparse2n - numLinksToAdd > 0 and noiseNumAddedPHs> 0 - not implemented yet!!\n');
  281. return;
  282. end
  283. if compensate_for_unknown_placeholers == 1
  284. newNodesAffinity = newNodesAffinity_orig;
  285. neighbors = [];
  286. if numLinksToAdd > 0
  287. % get the indexes of the #numLinksToAdd neighbors that have the
  288. % top (higher) S value
  289. neighbors = find(S >= sorted_S(numLinksToAdd), numLinksToAdd);
  290. end
  291. newDataWithMissingLinks = newData; % partial graph with the clustered nodes
  292. newDataForClustering = data; % partial graph with partial PHs
  293. for neighbor = neighbors
  294. % missing links - add links to clustered nodes
  295. [value, closest_new_node] = max(newNodesAffinity(:,neighbor));
  296. closest_new_node = closest_new_node(1);
  297. newDataWithMissingLinks(first_united_node + closest_new_node - 1, neighbor) = 1;
  298. newDataWithMissingLinks(neighbor, first_united_node + closest_new_node - 1) = 1;
  299. % Speculative MISC - add PH to the partial data
  300. newPlaceholder = zeros(1, size(newDataForClustering,2));
  301. newPlaceholder(neighbor) = 1;
  302. newDataForClustering = [newDataForClustering, newPlaceholder'; newPlaceholder, 0];
  303. end
  304. affinityWithS = CalcAffinity( newDataForClustering, affinity_calculation_type, actual_graph_size, num_missing_nodes);
  305. if reduce_dimensions == 1
  306. [affinityWithS, num_placeholdersWithS, first_unk_node_with_s] = ReduceDimensions(affinityWithS, first_united_node);
  307. else
  308. num_placeholdersWithS = num_placeholders + length(neighbors);
  309. first_unk_node_with_s = first_united_node;
  310. end
  311. [newPredictedGraph, newClustering] = PredictGraph(affinityWithS, k, newDataForClustering, num_placeholdersWithS, affinity_calculation_type, cluster_only_missing_nodes, noiseNumAddedPHs);
  312. %remap the original data so that
  313. %the known nodes match the
  314. %predicted data and the missing
  315. %nodes match the predicted nodes
  316. %created from each cluster
  317. perm_vector = 1:size(original_data,1);
  318. perm_vector(missing_nodes_mapping(1,:)) = [];
  319. perm_vector = [perm_vector, missing_nodes_mapping(1,:)];
  320. remapped_data = original_data(perm_vector,perm_vector);
  321. [small_data, indices_to_remove] = DecreaseGraphSize(remapped_data, first_united_node : size(remapped_data,1), neighbors); %changed from neighbors to perm(max_neighbors) - to be fair when there are more nieghbors
  322. %small_data2 = DecreaseGraphSize(newData, last_known_node+1 : size(newData,1));
  323. %new_nodes_affinity_sum = sum(sum(newNodesAffinity));
  324. %in case there is an empty cluster, the unrelated nodes may contain node index that does not exist
  325. %in the predicted graph (which may contain less nodes)
  326. indices_to_remove(indices_to_remove > size(newPredictedGraph,2)) = [];
  327. small_data2 = newData;
  328. small_data2(indices_to_remove,:) = [];
  329. small_data2(:,indices_to_remove) = [];
  330. %small_data3 = DecreaseGraphSize(newDataWithMissingLinks, last_known_node+1 : size(newDataWithMissingLinks,1));
  331. small_data3 = newDataWithMissingLinks;
  332. small_data3(indices_to_remove,:) = [];
  333. small_data3(:,indices_to_remove) = [];
  334. small_data4 = newPredictedGraph;
  335. small_data4(indices_to_remove,:) = [];
  336. small_data4(:,indices_to_remove) = [];
  337. %%% Sigal 23.6.13 - save reduce graphs for GED
  338. if dumpSmallFlag == 1
  339. saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttr, num_missing_nodes, small_data, 1);
  340. saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttr, num_missing_nodes, small_data2, 2);
  341. end
  342. %only calculate on the first iteration to save time
  343. if numLinksToAdd == 0
  344. edit_distance = 99; %GraphEditDistance( small_data, small_data2, num_missing_nodes );
  345. edit_distance2 = edit_distance;
  346. edit_distance3 = edit_distance;
  347. else
  348. edit_distance2 = 99; %GraphEditDistance( small_data, small_data3, num_missing_nodes );
  349. edit_distance3 = 99; %GraphEditDistance( small_data, small_data4, num_missing_nodes );
  350. if dumpSmallFlag == 1
  351. saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttr, num_missing_nodes, small_data3, 3);
  352. saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttr, num_missing_nodes, small_data4, 4);
  353. end
  354. end
  355. else % without compensate
  356. perm_vector = 1:size(original_data,1);
  357. perm_vector(missing_nodes_mapping(1,:)) = [];
  358. perm_vector = [perm_vector, missing_nodes_mapping(1,:)];
  359. remapped_data = original_data(perm_vector,perm_vector);
  360. %changed from neighbors to perm(max_neighbors) - to be fair when there are more nieghbors
  361. [small_data, indices_to_remove] = DecreaseGraphSize(remapped_data, (size(remapped_data,1) - num_missing_nodes + 1) : size(remapped_data,1), []);
  362. indices_to_remove(indices_to_remove > size(newData,2)) = [];
  363. small_data2 = newData;
  364. small_data2(indices_to_remove,:) = [];
  365. small_data2(:,indices_to_remove) = [];
  366. % Sigal 23.6.13 - save reduce graphs for GED
  367. withAttr = (1 - percent_known_placeholders)*10;
  368. if noisePercent > 0
  369. withAttr = round(withAttr*1000+(noiseType*10+noisePercent)*100);
  370. end
  371. if dumpSmallFlag == 1
  372. saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttr, num_missing_nodes, small_data, 1);
  373. saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttr, num_missing_nodes, small_data2, 2);
  374. end
  375. %sigal 6.12.12 - *** TODO *** temporary for test only TODO
  376. %edit_distance = GraphEditDistance( small_data, small_data2, num_missing_nodes );
  377. edit_distance = 99;
  378. end %compensate
  379. %fprintf('num_links_to_add = %d; missing links distance = %d, new clustering dist = %d\n', numLinksToAdd, full(edit_distance2), full(edit_distance3));
  380. fprintf('calculating purity\n');
  381. curr_index = size(purity,2) + 1;
  382. %purity for actual clustering
  383. try
  384. temp_purity = ClusteringPurity(true_clustering, test_clustering);
  385. catch ME1
  386. temp_purity = 99; % Sigal 12.8.12 - tmp %
  387. ddddd = 1;
  388. end
  389. purity(curr_index).score = temp_purity;
  390. purity(curr_index).score_sq = temp_purity^2;
  391. purity(curr_index).unite_common_friends = unite_common_friends;
  392. purity(curr_index).num_missing_nodes_idx = num_missing_nodes_idx;
  393. purity(curr_index).num_missing_nodes = num_missing_nodes_arr(num_missing_nodes_idx);
  394. purity(curr_index).affinity_calculation_type = affinity_calculation_type;
  395. purity(curr_index).cluster_only_missing_nodes = cluster_only_missing_nodes;
  396. purity(curr_index).withAttr = withAttr; %sigal - 10.7.13
  397. purity(curr_index).num_clusters_known = num_clusters_known;
  398. purity(curr_index).k = k; % sigal 26.6.13 - the predict # of clusters
  399. purity(curr_index).num_placeholders = num_placeholders;
  400. purity(curr_index).num_placeholders_to_remove = num_placeholders_to_remove;
  401. %sigal - 9.7.13 - noise params
  402. purity(curr_index).withAttr = withAttr;
  403. purity(curr_index).noiseNumAddedPHs = noiseNumAddedPHs;
  404. purity(curr_index).noiseType = noiseType;
  405. purity(curr_index).noisePercent = noisePercent;
  406. %sigal - end noise params
  407. purity(curr_index).iteration = 1;
  408. purity(curr_index).test_clustering = test_clustering;
  409. purity(curr_index).true_clustering = true_clustering;
  410. purity(curr_index).graph_size = graph_size;
  411. purity(curr_index).graph_edges = graph_edges; % sigal - number of edges
  412. purity(curr_index).inverse_purity = 1; % Sigal 12.8.12 - tmp % CalculateInversePurity(true_clustering, test_clustering);
  413. purity(curr_index).NMI = 0; %CalcNormalizedMutualInformation(true_clustering, test_clustering);
  414. purity(curr_index).removed_nodes = removed_nodes; %missing nodes
  415. purity(curr_index).percent_known_placeholders = percent_known_placeholders;
  416. purity(curr_index).reduce_dimensions = reduce_dimensions;
  417. purity(curr_index).missing_nodes_mapping = missing_nodes_mapping;
  418. purity(curr_index).compensate_for_unknown_placeholers = compensate_for_unknown_placeholers;
  419. purity(curr_index).edit_distance = edit_distance;
  420. if compensate_for_unknown_placeholers == 1
  421. purity(curr_index).numLinksToAdd = numLinksToAdd;
  422. purity(curr_index).edit_distance_missing_links = edit_distance2;
  423. purity(curr_index).edit_distance_new_clustering = edit_distance3;
  424. fprintf('numLinksToAdd - %d\nedit_distance - %d\nedit_distance_missing_links - %d\nedit_distance_new_clustering - %d\n', numLinksToAdd, full(edit_distance), full(edit_distance2), full(edit_distance3));
  425. end
  426. purity(curr_index).affinity_calc_time = affinity_calc_time;
  427. purity(curr_index).reduce_dim_time = reduce_dim_time;
  428. purity(curr_index).graph_predict_time = graph_predict_time;
  429. if compensate_for_unknown_placeholers == 0
  430. break
  431. end
  432. end %linksToAdd
  433. %
  434. % purity(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) = ...
  435. % purity(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) + temp_purity;
  436. %
  437. % purity_sq(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) = ...
  438. % purity_sq(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) + temp_purity^2;
  439. %
  440. LogMsg(sprintf('Sparse2n: Size=%d,Miss=%d,PHs=%d,Affinity=%d,Att=%d,Purity=%.3f', ...
  441. graph_size,purity(curr_index).num_missing_nodes,num_placeholders,affinity_calculation_type,withAttr,temp_purity));
  442. %fprintf('affinity_calculation_type = %d, unite_common_friends = %d\n', affinity_calculation_type, unite_common_friends);
  443. %fprintf('Graph size: %d, Number of missing nodes: %d, Purity: %f \n' ,graph_size, num_missing_nodes, temp_purity);
  444. %fprintf('============================================\n\n\n');
  445. %clear U;
  446. clear eigValues;
  447. clear eigVectors;
  448. end %NumClusterKnown type
  449. end
  450. clear('affinity');
  451. end %affinityType
  452. end %noiseType
  453. end %noisePercent
  454. end %percent_known
  455. end %MissingNodes
  456. end %function
  457. function [affinity] = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes)
  458. global affinity_calculation_shortest_path;
  459. global affinity_calculation_euclid;
  460. global affinity_calculation_common_friends;
  461. global affinity_calculation_random_clustering;
  462. global affinity_calculation_adamic_adar;
  463. global affinity_calculation_katz_beta_0_5;
  464. global affinity_calculation_katz_beta_0_05;
  465. global affinity_calculation_katz_beta_0_005;
  466. if affinity_calculation_type == affinity_calculation_euclid
  467. sp_mat = graphallshortestpaths(data);
  468. %remove INF values
  469. max_value = max(sp_mat(sp_mat ~= Inf)) + 1;
  470. sp_mat_euclid = sp_mat;
  471. sp_mat_euclid(sp_mat == Inf) = max_value;
  472. affinity = CalculateAffinity(sp_mat_euclid);
  473. %affinity = exp(-(sp_mat.^2))/(2 * 0.3^2);
  474. elseif affinity_calculation_type == affinity_calculation_shortest_path
  475. % max_value = max(sp_mat(sp_mat ~= Inf)) + 1;
  476. % sp_mat_euclid = sp_mat;
  477. % sp_mat_euclid(sp_mat == Inf) = max_value;
  478. % affinity = (sp_mat_euclid + 1).^(-affinity_exp_factor);
  479. %affinity = spfun(affinityFunc, data);
  480. affinity = graphallshortestpaths(data);
  481. affinity = affinity .^ -2;
  482. affinity(affinity == Inf) = 1; %added on 05/11/11
  483. elseif affinity_calculation_type == affinity_calculation_common_friends
  484. affinity = CalcAffinityByCommonNeighbors_Sparse(data, actual_graph_size, num_missing_nodes);
  485. %affinity = CalcAffinityByCommonNeighbors(data, actual_graph_size, num_missing_nodes);
  486. elseif affinity_calculation_type == affinity_calculation_random_clustering
  487. affinity = data; %just a placeholder...
  488. elseif affinity_calculation_type == affinity_calculation_adamic_adar
  489. affinity = CalculateAffinityByAdamicAdar_Sparse( data, actual_graph_size, num_missing_nodes, 1 );
  490. if nnz(affinity) < 5
  491. x = 8;
  492. end
  493. elseif affinity_calculation_type == affinity_calculation_katz_beta_0_5
  494. affinity = CalcAffinityByKatzBeta_Sparse( data, 0.5, 3 );
  495. elseif affinity_calculation_type == affinity_calculation_katz_beta_0_05
  496. affinity = CalcAffinityByKatzBeta_Sparse( data, 0.05, 4 );
  497. elseif affinity_calculation_type == affinity_calculation_katz_beta_0_005
  498. affinity = CalcAffinityByKatzBeta_Sparse( data, 0.005, 4 );
  499. end
  500. end
  501. function [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes)
  502. global affinity_calculation_shortest_path;
  503. global affinity_calculation_euclid;
  504. global affinity_calculation_common_friends;
  505. global affinity_calculation_random_clustering;
  506. global affinity_calculation_adamic_adar;
  507. global affinity_calculation_katz_beta_0_5;
  508. global affinity_calculation_katz_beta_0_05;
  509. global affinity_calculation_katz_beta_0_005;
  510. first_unk_node = size(affinity,1) - num_placeholders + 1;
  511. diagonal = sum(affinity, 2); %sum the rows
  512. D = sparse(diag(diagonal)); %D is the matrix whose diagonal is the sum of the rows of A
  513. clear('diagonal');
  514. fprintf('Calculating NL\n');
  515. D = sqrt(D);
  516. NL1 = D * affinity * D;
  517. clear('D');
  518. fprintf('calculating U\n');
  519. fail = 0;
  520. try
  521. [nEigVec,eigValues] = eigs(NL1,k);
  522. catch ME1
  523. opts.tol = 1e-1;
  524. try
  525. [nEigVec,eigValues] = eigs(NL1,k, 'LM', opts);
  526. catch ME2
  527. fail = 1;
  528. end
  529. end
  530. % select k largest eigen vectors
  531. if fail == 0
  532. U = [];
  533. % construct the normalized matrix U from the obtained eigen vectors
  534. for i=1:size(nEigVec,1)
  535. n = sqrt(sum(nEigVec(i,:).^2));
  536. U(i,:) = nEigVec(i,:) ./ n;
  537. end
  538. num_samples = size(affinity,1) - first_unk_node + 1;
  539. if cluster_only_missing_nodes == 1
  540. U(1:first_unk_node - 1 ,:) = []; %cluster only the missing nodes
  541. end
  542. fprintf('kmeans clustering\n');
  543. % perform kmeans clustering on the matrix U
  544. fail = 1;
  545. while fail > 0
  546. try
  547. currK = k;
  548. [IDX,C, SUMD, D] = kmeans(U,currK, 'EmptyAction', 'singleton'); %in case of an empty cluster just drop it
  549. fail = 0;
  550. catch ME1
  551. fail = fail + 1;
  552. if fail < 100
  553. disp('error in kmeans clustering. trying again...');
  554. else
  555. %give up on clustering and select random clusters...
  556. IDX = randi(currK, size(U));
  557. fail = 0;
  558. end
  559. end
  560. end
  561. test_clustering = IDX(size(IDX,1) - num_samples + 1 : size(IDX,1));
  562. %if it's random just replace everything...
  563. if affinity_calculation_type == affinity_calculation_random_clustering
  564. test_clustering = randi(k, size(test_clustering,1), size(test_clustering,2));
  565. end
  566. else
  567. disp('Failed in finding eigenvectors - using random!');
  568. if cluster_only_missing_nodes == 0
  569. num_samples = size(affinity,1);
  570. else
  571. num_samples = num_placeholders;
  572. end
  573. test_clustering = randi(k, num_samples, 1);
  574. end
  575. end
  576. function [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes, noiseNumAddedPHs)
  577. last_known_node = size(data,1) - num_placeholders;
  578. first_unk_node = last_known_node + 1;
  579. [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes);
  580. % sigal - 14.7.13 remove the added PHs noise from the clustring results
  581. if noiseNumAddedPHs > 0
  582. firstNoisePH = num_placeholders - noiseNumAddedPHs + 1;
  583. test_clustering(firstNoisePH:end, :) = [];
  584. end
  585. newNodes = CreateNewNodesFromClusters(data, test_clustering, noiseNumAddedPHs);
  586. newData = [data(1:last_known_node,1:last_known_node), newNodes(:, 1:last_known_node)';...
  587. newNodes(:,1:last_known_node), zeros(size(newNodes,1))];
  588. % sigal - 11.7.13 remove the added PHs noise ONLY from the clustring results
  589. % if noiseNumAddedPHs > 0
  590. % firstNoisePH = num_placeholders - noiseNumAddedPHs + 1;
  591. % test_clustering(firstNoisePH:end, :) = [];
  592. % end
  593. end
  594. function [k] = DetermineNumberOfClusters(num_clusters_known, data_untouched, actual_graph_size, num_missing_nodes, num_added_nodes)
  595. %determine k - the number of clusters
  596. if num_clusters_known == 1
  597. k = num_missing_nodes;
  598. else
  599. numKnownNodes = actual_graph_size - num_missing_nodes;
  600. sumKnownEdges = sum(sum(data_untouched(1 : numKnownNodes, 1 : numKnownNodes)));
  601. meanKnownEdges = sumKnownEdges/numKnownNodes;
  602. addedEdges = num_added_nodes*2; % undirect graph
  603. fprintf('EstimatedK: numKnownN=%d, meanKnownE=%.3f, addedE=%d, missN=%d, meanMissE=%.3f\n', ...
  604. numKnownNodes,full(meanKnownEdges),num_added_nodes,num_missing_nodes,addedEdges/num_missing_nodes);
  605. if num_clusters_known == 0
  606. %k = round(num_added_nodes / meanKnownEdges);
  607. k = round(num_added_nodes / floor(meanKnownEdges));
  608. %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k);
  609. elseif num_clusters_known == 2 %guessing upper limit
  610. k = 2*round(num_added_nodes / meanKnownEdges);
  611. %fprintf('EstimatedK: type=%d, actual=%d, guessing upper limit %d\n',num_clusters_known, num_missing_nodes, k);
  612. elseif num_clusters_known == 3 % e=a*n
  613. a = meanKnownEdges;
  614. e = sumKnownEdges+num_added_nodes;
  615. k = round(e/a-numKnownNodes);
  616. %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k);
  617. elseif num_clusters_known == 4 % e=a*n^2
  618. a = meanKnownEdges/numKnownNodes;
  619. e = sumKnownEdges+addedEdges;
  620. k = round(sqrt(e/a)-numKnownNodes);
  621. %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k);
  622. elseif num_clusters_known == 5 % e=a*n^2
  623. a = meanKnownEdges/numKnownNodes;
  624. e = sumKnownEdges+addedEdges;
  625. k = ceil(sqrt(e/a)-numKnownNodes);
  626. %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k);
  627. elseif num_clusters_known == 6 % e=a*n
  628. a = meanKnownEdges;
  629. e = sumKnownEdges+num_added_nodes;
  630. k = ceil(e/a-numKnownNodes);
  631. elseif num_clusters_known == 7
  632. k = ceil(num_added_nodes / meanKnownEdges);
  633. elseif num_clusters_known == 8
  634. k = round(num_added_nodes / meanKnownEdges);
  635. end
  636. LogMsg(sprintf('EstimatedK(size,PHs,type,actual,k):\t%d\t%d\t%d\t%d\t%d', ...
  637. actual_graph_size,num_added_nodes,num_clusters_known, num_missing_nodes, k),'EstimateK_Log2.txt');
  638. end
  639. end %function
  640. function [affinity, num_placeholders, first_unk_node] = ReduceDimensions(affinity, first_unk_node)
  641. size_before = size(affinity,1);
  642. num_placeholders = size(affinity,1) - first_unk_node + 1;
  643. affinity_sum = sum(affinity(first_unk_node:size(affinity,1),:)); %the sum of the affinity of placeholders to all other nodes
  644. nodes_to_keep = (affinity_sum > 0); %keep only nodes which have some affinity to the placeholders
  645. nodes_to_keep(first_unk_node:size(affinity,1)) = 1; %keep all the placeholders even if for some reason they have a sum of zero...
  646. affinity = affinity(nodes_to_keep, nodes_to_keep);
  647. size_after = size(affinity,1);
  648. first_unk_node = size(affinity,1) - num_placeholders + 1;
  649. LogMsg(sprintf('ReduceDimensions from %d to %d',size_before,size_after),'ReduceDimensions_Log.txt');
  650. end
  651. function [true_clustering] = BuildTrueClustering(missing_nodes_mapping, actual_graph_size, num_missing_nodes, percent_known_placeholders, placeholders_to_remove, last_known_node)
  652. true_clustering = []; %zeros(size(test_clustering, 1), 1);
  653. for i = 2 : size(missing_nodes_mapping, 1)
  654. for j = 1 : size(missing_nodes_mapping,2)
  655. if missing_nodes_mapping(i,j) ~= 0
  656. true_clustering(missing_nodes_mapping(i,j) - actual_graph_size + num_missing_nodes, 1) = j; % missing_nodes_mapping(1, j);
  657. end
  658. end
  659. end
  660. if percent_known_placeholders < 1
  661. true_clustering(placeholders_to_remove - last_known_node) = [];
  662. end
  663. end
  664. function saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_type, withAttr, missNodes, small_data, i)
  665. %%% Sigal 24.1.13 - TODO
  666. outFile = sprintf('%s_%d_%d_%d_%d_small_data_%d', dataFileName, iter, missNodes, affinity_type, withAttr, i);
  667. if affinity_type == 9 % save instead a dummy size (1) and the best_alg
  668. SaveIntMatrixToFile(strcat(dumpSmallDataPath, outFile,'_edges.txt'), small_data, 1);
  669. else
  670. SaveAsciiGraph(dumpSmallDataPath, outFile, small_data, 1); %% also save graph size
  671. end
  672. end