You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

RemoveRandomNodes2.m 12KB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. function [ data, dataWithAtt, missing_nodes_mapping ] = RemoveRandomNodes2( data, dataWithAtt, totalAttNum, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance )
  2. %RemoveRandomNodes Remove num_missing_nodes from data. If some nodes are
  3. %removed already, provide missing_nodes_mapping
  4. % Detailed explanation goes here
  5. %%data_orig = data;
  6. numAttPerPH = 0;
  7. % if the mapping is larger than the number of nodes we want to remove, empty
  8. % it and start a new mapping. This can happen if we finished looping over
  9. % the number of missing nodes and started a new iteration of an outer loop.
  10. if size(missing_nodes_mapping,2) > num_missing_nodes
  11. missing_nodes_mapping = [];
  12. num_nodes_to_remove = num_missing_nodes;
  13. else
  14. num_nodes_to_remove = num_missing_nodes - size(missing_nodes_mapping,2);
  15. end
  16. % randomly choose missing nodes
  17. %missing_nodes = ChooseMissingNodes_old(num_nodes_to_remove, data, missing_nodes_mapping, non_neighbors_distance);
  18. missing_nodes = ChooseMissingNodes(num_nodes_to_remove, data, dataWithAtt, totalAttNum, numAttPerPH, missing_nodes_mapping);
  19. %sort the list and create a list of the new nodes that each missing node is mapped to - each link
  20. %to a missing node is replaced by a link to a new, "UNK" node
  21. missing_nodes = sort( unique(missing_nodes) , 'descend');
  22. missing_nodes_mapping = missing_nodes;
  23. %replace each link to a missing node with a link to a new node
  24. %find all missing node neighbors
  25. missing_nodes_all_neighbors = zeros(1, size(data,2));
  26. for curr_nissing_node = missing_nodes
  27. missing_nodes_all_neighbors = missing_nodes_all_neighbors | data(curr_nissing_node,:);
  28. end
  29. missing_nodes_all_neighbors = find(missing_nodes_all_neighbors);
  30. %for each node in missing_nodes_all_neighbors add edges to placeholder
  31. for i = missing_nodes_all_neighbors
  32. neighbors = find(data(i,:));
  33. missing_neighbors = intersect(neighbors, missing_nodes);
  34. missing_neighbors = sort(missing_neighbors, 'descend');
  35. for curr_missing_neighbor = missing_neighbors
  36. if data(i,curr_missing_neighbor) == 1
  37. % append col & row for the placeholder
  38. data = ExpandDataByOne(data, curr_missing_neighbor, i, non_neighbors_distance, 0, 0);
  39. dataWithAtt = ExpandDataByOne(dataWithAtt, curr_missing_neighbor, i, non_neighbors_distance, totalAttNum, numAttPerPH);
  40. %add the new UNK node to the missing nodes mapping j is the index of the missing node
  41. %look for the first zero in column j of the missing nodes mapping and put the new node
  42. %index there
  43. added_node = 0;
  44. %add it in the first position which equals zero
  45. j = find(missing_nodes == curr_missing_neighbor, 1);
  46. for k = 1 : size(missing_nodes_mapping,1)
  47. if missing_nodes_mapping(k, j) == 0
  48. %if we start with 1000 nodes, and we have 5 missing nodes, after
  49. %adding one node at this point, the size of the graph is 1001. 5 nodes
  50. %will be removed so the correct index of the new node will be 1001 - 5 = 996.
  51. %The next one is 997 and so on.
  52. missing_nodes_mapping(k, j) = size(data,1) - num_missing_nodes;
  53. added_node = 1;
  54. break;
  55. end
  56. end
  57. %if all the column is non-zero, add a new row and put it there
  58. if added_node == 0
  59. missing_nodes_mapping = [missing_nodes_mapping; zeros(1, size(missing_nodes_mapping,2))];
  60. missing_nodes_mapping(size(missing_nodes_mapping,1), j) = size(data,1) - num_missing_nodes;
  61. end
  62. end
  63. end
  64. end
  65. %remove the missing nodes from the matrix (missing nodes MUST be sorted in descending order!!
  66. %so that removing one does not affect the index of the others)
  67. for j = 1:size(missing_nodes,2)
  68. missing_node_idx = missing_nodes(j);
  69. %remove column
  70. data(:, missing_node_idx) = [];
  71. dataWithAtt(:, missing_node_idx+totalAttNum) = [];
  72. %remove row
  73. data(missing_node_idx, :) = [];
  74. dataWithAtt(missing_node_idx+totalAttNum, :) = [];
  75. end
  76. % debug = 1;
  77. % if debug == 1
  78. % for i=graph_size+1-size(missing_nodes,2):size(data,2)
  79. % neighbor = find(data(i,:), 1);
  80. % if size(neighbor,2) == 0
  81. % fprintf('RemoveRandomNodes2-debug: node %d has no neighbors\n', i);
  82. % end
  83. % end
  84. % end
  85. end %function
  86. %sigal - move old implementation to function
  87. function [missing_nodes] = ChooseMissingNodes(num_nodes_to_remove, data, dataWithAtt, totalAttNum, numAttPerPH, missing_nodes_mapping)
  88. missing_nodes_all_neighbors = zeros(1, size(data,2));
  89. %randomize a list of nodes to remove and sort it
  90. if size(missing_nodes_mapping,1)> 0
  91. missing_nodes = sort(missing_nodes_mapping(1,:) , 'descend');
  92. %find all missing node neighbors
  93. for curr_missing_node = missing_nodes
  94. missing_nodes_all_neighbors = missing_nodes_all_neighbors | data(curr_missing_node,:);
  95. missing_nodes_all_neighbors(1,curr_missing_node)=1;
  96. end
  97. else
  98. missing_nodes = [];
  99. end
  100. % outlier1 - nodes with only one edge
  101. numEdges = sum(data,1);
  102. invalidNodes1a = (numEdges==1); %%numEdges<3); %%(numEdges==1);
  103. missing_nodes_all_neighbors(1,invalidNodes1a) = 1;
  104. invalidNodes1b = (numEdges>25); %%(numEdges==1);
  105. missing_nodes_all_neighbors(1,invalidNodes1b) = 1;
  106. % outlier2 - nodes with less than numAttPerPH attributes
  107. numAttr = sum(dataWithAtt(totalAttNum+1:end,1:totalAttNum),2)';
  108. invalidNodes2 = (numAttr<numAttPerPH);
  109. missing_nodes_all_neighbors(1,invalidNodes2) = 1;
  110. % outlier statistics
  111. count = nnz(invalidNodes1a|invalidNodes1b|invalidNodes2);
  112. if count*1.5 > size(data,2)
  113. fprintf('RemoveRandomNodes2: too many outliers nodes %d.\n',count);
  114. end
  115. for i=1:num_nodes_to_remove
  116. valid_nodes = find(missing_nodes_all_neighbors~=1);
  117. inx = ceil(rand(1)*size(valid_nodes,2));
  118. node = valid_nodes(inx);
  119. % add selected node to missing_nodes list and update the all neighbors list
  120. missing_nodes = [missing_nodes node];
  121. missing_nodes_all_neighbors(1,node)=1;
  122. missing_nodes_all_neighbors = missing_nodes_all_neighbors | data(node,:);
  123. end
  124. end %function
  125. %sigal - move old implementation to function
  126. function [missing_nodes] = ChooseMissingNodes_old(num_nodes_to_remove, data, missing_nodes_mapping, non_neighbors_distance)
  127. graph_size = size(data,1);
  128. maxRetries = 10;
  129. %randomize a list of nodes to remove and sort it
  130. if size(missing_nodes_mapping,1) == 0
  131. missing_nodes = sort(ceil(rand(1,num_nodes_to_remove).*graph_size), 'descend');
  132. else
  133. missing_nodes = sort( [missing_nodes_mapping(1,:) ceil(rand(1,num_nodes_to_remove).*graph_size)] , 'descend');
  134. end
  135. %check if there are doubles in the list
  136. sizeDiff = size(missing_nodes,2) - size(unique(missing_nodes),2);
  137. while sizeDiff > 0
  138. %selecting new nodes randomly until we have enough
  139. missing_nodes = sort( [unique(missing_nodes) ceil(rand(1,sizeDiff).*graph_size)] , 'descend');
  140. sizeDiff = size(missing_nodes,2) - size(unique(missing_nodes),2);
  141. end
  142. %check if we selected one of the nodes which is an outlier, i.e. not connected to any other node
  143. %check if the nodes has no neighbors after we remove the sleceted nodes
  144. % initialize prev_missing_nodes
  145. if size(missing_nodes_mapping,1) > 0
  146. prev_missing_nodes = missing_nodes_mapping(1,:);
  147. else
  148. prev_missing_nodes = [];
  149. end
  150. % make sure missing_nodes have their neighbors
  151. tryNo = 0;
  152. while tryNo<maxRetries && ReplaceNodesNeighbors(data, prev_missing_nodes, missing_nodes, non_neighbors_distance)
  153. tryNo = tryNo+1;
  154. if tryNo == maxRetries
  155. fprintf('RemoveRandomNodes2: last try %d to ReplaceNodesWithNoNeighbors...\n',tryNo);
  156. end
  157. end
  158. end %function
  159. % sigal - append col & row for the placeholder
  160. function [data] = ExpandDataByOne(data, orgNode, friend, non_neighbors_distance, totalAttNum, numAttPerPH)
  161. if totalAttNum>0 && numAttPerPH>0
  162. attIndices = find(data(orgNode+totalAttNum, 1:totalAttNum)==1);
  163. while size(attIndices,2) > numAttPerPH
  164. inx = ceil(rand(1)*size(attIndices,2));
  165. attIndices(:,inx) = [];
  166. end
  167. else
  168. attIndices=[];
  169. end
  170. new_col = ones(size(data, 1), 1) * non_neighbors_distance;
  171. new_col(friend+totalAttNum) = 1;
  172. for i=1:size(attIndices,2)
  173. new_col(i)=1;
  174. end
  175. data = [data new_col];
  176. new_row = ones(1,size(data, 2)) * non_neighbors_distance;
  177. new_row(friend+totalAttNum) = 1;
  178. for i=1:size(attIndices,2)
  179. new_row(i)=1;
  180. end
  181. data = [data; new_row];
  182. data(size(data, 1), size(data,2)) = 0;
  183. end
  184. % sigal - choose another node and make sure it has neighbors after the removal
  185. function [node] = ChooseNewMissingNode(data, missing_nodes, non_neighbors_distance)
  186. graph_size = size(data,1);
  187. invalidNode = 1;
  188. tryNo = 0;
  189. maxTries = min([graph_size/100, 50]);
  190. while invalidNode == 1 && tryNo<maxTries;
  191. node = ceil(rand(1)*graph_size);
  192. invalidNode = InavlidNode(node, data, missing_nodes, non_neighbors_distance);
  193. tryNo = tryNo+1;
  194. end
  195. end
  196. % sigal - check if we remove one of the neighbors of the selected node
  197. function [tf] = HasRemovedNeighbors(node, data, missing_nodes)
  198. friends = find(data(node,:)==1);
  199. rm_friends = intersect(friends ,missing_nodes);
  200. tf = (size(rm_friends,2)>0);
  201. end
  202. % sigal - check if we this is an invalid node
  203. function [tf] = InavlidNode(node, data, missing_nodes, non_neighbors_distance)
  204. maxEdges = max(2*nnz(data)/size(data,2), 10);
  205. hasNoNeighbors = sum(data(node,:))-data(node,node)==non_neighbors_distance*(size(data,2)-1);
  206. hasRemovedNeighbors = HasRemovedNeighbors(node, data, missing_nodes);
  207. duplicate = ismember(node, missing_nodes);
  208. tooManyEdges = size(find(data(node,:)==1),2) >maxEdges;
  209. tf = hasNoNeighbors || duplicate || hasRemovedNeighbors || tooManyEdges;
  210. end
  211. % sigal - try to replace selected nodes to insure the PH will have a neighbor
  212. function [tf] = ReplaceNodesNeighbors(data, prev_missing_nodes, missing_nodes, non_neighbors_distance)
  213. tf = 0;
  214. % make sure the prev selected nodes still have their neighbors
  215. if size(prev_missing_nodes,2) > 0
  216. for i = 1:size(missing_nodes,2)
  217. node = missing_nodes(i);
  218. if ismember(node, prev_missing_nodes) && HasRemovedNeighbors(node, data, missing_nodes)
  219. % try to replace one of the other nodes
  220. for j = 1:size(missing_nodes,2)
  221. if ismember(missing_nodes(j), prev_missing_nodes)==0 && data(node,missing_nodes(j))==1
  222. %fprintf('RemoveRandomNodes2-prevNode: try to replace outlier %d...\n',j);
  223. missing_nodes(j) = ChooseNewMissingNode(data, missing_nodes, non_neighbors_distance);
  224. tf=1;
  225. break;
  226. end
  227. end
  228. end
  229. end
  230. end
  231. % make sure the latest selected nodes still have their neighbors
  232. for i = 1:size(missing_nodes,2);
  233. node = missing_nodes(i);
  234. % skip prev_missing_nodes
  235. if ismember(node, prev_missing_nodes)
  236. continue;
  237. end
  238. if InavlidNode(node, data, missing_nodes, non_neighbors_distance)
  239. missing_nodes(i) = ChooseNewMissingNode(data, missing_nodes, non_neighbors_distance);
  240. %fprintf('RemoveRandomNodes2-newNode: try to replace outlier %d...\n',i);
  241. tf=1;
  242. end
  243. end
  244. end