You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

RemoveRandomNodes4.m 7.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. function [ data, attDataVec, missing_nodes_mapping ] = RemoveRandomNodes4( data, attDataVec, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance )
  2. %RemoveRandomNodes Remove num_missing_nodes from data. If some nodes are
  3. %removed already, provide missing_nodes_mapping
  4. % Detailed explanation goes here
  5. % Sigal 15.10.13
  6. % This function remove same nodes from few vectors of selected att (use in MissingNodes_S4t)
  7. %%data_orig = data;
  8. numAttPerPH = 0;
  9. numAttData = size(attDataVec,2);
  10. % if the mapping is larger than the number of nodes we want to remove, empty
  11. % it and start a new mapping. This can happen if we finished looping over
  12. % the number of missing nodes and started a new iteration of an outer loop.
  13. if size(missing_nodes_mapping,2) > num_missing_nodes
  14. missing_nodes_mapping = [];
  15. num_nodes_to_remove = num_missing_nodes;
  16. else
  17. num_nodes_to_remove = num_missing_nodes - size(missing_nodes_mapping,2);
  18. end
  19. % randomly choose missing nodes
  20. %missing_nodes = ChooseMissingNodes_old(num_nodes_to_remove, data, missing_nodes_mapping, non_neighbors_distance);
  21. missing_nodes = ChooseMissingNodes(num_nodes_to_remove, data, attDataVec(1).data, attDataVec(1).totalAttNum, numAttPerPH, missing_nodes_mapping);
  22. %sort the list and create a list of the new nodes that each missing node is mapped to - each link
  23. %to a missing node is replaced by a link to a new, "UNK" node
  24. missing_nodes = sort( unique(missing_nodes) , 'descend');
  25. missing_nodes_mapping = missing_nodes;
  26. %replace each link to a missing node with a link to a new node
  27. %find all missing node neighbors
  28. missing_nodes_all_neighbors = zeros(1, size(data,2));
  29. for curr_nissing_node = missing_nodes
  30. missing_nodes_all_neighbors = missing_nodes_all_neighbors | data(curr_nissing_node,:);
  31. end
  32. missing_nodes_all_neighbors = find(missing_nodes_all_neighbors);
  33. %for each node in missing_nodes_all_neighbors add edges to placeholder
  34. for i = missing_nodes_all_neighbors
  35. neighbors = find(data(i,:));
  36. missing_neighbors = intersect(neighbors, missing_nodes);
  37. missing_neighbors = sort(missing_neighbors, 'descend');
  38. for curr_missing_neighbor = missing_neighbors
  39. if data(i,curr_missing_neighbor) == 1
  40. % append col & row for the placeholder
  41. data = ExpandDataByOne(data, i, non_neighbors_distance);
  42. for v = 1:numAttData
  43. attDataVec(v).data = ExpandAttByOne(attDataVec(v).data, curr_missing_neighbor, non_neighbors_distance, attDataVec(v).totalAttNum, numAttPerPH);
  44. end
  45. %add the new UNK node to the missing nodes mapping j is the index of the missing node
  46. %look for the first zero in column j of the missing nodes mapping and put the new node
  47. %index there
  48. added_node = 0;
  49. %add it in the first position which equals zero
  50. j = find(missing_nodes == curr_missing_neighbor, 1);
  51. for k = 1 : size(missing_nodes_mapping,1)
  52. if missing_nodes_mapping(k, j) == 0
  53. %if we start with 1000 nodes, and we have 5 missing nodes, after
  54. %adding one node at this point, the size of the graph is 1001. 5 nodes
  55. %will be removed so the correct index of the new node will be 1001 - 5 = 996.
  56. %The next one is 997 and so on.
  57. missing_nodes_mapping(k, j) = size(data,1) - num_missing_nodes;
  58. added_node = 1;
  59. break;
  60. end
  61. end
  62. %if all the column is non-zero, add a new row and put it there
  63. if added_node == 0
  64. missing_nodes_mapping = [missing_nodes_mapping; zeros(1, size(missing_nodes_mapping,2))];
  65. missing_nodes_mapping(size(missing_nodes_mapping,1), j) = size(data,1) - num_missing_nodes;
  66. end
  67. end
  68. end
  69. end
  70. %remove the missing nodes from the matrix (missing nodes MUST be sorted in descending order!!
  71. %so that removing one does not affect the index of the others)
  72. for j = 1:size(missing_nodes,2)
  73. missing_node_idx = missing_nodes(j);
  74. %remove column
  75. data(:, missing_node_idx) = [];
  76. %remove row
  77. data(missing_node_idx, :) = [];
  78. end
  79. for v = 1:numAttData
  80. attData = attDataVec(v).data;
  81. for j = 1:size(missing_nodes,2)
  82. missing_node_idx = missing_nodes(j);
  83. attData(missing_node_idx, :) = [];
  84. end
  85. attDataVec(v).data=attData;
  86. end
  87. end %function
  88. %sigal - move old implementation to function
  89. function [missing_nodes] = ChooseMissingNodes(num_nodes_to_remove, data, attData, totalAttNum, numAttPerPH, missing_nodes_mapping)
  90. missing_nodes_all_neighbors = zeros(1, size(data,2));
  91. %randomize a list of nodes to remove and sort it
  92. if size(missing_nodes_mapping,1)> 0
  93. missing_nodes = sort(missing_nodes_mapping(1,:) , 'descend');
  94. %find all missing node neighbors
  95. for curr_missing_node = missing_nodes
  96. missing_nodes_all_neighbors = missing_nodes_all_neighbors | data(curr_missing_node,:);
  97. missing_nodes_all_neighbors(1,curr_missing_node)=1;
  98. end
  99. else
  100. missing_nodes = [];
  101. end
  102. % outlier1 - nodes with only one edge
  103. numEdges = sum(data,1);
  104. invalidNodes1a = (numEdges==1); %%numEdges<3); %%(numEdges==1);
  105. missing_nodes_all_neighbors(1,invalidNodes1a) = 1;
  106. invalidNodes1b = (numEdges>15); %%25); %%(numEdges==1); %% sigal - 6.2.13 max=15 (sarit)
  107. missing_nodes_all_neighbors(1,invalidNodes1b) = 1;
  108. % outlier2 - nodes with less than numAttPerPH attributes
  109. numAttr = sum(attData,2)';
  110. invalidNodes2 = (numAttr<numAttPerPH);
  111. missing_nodes_all_neighbors(1,invalidNodes2) = 1;
  112. % outlier statistics
  113. count = nnz(invalidNodes1a|invalidNodes1b|invalidNodes2);
  114. if count*1.5 > size(data,2)
  115. fprintf('RemoveRandomNodes2: too many outliers nodes %d.\n',count);
  116. end
  117. for i=1:num_nodes_to_remove
  118. valid_nodes = find(missing_nodes_all_neighbors~=1);
  119. inx = ceil(rand(1)*size(valid_nodes,2));
  120. node = valid_nodes(inx);
  121. % add selected node to missing_nodes list and update the all neighbors list
  122. missing_nodes = [missing_nodes node];
  123. missing_nodes_all_neighbors(1,node)=1;
  124. missing_nodes_all_neighbors = missing_nodes_all_neighbors | data(node,:);
  125. end
  126. end %function
  127. % sigal - append col & row for the placeholder
  128. function [data] = ExpandDataByOne(data, friend, non_neighbors_distance)
  129. new_col = ones(size(data, 1), 1) * non_neighbors_distance;
  130. new_col(friend) = 1;
  131. data = [data new_col];
  132. new_row = ones(1,size(data, 2)) * non_neighbors_distance;
  133. new_row(friend) = 1;
  134. data = [data; new_row];
  135. data(size(data, 1), size(data,2)) = 0;
  136. end
  137. % sigal - append row for the placeholder
  138. function [attData] = ExpandAttByOne(attData, orgNode, non_neighbors_distance, totalAttNum, numAttPerPH)
  139. if totalAttNum>0 && numAttPerPH>0
  140. attIndices = find(attData(orgNode, :)==1);
  141. while size(attIndices,2) > numAttPerPH
  142. inx = ceil(rand(1)*size(attIndices,2));
  143. attIndices(:,inx) = [];
  144. end
  145. else
  146. attIndices=[];
  147. end
  148. new_row = ones(1,size(attData, 2)) * non_neighbors_distance;
  149. for i=1:size(attIndices,2)
  150. new_row(i)=1;
  151. end
  152. attData = [attData; new_row];
  153. end