function [ data, dataWithAtt, missing_nodes_mapping ] = RemoveRandomNodes2( data, dataWithAtt, totalAttNum, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance ) %RemoveRandomNodes Remove num_missing_nodes from data. If some nodes are %removed already, provide missing_nodes_mapping % Detailed explanation goes here %%data_orig = data; numAttPerPH = 0; % if the mapping is larger than the number of nodes we want to remove, empty % it and start a new mapping. This can happen if we finished looping over % the number of missing nodes and started a new iteration of an outer loop. if size(missing_nodes_mapping,2) > num_missing_nodes missing_nodes_mapping = []; num_nodes_to_remove = num_missing_nodes; else num_nodes_to_remove = num_missing_nodes - size(missing_nodes_mapping,2); end % randomly choose missing nodes %missing_nodes = ChooseMissingNodes_old(num_nodes_to_remove, data, missing_nodes_mapping, non_neighbors_distance); missing_nodes = ChooseMissingNodes(num_nodes_to_remove, data, dataWithAtt, totalAttNum, numAttPerPH, missing_nodes_mapping); %sort the list and create a list of the new nodes that each missing node is mapped to - each link %to a missing node is replaced by a link to a new, "UNK" node missing_nodes = sort( unique(missing_nodes) , 'descend'); missing_nodes_mapping = missing_nodes; %replace each link to a missing node with a link to a new node %find all missing node neighbors missing_nodes_all_neighbors = zeros(1, size(data,2)); for curr_nissing_node = missing_nodes missing_nodes_all_neighbors = missing_nodes_all_neighbors | data(curr_nissing_node,:); end missing_nodes_all_neighbors = find(missing_nodes_all_neighbors); %for each node in missing_nodes_all_neighbors add edges to placeholder for i = missing_nodes_all_neighbors neighbors = find(data(i,:)); missing_neighbors = intersect(neighbors, missing_nodes); missing_neighbors = sort(missing_neighbors, 'descend'); for curr_missing_neighbor = missing_neighbors if data(i,curr_missing_neighbor) == 1 % append col & row for the placeholder data = ExpandDataByOne(data, curr_missing_neighbor, i, non_neighbors_distance, 0, 0); dataWithAtt = ExpandDataByOne(dataWithAtt, curr_missing_neighbor, i, non_neighbors_distance, totalAttNum, numAttPerPH); %add the new UNK node to the missing nodes mapping j is the index of the missing node %look for the first zero in column j of the missing nodes mapping and put the new node %index there added_node = 0; %add it in the first position which equals zero j = find(missing_nodes == curr_missing_neighbor, 1); for k = 1 : size(missing_nodes_mapping,1) if missing_nodes_mapping(k, j) == 0 %if we start with 1000 nodes, and we have 5 missing nodes, after %adding one node at this point, the size of the graph is 1001. 5 nodes %will be removed so the correct index of the new node will be 1001 - 5 = 996. %The next one is 997 and so on. missing_nodes_mapping(k, j) = size(data,1) - num_missing_nodes; added_node = 1; break; end end %if all the column is non-zero, add a new row and put it there if added_node == 0 missing_nodes_mapping = [missing_nodes_mapping; zeros(1, size(missing_nodes_mapping,2))]; missing_nodes_mapping(size(missing_nodes_mapping,1), j) = size(data,1) - num_missing_nodes; end end end end %remove the missing nodes from the matrix (missing nodes MUST be sorted in descending order!! %so that removing one does not affect the index of the others) for j = 1:size(missing_nodes,2) missing_node_idx = missing_nodes(j); %remove column data(:, missing_node_idx) = []; dataWithAtt(:, missing_node_idx+totalAttNum) = []; %remove row data(missing_node_idx, :) = []; dataWithAtt(missing_node_idx+totalAttNum, :) = []; end % debug = 1; % if debug == 1 % for i=graph_size+1-size(missing_nodes,2):size(data,2) % neighbor = find(data(i,:), 1); % if size(neighbor,2) == 0 % fprintf('RemoveRandomNodes2-debug: node %d has no neighbors\n', i); % end % end % end end %function %sigal - move old implementation to function function [missing_nodes] = ChooseMissingNodes(num_nodes_to_remove, data, dataWithAtt, totalAttNum, numAttPerPH, missing_nodes_mapping) missing_nodes_all_neighbors = zeros(1, size(data,2)); %randomize a list of nodes to remove and sort it if size(missing_nodes_mapping,1)> 0 missing_nodes = sort(missing_nodes_mapping(1,:) , 'descend'); %find all missing node neighbors for curr_missing_node = missing_nodes missing_nodes_all_neighbors = missing_nodes_all_neighbors | data(curr_missing_node,:); missing_nodes_all_neighbors(1,curr_missing_node)=1; end else missing_nodes = []; end % outlier1 - nodes with only one edge numEdges = sum(data,1); invalidNodes1a = (numEdges==1); %%numEdges<3); %%(numEdges==1); missing_nodes_all_neighbors(1,invalidNodes1a) = 1; invalidNodes1b = (numEdges>25); %%(numEdges==1); missing_nodes_all_neighbors(1,invalidNodes1b) = 1; % outlier2 - nodes with less than numAttPerPH attributes numAttr = sum(dataWithAtt(totalAttNum+1:end,1:totalAttNum),2)'; invalidNodes2 = (numAttr size(data,2) fprintf('RemoveRandomNodes2: too many outliers nodes %d.\n',count); end for i=1:num_nodes_to_remove valid_nodes = find(missing_nodes_all_neighbors~=1); inx = ceil(rand(1)*size(valid_nodes,2)); node = valid_nodes(inx); % add selected node to missing_nodes list and update the all neighbors list missing_nodes = [missing_nodes node]; missing_nodes_all_neighbors(1,node)=1; missing_nodes_all_neighbors = missing_nodes_all_neighbors | data(node,:); end end %function %sigal - move old implementation to function function [missing_nodes] = ChooseMissingNodes_old(num_nodes_to_remove, data, missing_nodes_mapping, non_neighbors_distance) graph_size = size(data,1); maxRetries = 10; %randomize a list of nodes to remove and sort it if size(missing_nodes_mapping,1) == 0 missing_nodes = sort(ceil(rand(1,num_nodes_to_remove).*graph_size), 'descend'); else missing_nodes = sort( [missing_nodes_mapping(1,:) ceil(rand(1,num_nodes_to_remove).*graph_size)] , 'descend'); end %check if there are doubles in the list sizeDiff = size(missing_nodes,2) - size(unique(missing_nodes),2); while sizeDiff > 0 %selecting new nodes randomly until we have enough missing_nodes = sort( [unique(missing_nodes) ceil(rand(1,sizeDiff).*graph_size)] , 'descend'); sizeDiff = size(missing_nodes,2) - size(unique(missing_nodes),2); end %check if we selected one of the nodes which is an outlier, i.e. not connected to any other node %check if the nodes has no neighbors after we remove the sleceted nodes % initialize prev_missing_nodes if size(missing_nodes_mapping,1) > 0 prev_missing_nodes = missing_nodes_mapping(1,:); else prev_missing_nodes = []; end % make sure missing_nodes have their neighbors tryNo = 0; while tryNo0 && numAttPerPH>0 attIndices = find(data(orgNode+totalAttNum, 1:totalAttNum)==1); while size(attIndices,2) > numAttPerPH inx = ceil(rand(1)*size(attIndices,2)); attIndices(:,inx) = []; end else attIndices=[]; end new_col = ones(size(data, 1), 1) * non_neighbors_distance; new_col(friend+totalAttNum) = 1; for i=1:size(attIndices,2) new_col(i)=1; end data = [data new_col]; new_row = ones(1,size(data, 2)) * non_neighbors_distance; new_row(friend+totalAttNum) = 1; for i=1:size(attIndices,2) new_row(i)=1; end data = [data; new_row]; data(size(data, 1), size(data,2)) = 0; end % sigal - choose another node and make sure it has neighbors after the removal function [node] = ChooseNewMissingNode(data, missing_nodes, non_neighbors_distance) graph_size = size(data,1); invalidNode = 1; tryNo = 0; maxTries = min([graph_size/100, 50]); while invalidNode == 1 && tryNo0); end % sigal - check if we this is an invalid node function [tf] = InavlidNode(node, data, missing_nodes, non_neighbors_distance) maxEdges = max(2*nnz(data)/size(data,2), 10); hasNoNeighbors = sum(data(node,:))-data(node,node)==non_neighbors_distance*(size(data,2)-1); hasRemovedNeighbors = HasRemovedNeighbors(node, data, missing_nodes); duplicate = ismember(node, missing_nodes); tooManyEdges = size(find(data(node,:)==1),2) >maxEdges; tf = hasNoNeighbors || duplicate || hasRemovedNeighbors || tooManyEdges; end % sigal - try to replace selected nodes to insure the PH will have a neighbor function [tf] = ReplaceNodesNeighbors(data, prev_missing_nodes, missing_nodes, non_neighbors_distance) tf = 0; % make sure the prev selected nodes still have their neighbors if size(prev_missing_nodes,2) > 0 for i = 1:size(missing_nodes,2) node = missing_nodes(i); if ismember(node, prev_missing_nodes) && HasRemovedNeighbors(node, data, missing_nodes) % try to replace one of the other nodes for j = 1:size(missing_nodes,2) if ismember(missing_nodes(j), prev_missing_nodes)==0 && data(node,missing_nodes(j))==1 %fprintf('RemoveRandomNodes2-prevNode: try to replace outlier %d...\n',j); missing_nodes(j) = ChooseNewMissingNode(data, missing_nodes, non_neighbors_distance); tf=1; break; end end end end end % make sure the latest selected nodes still have their neighbors for i = 1:size(missing_nodes,2); node = missing_nodes(i); % skip prev_missing_nodes if ismember(node, prev_missing_nodes) continue; end if InavlidNode(node, data, missing_nodes, non_neighbors_distance) missing_nodes(i) = ChooseNewMissingNode(data, missing_nodes, non_neighbors_distance); %fprintf('RemoveRandomNodes2-newNode: try to replace outlier %d...\n',i); tf=1; end end end