function [ data, missing_nodes_mapping ] = RemoveRandomNodes( data, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance, missingNodesInput ) %RemoveRandomNodes Remove num_missing_nodes from data. If some nodes are %removed already, provide missing_nodes_mapping % Detailed explanation goes here graph_size = size(data,1); max_distance = max(max(data)); % if the mapping is larger than the number of nodes we want to remove, emty % it and start a new mapping. This can happen if we finished looping over % the number of missing nodes and started a new iteration of an outer loop. if size(missing_nodes_mapping,2) > num_missing_nodes missing_nodes_mapping = []; num_nodes_to_remove = num_missing_nodes; else num_nodes_to_remove = num_missing_nodes - size(missing_nodes_mapping,2); end cont = 0; while cont == 0 cont = 1; if nargin < 5 %randomize a list of nodes to remove and sort it if nargin == 2 || size(missing_nodes_mapping,1) == 0 missing_nodes = sort(ceil(rand(1,num_nodes_to_remove).*graph_size), 'descend'); else missing_nodes = sort( [missing_nodes_mapping(1,:) ceil(rand(1,num_nodes_to_remove).*graph_size)] , 'descend'); end %check if there are doubles in the list sizeDiff = size(missing_nodes,2) - size(unique(missing_nodes),2); while sizeDiff > 0 %selecting new nodes randomly until we have enough missing_nodes = sort( [unique(missing_nodes) ceil(rand(1,sizeDiff).*graph_size)] , 'descend'); sizeDiff = size(missing_nodes,2) - size(unique(missing_nodes),2); %fprintf('selected duplicates...\n'); %cont = 0; end else missing_nodes = missingNodesInput; end %check if we selected one of the nodes which is an %outlier, i.e. not connected to any other node %TODO: optimize %isOutlier = 1; if cont == 1 for i = 1:size(missing_nodes,2) %check if all the values of this node in the adjacency matrix equal %the non_neighbors_distance (except for the diagonal of the matrix) %This should not happen if the graph is connected (was an %attempt to deal with a corner case) while sum(data(missing_nodes(i),:)) - data(missing_nodes(i),missing_nodes(i)) == non_neighbors_distance * (size(data,2) - 1) newNode = ceil(rand(1,sizeDiff).*graph_size); while ismember(newNode, missing_nodes) newNode = ceil(rand(1,sizeDiff).*graph_size); end missing_nodes(i) = newNode; %isOutlier = 1; %fprintf('selected outlier...\n'); end % for j = 1:size(data, 2) % if data(missing_nodes(i),j) ~= non_neighbors_distance && missing_nodes(i) ~= j % isOutlier=0; % end % end end end missing_nodes = sort( unique(missing_nodes) , 'descend'); % if isOutlier == 1 % cont = 0; % end end %create a list of the new nodes that each missing node %is mapped to - each link to a missing node is replaced %by a link to a new, "UNK" node missing_nodes_mapping = missing_nodes; %replace each link to a missing node with a link to a new node missing_nodes_all_neighbors = zeros(1, size(data,2)); for curr_nissing_node = missing_nodes missing_nodes_all_neighbors = missing_nodes_all_neighbors | data(curr_nissing_node,:); end missing_nodes_all_neighbors = find(missing_nodes_all_neighbors); i = 1; %while i <= size(data,1) for i = missing_nodes_all_neighbors neighbors = find(data(i,:)); missing_neighbors = intersect(neighbors, missing_nodes); missing_neighbors = sort(missing_neighbors, 'descend'); for curr_missing_neighbor = missing_neighbors %for j= 1:size(missing_nodes,2) %missing nodes is sorted descending %missing_node_idx = missing_nodes(j); if data(i,curr_missing_neighbor) == 1 new_col = ones(size(data, 1), 1) * non_neighbors_distance; new_col(i) = 1; data = [data new_col]; new_row = ones(1,size(data, 2)) * non_neighbors_distance; new_row(i) = 1; data = [data; new_row]; data(size(data, 1), size(data,2)) = 0; %add the new UNK node to the missing nodes mapping %j is the index of the missing node %look for the first zero in column j of the %missing nodes mapping and put the new node %index there added_node = 0; %add it in the first position which equals zero j = find(missing_nodes == curr_missing_neighbor, 1); for k = 1 : size(missing_nodes_mapping,1) if missing_nodes_mapping(k, j) == 0 %if we start with 1000 nodes, and %we have 5 missing nodes, after %adding one node at this point, the %size of the graph is 1001. 5 nodes %will be removed so the correct %index of the new node will be 1001 - 5 = 996. %The next one is 997 and so on. missing_nodes_mapping(k, j) = size(data,1) - num_missing_nodes; added_node = 1; break; end end %if all the column is non-zero, add a new row and put it there if added_node == 0 missing_nodes_mapping = [missing_nodes_mapping; zeros(1, size(missing_nodes_mapping,2))]; missing_nodes_mapping(size(missing_nodes_mapping,1), j) = size(data,1) - num_missing_nodes; end end end %i = i+1; end %remove the missing nodes from the matrix %(missing nodes should be sorted in descending order!! %so that removing one does not affect the index of the %others) for j = 1:size(missing_nodes,2) missing_node_idx = missing_nodes(j); % if missing_node_idx == 1 % data = data(:, 2 : size(data,2)); % data = data(2 : size(data,1), :); % else %remove column data(:, missing_node_idx) = []; %data = [data(:, 1:missing_node_idx-1) data(:,missing_node_idx+1 : size(data,2))]; %remove row data(missing_node_idx, :) = []; %data = [data(1:missing_node_idx-1, :); data(missing_node_idx+1 : size(data,1), :)]; % end end end