function [ graph_predicted, clustering ] = MissingNodeIdentification( graph_visible, num_placeholders, num_missing_nodes, affinity_calculation_type, reduce_dimensions ) %MissingNodeIdentification % Inputs: % graph_visible - The visible graph adjacency matrix. The known nodes % should be first, followed by the placeholders % num_placeholders - The number of placeholders % num_missing_nodes - The number of missing nodes. If it is unkown, % enter 0 and the algorithm will try to estimate it. % affinity_calculation_type - which affinity to use (default is Adamic/Adar) % reduce_dimensions - If the algorithm should use dimension reduction % (much faster but slightly worse performance - default is 1) % % Outputs: % graph_predicted - the predicted graph % clustering - a vector containing the cluster index of each % placeholder global affinity_calculation_shortest_path; global affinity_calculation_euclid; global affinity_calculation_common_friends; global affinity_calculation_random_clustering; global affinity_calculation_adamic_adar; global affinity_calculation_katz_beta_0_5; global affinity_calculation_katz_beta_0_05; global affinity_calculation_katz_beta_0_005; affinity_calculation_shortest_path = 0; affinity_calculation_euclid = 1; affinity_calculation_common_friends = 2; affinity_calculation_random_clustering = 3; affinity_calculation_adamic_adar = 4; affinity_calculation_katz_beta_0_5 = 5; affinity_calculation_katz_beta_0_05 = 6; affinity_calculation_katz_beta_0_005 = 7; if nargin < 5 reduce_dimensions = 1; end if nargin < 4 affinity_calculation_type = affinity_calculation_adamic_adar; end graph_visible = sparse(graph_visible); %size of the visible graph visible_size = size(graph_visible,1); %size of the actual (original) graph actual_graph_size = visible_size - num_placeholders + num_missing_nodes; %%index of the first unknown node first_unk_node = visible_size - num_placeholders + 1; if num_missing_nodes <= 0 num_clusters_known = 0; else num_clusters_known = 1; end %calculate the affinity matrix affinity = CalcAffinity( graph_visible, affinity_calculation_type, actual_graph_size, num_missing_nodes); %reduce dimension of the affinity matrix (only include nodes which affect %the placeholders clustering) if reduce_dimensions > 0 [affinity, ~, first_unk_node] = ReduceDimensions(affinity, first_unk_node); end %determine the number of clusters (if it is known then the number of %missing nodes will be used as the number of clusters) k = DetermineNumberOfClusters(num_clusters_known, graph_visible, num_missing_nodes, num_placeholders); %predict the output graph [ graph_predicted, clustering] = PredictGraph(affinity, k, graph_visible, num_placeholders, affinity_calculation_type, 1); end function [affinity] = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes) global affinity_calculation_shortest_path; global affinity_calculation_euclid; global affinity_calculation_common_friends; global affinity_calculation_random_clustering; global affinity_calculation_adamic_adar; global affinity_calculation_katz_beta_0_5; global affinity_calculation_katz_beta_0_05; global affinity_calculation_katz_beta_0_005; if affinity_calculation_type == affinity_calculation_euclid sp_mat = graphallshortestpaths(data); %remove INF values max_value = max(sp_mat(sp_mat ~= Inf)) + 1; sp_mat_euclid = sp_mat; sp_mat_euclid(sp_mat == Inf) = max_value; affinity = CalculateAffinity(sp_mat_euclid); %affinity = exp(-(sp_mat.^2))/(2 * 0.3^2); elseif affinity_calculation_type == affinity_calculation_shortest_path % max_value = max(sp_mat(sp_mat ~= Inf)) + 1; % sp_mat_euclid = sp_mat; % sp_mat_euclid(sp_mat == Inf) = max_value; % affinity = (sp_mat_euclid + 1).^(-affinity_exp_factor); %affinity = spfun(affinityFunc, data); affinity = graphallshortestpaths(data); affinity = affinity .^ -2; affinity(affinity == Inf) = 1; %added on 05/11/11 elseif affinity_calculation_type == affinity_calculation_common_friends affinity = CalcAffinityByCommonNeighbors_Sparse(data, actual_graph_size, num_missing_nodes); %affinity = CalcAffinityByCommonNeighbors(data, actual_graph_size, num_missing_nodes); elseif affinity_calculation_type == affinity_calculation_random_clustering affinity = data; %just a placeholder... elseif affinity_calculation_type == affinity_calculation_adamic_adar affinity = CalculateAffinityByAdamicAdar_Sparse( data, actual_graph_size, num_missing_nodes, 1 ); if nnz(affinity) < 5 x = 8; end elseif affinity_calculation_type == affinity_calculation_katz_beta_0_5 affinity = CalcAffinityByKatzBeta_Sparse( data, 0.5, 3 ); elseif affinity_calculation_type == affinity_calculation_katz_beta_0_05 affinity = CalcAffinityByKatzBeta_Sparse( data, 0.05, 4 ); elseif affinity_calculation_type == affinity_calculation_katz_beta_0_005 affinity = CalcAffinityByKatzBeta_Sparse( data, 0.005, 4 ); end end function [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes) global affinity_calculation_shortest_path; global affinity_calculation_euclid; global affinity_calculation_common_friends; global affinity_calculation_random_clustering; global affinity_calculation_adamic_adar; global affinity_calculation_katz_beta_0_5; global affinity_calculation_katz_beta_0_05; global affinity_calculation_katz_beta_0_005; first_unk_node = size(affinity,1) - num_placeholders + 1; diagonal = sum(affinity, 2); %sum the rows D = sparse(diag(diagonal)); %D is the matrix whose diagonal is the sum of the rows of A clear('diagonal'); fprintf('Calculating NL\n'); D = sqrt(D); NL1 = D * affinity * D; clear('D'); fprintf('calculating U\n'); fail = 0; try [nEigVec,eigValues] = eigs(NL1,k); catch ME1 opts.tol = 1e-1; try [nEigVec,eigValues] = eigs(NL1,k, 'LM', opts); catch ME2 fail = 1; end end % select k largest eigen vectors if fail == 0 U = []; % construct the normalized matrix U from the obtained eigen vectors for i=1:size(nEigVec,1) n = sqrt(sum(nEigVec(i,:).^2)); U(i,:) = nEigVec(i,:) ./ n; end num_samples = size(affinity,1) - first_unk_node + 1; if cluster_only_missing_nodes == 1 U(1:first_unk_node - 1 ,:) = []; %cluster only the missing nodes end fprintf('kmeans clustering\n'); % perform kmeans clustering on the matrix U fail = 1; while fail > 0 try currK = k [IDX,C, SUMD, D] = kmeans(U,currK, 'EmptyAction', 'singleton'); %in case of an empty cluster just drop it fail = 0; catch ME1 fail = fail + 1; if fail < 100 disp('error in kmeans clustering. trying again...'); else %give up on clustering and select random clusters... IDX = randi(currK, size(U)); fail = 0; end end end test_clustering = IDX(size(IDX,1) - num_samples + 1 : size(IDX,1)); %if it's random just replace everything... if affinity_calculation_type == affinity_calculation_random_clustering test_clustering = randi(k, size(test_clustering,1), size(test_clustering,2)); end else disp('Failed in finding eigenvectors - using random!'); if cluster_only_missing_nodes == 0 num_samples = size(affinity,1); else num_samples = num_placeholders; end test_clustering = randi(k, num_samples, 1); end end function [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes) last_known_node = size(data,1) - num_placeholders; first_unk_node = last_known_node + 1; [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes); newNodes = CreateNewNodesFromClusters(data, test_clustering); newData = [data(1:last_known_node,1:last_known_node), newNodes(:, 1:last_known_node)';... newNodes(:,1:last_known_node), zeros(size(newNodes,1))]; end function [k] = DetermineNumberOfClusters(num_clusters_known, data_untouched, num_missing_nodes, num_placeholders) %determine k - the number of clusters if num_clusters_known == 1 k = num_missing_nodes; else known_graph_size = size(data_untouched,1) - num_placeholders; k = mean(sum(data_untouched(1 : known_graph_size, 1 : known_graph_size))); %fprintf('Average number of neighbors for known nodes = %d, number of added nodes = %d\n', k, num_added_nodes); k = round(num_placeholders / floor(k)); fprintf('Rounding k to %d\n', k); end end function [affinity, num_placeholders, first_unk_node] = ReduceDimensions(affinity, first_unk_node) num_placeholders = size(affinity,1) - first_unk_node + 1; affinity_sum = sum(affinity(first_unk_node:size(affinity,1),:)); %the sum of the affinity of placeholders to all other nodes nodes_to_keep = (affinity_sum > 0); %keep only nodes which have some affinity to the placeholders nodes_to_keep(first_unk_node:size(affinity,1)) = 1; %keep all the placeholders even if for some reason they have a sum of zero... affinity = affinity(nodes_to_keep, nodes_to_keep); first_unk_node = size(affinity,1) - num_placeholders + 1; end function [true_clustering] = BuildTrueClustering(missing_nodes_mapping, actual_graph_size, num_missing_nodes, percent_known_placeholders, placeholders_to_remove, last_known_node) true_clustering = []; %zeros(size(test_clustering, 1), 1); for i = 2 : size(missing_nodes_mapping, 1) for j = 1 : size(missing_nodes_mapping,2) if missing_nodes_mapping(i,j) ~= 0 true_clustering(missing_nodes_mapping(i,j) - actual_graph_size + num_missing_nodes, 1) = j; % missing_nodes_mapping(1, j); end end end if percent_known_placeholders < 1 true_clustering(placeholders_to_remove - last_known_node) = []; end end