123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288 |
- function [ graph_predicted, clustering ] = MissingNodeIdentification( graph_visible, num_placeholders, num_missing_nodes, affinity_calculation_type, reduce_dimensions )
- %MissingNodeIdentification
- % Inputs:
- % graph_visible - The visible graph adjacency matrix. The known nodes
- % should be first, followed by the placeholders
- % num_placeholders - The number of placeholders
- % num_missing_nodes - The number of missing nodes. If it is unkown,
- % enter 0 and the algorithm will try to estimate it.
- % affinity_calculation_type - which affinity to use (default is Adamic/Adar)
- % reduce_dimensions - If the algorithm should use dimension reduction
- % (much faster but slightly worse performance - default is 1)
- %
- % Outputs:
- % graph_predicted - the predicted graph
- % clustering - a vector containing the cluster index of each
- % placeholder
-
-
- global affinity_calculation_shortest_path;
- global affinity_calculation_euclid;
- global affinity_calculation_common_friends;
- global affinity_calculation_random_clustering;
- global affinity_calculation_adamic_adar;
- global affinity_calculation_katz_beta_0_5;
- global affinity_calculation_katz_beta_0_05;
- global affinity_calculation_katz_beta_0_005;
-
- affinity_calculation_shortest_path = 0;
- affinity_calculation_euclid = 1;
- affinity_calculation_common_friends = 2;
- affinity_calculation_random_clustering = 3;
- affinity_calculation_adamic_adar = 4;
- affinity_calculation_katz_beta_0_5 = 5;
- affinity_calculation_katz_beta_0_05 = 6;
- affinity_calculation_katz_beta_0_005 = 7;
-
- if nargin < 5
- reduce_dimensions = 1;
- end
-
- if nargin < 4
- affinity_calculation_type = affinity_calculation_adamic_adar;
- end
-
- graph_visible = sparse(graph_visible);
-
- %size of the visible graph
- visible_size = size(graph_visible,1);
- %size of the actual (original) graph
- actual_graph_size = visible_size - num_placeholders + num_missing_nodes;
- %%index of the first unknown node
- first_unk_node = visible_size - num_placeholders + 1;
-
- if num_missing_nodes <= 0
- num_clusters_known = 0;
- else
- num_clusters_known = 1;
- end
-
- %calculate the affinity matrix
- affinity = CalcAffinity( graph_visible, affinity_calculation_type, actual_graph_size, num_missing_nodes);
-
- %reduce dimension of the affinity matrix (only include nodes which affect
- %the placeholders clustering)
- if reduce_dimensions > 0
- [affinity, ~, first_unk_node] = ReduceDimensions(affinity, first_unk_node);
- end
-
- %determine the number of clusters (if it is known then the number of
- %missing nodes will be used as the number of clusters)
- k = DetermineNumberOfClusters(num_clusters_known, graph_visible, num_missing_nodes, num_placeholders);
-
- %predict the output graph
- [ graph_predicted, clustering] = PredictGraph(affinity, k, graph_visible, num_placeholders, affinity_calculation_type, 1);
-
- end
-
-
- function [affinity] = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes)
- global affinity_calculation_shortest_path;
- global affinity_calculation_euclid;
- global affinity_calculation_common_friends;
- global affinity_calculation_random_clustering;
- global affinity_calculation_adamic_adar;
- global affinity_calculation_katz_beta_0_5;
- global affinity_calculation_katz_beta_0_05;
- global affinity_calculation_katz_beta_0_005;
-
-
- if affinity_calculation_type == affinity_calculation_euclid
- sp_mat = graphallshortestpaths(data);
- %remove INF values
- max_value = max(sp_mat(sp_mat ~= Inf)) + 1;
- sp_mat_euclid = sp_mat;
- sp_mat_euclid(sp_mat == Inf) = max_value;
- affinity = CalculateAffinity(sp_mat_euclid);
- %affinity = exp(-(sp_mat.^2))/(2 * 0.3^2);
- elseif affinity_calculation_type == affinity_calculation_shortest_path
- % max_value = max(sp_mat(sp_mat ~= Inf)) + 1;
- % sp_mat_euclid = sp_mat;
- % sp_mat_euclid(sp_mat == Inf) = max_value;
- % affinity = (sp_mat_euclid + 1).^(-affinity_exp_factor);
-
-
- %affinity = spfun(affinityFunc, data);
- affinity = graphallshortestpaths(data);
- affinity = affinity .^ -2;
-
- affinity(affinity == Inf) = 1; %added on 05/11/11
-
- elseif affinity_calculation_type == affinity_calculation_common_friends
- affinity = CalcAffinityByCommonNeighbors_Sparse(data, actual_graph_size, num_missing_nodes);
- %affinity = CalcAffinityByCommonNeighbors(data, actual_graph_size, num_missing_nodes);
- elseif affinity_calculation_type == affinity_calculation_random_clustering
- affinity = data; %just a placeholder...
- elseif affinity_calculation_type == affinity_calculation_adamic_adar
- affinity = CalculateAffinityByAdamicAdar_Sparse( data, actual_graph_size, num_missing_nodes, 1 );
- if nnz(affinity) < 5
- x = 8;
- end
- elseif affinity_calculation_type == affinity_calculation_katz_beta_0_5
- affinity = CalcAffinityByKatzBeta_Sparse( data, 0.5, 3 );
- elseif affinity_calculation_type == affinity_calculation_katz_beta_0_05
- affinity = CalcAffinityByKatzBeta_Sparse( data, 0.05, 4 );
- elseif affinity_calculation_type == affinity_calculation_katz_beta_0_005
- affinity = CalcAffinityByKatzBeta_Sparse( data, 0.005, 4 );
- end
-
- end
-
-
- function [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes)
-
- global affinity_calculation_shortest_path;
- global affinity_calculation_euclid;
- global affinity_calculation_common_friends;
- global affinity_calculation_random_clustering;
- global affinity_calculation_adamic_adar;
- global affinity_calculation_katz_beta_0_5;
- global affinity_calculation_katz_beta_0_05;
- global affinity_calculation_katz_beta_0_005;
-
- first_unk_node = size(affinity,1) - num_placeholders + 1;
-
- diagonal = sum(affinity, 2); %sum the rows
- D = sparse(diag(diagonal)); %D is the matrix whose diagonal is the sum of the rows of A
- clear('diagonal');
-
- fprintf('Calculating NL\n');
-
- D = sqrt(D);
- NL1 = D * affinity * D;
- clear('D');
-
- fprintf('calculating U\n');
-
- fail = 0;
- try
- [nEigVec,eigValues] = eigs(NL1,k);
- catch ME1
- opts.tol = 1e-1;
- try
- [nEigVec,eigValues] = eigs(NL1,k, 'LM', opts);
- catch ME2
- fail = 1;
- end
- end
-
- % select k largest eigen vectors
- if fail == 0
- U = [];
- % construct the normalized matrix U from the obtained eigen vectors
- for i=1:size(nEigVec,1)
- n = sqrt(sum(nEigVec(i,:).^2));
- U(i,:) = nEigVec(i,:) ./ n;
- end
-
-
- num_samples = size(affinity,1) - first_unk_node + 1;
-
- if cluster_only_missing_nodes == 1
-
- U(1:first_unk_node - 1 ,:) = []; %cluster only the missing nodes
-
- end
-
- fprintf('kmeans clustering\n');
-
- % perform kmeans clustering on the matrix U
- fail = 1;
- while fail > 0
- try
- currK = k
- [IDX,C, SUMD, D] = kmeans(U,currK, 'EmptyAction', 'singleton'); %in case of an empty cluster just drop it
- fail = 0;
- catch ME1
- fail = fail + 1;
- if fail < 100
- disp('error in kmeans clustering. trying again...');
- else
- %give up on clustering and select random clusters...
- IDX = randi(currK, size(U));
- fail = 0;
- end
- end
- end
-
- test_clustering = IDX(size(IDX,1) - num_samples + 1 : size(IDX,1));
-
-
- %if it's random just replace everything...
- if affinity_calculation_type == affinity_calculation_random_clustering
- test_clustering = randi(k, size(test_clustering,1), size(test_clustering,2));
- end
-
-
- else
- disp('Failed in finding eigenvectors - using random!');
- if cluster_only_missing_nodes == 0
- num_samples = size(affinity,1);
- else
- num_samples = num_placeholders;
- end
- test_clustering = randi(k, num_samples, 1);
- end
- end
-
-
- function [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes)
-
- last_known_node = size(data,1) - num_placeholders;
- first_unk_node = last_known_node + 1;
-
- [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes);
-
- newNodes = CreateNewNodesFromClusters(data, test_clustering);
-
- newData = [data(1:last_known_node,1:last_known_node), newNodes(:, 1:last_known_node)';...
- newNodes(:,1:last_known_node), zeros(size(newNodes,1))];
-
- end
-
- function [k] = DetermineNumberOfClusters(num_clusters_known, data_untouched, num_missing_nodes, num_placeholders)
-
- %determine k - the number of clusters
- if num_clusters_known == 1
- k = num_missing_nodes;
- else
- known_graph_size = size(data_untouched,1) - num_placeholders;
- k = mean(sum(data_untouched(1 : known_graph_size, 1 : known_graph_size)));
- %fprintf('Average number of neighbors for known nodes = %d, number of added nodes = %d\n', k, num_added_nodes);
- k = round(num_placeholders / floor(k));
- fprintf('Rounding k to %d\n', k);
- end
-
- end
-
-
- function [affinity, num_placeholders, first_unk_node] = ReduceDimensions(affinity, first_unk_node)
-
- num_placeholders = size(affinity,1) - first_unk_node + 1;
- affinity_sum = sum(affinity(first_unk_node:size(affinity,1),:)); %the sum of the affinity of placeholders to all other nodes
- nodes_to_keep = (affinity_sum > 0); %keep only nodes which have some affinity to the placeholders
- nodes_to_keep(first_unk_node:size(affinity,1)) = 1; %keep all the placeholders even if for some reason they have a sum of zero...
-
- affinity = affinity(nodes_to_keep, nodes_to_keep);
-
- first_unk_node = size(affinity,1) - num_placeholders + 1;
-
- end
-
- function [true_clustering] = BuildTrueClustering(missing_nodes_mapping, actual_graph_size, num_missing_nodes, percent_known_placeholders, placeholders_to_remove, last_known_node)
-
- true_clustering = []; %zeros(size(test_clustering, 1), 1);
-
- for i = 2 : size(missing_nodes_mapping, 1)
- for j = 1 : size(missing_nodes_mapping,2)
- if missing_nodes_mapping(i,j) ~= 0
- true_clustering(missing_nodes_mapping(i,j) - actual_graph_size + num_missing_nodes, 1) = j; % missing_nodes_mapping(1, j);
- end
- end
- end
-
- if percent_known_placeholders < 1
- true_clustering(placeholders_to_remove - last_known_node) = [];
- end
-
- end
|