function [ rand_score, purity, p_triads, missing_nodes_mapping, removed_nodes] = MissingNodes_Sparse2( dataFilePath, dataFileName, affinityType, ... num_missing_nodes_arr, percentKnownPlaceholdersVec, dumpSmallFlag, dumpSmallDataPath, iter, missingNodes) %UNTITLED2 Summary of this function goes here % Detailed explanation goes here %[SUCCESS,MESSAGE,MESSAGEID] = mkdir('C:\\MissingNodes\\Code\\Results\\',dataFileName); global g_threshold; % addpath 'Spectral Clustering' % sigal 13.8.12 % addpath 'mex'% sigal 13.8.12 p_triads = []; %create a log file for this run %date_now = clock; %date_now = strcat(num2str(date_now(1)),'_',num2str(date_now(2)),'_', num2str(date_now(3)),'_', num2str(date_now(4)), num2str(date_now(5)),'_', num2str(date_now(6))); %diary(strcat('C:\missingnodes\Code\Log\log', date_now,'.log')); %affinity calculation types global affinity_calculation_shortest_path; global affinity_calculation_euclid; global affinity_calculation_common_friends; global affinity_calculation_random_clustering; global affinity_calculation_adamic_adar; global affinity_calculation_katz_beta_0_5; global affinity_calculation_katz_beta_0_05; global affinity_calculation_katz_beta_0_005; affinity_calculation_shortest_path = 0; affinity_calculation_euclid = 1; affinity_calculation_common_friends = 2; affinity_calculation_random_clustering = 3; affinity_calculation_adamic_adar = 4; affinity_calculation_katz_beta_0_5 = 5; affinity_calculation_katz_beta_0_05 = 6; affinity_calculation_katz_beta_0_005 = 7; percent_known_placeholders_vec = percentKnownPlaceholdersVec; if nargin >= 9 select_random_missing_nodes = 0; else select_random_missing_nodes = 1; end compensate_for_unknown_placeholers = 0; %configuration parameters %num_values = 100000; %number of values to read from the raw data %graph_sizes = [400 600 800 1000 1200 1400]; %size of the graph matrix %graph_sizes = [400 1000]; %size of the graph matrix % num_missing_nodes_arr = 5 : 5 : 30; %number of missing nodes from the network % num_missing_nodes_arr = [11,21,31,41,50]; % 20; %%num_missing_nodes_arr = 1; unite_common_friends = 0; %should UNK nodes be united in accordance with the "friend of my friend principle" cluster_only_missing_nodes = 1; % if affinity_calculation_type == affinity_calculation_shortest_path || affinity_calculation_type == affinity_calculation_euclid % non_neighbors_distance = Inf; % elseif affinity_calculation_type == affinity_calculation_common_friends % non_neighbors_distance = 0; % end non_neighbors_distance = 0; withAttr = 0; %compare SP to Kmean, 0=SP, 1=kmean on PH (mxm), 2=Kmean on PH+Nodes (m*(m+n)) kmeanTypesVec = [0 1 2]; %read the full network, up to num_values links %disp('reading network information from file...'); fprintf('reading network information from file %s%s ...\n', dataFilePath, dataFileName); load(strcat(dataFilePath, dataFileName), 'data'); data = sparse(data); clear('graph'); %rand_score = zeros(2, 2, size(num_missing_nodes_arr,2), 6, 2); %(normalized or not, unite common friends, num missing nodes, affinity calculation, cluster only missing) %rand_score_sq = rand_score; %purity = zeros(2, size(num_missing_nodes_arr,2), 6, 2);%(unite common friends, num missing nodes, affinity calculation, cluster only missing) %purity_sq = purity; rand_score = []; purity = []; graph_size = size(data,1); graph_edges = nnz(data)/2; % number of edges %initialize the data matrix (binary adjacency) disp('generating network graph...'); actual_graph_size = size(data,1); original_data = data; missing_nodes_mapping = []; for num_missing_nodes_idx = 1 : size(num_missing_nodes_arr,2) if select_random_missing_nodes num_missing_nodes = num_missing_nodes_arr(1, num_missing_nodes_idx); else num_missing_nodes = length(missingNodes); end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %remove random nodes % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %randomize the missing nodes indexes and sort in descending order %disp('selecting random missing nodes...'); fprintf('selecting %d random missing nodes...\n',num_missing_nodes); if select_random_missing_nodes [data, missing_nodes_mapping] = RemoveRandomNodes( original_data, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance ); else [data, missing_nodes_mapping] = RemoveRandomNodes( original_data, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance, missingNodes); end data_untouched = data; for percent_known_placeholders = percent_known_placeholders_vec data = data_untouched; S = zeros(1, size(data,2)); num_placeholders_to_remove = 0; placeholders_to_remove = []; num_placeholders = size(data,1) - actual_graph_size + num_missing_nodes ; last_known_node = size(data,1) - num_placeholders; if percent_known_placeholders < 1 num_placeholders_to_remove = round(num_placeholders * (1 - percent_known_placeholders)); while size(placeholders_to_remove, 2) < num_placeholders_to_remove %randomly selecting unique placeholder indexes placeholders_to_remove = unique([placeholders_to_remove, randi(num_placeholders, 1, num_placeholders_to_remove - size(placeholders_to_remove, 2))]); end %rand_vec = rand(1, num_placeholders); %placeholders_to_remove = find(rand_vec > percent_known_placeholders) + last_known_node; placeholders_to_remove = placeholders_to_remove + last_known_node; %S is the group of neighbors of the unknown placeholders S = data(placeholders_to_remove(1), :); for i = placeholders_to_remove S = S | data(i,:); end %switch from binary vector to list of indexes % S = find(S); data_all_placeholders = data; data(placeholders_to_remove,:) = []; data(:,placeholders_to_remove) = []; num_placeholders = num_placeholders - num_placeholders_to_remove; end removed_nodes{num_missing_nodes_idx} = missing_nodes_mapping(1,:); %save the removed nodes in each iteration %disp('finding all pairs shortest paths...'); %if we got an argument for the affinity calculation type then use %it. otherwise loop over all the types if nargin >= 3 affinity_types = affinityType; else affinity_types = affinity_calculation_shortest_path: affinity_calculation_katz_beta_0_005; end orig_S = S; for affinity_calculation_type = affinity_types fprintf('affinity_calculation_type = %d\n', affinity_calculation_type); %data = data_untouched; first_unk_node = actual_graph_size - num_missing_nodes + 1; num_added_nodes = size(data,1) - first_unk_node + 1; % calculate the affinity / similarity matrix fprintf('calculating affinity matrix, type %d...\n', affinity_calculation_type); tic %affinity_calc_time affinity = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes); affinity_calc_time = toc;%affinity_calc_time %TODO: extend the dimension reduction to adding missing %links / reclustering for reduce_dimensions = 0%1 %[0 1] %0 must be first because it does not change the affinity matrix reduce_dim_time = 0; if reduce_dimensions == 1 tic %ReduceDimensions [affinity, num_placeholders, first_unk_node] = ReduceDimensions(affinity, first_unk_node); reduce_dim_time = toc; %ReduceDimensions end fprintf('calculating true clustering\n'); true_clustering = BuildTrueClustering(missing_nodes_mapping, actual_graph_size, num_missing_nodes, percent_known_placeholders, placeholders_to_remove, last_known_node); %figure,imshow(affinity,[]), title('Affinity Matrix') for num_clusters_known = 1 %%[1 4] % sigal 29.7.13 % Test other clustering kmean types if affinity_calculation_type == affinity_calculation_random_clustering kmeanTypes = 0; else kmeanTypes = kmeanTypesVec; end % loop over added clustering types for kmeanType = kmeanTypes k = DetermineNumberOfClusters(num_clusters_known, data_untouched, actual_graph_size, num_missing_nodes, num_added_nodes); debugEstimateK = 0; %(num_clusters_known == 1); if debugEstimateK == 1 && affinity_calculation_type == 2 for type=[0 4 8] estK = DetermineNumberOfClusters(type, data_untouched, actual_graph_size, num_missing_nodes, num_added_nodes); end end if num_clusters_known == 1 withAttr = 0; elseif num_clusters_known == 0 withAttr = 10; else withAttr = num_clusters_known*10; end % sigal 29.7.13 - add kmeanType to alg type withAttr = withAttr+kmeanType; last_known_node = first_unk_node - 1; fprintf('calculating PredictGraph with k=%d and reduce_dimensions=%d ...\n',k,reduce_dimensions); tic %graph_predict_time [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes, kmeanType); graph_predict_time = toc; %graph_predict_time if compensate_for_unknown_placeholers == 1 fprintf('*** running with compensate_for_unknown_placeholers mode ...\n'); S = orig_S; if size(newData,1) > size(S,2) %for breakpoint tttt = 98; end sigma = 1/4; S = S(1:size(newData,1)); S = S + randn(size(S)) * sigma; sorted_S = sort(S, 'descend'); %sum over the columns and find the columns which %indicate at least one neighbor neighbors_of_new_nodes = find(sum(newData(first_unk_node:size(newData,1), :))); first_united_node = size(newData,1) - k +1; newAffinity = CalcAffinityByKatzBeta_Sparse( newData, 0.05, 4 ); %newAffinity = CalculateAffinityByAdamicAdar_Sparse(newData, size(newData,1), 0, 0); newNodesAffinity = newAffinity(first_united_node:size(newAffinity,1), :); newNodesAffinity(newNodesAffinity>=1) = 0; newNodesAffinity = newNodesAffinity / max(max(newNodesAffinity)); %newNodesAffinity = newNodesAffinity / 2; newNodesAffinity(newData(first_united_node:size(newAffinity,1), :) >= 1) = 0; newNodesAffinity = (newNodesAffinity / max(max(newNodesAffinity))); %%%%trying to take only the %%%%k highest affinities sortedNewNodesAffinity = sort(newNodesAffinity(:), 'descend'); %affinityThreshold = sortedNewNodesAffinity(k + size(neighbors_of_new_nodes, 2)); newNodesAffinity_orig = newNodesAffinity; end if percent_known_placeholders < 1 %calculating as a function %of number of links added maxNumLinksToAdd = mean(sum(data(1:last_known_node, 1:last_known_node))) * num_placeholders; maxNumLinksToAdd = min(maxNumLinksToAdd, 25); else maxNumLinksToAdd = 0; end %%%%% for distance as %%%%% function of num %%%%% placeholders %%%% %max_neighbors = S >= sorted_S(maxNumLinksToAdd); for numLinksToAdd = 0 : maxNumLinksToAdd if compensate_for_unknown_placeholers == 1 newNodesAffinity = newNodesAffinity_orig; neighbors = []; if numLinksToAdd > 0 neighbors = find(S >= sorted_S(numLinksToAdd), numLinksToAdd); end newDataWithMissingLinks = newData; newDataForClustering = data; for neighbor = neighbors [value, closest_new_node] = max(newNodesAffinity(:,neighbor)); closest_new_node = closest_new_node(1); newDataWithMissingLinks(first_united_node + closest_new_node - 1, neighbor) = 1; newDataWithMissingLinks(neighbor, first_united_node + closest_new_node - 1) = 1; newPlaceholder = zeros(1, size(newDataForClustering,2)); newPlaceholder(neighbor) = 1; newDataForClustering = [newDataForClustering, newPlaceholder'; newPlaceholder, 0]; end affinityWithS = CalcAffinity( newDataForClustering, affinity_calculation_type, actual_graph_size, num_missing_nodes); if reduce_dimensions == 1 [affinityWithS, num_placeholdersWithS, first_unk_node_with_s] = ReduceDimensions(affinityWithS, first_united_node); else num_placeholdersWithS = num_placeholders + length(neighbors); first_unk_node_with_s = first_united_node; end [newPredictedGraph, newClustering] = PredictGraph(affinityWithS, k, newDataForClustering, num_placeholdersWithS, affinity_calculation_type, cluster_only_missing_nodes); %remap the original data so that %the known nodes match the %predicted data and the missing %nodes match the predicted nodes %created from each cluster perm_vector = 1:size(original_data,1); perm_vector(missing_nodes_mapping(1,:)) = []; perm_vector = [perm_vector, missing_nodes_mapping(1,:)]; remapped_data = original_data(perm_vector,perm_vector); [small_data, indices_to_remove] = DecreaseGraphSize(remapped_data, first_united_node : size(remapped_data,1), neighbors); %changed from neighbors to perm(max_neighbors) - to be fair when there are more nieghbors %small_data2 = DecreaseGraphSize(newData, last_known_node+1 : size(newData,1)); %new_nodes_affinity_sum = sum(sum(newNodesAffinity)); %in case there is an empty cluster, the unrelated nodes may contain node index that does not exist %in the predicted graph (which may contain less nodes) indices_to_remove(indices_to_remove > size(newPredictedGraph,2)) = []; small_data2 = newData; small_data2(indices_to_remove,:) = []; small_data2(:,indices_to_remove) = []; %small_data3 = DecreaseGraphSize(newDataWithMissingLinks, last_known_node+1 : size(newDataWithMissingLinks,1)); small_data3 = newDataWithMissingLinks; small_data3(indices_to_remove,:) = []; small_data3(:,indices_to_remove) = []; small_data4 = newPredictedGraph; small_data4(indices_to_remove,:) = []; small_data4(:,indices_to_remove) = []; %%% Sigal 23.6.13 - save reduce graphs for GED if dumpSmallFlag == 1 saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, numLinksToAdd, num_missing_nodes, small_data, 1); saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, numLinksToAdd, num_missing_nodes, small_data2, 2); end %only calculate on the first iteration to save time if numLinksToAdd == 0 edit_distance = 99; %GraphEditDistance( small_data, small_data2, num_missing_nodes ); edit_distance2 = edit_distance; edit_distance3 = edit_distance; else edit_distance2 = 99; %GraphEditDistance( small_data, small_data3, num_missing_nodes ); edit_distance3 = 99; %GraphEditDistance( small_data, small_data4, num_missing_nodes ); if dumpSmallFlag == 1 saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, numLinksToAdd, num_missing_nodes, small_data3, 3); saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, numLinksToAdd, num_missing_nodes, small_data4, 4); end end else perm_vector = 1:size(original_data,1); perm_vector(missing_nodes_mapping(1,:)) = []; perm_vector = [perm_vector, missing_nodes_mapping(1,:)]; remapped_data = original_data(perm_vector,perm_vector); [small_data, indices_to_remove] = DecreaseGraphSize(remapped_data, (size(remapped_data,1) - num_missing_nodes + 1) : size(remapped_data,1), []); %changed from neighbors to perm(max_neighbors) - to be fair when there are more nieghbors indices_to_remove(indices_to_remove > size(newData,2)) = []; small_data2 = newData; small_data2(indices_to_remove,:) = []; small_data2(:,indices_to_remove) = []; % Sigal 23.6.13 - save reduce graphs for GED if dumpSmallFlag == 1 saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttr, num_missing_nodes, small_data, 1); saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttr, num_missing_nodes, small_data2, 2); end %sigal 6.12.12 - *** TODO *** temporary for test only TODO %edit_distance = GraphEditDistance( small_data, small_data2, num_missing_nodes ); edit_distance = 99; end %fprintf('num_links_to_add = %d; missing links distance = %d, new clustering dist = %d\n', numLinksToAdd, full(edit_distance2), full(edit_distance3)); fprintf('calculating purity\n'); curr_index = size(purity,2) + 1; %purity for actual clustering try temp_purity = ClusteringPurity(true_clustering, test_clustering); catch ME1 temp_purity = 99; % Sigal 12.8.12 - tmp % ddddd = 1; end purity(curr_index).score = temp_purity; purity(curr_index).score_sq = temp_purity^2; purity(curr_index).unite_common_friends = unite_common_friends; purity(curr_index).num_missing_nodes_idx = num_missing_nodes_idx; purity(curr_index).num_missing_nodes = num_missing_nodes_arr(num_missing_nodes_idx); purity(curr_index).affinity_calculation_type = affinity_calculation_type; purity(curr_index).withAttr = withAttr; %sigal - 10.7.13 purity(curr_index).cluster_only_missing_nodes = cluster_only_missing_nodes; purity(curr_index).num_clusters_known = num_clusters_known; purity(curr_index).k = k; % sigal 26.6.13 - the predict # of clusters purity(curr_index).num_placeholders = num_placeholders; purity(curr_index).num_placeholders_to_remove = num_placeholders_to_remove; purity(curr_index).iteration = 1; purity(curr_index).test_clustering = test_clustering; purity(curr_index).true_clustering = true_clustering; purity(curr_index).graph_size = graph_size; purity(curr_index).graph_edges = graph_edges; % sigal - number of edges purity(curr_index).inverse_purity = 1; % Sigal 12.8.12 - tmp % CalculateInversePurity(true_clustering, test_clustering); purity(curr_index).NMI = 0; %CalcNormalizedMutualInformation(true_clustering, test_clustering); purity(curr_index).removed_nodes = removed_nodes; purity(curr_index).percent_known_placeholders = percent_known_placeholders; purity(curr_index).reduce_dimensions = reduce_dimensions; purity(curr_index).missing_nodes_mapping = missing_nodes_mapping; purity(curr_index).compensate_for_unknown_placeholers = compensate_for_unknown_placeholers; purity(curr_index).edit_distance = edit_distance; if compensate_for_unknown_placeholers == 1 purity(curr_index).numLinksToAdd = numLinksToAdd; purity(curr_index).edit_distance_missing_links = edit_distance2; purity(curr_index).edit_distance_new_clustering = edit_distance3; fprintf('numLinksToAdd - %d\nedit_distance - %d\nedit_distance_missing_links - %d\nedit_distance_new_clustering - %d\n', numLinksToAdd, full(edit_distance), full(edit_distance2), full(edit_distance3)); end purity(curr_index).affinity_calc_time = affinity_calc_time; purity(curr_index).reduce_dim_time = reduce_dim_time; purity(curr_index).graph_predict_time = graph_predict_time; if compensate_for_unknown_placeholers == 0 break end end % % purity(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) = ... % purity(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) + temp_purity; % % purity_sq(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) = ... % purity_sq(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) + temp_purity^2; % LogMsg(sprintf('Sparse2: Size=%d,Miss=%d,PHs=%d,Affinity=%d,Att=%d,Purity=%.3f', ... graph_size,purity(curr_index).num_missing_nodes,num_placeholders,affinity_calculation_type,withAttr,temp_purity)); %fprintf('affinity_calculation_type = %d, unite_common_friends = %d\n', affinity_calculation_type, unite_common_friends); %fprintf('Graph size: %d, Number of missing nodes: %d, Purity: %f \n' ,graph_size, num_missing_nodes, temp_purity); %fprintf('============================================\n\n\n'); %clear U; clear eigValues; clear eigVectors; end end end clear('affinity'); end end end end function [affinity] = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes) global affinity_calculation_shortest_path; global affinity_calculation_euclid; global affinity_calculation_common_friends; global affinity_calculation_random_clustering; global affinity_calculation_adamic_adar; global affinity_calculation_katz_beta_0_5; global affinity_calculation_katz_beta_0_05; global affinity_calculation_katz_beta_0_005; if affinity_calculation_type == affinity_calculation_euclid sp_mat = graphallshortestpaths(data); %remove INF values max_value = max(sp_mat(sp_mat ~= Inf)) + 1; sp_mat_euclid = sp_mat; sp_mat_euclid(sp_mat == Inf) = max_value; affinity = CalculateAffinity(sp_mat_euclid); %affinity = exp(-(sp_mat.^2))/(2 * 0.3^2); elseif affinity_calculation_type == affinity_calculation_shortest_path % max_value = max(sp_mat(sp_mat ~= Inf)) + 1; % sp_mat_euclid = sp_mat; % sp_mat_euclid(sp_mat == Inf) = max_value; % affinity = (sp_mat_euclid + 1).^(-affinity_exp_factor); %affinity = spfun(affinityFunc, data); affinity = graphallshortestpaths(data); affinity = affinity .^ -2; affinity(affinity == Inf) = 1; %added on 05/11/11 elseif affinity_calculation_type == affinity_calculation_common_friends affinity = CalcAffinityByCommonNeighbors_Sparse(data, actual_graph_size, num_missing_nodes); %affinity = CalcAffinityByCommonNeighbors(data, actual_graph_size, num_missing_nodes); elseif affinity_calculation_type == affinity_calculation_random_clustering affinity = data; %just a placeholder... elseif affinity_calculation_type == affinity_calculation_adamic_adar affinity = CalculateAffinityByAdamicAdar_Sparse( data, actual_graph_size, num_missing_nodes, 1 ); if nnz(affinity) < 5 x = 8; end elseif affinity_calculation_type == affinity_calculation_katz_beta_0_5 affinity = CalcAffinityByKatzBeta_Sparse( data, 0.5, 3 ); elseif affinity_calculation_type == affinity_calculation_katz_beta_0_05 affinity = CalcAffinityByKatzBeta_Sparse( data, 0.05, 4 ); elseif affinity_calculation_type == affinity_calculation_katz_beta_0_005 affinity = CalcAffinityByKatzBeta_Sparse( data, 0.005, 4 ); end end function [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes) global affinity_calculation_shortest_path; global affinity_calculation_euclid; global affinity_calculation_common_friends; global affinity_calculation_random_clustering; global affinity_calculation_adamic_adar; global affinity_calculation_katz_beta_0_5; global affinity_calculation_katz_beta_0_05; global affinity_calculation_katz_beta_0_005; first_unk_node = size(affinity,1) - num_placeholders + 1; diagonal = sum(affinity, 2); %sum the rows D = sparse(diag(diagonal)); %D is the matrix whose diagonal is the sum of the rows of A clear('diagonal'); fprintf('Calculating NL\n'); D = sqrt(D); NL1 = D * affinity * D; clear('D'); fprintf('calculating U\n'); fail = 0; try [nEigVec,eigValues] = eigs(NL1,k); catch ME1 opts.tol = 1e-1; try [nEigVec,eigValues] = eigs(NL1,k, 'LM', opts); catch ME2 fail = 1; end end % select k largest eigen vectors if fail == 0 U = []; % construct the normalized matrix U from the obtained eigen vectors for i=1:size(nEigVec,1) n = sqrt(sum(nEigVec(i,:).^2)); U(i,:) = nEigVec(i,:) ./ n; end num_samples = size(affinity,1) - first_unk_node + 1; if cluster_only_missing_nodes == 1 U(1:first_unk_node - 1 ,:) = []; %cluster only the missing nodes end fprintf('kmeans clustering\n'); % perform kmeans clustering on the matrix U fail = 1; while fail > 0 try currK = k; [IDX,C, SUMD, D] = kmeans(U,currK, 'EmptyAction', 'singleton'); %in case of an empty cluster just drop it fail = 0; catch ME1 fail = fail + 1; if fail < 100 disp('error in kmeans clustering. trying again...'); else %give up on clustering and select random clusters... IDX = randi(currK, size(U)); fail = 0; end end end test_clustering = IDX(size(IDX,1) - num_samples + 1 : size(IDX,1)); %if it's random just replace everything... if affinity_calculation_type == affinity_calculation_random_clustering test_clustering = randi(k, size(test_clustering,1), size(test_clustering,2)); end else disp('Failed in finding eigenvectors - using random!'); if cluster_only_missing_nodes == 0 num_samples = size(affinity,1); else num_samples = num_placeholders; end test_clustering = randi(k, num_samples, 1); end end % Sigal 29.7.13 % add option for clustering with Kmean intead of SP % options: kmean_type 0=SP, 1=kmean on PH (mxm), 2=Kmean on PH+Nodes (m*(m+n)) function [test_clustering] = KMeanClustering(affinity, k, num_placeholders, affinity_calculation_type, kmean_type) global affinity_calculation_random_clustering; first_unk_node = size(affinity,1) - num_placeholders + 1; num_samples = num_placeholders; if kmean_type == 1 U(1:num_placeholders,1:num_placeholders) = affinity(first_unk_node:end,first_unk_node:end); else U(1:num_placeholders,:) = affinity(first_unk_node:end,:); end fprintf('kmeans clustering type=%d\n',kmean_type); % perform kmeans clustering on the matrix U fail = 1; while fail > 0 try currK = k; [IDX,C, SUMD, D] = kmeans(U,currK, 'EmptyAction', 'singleton'); %in case of an empty cluster just drop it fail = 0; catch ME1 fail = fail + 1; if fail < 100 disp('error in kmeans clustering. trying again...'); else %give up on clustering and select random clusters... IDX = randi(currK, size(U)); fail = 0; end end end test_clustering = IDX(size(IDX,1) - num_samples + 1 : size(IDX,1)); %if it's random just replace everything... if affinity_calculation_type == affinity_calculation_random_clustering test_clustering = randi(k, size(test_clustering,1), size(test_clustering,2)); end end %function KMeanClustering % % Sigal 29.7.13 % % original implementation with SP, i.e use kmean_type = 0 % function [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes) % [newData, test_clustering] = PredictGraph2(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes, 0); % end %function PredictGraph % Sigal 29.7.13 % add option for clustering with Kmean intead of SP % original implementation with SP, i.e use kmean_type = 0 function [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes, kmean_type) % sigal - backwards compatibility if nargin < 7 kmean_type = 0; end last_known_node = size(data,1) - num_placeholders; first_unk_node = last_known_node + 1; if kmean_type == 0 [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes); else [test_clustering] = KMeanClustering(affinity, k, num_placeholders, affinity_calculation_type, kmean_type); end newNodes = CreateNewNodesFromClusters(data, test_clustering); newData = [data(1:last_known_node,1:last_known_node), newNodes(:, 1:last_known_node)';... newNodes(:,1:last_known_node), zeros(size(newNodes,1))]; end function [k] = DetermineNumberOfClusters(num_clusters_known, data_untouched, actual_graph_size, num_missing_nodes, num_added_nodes) %determine k - the number of clusters if num_clusters_known == 1 k = num_missing_nodes; else numKnownNodes = actual_graph_size - num_missing_nodes; sumKnownEdges = sum(sum(data_untouched(1 : numKnownNodes, 1 : numKnownNodes))); meanKnownEdges = sumKnownEdges/numKnownNodes; addedEdges = num_added_nodes*2; % undirect graph fprintf('EstimatedK: numKnownN=%d, meanKnownE=%.3f, addedE=%d, missN=%d, meanMissE=%.3f\n', ... numKnownNodes,full(meanKnownEdges),num_added_nodes,num_missing_nodes,addedEdges/num_missing_nodes); if num_clusters_known == 0 %k = round(num_added_nodes / meanKnownEdges); k = round(num_added_nodes / floor(meanKnownEdges)); %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k); elseif num_clusters_known == 2 %guessing upper limit k = 2*round(num_added_nodes / meanKnownEdges); %fprintf('EstimatedK: type=%d, actual=%d, guessing upper limit %d\n',num_clusters_known, num_missing_nodes, k); elseif num_clusters_known == 3 % e=a*n a = meanKnownEdges; e = sumKnownEdges+num_added_nodes; k = round(e/a-numKnownNodes); %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k); elseif num_clusters_known == 4 % e=a*n^2 a = meanKnownEdges/numKnownNodes; e = sumKnownEdges+addedEdges; k = round(sqrt(e/a)-numKnownNodes); %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k); elseif num_clusters_known == 5 % e=a*n^2 a = meanKnownEdges/numKnownNodes; e = sumKnownEdges+addedEdges; k = ceil(sqrt(e/a)-numKnownNodes); %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k); elseif num_clusters_known == 6 % e=a*n a = meanKnownEdges; e = sumKnownEdges+num_added_nodes; k = ceil(e/a-numKnownNodes); elseif num_clusters_known == 7 k = ceil(num_added_nodes / meanKnownEdges); elseif num_clusters_known == 8 k = round(num_added_nodes / meanKnownEdges); end LogMsg(sprintf('EstimatedK(size,PHs,type,actual,k):\t%d\t%d\t%d\t%d\t%d', ... actual_graph_size,num_added_nodes,num_clusters_known, num_missing_nodes, k),'EstimateK_Log2.txt'); end end %function function [affinity, num_placeholders, first_unk_node] = ReduceDimensions(affinity, first_unk_node) size_before = size(affinity,1); num_placeholders = size(affinity,1) - first_unk_node + 1; affinity_sum = sum(affinity(first_unk_node:size(affinity,1),:)); %the sum of the affinity of placeholders to all other nodes nodes_to_keep = (affinity_sum > 0); %keep only nodes which have some affinity to the placeholders nodes_to_keep(first_unk_node:size(affinity,1)) = 1; %keep all the placeholders even if for some reason they have a sum of zero... affinity = affinity(nodes_to_keep, nodes_to_keep); size_after = size(affinity,1); first_unk_node = size(affinity,1) - num_placeholders + 1; LogMsg(sprintf('ReduceDimensions from %d to %d',size_before,size_after),'ReduceDimensions_Log.txt'); end function [true_clustering] = BuildTrueClustering(missing_nodes_mapping, actual_graph_size, num_missing_nodes, percent_known_placeholders, placeholders_to_remove, last_known_node) true_clustering = []; %zeros(size(test_clustering, 1), 1); for i = 2 : size(missing_nodes_mapping, 1) for j = 1 : size(missing_nodes_mapping,2) if missing_nodes_mapping(i,j) ~= 0 true_clustering(missing_nodes_mapping(i,j) - actual_graph_size + num_missing_nodes, 1) = j; % missing_nodes_mapping(1, j); end end end if percent_known_placeholders < 1 true_clustering(placeholders_to_remove - last_known_node) = []; end end function saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_type, withAttr, missNodes, small_data, i) %%% Sigal 24.1.13 - TODO outFile = sprintf('%s_%d_%d_%d_%d_small_data_%d', dataFileName, iter, missNodes, affinity_type, withAttr, i); if affinity_type == 9 % save instead a dummy size (1) and the best_alg SaveIntMatrixToFile(strcat(dumpSmallDataPath, outFile,'_edges.txt'), small_data, 1); else SaveAsciiGraph(dumpSmallDataPath, outFile, small_data, 1); %% also save graph size end end