function [ rand_score, purity, p_triads, missing_nodes_mapping, removed_nodes] = MissingNodes_S8b(dataFilePath, dataFileName, attributes, attUpperRange, attWeightVec, addMissingAttVec, normFactorVec, affinityType, ... num_missing_nodes_arr, attAffinityThreshold, imagesData, numImagesProfiles, imgMissProb, imgSimType, imgSimProbDiff, percentKnownPlaceholdersVec, dumpSmallFlag, dumpSmallDataPath, iter, missingNodes ) %%global g_threshold; % Sigal - 15.10.13 remove warning % addpath 'Spectral Clustering' % sigal 13.8.12 % addpath 'mex'% sigal 13.8.12 p_triads = []; k = 0 %create a log file for this run %date_now = clock; %date_now = strcat(num2str(date_now(1)),'_',num2str(date_now(2)),'_', num2str(date_now(3)),'_', num2str(date_now(4)), num2str(date_now(5)),'_', num2str(date_now(6))); %dump dir for save reduce graphs for GED %dumpSmallDataPath = sprintf('%sdumpSmallData_%s/', resultsDir, date_now); %diary(strcat('C:\missingnodes\Code\Log\log', date_now,'.log')); %affinity calculation types global affinity_calculation_shortest_path; global affinity_calculation_euclid; global affinity_calculation_common_friends; global affinity_calculation_random_clustering; global affinity_calculation_adamic_adar; global affinity_calculation_katz_beta_0_5; global affinity_calculation_katz_beta_0_05; global affinity_calculation_katz_beta_0_005; global affinity_calculation_AA_RCN; global affinity_boost; global affinity_boost2; affinity_calculation_shortest_path = 0; affinity_calculation_euclid = 1; affinity_calculation_common_friends = 2; affinity_calculation_random_clustering = 3; affinity_calculation_adamic_adar = 4; affinity_calculation_katz_beta_0_5 = 5; affinity_calculation_katz_beta_0_05 = 6; affinity_calculation_katz_beta_0_005 = 7; affinity_calculation_AA_RCN = 8; % sigal 12.3.13 add BOOST option affinity_boost = 9; affinity_boost2 = 8; %%%%% for distance as function of num placeholders %%%% expectedParms = 19; if nargin < expectedParms LogMsg(sprintf('*** ERROR: MissingNodes_S8b - Inavlid # of parameters, expected %d got %d',expectedParms,nargin)); return; end percent_known_placeholders_vec = percentKnownPlaceholdersVec; if nargin >= expectedParms+1 select_random_missing_nodes = 0; else select_random_missing_nodes = 1; end affinity_types = affinityType; compensate_for_unknown_placeholers = 0; compensate_vec = [0 0.3 0.65 1 1.5]; unite_common_friends = 0; %should UNK nodes be united in accordance with the "friend of my friend principle" cluster_only_missing_nodes = 1; % if affinity_calculation_type == affinity_calculation_shortest_path || affinity_calculation_type == affinity_calculation_euclid % non_neighbors_distance = Inf; % elseif affinity_calculation_type == affinity_calculation_common_friends % non_neighbors_distance = 0; % end non_neighbors_distance = 0; ExpNormFactorVecLen = 3; normFactorVecLen = size(normFactorVec,2); if normFactorVecLen ~= ExpNormFactorVecLen LogMsg(sprintf('*** ERROR: MissingNodes_S8b - invalid normFactorVec expected len=%d got %d',ExpNormFactorVecLen, normFactorVecLen)); return; end global netAffNormFactor1; global netAffNormFactor2; netAffNormFactor1 = normFactorVec(1); netAffNormFactor2 = normFactorVec(2); netAffNormFactor3 = normFactorVec(3); %compare SC to Kmean, 0=SC, 3=k-mean on PH (mxm), 2=k-mean on PH+Nodes (m*(m+n)) %sigal 1=kmean on PH (with already cut affinitty) kmeanTypesVec = 1; %[0 1]; %1; %%2 3]; %compare with/without attr, 0=MISC, 1=SAMI-A, 2=SAMI-N, 3=SAMI-AK (k-mean), 4=SAMI-NK % images => 5=PMI, 7=PMI+SAMI samiAttrVec = [0 3 5 7]; % [0 4]; % [0 1 3]; % 2]; % Sigal - TODO - if no SAMI-N skip data_untouched_withAtt (~line 160) run_SAMI_N = 0; %read the full network, up to num_values links %disp('reading network information from file...'); fprintf('reading network information from file %s%s ...\n', dataFilePath, dataFileName); data = load(strcat(dataFilePath, dataFileName), 'data'); %use sparse data %a = struct2table(data); %LogMsg(sprintf('%s', a)); data = cell2mat(struct2cell((data))); data = sparse(data); % THISS IS THE MMATRIX %sigal 25.11.12 %combine the attributes with data as first #totalAttNum cols/rows %[dataWithAtt, totalAttNum] = CombineDataWithAttributes(data, attributes, attUpperRange, attWeight); sami_ind = find(samiAttrVec == 1); for sami = [1 2 3 4 7] sami_ind = sami_ind | find(samiAttrVec == sami); end if sum(sami_ind,2) > 0 [attData, totalAttNum] = PreProcessDataAttributes(data, attributes, attUpperRange); else % sigal 31.1.14 support runs without attributes, i.e. only images attData = 0; totalAttNum = 0; end clear('graph'); %rand_score = zeros(2, 2, size(num_missing_nodes_arr,2), 6, 2); %(normalized or not, unite common friends, num missing nodes, affinity calculation, cluster only missing) %rand_score_sq = rand_score; %purity = zeros(2, size(num_missing_nodes_arr,2), 6, 2);%(unite common friends, num missing nodes, affinity calculation, cluster only missing) %purity_sq = purity; rand_score = []; purity = []; graph_size = size(data,1); % DATA == MATRIX! graph_edges = nnz(data)/2; % number of edges %graph_attr_edges = 0; %% sigal 3.1.13: nnz(dataWithAtt)/2 - graph_edges; % number of attributes edges %num_missing_nodes_arr = round(num_missing_nodes_arr .* graph_size); %initialize the data matrix (binary adjacency) disp('generating network graph...'); original_graph_size = size(data,1); original_data = data; original_attData = attData; graph_attr_edges = nnz(attData); % number of attributes edges avg_attr_edges = graph_attr_edges/original_graph_size; %original_dataWithAtt = dataWithAtt; missing_nodes_mapping = []; for num_missing_nodes_idx = 1 : size(num_missing_nodes_arr,2) if select_random_missing_nodes num_missing_nodes = num_missing_nodes_arr(1, num_missing_nodes_idx); else num_missing_nodes = length(missingNodes); end if num_missing_nodes > numImagesProfiles LogMsg(sprintf('*** ERROR: MissingNodes_S8b - invalid numImagesProfiles %d vs. numMissingNodes %d.',numImagesProfiles, num_missing_nodes)); return; end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %remove random nodes % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %randomize the missing nodes indexes and sort in descending order %disp('selecting random missing nodes...'); fprintf('selecting %d random missing nodes...\n',num_missing_nodes); %sigal - 25.11.12 - remove same nodes from data and dataWithAtt if select_random_missing_nodes %[data, missing_nodes_mapping] = RemoveRandomNodes( original_data, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance ); fprintf('testtesttes'); fprintf('testsalam%i', num_missing_nodes); [data, attData, missing_nodes_mapping] = RemoveRandomNodesWithImages( original_data, original_attData, totalAttNum, num_missing_nodes, missing_nodes_mapping, numImagesProfiles ); else %Sigal 13.10.13 - TODO - add option to pre selected missing node %Sigal - 23.1.14 - %%%%TODO - add option for images fprintf('*** ERROR: pre selected missing node is not suported for images \n'); [data, attData, missing_nodes_mapping] = RemoveRandomNodes3( original_data, original_attData, totalAttNum, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance, missingNodes); end data_untouched = data; if run_SAMI_N tic %att_combine_calc_time in seconds data_untouched_withAtt = CombineDataWithAttributes4(data, attData); att_combine_calc_time = toc;%att_combine_calc_time else fprintf('Combining attribtes ...==> Skip\n'); att_combine_calc_time = 0; end % loop over addMissingAtt options for addMissingAtt = addMissingAttVec LogMsg(sprintf('S8b:astddMissingAtt=%.3f',addMissingAtt)); % loop over partial data options for percent_known_placeholders = percent_known_placeholders_vec data0 = data_untouched; %sigal - TODO all reference to data0 and S ??? % S is uncertainty vector when we don't known the placeholders S = zeros(1, size(data0,2)); num_placeholders_to_remove = 0; placeholders_to_remove = []; num_placeholders = size(data0,1) - original_graph_size + num_missing_nodes ; last_known_node = size(data0,1) - num_placeholders; if percent_known_placeholders < 1 num_placeholders_to_remove = round(num_placeholders * (1 - percent_known_placeholders)); while size(placeholders_to_remove, 2) < num_placeholders_to_remove %randomly selecting unique placeholder indexes placeholders_to_remove = unique([placeholders_to_remove, randi(num_placeholders, 1, num_placeholders_to_remove - size(placeholders_to_remove, 2))]); end %rand_vec = rand(1, num_placeholders); %placeholders_to_remove = find(rand_vec > percent_known_placeholders) + last_known_node; placeholders_to_remove = placeholders_to_remove + last_known_node; %S is the group of neighbors of the unknown placeholders S = data0(placeholders_to_remove(1), :); for i = placeholders_to_remove S = S | data0(i,:); end %switch from binary vector to list of indexes % S = find(S); %data_all_placeholders = data0; data0(placeholders_to_remove,:) = []; data0(:,placeholders_to_remove) = []; num_placeholders = num_placeholders - num_placeholders_to_remove; end %save the removed nodes in each iteration removed_nodes{num_missing_nodes_idx} = missing_nodes_mapping(1,:); %save S orig_S = S; %sigal - adjust to withAttr flag TOCHECK %sigal - 12.6.13 - move after calcAffinity so we can calculate only the reduce entries % tic %att_affinity_calc_time in seconds % fprintf('calculating attribtes affinity matrix...\n'); % attAffinity = CalcAttributesAffinity_S3(data0, attData, last_known_node, addMissingAtt, attAffinityThreshold); % att_affinity_calc_time = toc;%att_affinity_calc_time att_affinity_merge_time = 0; all_clustering = []; all_clustering_alg = []; all_att_clustering = []; all_att_clustering_alg = []; all_k1_clustering = []; all_k1_clustering_alg = []; all_k3_clustering = []; all_k3_clustering_alg = []; all_A2_clustering = []; all_A2_clustering_alg = []; all_A4_clustering = []; all_A4_clustering_alg = []; % sigal 12.6.13 - use as flag for first time calculation attAffinity = 0; phsAttAffinity = 0; phsAttAffinity2 = 0; phsImgAffinity = 0; % loop over different affinity_types for affinity_calculation_type = affinity_types % sigal 12.11.13 - use as flag for first time calculation phsNetAffinity = 0; if affinity_calculation_type == affinity_boost withAttrVec = affinity_boost; %[affinity_boost affinity_boost2]; elseif affinity_calculation_type == affinity_calculation_random_clustering withAttrVec = 0; else % sigal 12.3.13 - run several times: % 0=original, 1=weighted affinity, 2=weighted dataWithAttr withAttrVec = samiAttrVec; %Sigal - 15.10.13 end % sigal 25.11.12 % run loop - once as original without attributes and next with attriutes for withAttr = withAttrVec netAffinity = 0; % sigal 24.10.13 - use as flag for first time calculation % sigal 5.3.13 - loop over weights if withAttr == 0 attWeightVector = 0; elseif withAttr == affinity_boost || withAttr == affinity_boost2 attWeightVector = [0.1 0.2 0.4]; % sigal - use OASCA per Affinity/kmean if kmeanTypesVec==0 % find(kmeanTypesVec==0) % attWeightVector = [0 attWeightVector]; end % elseif withAttr == 1 && addMissingAtt > 0 % sigal 23.10.13 % attWeightVector = 0.2:0.1:0.8; %0.8; % elseif affinity_calculation_type == affinity_calculation_common_friends if withAttr == 3 attWeightVector = 0.3:0.1:0.5; %0.3:0.1:0.5; else attWeightVector = 0.2:0.1:0.4; %5; %%0.2:0.1:0.5; %0.3; %0.2:0.1:0.5; %0.3; % end elseif affinity_calculation_type == affinity_calculation_adamic_adar if withAttr == 4 attWeightVector = 0.2:0.1:0.4; %0.3:0.1:0.5; else attWeightVector = 0.6:0.1:0.8; %0.4:0.1:0.6; %0.5:0.1:0.7; %8; %0.4:0.1:0.7; %0.8; %0.5:0.1:0.8; end elseif affinity_calculation_type == affinity_calculation_AA_RCN attWeightVector = 0.1:0.1:0.9; %0.3; %0.2:0.1:0.8; % elseif affinity_calculation_type == affinity_calculation_katz_beta_0_05 attWeightVector = 0.1:0.1:0.4; %0.8; %0.2:0.1:0.8; % else attWeightVector = attWeightVec; end if find(withAttr == [3 5 7]) attWeightVector = [0 attWeightVector 1]; end % run loop for attWeight for attWeight = attWeightVector affinity = 0; phsAffinity = 0; withAttrWeight = (withAttr+attWeight)*10; % sigal 3.1.13 - data is the same the change is in the attAffinity if find(withAttr == [0 1 3 5 7 affinity_boost affinity_boost2]) actual_graph_size = original_graph_size; num_attr_nodes = 0; data = data_untouched; elseif withAttr == 2 || withAttr == 4 actual_graph_size = original_graph_size+totalAttNum; num_attr_nodes = totalAttNum; data = data_untouched_withAtt; else exception = MException(fprintf('Invalid Attribute Type %d',withAttr)); throw(exception); end last_known_node = actual_graph_size - num_missing_nodes; first_unk_node = last_known_node + 1; num_added_nodes = size(data,1) - last_known_node; if withAttr == affinity_boost || withAttr == affinity_boost2 fprintf('calculating best results for affinity matrix, type %d (withAttr=%d)...\n', affinity_calculation_type, withAttrWeight); % Sigal 10.3.13 - TODO calc best results if attWeight == 0.9 [test_clustering, best_alg] = ChooseBestResults(all_att_clustering,all_att_clustering_alg,withAttr); elseif attWeight == 0.1 [test_clustering, best_alg] = ChooseBestResults(all_k1_clustering,all_k1_clustering_alg,withAttr); elseif attWeight == 0.3 [test_clustering, best_alg] = ChooseBestResults(all_k3_clustering,all_k3_clustering_alg,withAttr); elseif attWeight == 0.2 [test_clustering, best_alg] = ChooseBestResults(all_A2_clustering,all_A2_clustering_alg,withAttr); elseif attWeight == 0.4 [test_clustering, best_alg] = ChooseBestResults(all_A4_clustering,all_A4_clustering_alg,withAttr); else [test_clustering, best_alg] = ChooseBestResults(all_clustering,all_clustering_alg,withAttr); end % Sigal 10.3.13 - TODO sum all times affinity_calc_time = 0; graph_predict_time= 0; reduce_dim_time = 0; att_affinity_calc_time = 0; phs_att_affinity_calc_time = 0; phs_img_affinity_calc_time = 0; else % calculate the affinity / similarity matrix fprintf('calculating affinity matrix, type %d (withAttr=%d)...\n', affinity_calculation_type, withAttrWeight); if withAttr == 2 tic %affinity_calc_time in seconds affinity = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, 0); affinity_calc_time = toc;%affinity_calc_time %sigal - 12.11.13 - calculate once for each type elseif nnz(netAffinity) == 0 && kmeanTypesVec==0 % find(kmeanTypesVec==0)% tic %affinity_calc_time %sigal - adjust to withAttr flag TOCHECK affinity = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt); affinity_calc_time = toc;%affinity_calc_time netAffinity = affinity; % if affinity_calculation_type ~= affinity_calculation_AA_RCN % diffAff = affinity(first_unk_node:end,first_unk_node:end)-phsAffinity; % fprintf('nnz diffAff %d \n',full(nnz(diffAff))); % end else affinity = netAffinity; affinity_calc_time = 0; end if withAttr == 4 tic %affinity_calc_time in seconds phsAffinity = CalcPHsAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt); affinity_calc_time = toc;%affinity_calc_time %sigal - 12.11.13 - calculate once for each type elseif nnz(phsNetAffinity) == 0 && kmeanTypesVec==1 % find(kmeanTypesVec==1)% tic %affinity_calc_time in seconds phsNetAffinity = CalcPHsAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt); phsAffinity = phsNetAffinity; affinity_calc_time = toc;%affinity_calc_time else %affinity_calc_time = 0; phsAffinity = phsNetAffinity; end %sigal - 12.6.13 - move after calcAffinity so we can calculate only the reduce entries if withAttr == 1 && nnz(attAffinity) == 0 nodesToKeep = NodesToKeep(affinity, first_unk_node, 1); fprintf('nodesToKeep %d \n',full(sum(nodesToKeep))); % Sigal TODO - what if addMissingAtt > 0 ?? tic %att_affinity_calc_time in seconds fprintf('calculating attribtes affinity matrix...\n'); debugCalcAttr = 0; %% Sigal 17.10.13 debugging nodesToKeep & maxAttStat debugAddMissingAtt = 0; %% sigal 16.12.13 backward - use 0 instead of addMissingAtt; if debugCalcAttr == 1 attAffinity = CalcAttributesAffinity_S5(data0, attData, last_known_node, debugAddMissingAtt, attAffinityThreshold); else attAffinity = CalcAttributesAffinity_S5(data0, attData, last_known_node, debugAddMissingAtt, attAffinityThreshold, nodesToKeep); end att_affinity_calc_time = toc;%att_affinity_calc_time end %sigal - 12.11.13 - calculate once for each type if (withAttr == 3 || withAttr == 7) && nnz(phsAttAffinity) == 0 tic %att_affinity_calc_time in seconds phsAttAffinity = netAffNormFactor3*CalcPHsAffinityByAttributes(data0, attData, last_known_node, addMissingAtt, attAffinityThreshold); phs_att_affinity_calc_time = toc;%att_affinity_calc_time end %sigal - 22.2.14 - calculate once for each type %%%%% TODO if (withAttr == 5 || withAttr == 7) && nnz(phsImgAffinity) == 0 tic %img_affinity_calc_time in seconds phsImgAffinity = netAffNormFactor3*CalcPHsAffinityByImages(data0, imagesData, last_known_node, missing_nodes_mapping, imgMissProb, imgSimType, imgSimProbDiff); phs_img_affinity_calc_time = toc;%att_affinity_calc_time end if withAttr == 3 %sigal - 22.2.14 - SAMI_AK if attWeight == 0 if nnz(phsAttAffinity2) == 0 tic %att_affinity_calc_time in seconds phsAttAffinity2 = netAffNormFactor3*CalcPHsAffinityByAttributes(data0, attData, last_known_node, addMissingAtt, attAffinityThreshold, 1); phs_att_affinity_calc_time = toc;%att_affinity_calc_time end phsAffinity = [netAffNormFactor2*phsAffinity phsAttAffinity2]; % sigal 5.11.13 original order with factor 10 else phsAffinity = (1-attWeight)*phsAffinity+attWeight*phsAttAffinity; end elseif withAttr == 5 %sigal - 22.2.14 - PMI %%%%% TODO if attWeight == 0 phsAffinity = [netAffNormFactor2*phsAffinity phsImgAffinity]; % sigal 5.11.13 original order with factor 10 else phsAffinity = (1-attWeight)*phsAffinity+attWeight*phsImgAffinity; end elseif withAttr == 7 %sigal - 22.2.14 - PMI+SAMI %%%%% TODO if attWeight == 0 if nnz(phsAttAffinity2) == 0 tic %att_affinity_calc_time in seconds phsAttAffinity2 = netAffNormFactor3*CalcPHsAffinityByAttributes(data0, attData, last_known_node, addMissingAtt, attAffinityThreshold, 1); phs_att_affinity_calc_time = toc;%att_affinity_calc_time end phsAffinity = [netAffNormFactor2*phsAffinity phsAttAffinity2 phsImgAffinity]; else phsAffinity = (1-attWeight)*phsAffinity+attWeight*(phsAttAffinity+phsImgAffinity)/2; end end end % sigal 3.1.13 - weighted affinity if withAttr == 1 fprintf('merge affinity matrix with attributes affinity\n'); if debugAddMissingAtt > 0 e = size(affinity,1); else e=last_known_node; end tic %att_affinity_calc_time %Sigal - 16.6.13 - use full C implementation affinity = WeightedSum(affinity, attAffinity, attWeight, e); %Sigal - 17.6.13 free memory if original_graph_size > 20000 fprintf('free attAffinity memory\n'); clear('attAffinity'); attAffinity = 0; fprintf('free netAffinity memory\n'); clear('netAffinity'); netAffinity = 0; end % affinity(1:e, 1:e)= affinity(1:e,1:e)*(1-attWeight)+attAffinity(1:e,1:e)*attWeight; % nnz1 = nnz(affinity); % nnz2 = nnz(affinity2); % aaa = affinity2-affinity; % nnz3 = nnz(aaa); % fprintf('nnz affinity: nnz1=%d, nnz2=%d, nnz3=%d\n',nnz1,nnz2,nnz3); att_affinity_merge_time = toc;%att_affinity_merge_time end %TODO: extend the dimension reduction to adding missing links / reclustering %Sigal/ron - ToRECEK ron TODO (done?) %Sigal - run always with 1 (ron) for reduce_dimensions = [1] %0 must be first because it does not change the affinity matrix reduce_dim_time = 0; skip_reduce_dimensions = find(withAttr == [3 4 5 7 affinity_boost affinity_boost2]); %Sigal 22.1.14 %%% TOCHECK if reduce_dimensions == 1 && nnz(affinity) > 0 && ~skip_reduce_dimensions fprintf('reduce dimensions\n'); tic %ReduceDimensions [affinity, num_placeholders, first_unk_node] = ReduceDimensions(affinity, first_unk_node); reduce_dim_time = toc; %ReduceDimensions fprintf('new dimensions %d\n',size(affinity,1)); end %sigal - why each iteration? simple calculation - can be done once %sigal - adjust to withAttr flag TODO fprintf('calculating true clustering\n'); true_clustering = BuildTrueClustering(missing_nodes_mapping, original_graph_size, num_missing_nodes, percent_known_placeholders, placeholders_to_remove, last_known_node); %figure,imshow(affinity,[]), title('Affinity Matrix') %sigal - use 0:1 if we want to compare with unknown #missNodes % (type=2 wasn't tested by ron) for num_clusters_known = [1] %[0, 1] % sigal 29.7.13 % Test other clustering kmean types if affinity_calculation_type == affinity_calculation_random_clustering || affinity_calculation_type == affinity_boost || affinity_calculation_type == affinity_boost2 kmeanTypes = 1; elseif withAttr == 0 kmeanTypes = kmeanTypesVec; elseif withAttr == 1 || withAttr == 2 kmeanTypes = 0; elseif find(withAttr == [3 4 5 7]) kmeanTypes = 1; else kmeanTypes = kmeanTypesVec; end % loop over added kmeanTypes (clustering types) for kmeanType = kmeanTypes %sigal - adjust to withAttr flag TODO - which params? data_untouched, original_graph_size k = DetermineNumberOfClusters(num_clusters_known, data_untouched, original_graph_size, num_missing_nodes, num_added_nodes); debugEstimateK = 0; if debugEstimateK == 1 && affinity_calculation_type == affinity_calculation_common_friends for type=[0,3,4,8] estK = DetermineNumberOfClusters(type, data_untouched, actual_graph_size, num_missing_nodes, num_added_nodes); fprintf('debugEstimateK: type=%d, estK=%d\n',type,estK); end end if num_clusters_known == 1 withAttrC = 0; elseif num_clusters_known == 0 withAttrC = 10; else withAttrC = num_clusters_known*10; end % sigal 15.10.13 - add kmeanType & num_clusters to alg type withAttrC = withAttrC+kmeanType; withAttrWeight = withAttrC*100+(withAttr+attWeight)*10; %sigal - first_unk_node might change after ReduceDimensions last_known_node = first_unk_node - 1; if withAttr ~= affinity_boost && withAttr ~= affinity_boost2 %sigal - adjust to withAttr flag TOCHECK fprintf('predicting the graph\n'); tic %graph_predict_time if kmeanType == 1 [newData, test_clustering] = PredictGraph(phsAffinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes, kmeanType); else [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes, kmeanType); end graph_predict_time = toc; %graph_predict_time end out_data = newData; out_clusterrr = test_clustering; %sigal - when to use - only if there are unknown placeholders if compensate_for_unknown_placeholers == 1 fprintf('*** running with compensate_for_unknown_placeholers mode ...\n'); S = orig_S; if size(newData,1) > size(S,2) %for breakpoint tttt = 98; end sigma = 1/4; S = S(1:size(newData,1)); S = S + randn(size(S)) * sigma; sorted_S = sort(S, 'descend'); %sum over the columns and find the columns which %indicate at least one neighbor %neighbors_of_new_nodes = find(sum(newData(first_unk_node:size(newData,1), :))); first_united_node = size(newData,1) - k +1; if affinity_calculation_type == affinity_calculation_katz_beta_0_05 newAffinity = CalcAffinityByKatzBeta_Sparse( newData, 0.05, 4 ); elseif affinity_calculation_type == affinity_calculation_adamic_adar newAffinity = CalculateAffinityByAdamicAdar_Sparse(newData, size(newData,1), 0, 0); elseif affinity_calculation_type == affinity_calculation_common_friends newAffinity = CalcAffinityByCommonNeighbors_Sparse(newData, size(newData,1), 0); end newNodesAffinity = newAffinity(first_united_node:size(newAffinity,1), :); newNodesAffinity(newNodesAffinity>=1) = 0; newNodesAffinity = newNodesAffinity / max(max(newNodesAffinity)); %newNodesAffinity = newNodesAffinity / 2; newNodesAffinity(newData(first_united_node:size(newAffinity,1), :) >= 1) = 0; newNodesAffinity = (newNodesAffinity / max(max(newNodesAffinity))); %%%%trying to take only the %%%%k highest affinities sortedNewNodesAffinity = sort(newNodesAffinity(:), 'descend'); %affinityThreshold = sortedNewNodesAffinity(k + size(neighbors_of_new_nodes, 2)); newNodesAffinity_orig = newNodesAffinity; end % compensate_for_unknown_placeholers == 1 if percent_known_placeholders < 1 && compensate_for_unknown_placeholers == 1 %calculating as a function of number of links added meanNumLinks = mean(sum(data(1:last_known_node, 1:last_known_node))); maxNumLinksToAdd = meanNumLinks * num_placeholders; maxNumLinksToAdd = min(maxNumLinksToAdd, 25); %sigal 27.6.13 linksToAdd = round(compensate_vec*num_missing_nodes); else maxNumLinksToAdd = 0; %sigal 27.6.13 linksToAdd = 0; end %%%%% for distance as %%%%% function of num %%%%% placeholders %%%% %max_neighbors = S >= sorted_S(maxNumLinksToAdd); %sigal 27.6.13 %for numLinksToAdd = 0 : maxNumLinksToAdd origWithAttrWeight = withAttrWeight; for linksInx = 1:size(linksToAdd,2) numLinksToAdd = linksToAdd(linksInx); %Sigal - 13.10.13 - TODO - fix withAttr flag calculation withAttrXX = compensate_vec(linksInx)*100; % numLinksToAdd withAttrWeight = origWithAttrWeight+1000*withAttrXX; if compensate_for_unknown_placeholers == 1 newNodesAffinity = newNodesAffinity_orig; neighbors = []; if numLinksToAdd > 0 neighbors = find(S >= sorted_S(numLinksToAdd), numLinksToAdd); end newDataWithMissingLinks = newData; % partial graph with the clustered nodes newDataForClustering = data0; % partial graph with partial PHs for neighbor = neighbors [value, closest_new_node] = max(newNodesAffinity(:,neighbor)); closest_new_node = closest_new_node(1); newDataWithMissingLinks(first_united_node + closest_new_node - 1, neighbor) = 1; newDataWithMissingLinks(neighbor, first_united_node + closest_new_node - 1) = 1; newPlaceholder = zeros(1, size(newDataForClustering,2)); newPlaceholder(neighbor) = 1; newDataForClustering = [newDataForClustering, newPlaceholder'; newPlaceholder, 0]; end affinityWithS = CalcAffinity( newDataForClustering, affinity_calculation_type, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt); if reduce_dimensions == 1 [affinityWithS, num_placeholdersWithS, first_unk_node_with_s] = ReduceDimensions(affinityWithS, first_united_node); else num_placeholdersWithS = num_placeholders + length(neighbors); first_unk_node_with_s = first_united_node; end % Sigal 15.10.13 - we are not using the new clusteing result as in this % cases we are using the GED as the main measure fprintf('^%s', newPredictedGraph) %remap the original data so that the known nodes match the %predicted data and the missing nodes match the predicted % nodes created from each cluster %sigal - adjust to withAttr flag TODO original_data perm_vector = 1:size(original_data,1); perm_vector(missing_nodes_mapping(1,:)) = []; perm_vector = [perm_vector, missing_nodes_mapping(1,:)]; remapped_data = original_data(perm_vector,perm_vector); [small_data, indices_to_remove] = DecreaseGraphSize(remapped_data, first_united_node : size(remapped_data,1), neighbors); %changed from neighbors to perm(max_neighbors) - to be fair when there are more nieghbors %small_data2 = DecreaseGraphSize(newData, last_known_node+1 : size(newData,1)); %new_nodes_affinity_sum = sum(sum(newNodesAffinity)); %in case there is an empty cluster, the unrelated nodes may contain node index that does not exist %in the predicted graph (which may contain less nodes) indices_to_remove(indices_to_remove > size(newPredictedGraph,2)) = []; small_data2 = newData; small_data2(indices_to_remove,:) = []; small_data2(:,indices_to_remove) = []; %small_data3 = DecreaseGraphSize(newDataWithMissingLinks, last_known_node+1 : size(newDataWithMissingLinks,1)); small_data3 = newDataWithMissingLinks; small_data3(indices_to_remove,:) = []; small_data3(:,indices_to_remove) = []; small_data4 = newPredictedGraph; small_data4(indices_to_remove,:) = []; small_data4(:,indices_to_remove) = []; %%% Sigal 27.1.13 - save reduce graphs for GED if dumpSmallFlag == 1 saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, small_data, 1); saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, small_data2, 2); end %only calculate on the first iteration to save time if numLinksToAdd == 0 edit_distance = 99; %GraphEditDistance( small_data, small_data2, num_missing_nodes ); edit_distance2 = edit_distance; edit_distance3 = edit_distance; else edit_distance2 = 99; %GraphEditDistance( small_data, small_data3, num_missing_nodes ); edit_distance3 = 99; %GraphEditDistance( small_data, small_data4, num_missing_nodes ); if dumpSmallFlag == 1 saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, small_data3, 3); saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, small_data4, 4); end end elseif withAttr == affinity_boost || withAttr == affinity_boost2 % NOT compensate_for_unknown_placeholers == 1 if dumpSmallFlag == 1 saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, best_alg, 1); saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, best_alg, 2); end else % NOT compensate_for_unknown_placeholers == 1 %sigal - adjust to withAttr flag TODO original_data perm_vector = 1:size(original_data,1); % set values 1:n perm_vector(missing_nodes_mapping(1,:)) = []; % according to 1st line of missing nodes remove indexes from perm perm_vector = [perm_vector, missing_nodes_mapping(1,:)]; % add missing node as last indexes remapped_data = original_data(perm_vector,perm_vector); % return original graph according to perm out_data_p = out_data%(perm_vector,perm_vector); graphs_out = sprintf('../output/graphed_%d.mat', iter); save(graphs_out, 'out_data_p', 'out_data', 'remapped_data', 'original_data'); %sigal - return data only with missing nodes and their friends % reduce size to improve GED calulation [small_data, indices_to_remove] = DecreaseGraphSize(remapped_data, (size(remapped_data,1) - num_missing_nodes + 1) : size(remapped_data,1), []); %changed from neighbors to perm(max_neighbors) - to be fair when there are more nieghbors %Sigal - now according to original data remove indexes, %adjust/reduce the predict grpah return at newData small_data2 = newData; %sigal - adjust to withAttr flag - remove att nodes TOCHECK if num_attr_nodes > 0 small_data2(1:num_attr_nodes,:) = []; small_data2(:,1:num_attr_nodes) = []; end %sigal 26.11.12 - change newData to small_data2 after resizing indices_to_remove(indices_to_remove > size(small_data2,2)) = []; small_data2(indices_to_remove,:) = []; small_data2(:,indices_to_remove) = []; %fprintf('&%s', small_data2) %fprintf('&&%s', small_data) %%% Sigal 27.1.13 - save reduce graphs for GED withAttrWeight = origWithAttrWeight+1000*(1-percent_known_placeholders)*10; if dumpSmallFlag == 1 saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, small_data, 1); saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, small_data2, 2); end %sigal 6.12.12 - *** TODO *** temporary for test only TODO %edit_distance = GraphEditDistance( small_data, small_data2, num_missing_nodes ); edit_distance = 99; % Sigal 10.3.13 - save results (without random) if attWeight ~= 1 && affinity_calculation_type ~= affinity_calculation_random_clustering currAlg = affinity_calculation_type*1000+withAttrWeight; % sigal 21.10.13 (only on SC) if kmeanType == 0 res_index = size(all_clustering,2) + 1; all_clustering(:, res_index) = test_clustering; all_clustering_alg(res_index) = currAlg; if withAttr ~= 0 att_index = size(all_att_clustering,2) + 1; all_att_clustering(:, att_index) = test_clustering; all_att_clustering_alg(att_index) = currAlg; end end % sigal 17.10.13 - find best based on kmeanType == 2 if kmeanType == 1 k_index = size(all_k1_clustering,2) + 1; all_k1_clustering(:, k_index) = test_clustering; all_k1_clustering_alg(k_index) = currAlg; if affinity_calculation_type == affinity_calculation_common_friends k_index = size(all_A2_clustering,2) + 1; all_A2_clustering(:, k_index) = test_clustering; all_A2_clustering_alg(k_index) = currAlg; elseif affinity_calculation_type == affinity_calculation_adamic_adar k_index = size(all_A4_clustering,2) + 1; all_A4_clustering(:, k_index) = test_clustering; all_A4_clustering_alg(k_index) = currAlg; end end if kmeanType == 2 || kmeanType == 3 k_index = size(all_k3_clustering,2) + 1; all_k3_clustering(:, k_index) = test_clustering; all_k3_clustering_alg(k_index) = currAlg; end end end % compensate_for_unknown_placeholers == 1 % calculate the purity for actual clustering fprintf('calculating purity\n'); try %sigal - calulation done accoring to definition temp_purity = ClusteringPurity(true_clustering, test_clustering); catch ME1 temp_purity = 99; %Sigal 12.8.12 - add invalid value incase of exception ddddd = 1; end clusters_out = sprintf('../output/OUTp_%d.mat', iter); save(clusters_out, 'true_clustering', 'test_clustering'); % save results fprintf('saving results (withAttr %d, purity %.5f) \n',withAttrWeight,temp_purity); %oooo = sprintf('/Users/armin/Desktop/output/OUT_%d.mat', k); %save(oooo, 'withAttrWeight', 'temp_purity'); curr_index = size(purity,2) + 1; purity(curr_index).score = temp_purity; purity(curr_index).score_sq = temp_purity^2; purity(curr_index).edit_distance = edit_distance; if compensate_for_unknown_placeholers == 1 purity(curr_index).numLinksToAdd = numLinksToAdd; purity(curr_index).edit_distance_missing_links = edit_distance2; purity(curr_index).edit_distance_new_clustering = edit_distance3; fprintf('numLinksToAdd - %d\nedit_distance - %d\nedit_distance_missing_links - %d\nedit_distance_new_clustering - %d\n', numLinksToAdd, full(edit_distance), full(edit_distance2), full(edit_distance3)); end purity(curr_index).withAttr = withAttrWeight; % sigal 15.10.13 purity(curr_index).num_missing_nodes_idx = num_missing_nodes_idx; purity(curr_index).num_missing_nodes = num_missing_nodes_arr(num_missing_nodes_idx); purity(curr_index).affinity_calculation_type = affinity_calculation_type; purity(curr_index).addMissingAtt = addMissingAtt; %sigal 8.11.13 purity(curr_index).cluster_only_missing_nodes = cluster_only_missing_nodes; purity(curr_index).num_clusters_known = num_clusters_known; purity(curr_index).num_clusters_estimated = k; % sigal 26.11.12 purity(curr_index).num_placeholders = num_placeholders; purity(curr_index).num_placeholders_to_remove = num_placeholders_to_remove; purity(curr_index).num_attr_nodes = totalAttNum; % sigal 3.1.13 purity(curr_index).unite_common_friends = unite_common_friends; purity(curr_index).iteration = 1; purity(curr_index).test_clustering = test_clustering; purity(curr_index).true_clustering = true_clustering; purity(curr_index).graph_size = graph_size; purity(curr_index).graph_edges = graph_edges; % sigal - number of edges purity(curr_index).graph_attr_edges = graph_attr_edges; % sigal - number of attributes edges purity(curr_index).inverse_purity = 99; % Sigal 12.8.12 - tmp ??? % CalculateInversePurity(true_clustering, test_clustering); purity(curr_index).NMI = 99; %CalcNormalizedMutualInformation(true_clustering, test_clustering); purity(curr_index).removed_nodes = removed_nodes; purity(curr_index).percent_known_placeholders = percent_known_placeholders; purity(curr_index).reduce_dimensions = reduce_dimensions; purity(curr_index).missing_nodes_mapping = missing_nodes_mapping; purity(curr_index).compensate_for_unknown_placeholers = compensate_for_unknown_placeholers; purity(curr_index).affinity_calc_time = affinity_calc_time; purity(curr_index).reduce_dim_time = reduce_dim_time; purity(curr_index).graph_predict_time = graph_predict_time; if withAttr == 1 %% save this time only for this variation purity(curr_index).att_affinity_calc_time = att_affinity_calc_time; %sigal 14.3.13 purity(curr_index).affinity_calc_time = affinity_calc_time+att_affinity_merge_time; %sigal 14.3.13 elseif withAttr == 2 || withAttr == 4 %% save this time only for this variation purity(curr_index).att_affinity_calc_time = att_combine_calc_time; %sigal 13.3.13 elseif withAttr == 3 %% sigal 22.1.14 purity(curr_index).att_affinity_calc_time = phs_att_affinity_calc_time; elseif withAttr == 5 %% sigal 22.1.14 purity(curr_index).att_affinity_calc_time = phs_img_affinity_calc_time; elseif withAttr == 7 %% sigal 22.1.14 purity(curr_index).att_affinity_calc_time = phs_att_affinity_calc_time+phs_img_affinity_calc_time; else purity(curr_index).att_affinity_calc_time = 0; %sigal 12.3.13 end if compensate_for_unknown_placeholers == 0 break end end %linksToAdd % % purity(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) = ... % purity(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) + temp_purity; % % purity_sq(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) = ... % purity_sq(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) + temp_purity^2; % LogMsg(sprintf('S8b: Size=%d,Miss=%d,PHs=%d,Affinity=%d,Att=%d,Purity=%.3f', ... graph_size,purity(curr_index).num_missing_nodes,num_placeholders,affinity_calculation_type,withAttrWeight,temp_purity)); %fprintf('affinity_calculation_type = %d, unite_common_friends = %d\n', affinity_calculation_type, unite_common_friends); fprintf('Graph size: %d, Number of missing nodes: %d, Purity: %f \n' ,graph_size, num_missing_nodes, temp_purity); %fprintf('============================================\n\n\n'); %clear U; clear eigValues; clear eigVectors; end %kmeanTypes (clustering types) end %num_clusters_known end %reduce_dimensions clear('affinity'); clear('phsAffinity'); end % run loop for attWeight end % run over - once as original without attributes and next with attriutes/images clear('netAffinity'); clear('phsNetAffinity'); end % loop over different affinity_types clear('attAffinity'); clear('phsAttAffinity'); clear('phsAttAffinity2'); end %loop over percent_known_placeholders_vec end %loop over addMissingAtt end %loop over num_missing_nodes... end %main function % sigal - 29.10.13 % calc only PHs affinity function [affinity] = CalcPHsAffinity( data, affType, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt) global affinity_calculation_shortest_path; global affinity_calculation_euclid; global affinity_calculation_common_friends; global affinity_calculation_random_clustering; global affinity_calculation_adamic_adar; global affinity_calculation_katz_beta_0_5; global affinity_calculation_katz_beta_0_05; global affinity_calculation_katz_beta_0_005; global affinity_calculation_AA_RCN; global netAffNormFactor1; global netAffNormFactor2; firstPH = actual_graph_size-num_missing_nodes+1; normAttWeight = attWeight / netAffNormFactor2; % only for SAMI-N if affType == affinity_calculation_euclid LogMsg(sprintf('*** ERROR: MissingNodes_S8b:CalcPHsAffinity - affType %d not supported !!!',affType)); return; elseif affType == affinity_calculation_shortest_path LogMsg(sprintf('*** ERROR: MissingNodes_S8b:CalcPHsAffinity - affType %d not supported !!!',affType)); return; elseif affType == affinity_calculation_common_friends affinity = CalcPHsAffinityByRCN(data, actual_graph_size, num_missing_nodes, num_attr_nodes, normAttWeight, addMissingAtt); elseif affType == affinity_calculation_random_clustering affinity = data(firstPH:end, firstPH:end); %just a placeholder... elseif affType == affinity_calculation_adamic_adar affinity = CalcPHsAffinityByAA( data, actual_graph_size, num_missing_nodes, 1, num_attr_nodes, normAttWeight, addMissingAtt); elseif affType == affinity_calculation_katz_beta_0_5 affinity = CalcPHsAffinityByKatzBeta( data, 0.5, 3, firstPH ); elseif affType == affinity_calculation_katz_beta_0_05 affinity = CalcPHsAffinityByKatzBeta( data, 0.05, 4, firstPH ); elseif affType == affinity_calculation_katz_beta_0_005 affinity = CalcPHsAffinityByKatzBeta( data, 0.005, 4, firstPH ); elseif affType == affinity_calculation_AA_RCN affinity = CalcPHsAffinity( data, affinity_calculation_adamic_adar, actual_graph_size, num_missing_nodes, num_attr_nodes, normAttWeight, addMissingAtt); affinity2 = CalcPHsAffinity( data, affinity_calculation_common_friends, actual_graph_size, num_missing_nodes, num_attr_nodes, normAttWeight, addMissingAtt); affinity = [affinity2 affinity]; end %sigal 14.11.13 affinity = affinity * netAffNormFactor1; end %main function %sigal - adjust to withAttr flag TODO function [affinity] = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt) global affinity_calculation_shortest_path; global affinity_calculation_euclid; global affinity_calculation_common_friends; global affinity_calculation_random_clustering; global affinity_calculation_adamic_adar; global affinity_calculation_katz_beta_0_5; global affinity_calculation_katz_beta_0_05; global affinity_calculation_katz_beta_0_005; global affinity_calculation_AA_RCN; global netAffNormFactor1; % sigal 11.2.14 - backward compitbility with ASONAM 13 addMissingAtt = 0; if affinity_calculation_type == affinity_calculation_euclid sp_mat = graphallshortestpaths(data); %remove INF values max_value = max(sp_mat(sp_mat ~= Inf)) + 1; sp_mat_euclid = sp_mat; sp_mat_euclid(sp_mat == Inf) = max_value; affinity = CalculateAffinity(sp_mat_euclid); %affinity = exp(-(sp_mat.^2))/(2 * 0.3^2); elseif affinity_calculation_type == affinity_calculation_shortest_path % max_value = max(sp_mat(sp_mat ~= Inf)) + 1; % sp_mat_euclid = sp_mat; % sp_mat_euclid(sp_mat == Inf) = max_value; % affinity = (sp_mat_euclid + 1).^(-affinity_exp_factor); %affinity = spfun(affinityFunc, data); affinity = graphallshortestpaths(data); affinity = affinity .^ -2; affinity(affinity == Inf) = 1; %added on 05/11/11 elseif affinity_calculation_type == affinity_calculation_common_friends affinity = CalcAffinityByCommonNeighbors_Sparse(data, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt); %affinity = CalcAffinityByCommonNeighbors(data, actual_graph_size, num_missing_nodes); elseif affinity_calculation_type == affinity_calculation_random_clustering affinity = data; %just a placeholder... elseif affinity_calculation_type == affinity_calculation_adamic_adar affinity = CalculateAffinityByAdamicAdar_S3o( data, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, 1 , addMissingAtt); %%affinity2 = CalculateAffinityByAdamicAdar_S2( data, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, 1 , addMissingAtt); if nnz(affinity) < 5 x = 8; end % diff = affinity2-affinity; % if nnz(diff) > 0 % LogMsg(sprintf('*** WARNING: affinityAA - mismatch (nnz=%d)',nnz(diff))); % zz = 999; % end elseif affinity_calculation_type == affinity_calculation_katz_beta_0_5 affinity = CalcAffinityByKatzBeta_Sparse( data, 0.5, 3, num_attr_nodes ); elseif affinity_calculation_type == affinity_calculation_katz_beta_0_05 affinity = CalcAffinityByKatzBeta_Sparse( data, 0.05, 4, num_attr_nodes ); elseif affinity_calculation_type == affinity_calculation_katz_beta_0_005 affinity = CalcAffinityByKatzBeta_Sparse( data, 0.005, 4, num_attr_nodes ); elseif affinity_calculation_type == affinity_calculation_AA_RCN w2 = 0.5; affinity = CalcAffinity( data, affinity_calculation_adamic_adar, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt); affinity2 = CalcAffinity( data, affinity_calculation_common_friends, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt); affinity = WeightedSum(affinity, affinity2, w2, size(affinity,1)); end %sigal 22.11.13 %affinity = affinity * 10; %%netAffNormFactor1; end % function CalcAffinity %sigal - review code and messages TODO function [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinityType, cluster_only_missing_nodes) fprintf('kmeans clustering type=0 (SP)\n'); first_unk_node = size(affinity,1) - num_placeholders + 1; diagonal = sum(affinity, 2); %sum the rows D = sparse(diag(diagonal)); %D is the matrix whose diagonal is the sum of the rows of A clear('diagonal'); fprintf('calculating NL\n'); D = sqrt(D); NL1 = D * affinity * D; clear('D'); fprintf('calculating U - eigs\n'); fail = 0; try [nEigVec,eigValues] = eigs(NL1,k); catch ME1 % variable that get the exception opts.tol = 1e-1; try fprintf('calculating U - 2nd try\n'); [nEigVec,eigValues] = eigs(NL1,k, 'LM', opts); catch ME2 fail = 1; end end % select k largest eigen vectors if fail == 0 U = []; % construct the normalized matrix U from the obtained eigen vectors fprintf('calculating U - construct\n'); for i=1:size(nEigVec,1) n = sqrt(sum(nEigVec(i,:).^2)); U(i,:) = nEigVec(i,:) ./ n; end num_samples = size(affinity,1) - first_unk_node + 1; if cluster_only_missing_nodes == 1 U(1:first_unk_node - 1 ,:) = []; %cluster only the missing nodes end fprintf('SC: run kmeans clustering\n'); % perform kmeans clustering on the matrix U test_clustering = calcKMean(U, k, num_samples, affinityType); else %fail == 0 disp('Failed in finding eigenvectors - using random!'); if cluster_only_missing_nodes == 0 num_samples = size(affinity,1); else num_samples = num_placeholders; end test_clustering = randi(k, num_samples, 1); end end % function SpectralClustering % Sigal 29.7.13 % add option for clustering with Kmean intead of SP % options: kmean_type 0=SP, 3=kmean on PH (mxm), 2=Kmean on PH+Nodes (m*(m+n)) % 1=kmean on already cut affinity function [test_clustering] = KMeanClustering(affinity, k, num_placeholders, affinityType, kmean_type) first_unk_node = size(affinity,1) - num_placeholders + 1; num_samples = num_placeholders; if kmean_type == 1 U = affinity; elseif kmean_type == 3 % previous kmean_type == 1 U(1:num_placeholders,1:num_placeholders) = affinity(first_unk_node:end,first_unk_node:end); else U(1:num_placeholders,:) = affinity(first_unk_node:end,:); end fprintf('kmeans clustering type=%d\n',kmean_type); % perform kmeans clustering on the matrix U test_clustering = calcKMean(U, k, num_samples, affinityType); end % function SpectralClustering % Sigal 29.10.13 % perform kmeans clustering on the matrix U % use same method for both KMeanClustering and SpectralClustering function [test_clustering] = calcKMean(U, num_clusters, num_samples, affinityType) global affinity_calculation_random_clustering; if num_clusters > 99 numReplicates = 1; else numReplicates = 3; end fprintf('calcKMean\n'); % perform kmeans clustering on the matrix U fail = 1; while fail > 0 try currK = num_clusters; % OPT: 'EmptyAction','singleton' - in case of an empty cluster just drop it % OPT: 'Replicates',3 - repeat run/start points [IDX,C, SUMD, D] = kmeans(U,currK,'EmptyAction','singleton','Replicates',numReplicates); fail = 0; catch ME1 fail = fail + 1; if fail < 100 %disp('error in kmeans clustering. trying again...'); else %give up on clustering and select random clusters... IDX = randi(currK, size(U)); fail = 0; end end end test_clustering = IDX(size(IDX,1) - num_samples + 1 : size(IDX,1)); %if it's random just replace everything... if affinityType == affinity_calculation_random_clustering test_clustering = randi(num_clusters, size(test_clustering,1), size(test_clustering,2)); end end %function KMeanClustering % sigal - adjust to withAttr flag TOCHECK - which data to use? % Sigal 29.7.13 % add option for clustering with Kmean intead of SP % original implementation with SP, i.e use kmean_type = 0 % k is the number of return clusters function [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes, kmean_type) % sigal - backwards compatibility if nargin < 7 kmean_type = 0; end last_known_node = size(data,1) - num_placeholders; %%first_unk_node = last_known_node + 1; % Sigal 15.10.13 remove unuse warning if kmean_type == 0 [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes); else [test_clustering] = KMeanClustering(affinity, k, num_placeholders, affinity_calculation_type, kmean_type); end %sigal - what if #clusters diffrent than #missing ??? newNodes = CreateNewNodesFromClusters(data, test_clustering); %sigal - ... means continue command at next line newData = [data(1:last_known_node,1:last_known_node), newNodes(:, 1:last_known_node)';... newNodes(:,1:last_known_node), zeros(size(newNodes,1))]; end % function PredictGraph %sigal - estimation can be wrong ??? %sigal - adjust to withAttr flag TODO function [k] = DetermineNumberOfClusters(num_clusters_known, data_untouched, actual_graph_size, num_missing_nodes, num_added_nodes) %determine k - the number of clusters if num_clusters_known == 1 k = num_missing_nodes; else numKnownNodes = actual_graph_size - num_missing_nodes; sumKnownEdges = sum(sum(data_untouched(1 : numKnownNodes, 1 : numKnownNodes))); meanKnownEdges = sumKnownEdges/numKnownNodes; addedEdges = num_added_nodes*2; % undirect graph fprintf('EstimatedK: numKnownN=%d, meanKnownE=%.3f, addedE=%d, missN=%d, meanMissE=%.3f\n', ... numKnownNodes,full(meanKnownEdges),num_added_nodes,num_missing_nodes,addedEdges/num_missing_nodes); if num_clusters_known == 0 %k = round(num_added_nodes / meanKnownEdges); k = round(num_added_nodes / floor(meanKnownEdges)); %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k); elseif num_clusters_known == 2 %guessing upper limit k = 2*round(num_added_nodes / meanKnownEdges); %fprintf('EstimatedK: type=%d, actual=%d, guessing upper limit %d\n',num_clusters_known, num_missing_nodes, k); elseif num_clusters_known == 3 % e=a*n a = meanKnownEdges; e = sumKnownEdges+num_added_nodes; k = round(e/a-numKnownNodes); %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k); elseif num_clusters_known == 4 % e=a*n^2 a = meanKnownEdges/numKnownNodes; e = sumKnownEdges+addedEdges; k = round(sqrt(e/a)-numKnownNodes); %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k); elseif num_clusters_known == 5 % e=a*n^2 a = meanKnownEdges/numKnownNodes; e = sumKnownEdges+addedEdges; k = ceil(sqrt(e/a)-numKnownNodes); %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k); elseif num_clusters_known == 6 % e=a*n a = meanKnownEdges; e = sumKnownEdges+num_added_nodes; k = ceil(e/a-numKnownNodes); elseif num_clusters_known == 7 k = ceil(num_added_nodes / meanKnownEdges); elseif num_clusters_known == 8 k = round(num_added_nodes / meanKnownEdges); end LogMsg(sprintf('EstimatedK(size,PHs,type,actual,k):\t%d\t%d\t%d\t%d\t%d', ... actual_graph_size,num_added_nodes,num_clusters_known, num_missing_nodes, k),'EstimateK_Log2.txt'); end end % function DetermineNumberOfClusters %find nodes with some affinity to one or more missing node function [nodes_to_keep] = NodesToKeep(affinity, first_unk_node, includePHs) affinity_sum = sum(affinity(first_unk_node:size(affinity,1),:)); %the sum of the affinity of placeholders to all other nodes nodes_to_keep = (affinity_sum > 0); %keep only nodes which have some affinity to the placeholders if includePHs == 1 nodes_to_keep(first_unk_node:size(affinity,1)) = 1; %keep all the placeholders even if for some reason they have a sum of zero... end end % function NodesToKeep %keep only missing node rows and their friends function [affinity, num_placeholders, first_unk_node] = ReduceDimensions(affinity, first_unk_node) num_placeholders = size(affinity,1) - first_unk_node + 1; %keep only nodes which have some affinity to the placeholders and all of the placeholders nodes_to_keep = NodesToKeep(affinity, first_unk_node, 1); affinity = affinity(nodes_to_keep, nodes_to_keep); first_unk_node = size(affinity,1) - num_placeholders + 1; end % function ReduceDimensions %return the true clustering accoring to the savd missing_nodes_mapping function [true_clustering] = BuildTrueClustering(missing_nodes_mapping, actual_graph_size, num_missing_nodes, percent_known_placeholders, placeholders_to_remove, last_known_node) %sigal 25.11.12 - count nonzero cell (beside first raw) - i.e. number of placeholder numMapping = 0; %sigal 23.1.14 - start form thrid row (first row original id, second row images profile) for i = 3 : size(missing_nodes_mapping, 1) nz=find(missing_nodes_mapping(i,:)); numMapping = numMapping + size(nz,2); end true_clustering = zeros(numMapping,1); %sigal 25.11.12 %true_clustering = []; %zeros(size(test_clustering, 1), 1); for i = 3 : size(missing_nodes_mapping, 1) for j = 1 : size(missing_nodes_mapping,2) if missing_nodes_mapping(i,j) ~= 0 true_clustering(missing_nodes_mapping(i,j) - actual_graph_size + num_missing_nodes, 1) = j; % missing_nodes_mapping(1, j); end end end %sigal - adjust to withAttr flag TODO if percent_known_placeholders < 1 true_clustering(placeholders_to_remove - last_known_node) = []; end end % function BuildTrueClustering function saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_type, withAttr, missNodes, small_data, i) %%% Sigal 24.1.13 - TODO outFile = sprintf('%s_%d_%d_%d_%d_small_data_%d', dataFileName, iter, missNodes, affinity_type, withAttr, i); if affinity_type == 9 % save instead a dummy size (1) and the best_alg SaveIntMatrixToFile(strcat(dumpSmallDataPath, outFile,'_edges.txt'), small_data, 1); else SaveAsciiGraph(dumpSmallDataPath, outFile, small_data, 1); %% also save graph size end end % function saveSmallData function [best_clustering, best_alg] = ChooseBestResults(clusteringResults, clusteringAlg, type) global affinity_boost; global affinity_boost2; if type == affinity_boost best_clustering_inx = ChooseBestResults1(clusteringResults); elseif type == affinity_boost2 best_clustering_inx = ChooseBestResults2(clusteringResults, clusteringAlg); else fprintf('*** ERROR: ChooseBestResults: invalid tye %d\n', type); end best_clustering = clusteringResults(:,best_clustering_inx); best_alg = clusteringAlg(best_clustering_inx); end % function ChooseBestResults function [best_clustering_inx] = ChooseBestResults1(clusteringResults, indices) numResults = size(clusteringResults,2); if nargin < 2 indices = ones(1,numResults); % i.e. all end sumPurity = CalcSumPurity(clusteringResults, indices); % return max entry [val, inx] = max(sumPurity); fprintf('ChooseBestResults: val %d, inx %d\n', val, inx); best_clustering_inx = inx; end % function ChooseBestResults function [best_clustering_inx] = ChooseBestResults2(clusteringResults, clusteringAlg) max_num_level_2 = 10; last_level_2 = 1; numResults = size(clusteringResults,2); indices_level_1 = zeros(1,numResults); indices_level_2 = zeros(max_num_level_2,numResults); alg_level_2 = zeros(1,max_num_level_2); for i=1:numResults base_alg = floor(clusteringAlg(i)/10); %base_att = base_alg-10*floor(base_alg/10); var_alg = clusteringAlg(i)-base_alg*10; if var_alg == 0 || var_alg == 0 indices_level_1(i) = 1; else found = 0; for j=1:last_level_2-1 if alg_level_2(j)==base_alg l2 = j; found = 1; break; end end if found == 0 l2 = last_level_2; alg_level_2(l2)=base_alg; last_level_2 = l2+1; end indices_level_2(l2,i) = 1; end end for j=1:last_level_2-1 best = ChooseBestResults1(clusteringResults, indices_level_2(j,:)); indices_level_1(best) = 1; end best_clustering_inx = ChooseBestResults1(clusteringResults, indices_level_1); end % function ChooseBestResults function [sumPurity] = CalcSumPurity(clusteringResults, indices) numResults = size(clusteringResults,2); crossPurity = zeros(numResults,numResults); for i=1:numResults for j=1:numResults if i~=j && indices(i)==1 && indices(j)==1 crossPurity(i,j) = ClusteringPurity(clusteringResults(:,j), clusteringResults(:,i)); end end end sumPurity = sum(crossPurity,2); end % function CalcSumPurity