|
|
- function [ rand_score, purity, p_triads, missing_nodes_mapping, removed_nodes] = MissingNodes_S8b(dataFilePath, dataFileName, attributes, attUpperRange, attWeightVec, addMissingAttVec, normFactorVec, affinityType, ...
- num_missing_nodes_arr, attAffinityThreshold, imagesData, numImagesProfiles, imgMissProb, imgSimType, imgSimProbDiff, percentKnownPlaceholdersVec, dumpSmallFlag, dumpSmallDataPath, iter, missingNodes )
-
- %%global g_threshold; % Sigal - 15.10.13 remove warning
- % addpath 'Spectral Clustering' % sigal 13.8.12
- % addpath 'mex'% sigal 13.8.12
-
- p_triads = [];
- k = 0
- %create a log file for this run
- %date_now = clock;
- %date_now = strcat(num2str(date_now(1)),'_',num2str(date_now(2)),'_', num2str(date_now(3)),'_', num2str(date_now(4)), num2str(date_now(5)),'_', num2str(date_now(6)));
- %dump dir for save reduce graphs for GED
- %dumpSmallDataPath = sprintf('%sdumpSmallData_%s/', resultsDir, date_now);
- %diary(strcat('C:\missingnodes\Code\Log\log', date_now,'.log'));
-
- %affinity calculation types
- global affinity_calculation_shortest_path;
- global affinity_calculation_euclid;
- global affinity_calculation_common_friends;
- global affinity_calculation_random_clustering;
- global affinity_calculation_adamic_adar;
- global affinity_calculation_katz_beta_0_5;
- global affinity_calculation_katz_beta_0_05;
- global affinity_calculation_katz_beta_0_005;
- global affinity_calculation_AA_RCN;
- global affinity_boost;
- global affinity_boost2;
-
- affinity_calculation_shortest_path = 0;
- affinity_calculation_euclid = 1;
- affinity_calculation_common_friends = 2;
- affinity_calculation_random_clustering = 3;
- affinity_calculation_adamic_adar = 4;
- affinity_calculation_katz_beta_0_5 = 5;
- affinity_calculation_katz_beta_0_05 = 6;
- affinity_calculation_katz_beta_0_005 = 7;
- affinity_calculation_AA_RCN = 8;
- % sigal 12.3.13 add BOOST option
- affinity_boost = 9;
- affinity_boost2 = 8;
-
- %%%%% for distance as function of num placeholders %%%%
- expectedParms = 19;
- if nargin < expectedParms
- LogMsg(sprintf('*** ERROR: MissingNodes_S8b - Inavlid # of parameters, expected %d got %d',expectedParms,nargin));
- return;
- end
-
- percent_known_placeholders_vec = percentKnownPlaceholdersVec;
-
- if nargin >= expectedParms+1
- select_random_missing_nodes = 0;
- else
- select_random_missing_nodes = 1;
- end
-
- affinity_types = affinityType;
-
- compensate_for_unknown_placeholers = 0;
- compensate_vec = [0 0.3 0.65 1 1.5];
-
- unite_common_friends = 0; %should UNK nodes be united in accordance with the "friend of my friend principle"
-
- cluster_only_missing_nodes = 1;
- % if affinity_calculation_type == affinity_calculation_shortest_path || affinity_calculation_type == affinity_calculation_euclid
- % non_neighbors_distance = Inf;
- % elseif affinity_calculation_type == affinity_calculation_common_friends
- % non_neighbors_distance = 0;
- % end
-
- non_neighbors_distance = 0;
-
- ExpNormFactorVecLen = 3;
- normFactorVecLen = size(normFactorVec,2);
- if normFactorVecLen ~= ExpNormFactorVecLen
- LogMsg(sprintf('*** ERROR: MissingNodes_S8b - invalid normFactorVec expected len=%d got %d',ExpNormFactorVecLen, normFactorVecLen));
- return;
- end
-
- global netAffNormFactor1;
- global netAffNormFactor2;
- netAffNormFactor1 = normFactorVec(1);
- netAffNormFactor2 = normFactorVec(2);
- netAffNormFactor3 = normFactorVec(3);
-
- %compare SC to Kmean, 0=SC, 3=k-mean on PH (mxm), 2=k-mean on PH+Nodes (m*(m+n))
- %sigal 1=kmean on PH (with already cut affinitty)
- kmeanTypesVec = 1; %[0 1]; %1; %%2 3];
-
- %compare with/without attr, 0=MISC, 1=SAMI-A, 2=SAMI-N, 3=SAMI-AK (k-mean), 4=SAMI-NK
- % images => 5=PMI, 7=PMI+SAMI
- samiAttrVec = [0 3 5 7]; % [0 4]; % [0 1 3]; % 2];
- % Sigal - TODO - if no SAMI-N skip data_untouched_withAtt (~line 160)
- run_SAMI_N = 0;
-
- %read the full network, up to num_values links
- %disp('reading network information from file...');
- fprintf('reading network information from file %s%s ...\n', dataFilePath, dataFileName);
- data = load(strcat(dataFilePath, dataFileName), 'data');
- %use sparse data
- %a = struct2table(data);
- %LogMsg(sprintf('%s', a));
- data = cell2mat(struct2cell((data)));
- data = sparse(data); % THISS IS THE MMATRIX
-
- %sigal 25.11.12
- %combine the attributes with data as first #totalAttNum cols/rows
- %[dataWithAtt, totalAttNum] = CombineDataWithAttributes(data, attributes, attUpperRange, attWeight);
- sami_ind = find(samiAttrVec == 1);
- for sami = [1 2 3 4 7]
- sami_ind = sami_ind | find(samiAttrVec == sami);
- end
- if sum(sami_ind,2) > 0
- [attData, totalAttNum] = PreProcessDataAttributes(data, attributes, attUpperRange);
- else % sigal 31.1.14 support runs without attributes, i.e. only images
- attData = 0;
- totalAttNum = 0;
- end
-
- clear('graph');
-
- %rand_score = zeros(2, 2, size(num_missing_nodes_arr,2), 6, 2); %(normalized or not, unite common friends, num missing nodes, affinity calculation, cluster only missing)
- %rand_score_sq = rand_score;
- %purity = zeros(2, size(num_missing_nodes_arr,2), 6, 2);%(unite common friends, num missing nodes, affinity calculation, cluster only missing)
- %purity_sq = purity;
- rand_score = [];
- purity = [];
-
- graph_size = size(data,1); % DATA == MATRIX!
- graph_edges = nnz(data)/2; % number of edges
- %graph_attr_edges = 0; %% sigal 3.1.13: nnz(dataWithAtt)/2 - graph_edges; % number of attributes edges
- %num_missing_nodes_arr = round(num_missing_nodes_arr .* graph_size);
-
- %initialize the data matrix (binary adjacency)
- disp('generating network graph...');
-
- original_graph_size = size(data,1);
-
- original_data = data;
- original_attData = attData;
- graph_attr_edges = nnz(attData); % number of attributes edges
- avg_attr_edges = graph_attr_edges/original_graph_size;
- %original_dataWithAtt = dataWithAtt;
-
- missing_nodes_mapping = [];
-
- for num_missing_nodes_idx = 1 : size(num_missing_nodes_arr,2)
-
- if select_random_missing_nodes
- num_missing_nodes = num_missing_nodes_arr(1, num_missing_nodes_idx);
- else
- num_missing_nodes = length(missingNodes);
- end
-
- if num_missing_nodes > numImagesProfiles
- LogMsg(sprintf('*** ERROR: MissingNodes_S8b - invalid numImagesProfiles %d vs. numMissingNodes %d.',numImagesProfiles, num_missing_nodes));
- return;
- end
- %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
- %remove random nodes %
- %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
- %randomize the missing nodes indexes and sort in descending order
- %disp('selecting random missing nodes...');
- fprintf('selecting %d random missing nodes...\n',num_missing_nodes);
-
- %sigal - 25.11.12 - remove same nodes from data and dataWithAtt
- if select_random_missing_nodes
- %[data, missing_nodes_mapping] = RemoveRandomNodes( original_data, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance );
- fprintf('testtesttes');
- fprintf('testsalam%i', num_missing_nodes);
-
- [data, attData, missing_nodes_mapping] = RemoveRandomNodesWithImages( original_data, original_attData, totalAttNum, num_missing_nodes, missing_nodes_mapping, numImagesProfiles );
- else
- %Sigal 13.10.13 - TODO - add option to pre selected missing node
- %Sigal - 23.1.14 - %%%%TODO - add option for images
- fprintf('*** ERROR: pre selected missing node is not suported for images \n');
- [data, attData, missing_nodes_mapping] = RemoveRandomNodes3( original_data, original_attData, totalAttNum, num_missing_nodes, missing_nodes_mapping, non_neighbors_distance, missingNodes);
- end
-
- data_untouched = data;
- if run_SAMI_N
- tic %att_combine_calc_time in seconds
- data_untouched_withAtt = CombineDataWithAttributes4(data, attData);
- att_combine_calc_time = toc;%att_combine_calc_time
- else
- fprintf('Combining attribtes ...==> Skip\n');
- att_combine_calc_time = 0;
- end
-
- % loop over addMissingAtt options
- for addMissingAtt = addMissingAttVec
- LogMsg(sprintf('S8b:astddMissingAtt=%.3f',addMissingAtt));
- % loop over partial data options
- for percent_known_placeholders = percent_known_placeholders_vec
- data0 = data_untouched;
-
- %sigal - TODO all reference to data0 and S ???
- % S is uncertainty vector when we don't known the placeholders
- S = zeros(1, size(data0,2));
- num_placeholders_to_remove = 0;
- placeholders_to_remove = [];
- num_placeholders = size(data0,1) - original_graph_size + num_missing_nodes ;
- last_known_node = size(data0,1) - num_placeholders;
-
- if percent_known_placeholders < 1
- num_placeholders_to_remove = round(num_placeholders * (1 - percent_known_placeholders));
-
- while size(placeholders_to_remove, 2) < num_placeholders_to_remove
- %randomly selecting unique placeholder indexes
- placeholders_to_remove = unique([placeholders_to_remove, randi(num_placeholders, 1, num_placeholders_to_remove - size(placeholders_to_remove, 2))]);
- end
-
- %rand_vec = rand(1, num_placeholders);
- %placeholders_to_remove = find(rand_vec > percent_known_placeholders) + last_known_node;
-
- placeholders_to_remove = placeholders_to_remove + last_known_node;
-
- %S is the group of neighbors of the unknown placeholders
- S = data0(placeholders_to_remove(1), :);
- for i = placeholders_to_remove
- S = S | data0(i,:);
- end
-
- %switch from binary vector to list of indexes
- % S = find(S);
-
- %data_all_placeholders = data0;
- data0(placeholders_to_remove,:) = [];
- data0(:,placeholders_to_remove) = [];
-
- num_placeholders = num_placeholders - num_placeholders_to_remove;
- end
-
- %save the removed nodes in each iteration
- removed_nodes{num_missing_nodes_idx} = missing_nodes_mapping(1,:);
-
- %save S
- orig_S = S;
-
- %sigal - adjust to withAttr flag TOCHECK
- %sigal - 12.6.13 - move after calcAffinity so we can calculate only the reduce entries
- % tic %att_affinity_calc_time in seconds
- % fprintf('calculating attribtes affinity matrix...\n');
- % attAffinity = CalcAttributesAffinity_S3(data0, attData, last_known_node, addMissingAtt, attAffinityThreshold);
- % att_affinity_calc_time = toc;%att_affinity_calc_time
- att_affinity_merge_time = 0;
-
- all_clustering = [];
- all_clustering_alg = [];
- all_att_clustering = [];
- all_att_clustering_alg = [];
- all_k1_clustering = [];
- all_k1_clustering_alg = [];
- all_k3_clustering = [];
- all_k3_clustering_alg = [];
- all_A2_clustering = [];
- all_A2_clustering_alg = [];
- all_A4_clustering = [];
- all_A4_clustering_alg = [];
- % sigal 12.6.13 - use as flag for first time calculation
- attAffinity = 0;
- phsAttAffinity = 0;
- phsAttAffinity2 = 0;
- phsImgAffinity = 0;
-
- % loop over different affinity_types
- for affinity_calculation_type = affinity_types
- % sigal 12.11.13 - use as flag for first time calculation
- phsNetAffinity = 0;
-
- if affinity_calculation_type == affinity_boost
- withAttrVec = affinity_boost; %[affinity_boost affinity_boost2];
- elseif affinity_calculation_type == affinity_calculation_random_clustering
- withAttrVec = 0;
- else
- % sigal 12.3.13 - run several times:
- % 0=original, 1=weighted affinity, 2=weighted dataWithAttr
- withAttrVec = samiAttrVec; %Sigal - 15.10.13
- end
-
- % sigal 25.11.12
- % run loop - once as original without attributes and next with attriutes
- for withAttr = withAttrVec
- netAffinity = 0; % sigal 24.10.13 - use as flag for first time calculation
- % sigal 5.3.13 - loop over weights
- if withAttr == 0
- attWeightVector = 0;
- elseif withAttr == affinity_boost || withAttr == affinity_boost2
- attWeightVector = [0.1 0.2 0.4]; % sigal - use OASCA per Affinity/kmean
- if kmeanTypesVec==0 % find(kmeanTypesVec==0) %
- attWeightVector = [0 attWeightVector];
- end
- % elseif withAttr == 1 && addMissingAtt > 0 % sigal 23.10.13
- % attWeightVector = 0.2:0.1:0.8; %0.8; %
- elseif affinity_calculation_type == affinity_calculation_common_friends
- if withAttr == 3
- attWeightVector = 0.3:0.1:0.5; %0.3:0.1:0.5;
- else
- attWeightVector = 0.2:0.1:0.4; %5; %%0.2:0.1:0.5; %0.3; %0.2:0.1:0.5; %0.3; %
- end
- elseif affinity_calculation_type == affinity_calculation_adamic_adar
- if withAttr == 4
- attWeightVector = 0.2:0.1:0.4; %0.3:0.1:0.5;
- else
- attWeightVector = 0.6:0.1:0.8; %0.4:0.1:0.6; %0.5:0.1:0.7; %8; %0.4:0.1:0.7; %0.8; %0.5:0.1:0.8;
- end
- elseif affinity_calculation_type == affinity_calculation_AA_RCN
- attWeightVector = 0.1:0.1:0.9; %0.3; %0.2:0.1:0.8; %
- elseif affinity_calculation_type == affinity_calculation_katz_beta_0_05
- attWeightVector = 0.1:0.1:0.4; %0.8; %0.2:0.1:0.8; %
- else
- attWeightVector = attWeightVec;
- end
- if find(withAttr == [3 5 7])
- attWeightVector = [0 attWeightVector 1];
- end
-
- % run loop for attWeight
- for attWeight = attWeightVector
- affinity = 0;
- phsAffinity = 0;
- withAttrWeight = (withAttr+attWeight)*10;
-
- % sigal 3.1.13 - data is the same the change is in the attAffinity
- if find(withAttr == [0 1 3 5 7 affinity_boost affinity_boost2])
- actual_graph_size = original_graph_size;
- num_attr_nodes = 0;
- data = data_untouched;
- elseif withAttr == 2 || withAttr == 4
- actual_graph_size = original_graph_size+totalAttNum;
- num_attr_nodes = totalAttNum;
- data = data_untouched_withAtt;
- else
- exception = MException(fprintf('Invalid Attribute Type %d',withAttr));
- throw(exception);
- end
-
- last_known_node = actual_graph_size - num_missing_nodes;
- first_unk_node = last_known_node + 1;
- num_added_nodes = size(data,1) - last_known_node;
-
- if withAttr == affinity_boost || withAttr == affinity_boost2
- fprintf('calculating best results for affinity matrix, type %d (withAttr=%d)...\n', affinity_calculation_type, withAttrWeight);
- % Sigal 10.3.13 - TODO calc best results
- if attWeight == 0.9
- [test_clustering, best_alg] = ChooseBestResults(all_att_clustering,all_att_clustering_alg,withAttr);
- elseif attWeight == 0.1
- [test_clustering, best_alg] = ChooseBestResults(all_k1_clustering,all_k1_clustering_alg,withAttr);
- elseif attWeight == 0.3
- [test_clustering, best_alg] = ChooseBestResults(all_k3_clustering,all_k3_clustering_alg,withAttr);
- elseif attWeight == 0.2
- [test_clustering, best_alg] = ChooseBestResults(all_A2_clustering,all_A2_clustering_alg,withAttr);
- elseif attWeight == 0.4
- [test_clustering, best_alg] = ChooseBestResults(all_A4_clustering,all_A4_clustering_alg,withAttr);
- else
- [test_clustering, best_alg] = ChooseBestResults(all_clustering,all_clustering_alg,withAttr);
- end
- % Sigal 10.3.13 - TODO sum all times
- affinity_calc_time = 0;
- graph_predict_time= 0;
- reduce_dim_time = 0;
- att_affinity_calc_time = 0;
- phs_att_affinity_calc_time = 0;
- phs_img_affinity_calc_time = 0;
-
- else
-
- % calculate the affinity / similarity matrix
- fprintf('calculating affinity matrix, type %d (withAttr=%d)...\n', affinity_calculation_type, withAttrWeight);
- if withAttr == 2
- tic %affinity_calc_time in seconds
- affinity = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, 0);
- affinity_calc_time = toc;%affinity_calc_time
- %sigal - 12.11.13 - calculate once for each type
- elseif nnz(netAffinity) == 0 && kmeanTypesVec==0 % find(kmeanTypesVec==0)%
- tic %affinity_calc_time
- %sigal - adjust to withAttr flag TOCHECK
- affinity = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt);
- affinity_calc_time = toc;%affinity_calc_time
- netAffinity = affinity;
- % if affinity_calculation_type ~= affinity_calculation_AA_RCN
- % diffAff = affinity(first_unk_node:end,first_unk_node:end)-phsAffinity;
- % fprintf('nnz diffAff %d \n',full(nnz(diffAff)));
- % end
- else
- affinity = netAffinity;
- affinity_calc_time = 0;
- end
- if withAttr == 4
- tic %affinity_calc_time in seconds
- phsAffinity = CalcPHsAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt);
- affinity_calc_time = toc;%affinity_calc_time
- %sigal - 12.11.13 - calculate once for each type
- elseif nnz(phsNetAffinity) == 0 && kmeanTypesVec==1 % find(kmeanTypesVec==1)%
- tic %affinity_calc_time in seconds
- phsNetAffinity = CalcPHsAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt);
- phsAffinity = phsNetAffinity;
- affinity_calc_time = toc;%affinity_calc_time
- else
- %affinity_calc_time = 0;
- phsAffinity = phsNetAffinity;
- end
-
- %sigal - 12.6.13 - move after calcAffinity so we can calculate only the reduce entries
- if withAttr == 1 && nnz(attAffinity) == 0
- nodesToKeep = NodesToKeep(affinity, first_unk_node, 1);
- fprintf('nodesToKeep %d \n',full(sum(nodesToKeep)));
- % Sigal TODO - what if addMissingAtt > 0 ??
- tic %att_affinity_calc_time in seconds
- fprintf('calculating attribtes affinity matrix...\n');
- debugCalcAttr = 0; %% Sigal 17.10.13 debugging nodesToKeep & maxAttStat
- debugAddMissingAtt = 0; %% sigal 16.12.13 backward - use 0 instead of addMissingAtt;
- if debugCalcAttr == 1
- attAffinity = CalcAttributesAffinity_S5(data0, attData, last_known_node, debugAddMissingAtt, attAffinityThreshold);
- else
- attAffinity = CalcAttributesAffinity_S5(data0, attData, last_known_node, debugAddMissingAtt, attAffinityThreshold, nodesToKeep);
- end
- att_affinity_calc_time = toc;%att_affinity_calc_time
- end
- %sigal - 12.11.13 - calculate once for each type
- if (withAttr == 3 || withAttr == 7) && nnz(phsAttAffinity) == 0
- tic %att_affinity_calc_time in seconds
- phsAttAffinity = netAffNormFactor3*CalcPHsAffinityByAttributes(data0, attData, last_known_node, addMissingAtt, attAffinityThreshold);
- phs_att_affinity_calc_time = toc;%att_affinity_calc_time
- end
- %sigal - 22.2.14 - calculate once for each type %%%%% TODO
- if (withAttr == 5 || withAttr == 7) && nnz(phsImgAffinity) == 0
- tic %img_affinity_calc_time in seconds
- phsImgAffinity = netAffNormFactor3*CalcPHsAffinityByImages(data0, imagesData, last_known_node, missing_nodes_mapping, imgMissProb, imgSimType, imgSimProbDiff);
- phs_img_affinity_calc_time = toc;%att_affinity_calc_time
- end
- if withAttr == 3 %sigal - 22.2.14 - SAMI_AK
- if attWeight == 0
- if nnz(phsAttAffinity2) == 0
- tic %att_affinity_calc_time in seconds
- phsAttAffinity2 = netAffNormFactor3*CalcPHsAffinityByAttributes(data0, attData, last_known_node, addMissingAtt, attAffinityThreshold, 1);
- phs_att_affinity_calc_time = toc;%att_affinity_calc_time
- end
- phsAffinity = [netAffNormFactor2*phsAffinity phsAttAffinity2]; % sigal 5.11.13 original order with factor 10
- else
- phsAffinity = (1-attWeight)*phsAffinity+attWeight*phsAttAffinity;
- end
- elseif withAttr == 5 %sigal - 22.2.14 - PMI %%%%% TODO
- if attWeight == 0
- phsAffinity = [netAffNormFactor2*phsAffinity phsImgAffinity]; % sigal 5.11.13 original order with factor 10
- else
- phsAffinity = (1-attWeight)*phsAffinity+attWeight*phsImgAffinity;
- end
- elseif withAttr == 7 %sigal - 22.2.14 - PMI+SAMI %%%%% TODO
- if attWeight == 0
- if nnz(phsAttAffinity2) == 0
- tic %att_affinity_calc_time in seconds
- phsAttAffinity2 = netAffNormFactor3*CalcPHsAffinityByAttributes(data0, attData, last_known_node, addMissingAtt, attAffinityThreshold, 1);
- phs_att_affinity_calc_time = toc;%att_affinity_calc_time
- end
- phsAffinity = [netAffNormFactor2*phsAffinity phsAttAffinity2 phsImgAffinity];
- else
- phsAffinity = (1-attWeight)*phsAffinity+attWeight*(phsAttAffinity+phsImgAffinity)/2;
- end
- end
-
- end
-
- % sigal 3.1.13 - weighted affinity
- if withAttr == 1
- fprintf('merge affinity matrix with attributes affinity\n');
- if debugAddMissingAtt > 0
- e = size(affinity,1);
- else
- e=last_known_node;
- end
- tic %att_affinity_calc_time
- %Sigal - 16.6.13 - use full C implementation
- affinity = WeightedSum(affinity, attAffinity, attWeight, e);
- %Sigal - 17.6.13 free memory
- if original_graph_size > 20000
- fprintf('free attAffinity memory\n');
- clear('attAffinity');
- attAffinity = 0;
- fprintf('free netAffinity memory\n');
- clear('netAffinity');
- netAffinity = 0;
- end
- % affinity(1:e, 1:e)= affinity(1:e,1:e)*(1-attWeight)+attAffinity(1:e,1:e)*attWeight;
- % nnz1 = nnz(affinity);
- % nnz2 = nnz(affinity2);
- % aaa = affinity2-affinity;
- % nnz3 = nnz(aaa);
- % fprintf('nnz affinity: nnz1=%d, nnz2=%d, nnz3=%d\n',nnz1,nnz2,nnz3);
- att_affinity_merge_time = toc;%att_affinity_merge_time
- end
-
- %TODO: extend the dimension reduction to adding missing links / reclustering
- %Sigal/ron - ToRECEK ron TODO (done?)
-
- %Sigal - run always with 1 (ron)
- for reduce_dimensions = [1] %0 must be first because it does not change the affinity matrix
-
- reduce_dim_time = 0;
- skip_reduce_dimensions = find(withAttr == [3 4 5 7 affinity_boost affinity_boost2]); %Sigal 22.1.14 %%% TOCHECK
- if reduce_dimensions == 1 && nnz(affinity) > 0 && ~skip_reduce_dimensions
- fprintf('reduce dimensions\n');
- tic %ReduceDimensions
- [affinity, num_placeholders, first_unk_node] = ReduceDimensions(affinity, first_unk_node);
- reduce_dim_time = toc; %ReduceDimensions
- fprintf('new dimensions %d\n',size(affinity,1));
- end
-
- %sigal - why each iteration? simple calculation - can be done once
- %sigal - adjust to withAttr flag TODO
- fprintf('calculating true clustering\n');
- true_clustering = BuildTrueClustering(missing_nodes_mapping, original_graph_size, num_missing_nodes, percent_known_placeholders, placeholders_to_remove, last_known_node);
-
- %figure,imshow(affinity,[]), title('Affinity Matrix')
- %sigal - use 0:1 if we want to compare with unknown #missNodes
- % (type=2 wasn't tested by ron)
- for num_clusters_known = [1] %[0, 1]
-
- % sigal 29.7.13
- % Test other clustering kmean types
- if affinity_calculation_type == affinity_calculation_random_clustering || affinity_calculation_type == affinity_boost || affinity_calculation_type == affinity_boost2
- kmeanTypes = 1;
- elseif withAttr == 0
- kmeanTypes = kmeanTypesVec;
- elseif withAttr == 1 || withAttr == 2
- kmeanTypes = 0;
- elseif find(withAttr == [3 4 5 7])
- kmeanTypes = 1;
- else
- kmeanTypes = kmeanTypesVec;
- end
- % loop over added kmeanTypes (clustering types)
- for kmeanType = kmeanTypes
-
- %sigal - adjust to withAttr flag TODO - which params? data_untouched, original_graph_size
- k = DetermineNumberOfClusters(num_clusters_known, data_untouched, original_graph_size, num_missing_nodes, num_added_nodes);
-
- debugEstimateK = 0;
- if debugEstimateK == 1 && affinity_calculation_type == affinity_calculation_common_friends
- for type=[0,3,4,8]
- estK = DetermineNumberOfClusters(type, data_untouched, actual_graph_size, num_missing_nodes, num_added_nodes);
- fprintf('debugEstimateK: type=%d, estK=%d\n',type,estK);
- end
- end
-
- if num_clusters_known == 1
- withAttrC = 0;
- elseif num_clusters_known == 0
- withAttrC = 10;
- else
- withAttrC = num_clusters_known*10;
- end
-
- % sigal 15.10.13 - add kmeanType & num_clusters to alg type
- withAttrC = withAttrC+kmeanType;
- withAttrWeight = withAttrC*100+(withAttr+attWeight)*10;
-
- %sigal - first_unk_node might change after ReduceDimensions
- last_known_node = first_unk_node - 1;
-
-
- if withAttr ~= affinity_boost && withAttr ~= affinity_boost2
- %sigal - adjust to withAttr flag TOCHECK
- fprintf('predicting the graph\n');
- tic %graph_predict_time
- if kmeanType == 1
- [newData, test_clustering] = PredictGraph(phsAffinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes, kmeanType);
- else
- [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes, kmeanType);
- end
- graph_predict_time = toc; %graph_predict_time
- end
- out_data = newData;
- out_clusterrr = test_clustering;
-
-
-
- %sigal - when to use - only if there are unknown placeholders
- if compensate_for_unknown_placeholers == 1
- fprintf('*** running with compensate_for_unknown_placeholers mode ...\n');
- S = orig_S;
-
- if size(newData,1) > size(S,2)
- %for breakpoint
- tttt = 98;
- end
- sigma = 1/4;
- S = S(1:size(newData,1));
- S = S + randn(size(S)) * sigma;
- sorted_S = sort(S, 'descend');
-
- %sum over the columns and find the columns which
- %indicate at least one neighbor
- %neighbors_of_new_nodes = find(sum(newData(first_unk_node:size(newData,1), :)));
-
- first_united_node = size(newData,1) - k +1;
- if affinity_calculation_type == affinity_calculation_katz_beta_0_05
- newAffinity = CalcAffinityByKatzBeta_Sparse( newData, 0.05, 4 );
- elseif affinity_calculation_type == affinity_calculation_adamic_adar
- newAffinity = CalculateAffinityByAdamicAdar_Sparse(newData, size(newData,1), 0, 0);
- elseif affinity_calculation_type == affinity_calculation_common_friends
- newAffinity = CalcAffinityByCommonNeighbors_Sparse(newData, size(newData,1), 0);
- end
-
- newNodesAffinity = newAffinity(first_united_node:size(newAffinity,1), :);
- newNodesAffinity(newNodesAffinity>=1) = 0;
- newNodesAffinity = newNodesAffinity / max(max(newNodesAffinity));
- %newNodesAffinity = newNodesAffinity / 2;
-
- newNodesAffinity(newData(first_united_node:size(newAffinity,1), :) >= 1) = 0;
-
- newNodesAffinity = (newNodesAffinity / max(max(newNodesAffinity)));
- %%%%trying to take only the
- %%%%k highest affinities
- sortedNewNodesAffinity = sort(newNodesAffinity(:), 'descend');
- %affinityThreshold = sortedNewNodesAffinity(k + size(neighbors_of_new_nodes, 2));
- newNodesAffinity_orig = newNodesAffinity;
- end % compensate_for_unknown_placeholers == 1
-
- if percent_known_placeholders < 1 && compensate_for_unknown_placeholers == 1
- %calculating as a function of number of links added
-
- meanNumLinks = mean(sum(data(1:last_known_node, 1:last_known_node)));
- maxNumLinksToAdd = meanNumLinks * num_placeholders;
- maxNumLinksToAdd = min(maxNumLinksToAdd, 25);
-
- %sigal 27.6.13
- linksToAdd = round(compensate_vec*num_missing_nodes);
- else
- maxNumLinksToAdd = 0;
- %sigal 27.6.13
- linksToAdd = 0;
- end
- %%%%% for distance as
- %%%%% function of num
- %%%%% placeholders %%%%
-
- %max_neighbors = S >= sorted_S(maxNumLinksToAdd);
- %sigal 27.6.13
- %for numLinksToAdd = 0 : maxNumLinksToAdd
- origWithAttrWeight = withAttrWeight;
- for linksInx = 1:size(linksToAdd,2)
- numLinksToAdd = linksToAdd(linksInx);
- %Sigal - 13.10.13 - TODO - fix withAttr flag calculation
- withAttrXX = compensate_vec(linksInx)*100; % numLinksToAdd
- withAttrWeight = origWithAttrWeight+1000*withAttrXX;
- if compensate_for_unknown_placeholers == 1
-
- newNodesAffinity = newNodesAffinity_orig;
-
- neighbors = [];
- if numLinksToAdd > 0
- neighbors = find(S >= sorted_S(numLinksToAdd), numLinksToAdd);
- end
-
- newDataWithMissingLinks = newData; % partial graph with the clustered nodes
- newDataForClustering = data0; % partial graph with partial PHs
- for neighbor = neighbors
- [value, closest_new_node] = max(newNodesAffinity(:,neighbor));
- closest_new_node = closest_new_node(1);
- newDataWithMissingLinks(first_united_node + closest_new_node - 1, neighbor) = 1;
- newDataWithMissingLinks(neighbor, first_united_node + closest_new_node - 1) = 1;
-
- newPlaceholder = zeros(1, size(newDataForClustering,2));
- newPlaceholder(neighbor) = 1;
- newDataForClustering = [newDataForClustering, newPlaceholder'; newPlaceholder, 0];
-
- end
-
- affinityWithS = CalcAffinity( newDataForClustering, affinity_calculation_type, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt);
-
- if reduce_dimensions == 1
- [affinityWithS, num_placeholdersWithS, first_unk_node_with_s] = ReduceDimensions(affinityWithS, first_united_node);
- else
- num_placeholdersWithS = num_placeholders + length(neighbors);
- first_unk_node_with_s = first_united_node;
- end
-
- % Sigal 15.10.13 - we are not using the new clusteing result as in this
- % cases we are using the GED as the main measure
- fprintf('^%s', newPredictedGraph)
-
- %remap the original data so that the known nodes match the
- %predicted data and the missing nodes match the predicted
- % nodes created from each cluster
-
- %sigal - adjust to withAttr flag TODO original_data
- perm_vector = 1:size(original_data,1);
- perm_vector(missing_nodes_mapping(1,:)) = [];
- perm_vector = [perm_vector, missing_nodes_mapping(1,:)];
- remapped_data = original_data(perm_vector,perm_vector);
-
- [small_data, indices_to_remove] = DecreaseGraphSize(remapped_data, first_united_node : size(remapped_data,1), neighbors); %changed from neighbors to perm(max_neighbors) - to be fair when there are more nieghbors
- %small_data2 = DecreaseGraphSize(newData, last_known_node+1 : size(newData,1));
- %new_nodes_affinity_sum = sum(sum(newNodesAffinity));
-
- %in case there is an empty cluster, the unrelated nodes may contain node index that does not exist
- %in the predicted graph (which may contain less nodes)
- indices_to_remove(indices_to_remove > size(newPredictedGraph,2)) = [];
-
-
- small_data2 = newData;
- small_data2(indices_to_remove,:) = [];
- small_data2(:,indices_to_remove) = [];
- %small_data3 = DecreaseGraphSize(newDataWithMissingLinks, last_known_node+1 : size(newDataWithMissingLinks,1));
- small_data3 = newDataWithMissingLinks;
- small_data3(indices_to_remove,:) = [];
- small_data3(:,indices_to_remove) = [];
-
- small_data4 = newPredictedGraph;
- small_data4(indices_to_remove,:) = [];
- small_data4(:,indices_to_remove) = [];
-
- %%% Sigal 27.1.13 - save reduce graphs for GED
- if dumpSmallFlag == 1
- saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, small_data, 1);
- saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, small_data2, 2);
- end
-
- %only calculate on the first iteration to save time
- if numLinksToAdd == 0
- edit_distance = 99; %GraphEditDistance( small_data, small_data2, num_missing_nodes );
- edit_distance2 = edit_distance;
- edit_distance3 = edit_distance;
- else
- edit_distance2 = 99; %GraphEditDistance( small_data, small_data3, num_missing_nodes );
- edit_distance3 = 99; %GraphEditDistance( small_data, small_data4, num_missing_nodes );
- if dumpSmallFlag == 1
- saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, small_data3, 3);
- saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, small_data4, 4);
- end
- end
- elseif withAttr == affinity_boost || withAttr == affinity_boost2 % NOT compensate_for_unknown_placeholers == 1
- if dumpSmallFlag == 1
- saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, best_alg, 1);
- saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, best_alg, 2);
- end
- else % NOT compensate_for_unknown_placeholers == 1
-
- %sigal - adjust to withAttr flag TODO original_data
- perm_vector = 1:size(original_data,1); % set values 1:n
- perm_vector(missing_nodes_mapping(1,:)) = []; % according to 1st line of missing nodes remove indexes from perm
- perm_vector = [perm_vector, missing_nodes_mapping(1,:)]; % add missing node as last indexes
- remapped_data = original_data(perm_vector,perm_vector); % return original graph according to perm
- out_data_p = out_data%(perm_vector,perm_vector);
- graphs_out = sprintf('/Users/armin/Desktop/output/graphed_%d.mat', iter);
- save(graphs_out, 'out_data_p', 'out_data', 'remapped_data', 'original_data');
- %sigal - return data only with missing nodes and their friends
- % reduce size to improve GED calulation
- [small_data, indices_to_remove] = DecreaseGraphSize(remapped_data, (size(remapped_data,1) - num_missing_nodes + 1) : size(remapped_data,1), []); %changed from neighbors to perm(max_neighbors) - to be fair when there are more nieghbors
-
- %Sigal - now according to original data remove indexes,
- %adjust/reduce the predict grpah return at newData
- small_data2 = newData;
- %sigal - adjust to withAttr flag - remove att nodes TOCHECK
- if num_attr_nodes > 0
- small_data2(1:num_attr_nodes,:) = [];
- small_data2(:,1:num_attr_nodes) = [];
- end
-
- %sigal 26.11.12 - change newData to small_data2 after resizing
- indices_to_remove(indices_to_remove > size(small_data2,2)) = [];
-
- small_data2(indices_to_remove,:) = [];
- small_data2(:,indices_to_remove) = [];
- %fprintf('&%s', small_data2)
- %fprintf('&&%s', small_data)
-
- %%% Sigal 27.1.13 - save reduce graphs for GED
- withAttrWeight = origWithAttrWeight+1000*(1-percent_known_placeholders)*10;
- if dumpSmallFlag == 1
- saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, small_data, 1);
- saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_calculation_type, withAttrWeight, num_missing_nodes, small_data2, 2);
- end
- %sigal 6.12.12 - *** TODO *** temporary for test only TODO
- %edit_distance = GraphEditDistance( small_data, small_data2, num_missing_nodes );
- edit_distance = 99;
-
- % Sigal 10.3.13 - save results (without random)
- if attWeight ~= 1 && affinity_calculation_type ~= affinity_calculation_random_clustering
- currAlg = affinity_calculation_type*1000+withAttrWeight;
- % sigal 21.10.13 (only on SC)
- if kmeanType == 0
- res_index = size(all_clustering,2) + 1;
- all_clustering(:, res_index) = test_clustering;
- all_clustering_alg(res_index) = currAlg;
- if withAttr ~= 0
- att_index = size(all_att_clustering,2) + 1;
- all_att_clustering(:, att_index) = test_clustering;
- all_att_clustering_alg(att_index) = currAlg;
- end
- end
- % sigal 17.10.13 - find best based on kmeanType == 2
- if kmeanType == 1
- k_index = size(all_k1_clustering,2) + 1;
- all_k1_clustering(:, k_index) = test_clustering;
- all_k1_clustering_alg(k_index) = currAlg;
- if affinity_calculation_type == affinity_calculation_common_friends
- k_index = size(all_A2_clustering,2) + 1;
- all_A2_clustering(:, k_index) = test_clustering;
- all_A2_clustering_alg(k_index) = currAlg;
- elseif affinity_calculation_type == affinity_calculation_adamic_adar
- k_index = size(all_A4_clustering,2) + 1;
- all_A4_clustering(:, k_index) = test_clustering;
- all_A4_clustering_alg(k_index) = currAlg;
- end
- end
- if kmeanType == 2 || kmeanType == 3
- k_index = size(all_k3_clustering,2) + 1;
- all_k3_clustering(:, k_index) = test_clustering;
- all_k3_clustering_alg(k_index) = currAlg;
- end
- end
- end % compensate_for_unknown_placeholers == 1
-
- % calculate the purity for actual clustering
- fprintf('calculating purity\n');
- try
- %sigal - calulation done accoring to definition
- temp_purity = ClusteringPurity(true_clustering, test_clustering);
- catch ME1
- temp_purity = 99; %Sigal 12.8.12 - add invalid value incase of exception
- ddddd = 1;
- end
- clusters_out = sprintf('/Users/armin/Desktop/output/OUTp_%d.mat', iter);
- save(clusters_out, 'true_clustering', 'test_clustering');
-
- % save results
- fprintf('saving results (withAttr %d, purity %.5f) \n',withAttrWeight,temp_purity);
- %oooo = sprintf('/Users/armin/Desktop/output/OUT_%d.mat', k);
- %save(oooo, 'withAttrWeight', 'temp_purity');
- curr_index = size(purity,2) + 1;
- purity(curr_index).score = temp_purity;
- purity(curr_index).score_sq = temp_purity^2;
- purity(curr_index).edit_distance = edit_distance;
- if compensate_for_unknown_placeholers == 1
- purity(curr_index).numLinksToAdd = numLinksToAdd;
- purity(curr_index).edit_distance_missing_links = edit_distance2;
- purity(curr_index).edit_distance_new_clustering = edit_distance3;
- fprintf('numLinksToAdd - %d\nedit_distance - %d\nedit_distance_missing_links - %d\nedit_distance_new_clustering - %d\n', numLinksToAdd, full(edit_distance), full(edit_distance2), full(edit_distance3));
- end
- purity(curr_index).withAttr = withAttrWeight; % sigal 15.10.13
- purity(curr_index).num_missing_nodes_idx = num_missing_nodes_idx;
- purity(curr_index).num_missing_nodes = num_missing_nodes_arr(num_missing_nodes_idx);
- purity(curr_index).affinity_calculation_type = affinity_calculation_type;
- purity(curr_index).addMissingAtt = addMissingAtt; %sigal 8.11.13
- purity(curr_index).cluster_only_missing_nodes = cluster_only_missing_nodes;
- purity(curr_index).num_clusters_known = num_clusters_known;
- purity(curr_index).num_clusters_estimated = k; % sigal 26.11.12
- purity(curr_index).num_placeholders = num_placeholders;
- purity(curr_index).num_placeholders_to_remove = num_placeholders_to_remove;
- purity(curr_index).num_attr_nodes = totalAttNum; % sigal 3.1.13
- purity(curr_index).unite_common_friends = unite_common_friends;
- purity(curr_index).iteration = 1;
- purity(curr_index).test_clustering = test_clustering;
- purity(curr_index).true_clustering = true_clustering;
- purity(curr_index).graph_size = graph_size;
- purity(curr_index).graph_edges = graph_edges; % sigal - number of edges
- purity(curr_index).graph_attr_edges = graph_attr_edges; % sigal - number of attributes edges
- purity(curr_index).inverse_purity = 99; % Sigal 12.8.12 - tmp ??? % CalculateInversePurity(true_clustering, test_clustering);
- purity(curr_index).NMI = 99; %CalcNormalizedMutualInformation(true_clustering, test_clustering);
- purity(curr_index).removed_nodes = removed_nodes;
- purity(curr_index).percent_known_placeholders = percent_known_placeholders;
- purity(curr_index).reduce_dimensions = reduce_dimensions;
- purity(curr_index).missing_nodes_mapping = missing_nodes_mapping;
- purity(curr_index).compensate_for_unknown_placeholers = compensate_for_unknown_placeholers;
- purity(curr_index).affinity_calc_time = affinity_calc_time;
- purity(curr_index).reduce_dim_time = reduce_dim_time;
- purity(curr_index).graph_predict_time = graph_predict_time;
- if withAttr == 1 %% save this time only for this variation
- purity(curr_index).att_affinity_calc_time = att_affinity_calc_time; %sigal 14.3.13
- purity(curr_index).affinity_calc_time = affinity_calc_time+att_affinity_merge_time; %sigal 14.3.13
- elseif withAttr == 2 || withAttr == 4 %% save this time only for this variation
- purity(curr_index).att_affinity_calc_time = att_combine_calc_time; %sigal 13.3.13
- elseif withAttr == 3 %% sigal 22.1.14
- purity(curr_index).att_affinity_calc_time = phs_att_affinity_calc_time;
- elseif withAttr == 5 %% sigal 22.1.14
- purity(curr_index).att_affinity_calc_time = phs_img_affinity_calc_time;
- elseif withAttr == 7 %% sigal 22.1.14
- purity(curr_index).att_affinity_calc_time = phs_att_affinity_calc_time+phs_img_affinity_calc_time;
- else
- purity(curr_index).att_affinity_calc_time = 0; %sigal 12.3.13
- end
-
- if compensate_for_unknown_placeholers == 0
- break
- end
- end %linksToAdd
- %
- % purity(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) = ...
- % purity(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) + temp_purity;
- %
- % purity_sq(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) = ...
- % purity_sq(unite_common_friends+1, num_missing_nodes_idx, affinity_calculation_type+1, cluster_only_missing_nodes+1) + temp_purity^2;
- %
-
-
- LogMsg(sprintf('S8b: Size=%d,Miss=%d,PHs=%d,Affinity=%d,Att=%d,Purity=%.3f', ...
- graph_size,purity(curr_index).num_missing_nodes,num_placeholders,affinity_calculation_type,withAttrWeight,temp_purity));
-
- %fprintf('affinity_calculation_type = %d, unite_common_friends = %d\n', affinity_calculation_type, unite_common_friends);
- fprintf('Graph size: %d, Number of missing nodes: %d, Purity: %f \n' ,graph_size, num_missing_nodes, temp_purity);
- %fprintf('============================================\n\n\n');
-
- %clear U;
-
- clear eigValues;
- clear eigVectors;
- end %kmeanTypes (clustering types)
- end %num_clusters_known
- end %reduce_dimensions
- clear('affinity');
- clear('phsAffinity');
- end % run loop for attWeight
- end % run over - once as original without attributes and next with attriutes/images
- clear('netAffinity');
- clear('phsNetAffinity');
- end % loop over different affinity_types
- clear('attAffinity');
- clear('phsAttAffinity');
- clear('phsAttAffinity2');
- end %loop over percent_known_placeholders_vec
- end %loop over addMissingAtt
- end %loop over num_missing_nodes...
-
- end %main function
-
-
- % sigal - 29.10.13
- % calc only PHs affinity
- function [affinity] = CalcPHsAffinity( data, affType, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt)
- global affinity_calculation_shortest_path;
- global affinity_calculation_euclid;
- global affinity_calculation_common_friends;
- global affinity_calculation_random_clustering;
- global affinity_calculation_adamic_adar;
- global affinity_calculation_katz_beta_0_5;
- global affinity_calculation_katz_beta_0_05;
- global affinity_calculation_katz_beta_0_005;
- global affinity_calculation_AA_RCN;
-
- global netAffNormFactor1;
- global netAffNormFactor2;
-
- firstPH = actual_graph_size-num_missing_nodes+1;
- normAttWeight = attWeight / netAffNormFactor2; % only for SAMI-N
-
- if affType == affinity_calculation_euclid
- LogMsg(sprintf('*** ERROR: MissingNodes_S8b:CalcPHsAffinity - affType %d not supported !!!',affType));
- return;
- elseif affType == affinity_calculation_shortest_path
- LogMsg(sprintf('*** ERROR: MissingNodes_S8b:CalcPHsAffinity - affType %d not supported !!!',affType));
- return;
- elseif affType == affinity_calculation_common_friends
- affinity = CalcPHsAffinityByRCN(data, actual_graph_size, num_missing_nodes, num_attr_nodes, normAttWeight, addMissingAtt);
- elseif affType == affinity_calculation_random_clustering
- affinity = data(firstPH:end, firstPH:end); %just a placeholder...
- elseif affType == affinity_calculation_adamic_adar
- affinity = CalcPHsAffinityByAA( data, actual_graph_size, num_missing_nodes, 1, num_attr_nodes, normAttWeight, addMissingAtt);
- elseif affType == affinity_calculation_katz_beta_0_5
- affinity = CalcPHsAffinityByKatzBeta( data, 0.5, 3, firstPH );
- elseif affType == affinity_calculation_katz_beta_0_05
- affinity = CalcPHsAffinityByKatzBeta( data, 0.05, 4, firstPH );
- elseif affType == affinity_calculation_katz_beta_0_005
- affinity = CalcPHsAffinityByKatzBeta( data, 0.005, 4, firstPH );
- elseif affType == affinity_calculation_AA_RCN
- affinity = CalcPHsAffinity( data, affinity_calculation_adamic_adar, actual_graph_size, num_missing_nodes, num_attr_nodes, normAttWeight, addMissingAtt);
- affinity2 = CalcPHsAffinity( data, affinity_calculation_common_friends, actual_graph_size, num_missing_nodes, num_attr_nodes, normAttWeight, addMissingAtt);
- affinity = [affinity2 affinity];
- end
-
- %sigal 14.11.13
- affinity = affinity * netAffNormFactor1;
-
- end %main function
-
-
- %sigal - adjust to withAttr flag TODO
- function [affinity] = CalcAffinity( data, affinity_calculation_type, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt)
- global affinity_calculation_shortest_path;
- global affinity_calculation_euclid;
- global affinity_calculation_common_friends;
- global affinity_calculation_random_clustering;
- global affinity_calculation_adamic_adar;
- global affinity_calculation_katz_beta_0_5;
- global affinity_calculation_katz_beta_0_05;
- global affinity_calculation_katz_beta_0_005;
- global affinity_calculation_AA_RCN;
-
- global netAffNormFactor1;
-
- % sigal 11.2.14 - backward compitbility with ASONAM 13
- addMissingAtt = 0;
-
- if affinity_calculation_type == affinity_calculation_euclid
- sp_mat = graphallshortestpaths(data);
- %remove INF values
- max_value = max(sp_mat(sp_mat ~= Inf)) + 1;
- sp_mat_euclid = sp_mat;
- sp_mat_euclid(sp_mat == Inf) = max_value;
- affinity = CalculateAffinity(sp_mat_euclid);
- %affinity = exp(-(sp_mat.^2))/(2 * 0.3^2);
- elseif affinity_calculation_type == affinity_calculation_shortest_path
- % max_value = max(sp_mat(sp_mat ~= Inf)) + 1;
- % sp_mat_euclid = sp_mat;
- % sp_mat_euclid(sp_mat == Inf) = max_value;
- % affinity = (sp_mat_euclid + 1).^(-affinity_exp_factor);
-
-
- %affinity = spfun(affinityFunc, data);
- affinity = graphallshortestpaths(data);
- affinity = affinity .^ -2;
-
- affinity(affinity == Inf) = 1; %added on 05/11/11
-
- elseif affinity_calculation_type == affinity_calculation_common_friends
- affinity = CalcAffinityByCommonNeighbors_Sparse(data, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt);
- %affinity = CalcAffinityByCommonNeighbors(data, actual_graph_size, num_missing_nodes);
- elseif affinity_calculation_type == affinity_calculation_random_clustering
- affinity = data; %just a placeholder...
- elseif affinity_calculation_type == affinity_calculation_adamic_adar
- affinity = CalculateAffinityByAdamicAdar_S3o( data, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, 1 , addMissingAtt);
- %%affinity2 = CalculateAffinityByAdamicAdar_S2( data, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, 1 , addMissingAtt);
- if nnz(affinity) < 5
- x = 8;
- end
- % diff = affinity2-affinity;
- % if nnz(diff) > 0
- % LogMsg(sprintf('*** WARNING: affinityAA - mismatch (nnz=%d)',nnz(diff)));
- % zz = 999;
- % end
- elseif affinity_calculation_type == affinity_calculation_katz_beta_0_5
- affinity = CalcAffinityByKatzBeta_Sparse( data, 0.5, 3, num_attr_nodes );
- elseif affinity_calculation_type == affinity_calculation_katz_beta_0_05
- affinity = CalcAffinityByKatzBeta_Sparse( data, 0.05, 4, num_attr_nodes );
- elseif affinity_calculation_type == affinity_calculation_katz_beta_0_005
- affinity = CalcAffinityByKatzBeta_Sparse( data, 0.005, 4, num_attr_nodes );
- elseif affinity_calculation_type == affinity_calculation_AA_RCN
- w2 = 0.5;
- affinity = CalcAffinity( data, affinity_calculation_adamic_adar, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt);
- affinity2 = CalcAffinity( data, affinity_calculation_common_friends, actual_graph_size, num_missing_nodes, num_attr_nodes, attWeight, addMissingAtt);
- affinity = WeightedSum(affinity, affinity2, w2, size(affinity,1));
- end
-
- %sigal 22.11.13
- %affinity = affinity * 10; %%netAffNormFactor1;
-
- end % function CalcAffinity
-
- %sigal - review code and messages TODO
- function [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinityType, cluster_only_missing_nodes)
-
- fprintf('kmeans clustering type=0 (SP)\n');
-
- first_unk_node = size(affinity,1) - num_placeholders + 1;
-
- diagonal = sum(affinity, 2); %sum the rows
- D = sparse(diag(diagonal)); %D is the matrix whose diagonal is the sum of the rows of A
- clear('diagonal');
-
- fprintf('calculating NL\n');
-
- D = sqrt(D);
- NL1 = D * affinity * D;
- clear('D');
-
- fprintf('calculating U - eigs\n');
-
- fail = 0;
- try
- [nEigVec,eigValues] = eigs(NL1,k);
- catch ME1 % variable that get the exception
- opts.tol = 1e-1;
- try
- fprintf('calculating U - 2nd try\n');
- [nEigVec,eigValues] = eigs(NL1,k, 'LM', opts);
- catch ME2
- fail = 1;
- end
- end
-
- % select k largest eigen vectors
- if fail == 0
- U = [];
- % construct the normalized matrix U from the obtained eigen vectors
- fprintf('calculating U - construct\n');
- for i=1:size(nEigVec,1)
- n = sqrt(sum(nEigVec(i,:).^2));
- U(i,:) = nEigVec(i,:) ./ n;
- end
-
- num_samples = size(affinity,1) - first_unk_node + 1;
-
- if cluster_only_missing_nodes == 1
- U(1:first_unk_node - 1 ,:) = []; %cluster only the missing nodes
- end
-
- fprintf('SC: run kmeans clustering\n');
- % perform kmeans clustering on the matrix U
- test_clustering = calcKMean(U, k, num_samples, affinityType);
- else %fail == 0
- disp('Failed in finding eigenvectors - using random!');
- if cluster_only_missing_nodes == 0
- num_samples = size(affinity,1);
- else
- num_samples = num_placeholders;
- end
- test_clustering = randi(k, num_samples, 1);
- end
-
- end % function SpectralClustering
-
- % Sigal 29.7.13
- % add option for clustering with Kmean intead of SP
- % options: kmean_type 0=SP, 3=kmean on PH (mxm), 2=Kmean on PH+Nodes (m*(m+n))
- % 1=kmean on already cut affinity
- function [test_clustering] = KMeanClustering(affinity, k, num_placeholders, affinityType, kmean_type)
-
- first_unk_node = size(affinity,1) - num_placeholders + 1;
- num_samples = num_placeholders;
-
- if kmean_type == 1
- U = affinity;
- elseif kmean_type == 3 % previous kmean_type == 1
- U(1:num_placeholders,1:num_placeholders) = affinity(first_unk_node:end,first_unk_node:end);
- else
- U(1:num_placeholders,:) = affinity(first_unk_node:end,:);
- end
-
- fprintf('kmeans clustering type=%d\n',kmean_type);
-
- % perform kmeans clustering on the matrix U
- test_clustering = calcKMean(U, k, num_samples, affinityType);
-
- end % function SpectralClustering
-
- % Sigal 29.10.13
- % perform kmeans clustering on the matrix U
- % use same method for both KMeanClustering and SpectralClustering
- function [test_clustering] = calcKMean(U, num_clusters, num_samples, affinityType)
-
- global affinity_calculation_random_clustering;
-
- if num_clusters > 99
- numReplicates = 1;
- else
- numReplicates = 3;
- end
-
- fprintf('calcKMean\n');
-
- % perform kmeans clustering on the matrix U
- fail = 1;
- while fail > 0
- try
- currK = num_clusters;
- % OPT: 'EmptyAction','singleton' - in case of an empty cluster just drop it
- % OPT: 'Replicates',3 - repeat run/start points
- [IDX,C, SUMD, D] = kmeans(U,currK,'EmptyAction','singleton','Replicates',numReplicates);
- fail = 0;
- catch ME1
- fail = fail + 1;
- if fail < 100
- %disp('error in kmeans clustering. trying again...');
- else
- %give up on clustering and select random clusters...
- IDX = randi(currK, size(U));
- fail = 0;
- end
- end
- end
-
- test_clustering = IDX(size(IDX,1) - num_samples + 1 : size(IDX,1));
-
- %if it's random just replace everything...
- if affinityType == affinity_calculation_random_clustering
- test_clustering = randi(num_clusters, size(test_clustering,1), size(test_clustering,2));
- end
- end %function KMeanClustering
-
-
-
- % sigal - adjust to withAttr flag TOCHECK - which data to use?
- % Sigal 29.7.13
- % add option for clustering with Kmean intead of SP
- % original implementation with SP, i.e use kmean_type = 0
- % k is the number of return clusters
- function [newData, test_clustering] = PredictGraph(affinity, k, data, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes, kmean_type)
-
- % sigal - backwards compatibility
- if nargin < 7
- kmean_type = 0;
- end
-
- last_known_node = size(data,1) - num_placeholders;
- %%first_unk_node = last_known_node + 1; % Sigal 15.10.13 remove unuse warning
-
-
- if kmean_type == 0
- [test_clustering] = SpectralClustering(affinity, k, num_placeholders, affinity_calculation_type, cluster_only_missing_nodes);
- else
- [test_clustering] = KMeanClustering(affinity, k, num_placeholders, affinity_calculation_type, kmean_type);
- end
- %sigal - what if #clusters diffrent than #missing ???
- newNodes = CreateNewNodesFromClusters(data, test_clustering);
-
- %sigal - ... means continue command at next line
- newData = [data(1:last_known_node,1:last_known_node), newNodes(:, 1:last_known_node)';...
- newNodes(:,1:last_known_node), zeros(size(newNodes,1))];
-
- end % function PredictGraph
-
- %sigal - estimation can be wrong ???
- %sigal - adjust to withAttr flag TODO
- function [k] = DetermineNumberOfClusters(num_clusters_known, data_untouched, actual_graph_size, num_missing_nodes, num_added_nodes)
-
- %determine k - the number of clusters
- if num_clusters_known == 1
- k = num_missing_nodes;
- else
- numKnownNodes = actual_graph_size - num_missing_nodes;
- sumKnownEdges = sum(sum(data_untouched(1 : numKnownNodes, 1 : numKnownNodes)));
- meanKnownEdges = sumKnownEdges/numKnownNodes;
- addedEdges = num_added_nodes*2; % undirect graph
- fprintf('EstimatedK: numKnownN=%d, meanKnownE=%.3f, addedE=%d, missN=%d, meanMissE=%.3f\n', ...
- numKnownNodes,full(meanKnownEdges),num_added_nodes,num_missing_nodes,addedEdges/num_missing_nodes);
- if num_clusters_known == 0
- %k = round(num_added_nodes / meanKnownEdges);
- k = round(num_added_nodes / floor(meanKnownEdges));
- %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k);
- elseif num_clusters_known == 2 %guessing upper limit
- k = 2*round(num_added_nodes / meanKnownEdges);
- %fprintf('EstimatedK: type=%d, actual=%d, guessing upper limit %d\n',num_clusters_known, num_missing_nodes, k);
- elseif num_clusters_known == 3 % e=a*n
- a = meanKnownEdges;
- e = sumKnownEdges+num_added_nodes;
- k = round(e/a-numKnownNodes);
- %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k);
- elseif num_clusters_known == 4 % e=a*n^2
- a = meanKnownEdges/numKnownNodes;
- e = sumKnownEdges+addedEdges;
- k = round(sqrt(e/a)-numKnownNodes);
- %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k);
- elseif num_clusters_known == 5 % e=a*n^2
- a = meanKnownEdges/numKnownNodes;
- e = sumKnownEdges+addedEdges;
- k = ceil(sqrt(e/a)-numKnownNodes);
- %fprintf('EstimatedK: type=%d, actual=%d, rounding k to %d\n',num_clusters_known, num_missing_nodes, k);
- elseif num_clusters_known == 6 % e=a*n
- a = meanKnownEdges;
- e = sumKnownEdges+num_added_nodes;
- k = ceil(e/a-numKnownNodes);
- elseif num_clusters_known == 7
- k = ceil(num_added_nodes / meanKnownEdges);
- elseif num_clusters_known == 8
- k = round(num_added_nodes / meanKnownEdges);
- end
- LogMsg(sprintf('EstimatedK(size,PHs,type,actual,k):\t%d\t%d\t%d\t%d\t%d', ...
- actual_graph_size,num_added_nodes,num_clusters_known, num_missing_nodes, k),'EstimateK_Log2.txt');
-
- end
- end % function DetermineNumberOfClusters
-
- %find nodes with some affinity to one or more missing node
- function [nodes_to_keep] = NodesToKeep(affinity, first_unk_node, includePHs)
-
- affinity_sum = sum(affinity(first_unk_node:size(affinity,1),:)); %the sum of the affinity of placeholders to all other nodes
- nodes_to_keep = (affinity_sum > 0); %keep only nodes which have some affinity to the placeholders
-
- if includePHs == 1
- nodes_to_keep(first_unk_node:size(affinity,1)) = 1; %keep all the placeholders even if for some reason they have a sum of zero...
- end
-
- end % function NodesToKeep
-
-
- %keep only missing node rows and their friends
- function [affinity, num_placeholders, first_unk_node] = ReduceDimensions(affinity, first_unk_node)
-
- num_placeholders = size(affinity,1) - first_unk_node + 1;
- %keep only nodes which have some affinity to the placeholders and all of the placeholders
- nodes_to_keep = NodesToKeep(affinity, first_unk_node, 1);
-
- affinity = affinity(nodes_to_keep, nodes_to_keep);
-
- first_unk_node = size(affinity,1) - num_placeholders + 1;
-
- end % function ReduceDimensions
-
- %return the true clustering accoring to the savd missing_nodes_mapping
- function [true_clustering] = BuildTrueClustering(missing_nodes_mapping, actual_graph_size, num_missing_nodes, percent_known_placeholders, placeholders_to_remove, last_known_node)
-
- %sigal 25.11.12 - count nonzero cell (beside first raw) - i.e. number of placeholder
- numMapping = 0;
- %sigal 23.1.14 - start form thrid row (first row original id, second row images profile)
- for i = 3 : size(missing_nodes_mapping, 1)
- nz=find(missing_nodes_mapping(i,:));
- numMapping = numMapping + size(nz,2);
- end
- true_clustering = zeros(numMapping,1);
- %sigal 25.11.12
- %true_clustering = []; %zeros(size(test_clustering, 1), 1);
-
- for i = 3 : size(missing_nodes_mapping, 1)
- for j = 1 : size(missing_nodes_mapping,2)
- if missing_nodes_mapping(i,j) ~= 0
- true_clustering(missing_nodes_mapping(i,j) - actual_graph_size + num_missing_nodes, 1) = j; % missing_nodes_mapping(1, j);
- end
- end
- end
-
- %sigal - adjust to withAttr flag TODO
- if percent_known_placeholders < 1
- true_clustering(placeholders_to_remove - last_known_node) = [];
- end
-
- end % function BuildTrueClustering
-
-
- function saveSmallData(dumpSmallDataPath, dataFileName, iter, affinity_type, withAttr, missNodes, small_data, i)
- %%% Sigal 24.1.13 - TODO
- outFile = sprintf('%s_%d_%d_%d_%d_small_data_%d', dataFileName, iter, missNodes, affinity_type, withAttr, i);
- if affinity_type == 9 % save instead a dummy size (1) and the best_alg
- SaveIntMatrixToFile(strcat(dumpSmallDataPath, outFile,'_edges.txt'), small_data, 1);
- else
- SaveAsciiGraph(dumpSmallDataPath, outFile, small_data, 1); %% also save graph size
- end
-
- end % function saveSmallData
-
- function [best_clustering, best_alg] = ChooseBestResults(clusteringResults, clusteringAlg, type)
- global affinity_boost;
- global affinity_boost2;
-
- if type == affinity_boost
- best_clustering_inx = ChooseBestResults1(clusteringResults);
- elseif type == affinity_boost2
- best_clustering_inx = ChooseBestResults2(clusteringResults, clusteringAlg);
- else
- fprintf('*** ERROR: ChooseBestResults: invalid tye %d\n', type);
- end
- best_clustering = clusteringResults(:,best_clustering_inx);
- best_alg = clusteringAlg(best_clustering_inx);
- end % function ChooseBestResults
-
- function [best_clustering_inx] = ChooseBestResults1(clusteringResults, indices)
- numResults = size(clusteringResults,2);
- if nargin < 2
- indices = ones(1,numResults); % i.e. all
- end
- sumPurity = CalcSumPurity(clusteringResults, indices);
- % return max entry
- [val, inx] = max(sumPurity);
- fprintf('ChooseBestResults: val %d, inx %d\n', val, inx);
- best_clustering_inx = inx;
- end % function ChooseBestResults
-
- function [best_clustering_inx] = ChooseBestResults2(clusteringResults, clusteringAlg)
- max_num_level_2 = 10;
- last_level_2 = 1;
-
- numResults = size(clusteringResults,2);
- indices_level_1 = zeros(1,numResults);
- indices_level_2 = zeros(max_num_level_2,numResults);
- alg_level_2 = zeros(1,max_num_level_2);
-
- for i=1:numResults
- base_alg = floor(clusteringAlg(i)/10);
- %base_att = base_alg-10*floor(base_alg/10);
- var_alg = clusteringAlg(i)-base_alg*10;
- if var_alg == 0 || var_alg == 0
- indices_level_1(i) = 1;
- else
- found = 0;
- for j=1:last_level_2-1
- if alg_level_2(j)==base_alg
- l2 = j;
- found = 1;
- break;
- end
- end
- if found == 0
- l2 = last_level_2;
- alg_level_2(l2)=base_alg;
- last_level_2 = l2+1;
- end
- indices_level_2(l2,i) = 1;
- end
- end
-
- for j=1:last_level_2-1
- best = ChooseBestResults1(clusteringResults, indices_level_2(j,:));
- indices_level_1(best) = 1;
- end
-
- best_clustering_inx = ChooseBestResults1(clusteringResults, indices_level_1);
-
- end % function ChooseBestResults
-
- function [sumPurity] = CalcSumPurity(clusteringResults, indices)
- numResults = size(clusteringResults,2);
- crossPurity = zeros(numResults,numResults);
- for i=1:numResults
- for j=1:numResults
- if i~=j && indices(i)==1 && indices(j)==1
- crossPurity(i,j) = ClusteringPurity(clusteringResults(:,j), clusteringResults(:,i));
- end
- end
- end
- sumPurity = sum(crossPurity,2);
- end % function CalcSumPurity
-
|