123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132 |
- % Map attribute to categories
- % Use 0 for null/no value, 1 for unknown/private value (if exist) and then real values
- function [outAttributes, attUpperRange, selectedAttr, attStat] = PrepareAttributes5(dataFilePath, dataFileName, numNodes, expectedAttrCols, maxAttStat, inSelectedAttr, debug, debugPath)
-
-
- outNoneValue = 0;
- inNoneValue = -1;
-
- countryCol = 1;
- countryCompress = 1;
- countryOffset = 129; % min value is -128 => real value starting from 1
- %countryUnknown = -999;
-
- if nargin < 4
- fprintf('PrepareAttributes - Invalid parameters. expecting: dataFilePath, rawAtts, debug\n');
- return;
- end
- if nargin < 5
- maxAttStat = 0.50; % use this attribute only if it appears less than this percentage
- end
- if nargin < 6
- selectedAttr = ones(1,expectedAttrCols);
- else
- selectedAttr = inSelectedAttr;
- end
- if nargin < 7
- debug = 0;
- end
- if nargin < 8
- debugPath = 'debug/';
- end
-
- outFullPath = strcat(dataFilePath, debugPath);
- outFullName = strcat(outFullPath,dataFileName);
-
- if isdir(outFullPath) == 0 && debug == 1
- mkdir(outFullPath);
- end
-
- % load row data from file
- % sigal - 12.6.13 - use mat binary file
- %attributes = LoadAsciiAttributes(dataFilePath, dataFileName, numNodes, debug, debugPath);
- fprintf('%s ---- %s', dataFilePath, dataFileName);
- load(strcat(dataFilePath, dataFileName), 'attributes');
- m = size(attributes,1); % num nodes/lines
- n = size(attributes,2); % num attributes/cols
-
- if n ~= expectedAttrCols || m ~= numNodes
- fprintf('PrepareAttributes - Invalid size: expecting (%dx%d), got (%dx%d)\n',numNodes,expectedAttrCols,m,n);
- return;
- end
-
- % first process all attributes, then filter by selected and max statistics
- outAttributes = zeros(m,n);
- for col=1:n
- for line=1:m
- if (col == countryCol)
- value = attributes(line, col)+countryOffset;
- if (value < 0)
- outAttributes(line, col) = outNoneValue;
- else
- outAttributes(line, col) = value;
- end
- else % games/groups (all other columns)
- value = attributes(line,col);
- if value == inNoneValue
- outAttributes(line, col) = outNoneValue;
- else % no threshold, i.e. binary value
- outAttributes(line, col) = 1;
- end
- end
- end
- end
-
- % calculate COUNTRY statistics and filter according to maxAttStat
- if selectedAttr(countryCol) == 1
- new_values = outAttributes(:, countryCol);
- maxCountry = max(new_values);
- for i=1:maxCountry
- indices = (new_values==i);
- count = sum(indices);
- if count/numNodes > maxAttStat
- new_values(indices) = 0;
- end
- end
- outAttributes(:, countryCol)= new_values;
- end
-
- % remove empty country indexes and shift codes
- if countryCompress && selectedAttr(countryCol) == 1
- new_values = outAttributes(:, countryCol);
- maxCountry = max(new_values);
- for i=1:maxCountry
- indices = find(new_values==i);
- if size(indices,1) == 0
- indices = find(new_values>i);
- if size(indices,1) == 0
- break;
- else
- new_values(indices) = new_values(indices) -1;
- end
- end
- end
- %maxCountry = max(new_values);
- outAttributes(:, countryCol)= new_values;
- end
-
- % calculate statistics and filter according to zero & maxAttStat
- attStat = zeros(1,n);
- for a = 1:n
- attStat(a) = nnz(outAttributes(:,a))/numNodes;
- if attStat(a) == 0
- selectedAttr(a)=0;
- elseif selectedAttr(a)>0 && attStat(a) > maxAttStat && a>1 % don't filter country
- selectedAttr(a)=0;
- end
- end
- %filter according to updated selectedAttr
- for a = 0:n-1 % run in reverse order
- if selectedAttr(n-a)==0
- outAttributes(:,n-a) = []; % remove this column
- end
- end
-
- attUpperRange = max(outAttributes);
- %attLowRange = min(attributes);
-
- if debug == 1
- outFullName = sprintf('%s.att2%d', outFullName, numThreshold);
- save(strcat(outFullName,'.mat'), 'outAttributes');
- SaveIntMatrixToFile(strcat(outFullName,'.txt'), outAttributes);
- end
|