Map attribute to categories Use 0 for null/no value, 1 for unknown/private value (if exist) and then real values function [outAttributes, attUpperRange, selectedAttr, attStat] = PrepareAttributes5(dataFilePath, dataFileName, expectedAttrCols, maxAttStat, inSelectedAttr, debug, debugPath) outNoneValue = 0; inNoneValue = -1; countryCol = 1; countryCompress = 1; countryOffset = 129; % min value is -128 => real value starting from 1 %countryUnknown = -999; if nargin < 4 fprintf('PrepareAttributes - Invalid parameters. expecting: dataFilePath, rawAtts, debug\n'); return; end if nargin < 5 maxAttStat = 0.50; % use this attribute only if it appears less than this percentage end if nargin < 6 selectedAttr = ones(1,expectedAttrCols); else selectedAttr = inSelectedAttr; end if nargin < 7 debug = 0; end if nargin < 8 debugPath = 'debug/'; end outFullPath = strcat(dataFilePath, debugPath); outFullName = strcat(outFullPath,dataFileName); if isdir(outFullPath) == 0 && debug == 1 mkdir(outFullPath); end % load row data from file % sigal - 12.6.13 - use mat binary file %attributes = LoadAsciiAttributes(dataFilePath, dataFileName, numNodes, debug, debugPath); fprintf('%s ---- %s', dataFilePath, dataFileName); load(strcat(dataFilePath, dataFileName), 'attributes'); m = size(attributes,1); % num nodes/lines n = size(attributes,2); % num attributes/cols if n ~= expectedAttrCols fprintf('PrepareAttributes - Invalid size: expecting (%d), got (%dx%d)\n',expectedAttrCols,m,n); return; end % first process all attributes, then filter by selected and max statistics outAttributes = zeros(m,n); for col=1:n for line=1:m if (col == countryCol) value = attributes(line, col)+countryOffset; if (value < 0) outAttributes(line, col) = outNoneValue; else outAttributes(line, col) = value; end else % games/groups (all other columns) value = attributes(line,col); if value == inNoneValue outAttributes(line, col) = outNoneValue; else % no threshold, i.e. binary value outAttributes(line, col) = 1; end end end end % calculate COUNTRY statistics and filter according to maxAttStat if selectedAttr(countryCol) == 1 new_values = outAttributes(:, countryCol); maxCountry = max(new_values); for i=1:maxCountry indices = (new_values==i); count = sum(indices); if count/m > maxAttStat new_values(indices) = 0; end end outAttributes(:, countryCol)= new_values; end % remove empty country indexes and shift codes if countryCompress && selectedAttr(countryCol) == 1 new_values = outAttributes(:, countryCol); maxCountry = max(new_values); for i=1:maxCountry indices = find(new_values==i); if size(indices,1) == 0 indices = find(new_values>i); if size(indices,1) == 0 break; else new_values(indices) = new_values(indices) -1; end end end %maxCountry = max(new_values); outAttributes(:, countryCol)= new_values; end % calculate statistics and filter according to zero & maxAttStat attStat = zeros(1,n); for a = 1:n attStat(a) = nnz(outAttributes(:,a))/m; if attStat(a) == 0 selectedAttr(a)=0; elseif selectedAttr(a)>0 && attStat(a) > maxAttStat && a>1 % don't filter country selectedAttr(a)=0; end end %filter according to updated selectedAttr for a = 0:n-1 % run in reverse order if selectedAttr(n-a)==0 outAttributes(:,n-a) = []; % remove this column end end attUpperRange = max(outAttributes); %attLowRange = min(attributes); if debug == 1 outFullName = sprintf('%s.att2%d', outFullName, numThreshold); save(strcat(outFullName,'.mat'), 'outAttributes'); SaveIntMatrixToFile(strcat(outFullName,'.txt'), outAttributes); end