You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PrepareAttributes2.m 3.9KB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. % Map attribute to categories
  2. % Use 0 for null/no value, 1 for unknown/private value (if exist) and then real values
  3. function [outAttributes, attUpperRange, selectedAttr, attStat] = PrepareAttributes2(dataFilePath, dataFileName, numNodes, expectedAttrCols, maxAttStat, selectedAttr, debug, debugPath)
  4. outNoneValue = 0;
  5. inNoneValue = -1;
  6. countryCol = 1;
  7. countryCompress = 1;
  8. countryOffset = 129; % min value is -128 => real value starting from 1
  9. %countryUnknown = -999;
  10. if nargin < 4
  11. fprintf('PrepareAttributes - Invalid parameters. expecting: dataFilePath, rawAtts, debug\n');
  12. return;
  13. end
  14. if nargin < 5
  15. maxAttStat = 0.50; % use this attribute only if it appears less than this percentage
  16. end
  17. if nargin < 6
  18. selectedAttr = ones(1,expectedAttrCols);
  19. end
  20. if nargin < 7
  21. debug = 0;
  22. end
  23. if nargin < 8
  24. debugPath = 'debug/';
  25. end
  26. outFullPath = strcat(dataFilePath, debugPath);
  27. outFullName = strcat(outFullPath,dataFileName);
  28. if isdir(outFullPath) == 0 && debug == 1
  29. mkdir(outFullPath);
  30. end
  31. % load row data from file
  32. attributes = LoadAsciiAttributes(dataFilePath, dataFileName, numNodes, debug, debugPath);
  33. m = size(attributes,1); % num nodes/lines
  34. n = size(attributes,2); % num attributes/cols
  35. if n ~= expectedAttrCols || m ~= numNodes
  36. fprintf('PrepareAttributes - Invalid size: expecting (%dx%d), got (%dx%d)\n',numNodes,expectedAttrCols,m,n);
  37. return;
  38. end
  39. % first process all attributes, then filter by selected and max statistics
  40. outAttributes = zeros(m,n);
  41. for col=1:n
  42. for line=1:m
  43. if (col == countryCol)
  44. value = attributes(line, col)+countryOffset;
  45. if (value < 0)
  46. outAttributes(line, col) = outNoneValue;
  47. else
  48. outAttributes(line, col) = value;
  49. end
  50. else % games/groups (all other columns)
  51. value = attributes(line,col);
  52. if value == inNoneValue
  53. outAttributes(line, col) = outNoneValue;
  54. else % no threshold, i.e. binary value
  55. outAttributes(line, col) = 1;
  56. end
  57. end
  58. end
  59. end
  60. % calculate COUNTRY statistics and filter according to maxAttStat
  61. if selectedAttr(countryCol) == 1
  62. new_values = outAttributes(:, countryCol);
  63. maxCountry = max(new_values);
  64. for i=1:maxCountry
  65. indices = (new_values==i);
  66. count = sum(indices);
  67. if count/numNodes > maxAttStat
  68. new_values(indices) = 0;
  69. end
  70. end
  71. outAttributes(:, countryCol)= new_values;
  72. end
  73. % remove empty country indexes and shift codes
  74. if countryCompress && selectedAttr(countryCol) == 1
  75. new_values = outAttributes(:, countryCol);
  76. maxCountry = max(new_values);
  77. for i=1:maxCountry
  78. indices = find(new_values==i);
  79. if size(indices,1) == 0
  80. indices = find(new_values>i);
  81. if size(indices,1) == 0
  82. break;
  83. else
  84. new_values(indices) = new_values(indices) -1;
  85. end
  86. end
  87. end
  88. %maxCountry = max(new_values);
  89. outAttributes(:, countryCol)= new_values;
  90. end
  91. % calculate statistics and filter according to zero & maxAttStat
  92. attStat = zeros(1,n);
  93. for a = 1:n
  94. attStat(a) = nnz(outAttributes(:,a))/numNodes;
  95. if attStat(a) == 0
  96. selectedAttr(a)=0;
  97. elseif selectedAttr(a)>0 && attStat(a) > maxAttStat && a>1 % don't filter country
  98. selectedAttr(a)=0;
  99. end
  100. end
  101. %filter according to updated selectedAttr
  102. for a = 0:n-1 % run in reverse order
  103. if selectedAttr(n-a)==0
  104. outAttributes(:,n-a) = []; % remove this column
  105. end
  106. end
  107. attUpperRange = max(outAttributes);
  108. %attLowRange = min(attributes);
  109. if debug == 1
  110. outFullName = sprintf('%s.att2%d', outFullName, numThreshold);
  111. save(strcat(outFullName,'.mat'), 'outAttributes');
  112. SaveIntMatrixToFile(strcat(outFullName,'.txt'), outAttributes);
  113. end