You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PrepareAttributes5.m 4.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. Map attribute to categories
  2. Use 0 for null/no value, 1 for unknown/private value (if exist) and then real values
  3. function [outAttributes, attUpperRange, selectedAttr, attStat] = PrepareAttributes5(dataFilePath, dataFileName, expectedAttrCols, maxAttStat, inSelectedAttr, debug, debugPath)
  4. outNoneValue = 0;
  5. inNoneValue = -1;
  6. countryCol = 1;
  7. countryCompress = 1;
  8. countryOffset = 129; % min value is -128 => real value starting from 1
  9. %countryUnknown = -999;
  10. if nargin < 4
  11. fprintf('PrepareAttributes - Invalid parameters. expecting: dataFilePath, rawAtts, debug\n');
  12. return;
  13. end
  14. if nargin < 5
  15. maxAttStat = 0.50; % use this attribute only if it appears less than this percentage
  16. end
  17. if nargin < 6
  18. selectedAttr = ones(1,expectedAttrCols);
  19. else
  20. selectedAttr = inSelectedAttr;
  21. end
  22. if nargin < 7
  23. debug = 0;
  24. end
  25. if nargin < 8
  26. debugPath = 'debug/';
  27. end
  28. outFullPath = strcat(dataFilePath, debugPath);
  29. outFullName = strcat(outFullPath,dataFileName);
  30. if isdir(outFullPath) == 0 && debug == 1
  31. mkdir(outFullPath);
  32. end
  33. % load row data from file
  34. % sigal - 12.6.13 - use mat binary file
  35. %attributes = LoadAsciiAttributes(dataFilePath, dataFileName, numNodes, debug, debugPath);
  36. fprintf('%s ---- %s', dataFilePath, dataFileName);
  37. load(strcat(dataFilePath, dataFileName), 'attributes');
  38. m = size(attributes,1); % num nodes/lines
  39. n = size(attributes,2); % num attributes/cols
  40. if n ~= expectedAttrCols
  41. fprintf('PrepareAttributes - Invalid size: expecting (%d), got (%dx%d)\n',expectedAttrCols,m,n);
  42. return;
  43. end
  44. % first process all attributes, then filter by selected and max statistics
  45. outAttributes = zeros(m,n);
  46. for col=1:n
  47. for line=1:m
  48. if (col == countryCol)
  49. value = attributes(line, col)+countryOffset;
  50. if (value < 0)
  51. outAttributes(line, col) = outNoneValue;
  52. else
  53. outAttributes(line, col) = value;
  54. end
  55. else % games/groups (all other columns)
  56. value = attributes(line,col);
  57. if value == inNoneValue
  58. outAttributes(line, col) = outNoneValue;
  59. else % no threshold, i.e. binary value
  60. outAttributes(line, col) = 1;
  61. end
  62. end
  63. end
  64. end
  65. % calculate COUNTRY statistics and filter according to maxAttStat
  66. if selectedAttr(countryCol) == 1
  67. new_values = outAttributes(:, countryCol);
  68. maxCountry = max(new_values);
  69. for i=1:maxCountry
  70. indices = (new_values==i);
  71. count = sum(indices);
  72. if count/m > maxAttStat
  73. new_values(indices) = 0;
  74. end
  75. end
  76. outAttributes(:, countryCol)= new_values;
  77. end
  78. % remove empty country indexes and shift codes
  79. if countryCompress && selectedAttr(countryCol) == 1
  80. new_values = outAttributes(:, countryCol);
  81. maxCountry = max(new_values);
  82. for i=1:maxCountry
  83. indices = find(new_values==i);
  84. if size(indices,1) == 0
  85. indices = find(new_values>i);
  86. if size(indices,1) == 0
  87. break;
  88. else
  89. new_values(indices) = new_values(indices) -1;
  90. end
  91. end
  92. end
  93. %maxCountry = max(new_values);
  94. outAttributes(:, countryCol)= new_values;
  95. end
  96. % calculate statistics and filter according to zero & maxAttStat
  97. attStat = zeros(1,n);
  98. for a = 1:n
  99. attStat(a) = nnz(outAttributes(:,a))/m;
  100. if attStat(a) == 0
  101. selectedAttr(a)=0;
  102. elseif selectedAttr(a)>0 && attStat(a) > maxAttStat && a>1 % don't filter country
  103. selectedAttr(a)=0;
  104. end
  105. end
  106. %filter according to updated selectedAttr
  107. for a = 0:n-1 % run in reverse order
  108. if selectedAttr(n-a)==0
  109. outAttributes(:,n-a) = []; % remove this column
  110. end
  111. end
  112. attUpperRange = max(outAttributes);
  113. %attLowRange = min(attributes);
  114. if debug == 1
  115. outFullName = sprintf('%s.att2%d', outFullName, numThreshold);
  116. save(strcat(outFullName,'.mat'), 'outAttributes');
  117. SaveIntMatrixToFile(strcat(outFullName,'.txt'), outAttributes);
  118. end