You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Facebook_Crawl2.m 3.5KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. clc;
  2. clear all;
  3. max_queue_size = 200000;
  4. max_facebook_node = 957359;
  5. %D:\SocialNets\Facebook\Data
  6. fileName = 'd:\\SocialNets\\Facebook\\Data\\mhrw-socialgraph-anonymized.txt';
  7. dataset_out_dir = 'd:\\SocialNets\\DataSets6\\Facebook_June13\\';
  8. if isdir(dataset_out_dir) == 0
  9. mkdir(dataset_out_dir);
  10. end
  11. %graph_size = 16384;
  12. graph_size = 1000000;
  13. graph_sizes = [2000,5000,10000,15000,20000,25000,30000,35000,40000,50000,60000,70000,75000,80000,90000];
  14. debugFlag = 0;
  15. numIter = 6; %0;
  16. for iter = 1 : numIter-1
  17. bfs_start_node = floor(rand(1) * max_facebook_node); %some node from the mhrw dataset
  18. %bfs_start_node =21632297; %most connected node in the uni dataset
  19. %bfs_start_node =72261595;
  20. date_now = clock;
  21. date_now = strcat(num2str(date_now(1)),'_',num2str(date_now(2)),'_', num2str(date_now(3)),'_', num2str(date_now(4)), num2str(date_now(5)),'_', num2str(date_now(6)));
  22. LogMsg(sprintf('%s Start Facebook_Crawl (%d of %d) starting from node %d ...',date_now, iter+1, numIter, bfs_start_node));
  23. curr_node = bfs_start_node;
  24. queue = [];
  25. developed_nodes = [];
  26. graph = [];
  27. curr_graph_size = 0;
  28. while curr_graph_size < graph_size
  29. %fprintf('Current Node: %d\n', curr_node);
  30. developed_nodes = [developed_nodes, curr_node];
  31. curr_neighbors = FindFacebookNeighbors_binary(fileName, curr_node);
  32. if curr_neighbors ~= -1
  33. curr_neighbors = curr_neighbors(curr_neighbors <= max_facebook_node);
  34. for j = 1 : size(curr_neighbors,1)
  35. %queue = union(curr_neighbors(j,:), queue); %add the new nodes to the queue but make it exclusive (no repetitions)
  36. queue = [queue, setdiff(curr_neighbors(j,:),queue)];
  37. %queue = [queue, curr_neighbors(j,:)]; %BFS
  38. if size(queue,2) > max_queue_size
  39. queue = queue(1:max_queue_size);
  40. end
  41. curr_graph_size = curr_graph_size+1;
  42. graph{curr_graph_size} = curr_neighbors;
  43. % save incremental netwroks
  44. % if curr_graph_size == 1024 || curr_graph_size==2048 || curr_graph_size==4096 || curr_graph_size==8192
  45. if find(graph_sizes==curr_graph_size)
  46. data = BuildFacebookData(graph);
  47. networkName = sprintf('%sfacebook_sparse_%d_%d_%d', dataset_out_dir,curr_graph_size,iter,bfs_start_node);
  48. LogMsg(sprintf('Saving %s',networkName));
  49. save(networkName);
  50. clear data;
  51. end
  52. end
  53. end
  54. %queue = setdiff(queue, developed_nodes); %remove nodes that were already developed from the queue - also sorts the queue
  55. if debugFlag == 1
  56. fprintf('graph size: %d, queue size: %d\n' , size(graph,2), size(queue,2));
  57. end
  58. if size(queue, 1) == 0
  59. fprintf('queue is empty');
  60. break;
  61. end
  62. while ismember(curr_node, developed_nodes)
  63. curr_node = queue(1);
  64. queue(1) = [];
  65. end
  66. %curr_node = queue(ceil(rand(1) * size(queue,2))); %select a random node from the queue
  67. end
  68. clear queue;
  69. % pack;
  70. % save final network
  71. data = BuildFacebookData(graph);
  72. networkName = sprintf('%sfacebook_sparse_%d_%d_%d', dataset_out_dir,curr_graph_size,iter,bfs_start_node);
  73. LogMsg(sprintf('Saving %s',networkName));
  74. save(networkName);
  75. clear data;
  76. % pack;
  77. end