You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Facebook_Crawl.m 3.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. clc;
  2. clear all;
  3. max_queue_size = 200000;
  4. max_facebook_node = 957359;
  5. %D:\SocialNets\Facebook\Data
  6. fileName = 'd:\\SocialNets\\Facebook\\Data\\mhrw-socialgraph-anonymized.txt';
  7. dataset_out_dir = 'd:\\SocialNets\\DataSets6\\Facebook_June13\\';
  8. if isdir(dataset_out_dir) == 0
  9. mkdir(dataset_out_dir);
  10. end
  11. %graph_size = 16384;
  12. graph_size = 100000;
  13. graph_sizes = [2000,5000,10000,15000,20000,25000,30000,35000,40000,50000,60000,70000,75000,80000,90000]; %,100000];
  14. debugFlag = 0;
  15. numIter = 10; %0;
  16. startNodes = [779983 867168 121571 874428 605394 93381 266622 523561 916677 923744];
  17. bfs_start_node = floor(rand(1) * max_facebook_node); %some node from the mhrw dataset
  18. for iter = 4 : numIter-1
  19. while find(startNodes==bfs_start_node)
  20. bfs_start_node = floor(rand(1) * max_facebook_node); %some node from the mhrw dataset
  21. end
  22. startNodes = [startNodes bfs_start_node];
  23. %bfs_start_node =21632297; %most connected node in the uni dataset
  24. %bfs_start_node =72261595;
  25. date_now = clock;
  26. date_now = strcat(num2str(date_now(1)),'_',num2str(date_now(2)),'_', num2str(date_now(3)),'_', num2str(date_now(4)), num2str(date_now(5)),'_', num2str(date_now(6)));
  27. LogMsg(sprintf('%s Start Facebook_Crawl (%d of %d) starting from node %d ...',date_now, iter+1, numIter, bfs_start_node));
  28. curr_node = bfs_start_node;
  29. queue = [];
  30. developed_nodes = [];
  31. graph = [];
  32. curr_graph_size = 0;
  33. while curr_graph_size < graph_size
  34. %fprintf('Current Node: %d\n', curr_node);
  35. developed_nodes = [developed_nodes, curr_node];
  36. curr_neighbors = FindFacebookNeighbors_binary(fileName, curr_node);
  37. if curr_neighbors ~= -1
  38. curr_neighbors = curr_neighbors(curr_neighbors <= max_facebook_node);
  39. for j = 1 : size(curr_neighbors,1)
  40. %queue = union(curr_neighbors(j,:), queue); %add the new nodes to the queue but make it exclusive (no repetitions)
  41. queue = [queue, setdiff(curr_neighbors(j,:),queue)];
  42. %queue = [queue, curr_neighbors(j,:)]; %BFS
  43. if size(queue,2) > max_queue_size
  44. queue = queue(1:max_queue_size);
  45. end
  46. curr_graph_size = curr_graph_size+1;
  47. graph{curr_graph_size} = curr_neighbors;
  48. % save incremental netwroks
  49. % if curr_graph_size == 1024 || curr_graph_size==2048 || curr_graph_size==4096 || curr_graph_size==8192
  50. if find(graph_sizes==curr_graph_size)
  51. data = BuildFacebookData(graph);
  52. networkName = sprintf('%sfacebook_sparse_%d_%d_%d', dataset_out_dir,curr_graph_size,iter,bfs_start_node);
  53. LogMsg(sprintf('Saving %s',networkName));
  54. save(networkName);
  55. clear data;
  56. end
  57. end
  58. end
  59. %queue = setdiff(queue, developed_nodes); %remove nodes that were already developed from the queue - also sorts the queue
  60. if debugFlag == 1
  61. fprintf('graph size: %d, queue size: %d\n' , size(graph,2), size(queue,2));
  62. end
  63. if size(queue, 1) == 0
  64. fprintf('queue is empty');
  65. break;
  66. end
  67. while ismember(curr_node, developed_nodes)
  68. curr_node = queue(1);
  69. queue(1) = [];
  70. end
  71. %curr_node = queue(ceil(rand(1) * size(queue,2))); %select a random node from the queue
  72. end
  73. clear queue;
  74. % pack;
  75. % save final network
  76. data = BuildFacebookData(graph);
  77. networkName = sprintf('%sfacebook_sparse_%d_%d_%d', dataset_out_dir,curr_graph_size,iter,bfs_start_node);
  78. LogMsg(sprintf('Saving %s',networkName));
  79. save(networkName);
  80. clear data;
  81. LogMsg(strcat('startNodes ', sprintf(' %d ',startNodes)));
  82. % pack;
  83. end