1 '''
2 Created on Aug 18, 2010
3
4 @author: dwmclary
5 '''
6 __author__ = "D. McClary (dan.mcclary@northwestern.edu)"
7 from .. import hdmc
8 from .. hdmc import hdfs
9 import hadoop_config as config
10 import networkx as nx
11 import os
12 import sys
13 import string
14 from GraphLoader import GraphLoader
15
16 -def page_rank(G, name=None, max_iterations=10):
17 '''Compute page rank in parallel for the networkx graph G.'''
18
19 wd = config.GraphReduce_location
20 ranks = dict(zip(map(str,G.nodes()),[1.0/len(G)]*len(G)))
21 G = GraphLoader(G)
22 if name:
23 G.write_adjlist(name)
24 else:
25 G.write_adjlist("pbfs_input.adjlist")
26 hdfs_handle = G.graph_handle.split("/")[-1]
27 hdfs.rm(hdfs_handle+"/page_rank")
28 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/page_rank/part-00000")
29 ranking = parallel_page_rank(G,hdfs_handle, ranks, 0, max_iterations)
30 return ranking
31
32 -def parallel_page_rank(G, hdfs_handle, old_ranks, iterations, max_iterations):
33 '''Compute page rank in parallel for the networkx graph G.'''
34 hdfs.rm("PPR")
35 base_path = os.path.realpath( __file__ ).split("/")
36 base_path = "/".join(base_path[0:-1])
37 hadoop_call = hdmc.build_generic_hadoop_call(base_path+"/PageRank_mapper.py", base_path+"/PageRank_reducer.py", hdfs_handle+"/page_rank", "PPR", ["rank_mass"])
38 hdmc.execute_and_wait(hadoop_call)
39 listing = hdfs.ls("PPR/part*")["stdout"].rstrip().split("\n")
40 for entry in listing:
41 last_part = entry.split("part-")
42 data = hdfs.cat("PPR/part-"+last_part[-1])["stdout"].split("\n")
43 lost_mass = 0.0
44 for line in data:
45 line = line.strip().split()
46 if "#lost_mass:" in line:
47 lost_mass += float(line[1])
48 os.system("echo " + str(lost_mass) + " > lost_mass")
49
50 hdfs.rm(hdfs_handle+"/page_rank/part*")
51 hdfs.mv("PPR/part*", hdfs_handle+"/page_rank/")
52 hdfs.rm("PPR")
53 hadoop_call = hdmc.build_generic_hadoop_call(base_path+"/LostMass_mapper.py", base_path+"/LostMass_reducer.py", hdfs_handle+"/page_rank", "PPR", ["rank_mass", "lost_mass"])
54 hdmc.execute_and_wait(hadoop_call)
55
56
57 for entry in listing:
58 last_part = entry.split("part-")
59 data = hdfs.cat("PPR/part-"+last_part[-1])["stdout"].split("\n")
60 rank_sum = 0.0
61 ranks= {}
62 for line in data:
63 pr_value = line.strip().split("pr:")
64 if len(pr_value) > 1:
65 rank = float(pr_value[-1])
66 node = pr_value[0].split()[0]
67 ranks[node] = rank
68 rank_sum+= rank
69
70 converged = True
71 for key in ranks.keys():
72 if abs(ranks[key] - old_ranks[key]) > 0.0001:
73 converged = False
74 break
75
76 iterations += 1
77
78 hdfs.rm(hdfs_handle+"/page_rank/part*")
79 hdfs.mv("PPR/part*", hdfs_handle+"/page_rank/")
80 hdfs.rm("PPR")
81
82 if not converged and iterations < max_iterations:
83 return parallel_page_rank(G, hdfs_handle, ranks, iterations, max_iterations)
84 else:
85 return ranks
86