1 '''
2 Created on Aug 12, 2010
3
4 @author: dwmclary
5 '''
6
7 __author__ = "D. McClary (dan.mcclary@northwestern.edu)"
8 __all__ = ['out_degree', 'in_degree', 'average_degree', 'average_out_degree', 'average_in_degree',\
9 ' connected_components', 'num_connected_components',\
10 'single_source_shortest_path', 'single_source_shortest_path_length',\
11 'shortest_path', 'shorested_path_length', 'average_shortest_path_length']
12 from .. import hdmc
13 from .. hdmc import hdfs
14 import hadoop_config as config
15 import networkx as nx
16 import os
17 import sys
18 import string
19 from GraphLoader import GraphLoader
20
22 '''Compute the connected components for a networkx graph G'''
23 paths = shortest_path(G, None, None, name, recompute)
24 components = []
25 for p in paths.keys():
26 found = False
27 for c in components:
28 if len(c.intersection(paths[p].keys())) > 0:
29 c_index = components.index(c)
30 components[c_index] = c.union(paths[p].keys())
31 found = True
32 break
33 if not found:
34 components.append(set(paths[p].keys()))
35 return map(list, components)
36
38 '''Compute the number of connected components for the networkx graph G.'''
39 components = connected_components(G, name, recompute)
40 return len(components)
41
43 '''Computer the shortest path from source to a target or all other nodes in the networkx graph G.'''
44 if not recompute:
45 distance, path = check_for_precomputed_bfs_result(G, name, source)
46 else:
47 print "at sssp, name = " + name
48 distance, path = bfs(G, source, name)
49
50 if target:
51 try:
52 target_path = path[target]
53 return target_path
54 except KeyError:
55 return None
56 else:
57 for key in path.keys():
58 if len(path[key]) == 0:
59 del path[key]
60 return path
61
63 '''Computer the shortest path length from source to a target or all other nodes in the networkx graph G.'''
64 if not recompute:
65 distance, path = check_for_precomputed_bfs_result(G, name, source)
66 else:
67 distance, path = bfs(G, source, name)
68 if target:
69 try:
70 target_distance = distance[target]
71 except KeyError:
72 return None
73 else:
74 for key in distance.keys():
75 if distance[key] == float('inf'):
76 del distance[key]
77 return distance
78
80 '''Computer the average shortest path length from source to a target or all other nodes in the networkx graph G.'''
81 sum = 0.0
82 count = 0
83 if not recompute:
84 distance, path = check_for_precomputed_bfs_result(G, name, source)
85 else:
86 distance, path = bfs(G, source, name)
87
88 for key in distance.keys():
89 if distance[key] != float('inf'):
90 sum += distance[key]
91 count += 1
92
93 return sum/count
94
95 -def shortest_path(G, source=None, target=None, name=None, recompute=False):
96 '''Computer the shortest path from each node to all other nodes in the networkx graph G.
97 A source and target can optionally passed to limit the search.'''
98 if source:
99 single_source_shortest_path(G, source, target, name, recompute)
100 else:
101 paths = {}
102 for n in G.nodes():
103 this_path = single_source_shortest_path(G, n, target, name, recompute)
104 paths[n] = this_path
105 return paths
106
108 '''Computer the shortest path length from each node to all other nodes in the networkx graph G.
109 A source and target can optionally passed to limit the search.'''
110 if source:
111 single_source_shortest_path(G, source, target, name, recompute)
112 else:
113 distances = {}
114 for n in G.nodes():
115 this_distance = single_source_shortest_path_length(G, n, target, name, recompute)
116 distances[n] = this_distance
117 return distances
118
120 '''Computer the average shortest path length from each node to all other nodes in the networkx graph G.
121 '''
122 sum = 0.0
123 count = 0
124 for n in G.nodes():
125 sum += single_source_average_shortest_path_length(G, n, None, name, recompute)
126 count += 1
127 return sum/count
128
130 '''Compute the average out-degree for the networkx graph G.'''
131 in_d, out_d = degree(G, name)
132 average_out = float(sum(out_d.values()))/len(out_d.values())
133 return average_out
134
136 '''Compute the average in-degree for the networkx graph G.'''
137 in_d, out_d = degree(G, name)
138 average_in = float(sum(in_d.values()))/len(in_d.values())
139 return average_in
140
142 '''Compute the average degree for the networkx graph G.'''
143 in_d, out_d = degree(G, name)
144 average_out = sum(out_d.values())
145 average_in = sum(in_d.values())
146 return (average_out+average_in)/(float(len(out_d.values()))+float(len(in_d.values())))
147
149 '''Compute the out-degree for each node in the networkx graph G.'''
150 in_d, out_d = degree(G, name)
151 return out_d
152
154 '''Compute the in-degree for each node in the networkx graph G.'''
155 in_d, out_d = degree(G, name)
156 return in_d
157
171
172 -def bfs(G, source, name=None):
173 '''Conduct a parallel BFS from the source node to all other reachable nodes in G.'''
174 source = str(source)
175 os.system("echo "+source + " > pbfs_source")
176 wd = config.GraphReduce_location
177 inf_count = len(G)
178 print "at bfs, name = " + name
179 G = GraphLoader(G, name)
180 if name:
181 G.write_adjlist(name)
182 else:
183 G.write_adjlist("pbfs_input.adjlist")
184 hdfs_handle = G.graph_handle.split("/")[-1]
185 print G.graph_handle
186 print hdfs_handle
187 print "writing to " + hdfs_handle + "/" + source
188 r = hdfs.rm(hdfs_handle+"/shortest_path/"+source)
189
190 hdfs.mkdir(hdfs_handle+"/shortest_path/"+source)
191
192
193 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/shortest_path/"+source+"/part-00000")
194
195
196 distance, path = parallel_bfs(source, hdfs_handle, inf_count)
197 return distance, path
198
200 '''Compute node degree in parallel for the graph adjacency list stored in hdfs_handle.'''
201 hdfs.rm("pdegree")
202 base_path = os.path.realpath( __file__ ).split("/")
203 base_path = "/".join(base_path[0:-1])
204 hadoop_call = hdmc.build_generic_hadoop_call(base_path+"/Degree_mapper.py", base_path+"/Degree_reducer.py", hdfs_handle+"/degree", "pdegree", [])
205 hdmc.execute_and_wait(hadoop_call)
206
207 hdfs.rm(hdfs_handle+"/degree/part*")
208 hdfs.mv("pdegree/part*", hdfs_handle+"/degree/")
209 hdfs.rm("pdegree")
210 in_d, out_d = fetch_degree_from_hdfs(hdfs_handle)
211 return in_d, out_d
212
213
214
216 '''Compute shortest path from source to all nodes in parallel for the graph adjacency list stored in hdfs_handle.'''
217 hdfs.rm("PBFS-src-"+str(source))
218 base_path = os.path.realpath( __file__ ).split("/")
219 base_path = "/".join(base_path[0:-1])
220 hadoop_call = hdmc.build_generic_hadoop_call(base_path+"/PBFS_mapper.py", base_path+"/PBFS_reducer.py", hdfs_handle+"/shortest_path/"+source, "PBFS-src-"+str(source), ["pbfs_source"])
221 hdmc.execute_and_wait(hadoop_call)
222 listing = hdfs.ls("PBFS-src-"+str(source)+"/part*")["stdout"].rstrip().split("\n")
223 inf_count = 0
224 for entry in listing:
225 last_part = entry.split("part-")
226 tail = hdfs.tail("PBFS-src-"+str(source)+"/part-"+last_part[-1])["stdout"].split("\n")
227
228 for line in tail:
229 tail_entry = line.rstrip().split(":")
230 if len(tail_entry) > 0:
231 if tail_entry[0] == "#inf_count":
232 inf_count += int(tail_entry[1])
233
234
235 hdfs.rm(hdfs_handle+"/shortest_path/"+source+"/part*")
236 hdfs.mv("PBFS-src-"+str(source)+"/part*", hdfs_handle+"/shortest_path/"+source+"/")
237 hdfs.rm("PBFS-src-"+str(source))
238 if inf_count > 0 and old_inf_count > inf_count:
239 results, paths = parallel_bfs(source, hdfs_handle, inf_count)
240 else:
241 results, paths = fetch_sp_from_hdfs(hdfs_handle, source)
242 return results, paths
243
245 '''Fetch shortest path results from HDFS.'''
246 results = {}
247 paths = {}
248 output = hdfs.cat(hdfs_handle+"/shortest_path/"+source+"/part*")["stdout"].split("\n")
249 for r in output:
250 if len(r) > 0:
251 if r[0] != "#":
252 o = r.rstrip().split("d:")
253 p = r.rstrip().split("path:")
254 nodes = o[0].split()
255 results[nodes[0]] = float(o[1].split()[0])
256 paths[nodes[0]] = map(string.strip, p[-1].split(","))
257 if '' in paths[nodes[0]]:
258 paths[nodes[0]].remove('')
259 return results, paths
260
262 '''Fetch degree results from HDFS.'''
263 in_degrees = {}
264 out_degrees = {}
265 output = hdfs.cat(hdfs_handle+"/degree/part*")["stdout"].split("\n")
266 for r in output:
267 if len(r) > 0:
268 if r[0] != "#":
269 entry = r.split()
270 key = entry[0]
271 in_index = entry.index("in:")
272 out_index = entry.index("out:")
273 in_count = len(entry[in_index:out_index])
274 out_count = len(entry[out_index:])
275 in_degrees[key] = in_count
276 out_degrees[key] = out_count
277 return in_degrees, out_degrees
278
280 '''Check to see if degree has been computed for the networkx graph G.'''
281 if not name:
282 name = "pbfs_input.adjlist"
283 try:
284 listing = hdfs.ls(name+'/degree')["stdout"].split("\n")
285 in_d, out_d = fetch_degree_from_hdfs(name)
286 except AttributeError:
287 in_d= None
288 out_d = None
289 return in_d, out_d
290
291
293 '''Check to see if shortest path has been computed for the networkx graph G.'''
294
295 print "at check: " + name
296 if not name:
297 name = "pbfs_input.adjlist"
298 listing = hdfs.ls(name+'/shortest_path')["stdout"].split("\n")
299 result_exists = False
300 for line in listing:
301 entry = line.rstrip().split("/")[-1]
302 if source == entry:
303 result_exists = True
304 if result_exists:
305 distance, path = fetch_sp_from_hdfs(name, source)
306 else:
307 distance, path = bfs(G, source, name)
308 return distance, path
309