Package ziggy :: Package GraphReduce :: Module gr_path
[hide private]
[frames] | no frames]

Source Code for Module ziggy.GraphReduce.gr_path

  1  ''' 
  2  Created on Aug 12, 2010 
  3   
  4  @author: dwmclary 
  5  ''' 
  6   
  7  __author__ = "D. McClary (dan.mcclary@northwestern.edu)" 
  8  __all__ = ['out_degree', 'in_degree', 'average_degree', 'average_out_degree', 'average_in_degree',\ 
  9            ' connected_components', 'num_connected_components',\ 
 10             'single_source_shortest_path', 'single_source_shortest_path_length',\ 
 11             'shortest_path', 'shorested_path_length', 'average_shortest_path_length'] 
 12  from .. import hdmc 
 13  from .. hdmc import hdfs 
 14  import hadoop_config as config 
 15  import networkx as nx 
 16  import os 
 17  import sys 
 18  import string 
 19  from GraphLoader import GraphLoader 
 20   
21 -def connected_components(G, name = None, recompute=False):
22 '''Compute the connected components for a networkx graph G''' 23 paths = shortest_path(G, None, None, name, recompute) 24 components = [] 25 for p in paths.keys(): 26 found = False 27 for c in components: 28 if len(c.intersection(paths[p].keys())) > 0: 29 c_index = components.index(c) 30 components[c_index] = c.union(paths[p].keys()) 31 found = True 32 break 33 if not found: 34 components.append(set(paths[p].keys())) 35 return map(list, components)
36
37 -def num_connected_components(G, name=None, recompute=False):
38 '''Compute the number of connected components for the networkx graph G.''' 39 components = connected_components(G, name, recompute) 40 return len(components)
41
42 -def single_source_shortest_path(G, source, target=None, name=None, recompute=False):
43 '''Computer the shortest path from source to a target or all other nodes in the networkx graph G.''' 44 if not recompute: 45 distance, path = check_for_precomputed_bfs_result(G, name, source) 46 else: 47 print "at sssp, name = " + name 48 distance, path = bfs(G, source, name) 49 50 if target: 51 try: 52 target_path = path[target] 53 return target_path 54 except KeyError: 55 return None 56 else: 57 for key in path.keys(): 58 if len(path[key]) == 0: 59 del path[key] 60 return path
61
62 -def single_source_shortest_path_length(G, source, target=None, name=None, recompute=False):
63 '''Computer the shortest path length from source to a target or all other nodes in the networkx graph G.''' 64 if not recompute: 65 distance, path = check_for_precomputed_bfs_result(G, name, source) 66 else: 67 distance, path = bfs(G, source, name) 68 if target: 69 try: 70 target_distance = distance[target] 71 except KeyError: 72 return None 73 else: 74 for key in distance.keys(): 75 if distance[key] == float('inf'): 76 del distance[key] 77 return distance
78
79 -def single_source_average_shortest_path_length(G, source, target=None, name=None, recompute=False):
80 '''Computer the average shortest path length from source to a target or all other nodes in the networkx graph G.''' 81 sum = 0.0 82 count = 0 83 if not recompute: 84 distance, path = check_for_precomputed_bfs_result(G, name, source) 85 else: 86 distance, path = bfs(G, source, name) 87 88 for key in distance.keys(): 89 if distance[key] != float('inf'): 90 sum += distance[key] 91 count += 1 92 93 return sum/count
94
95 -def shortest_path(G, source=None, target=None, name=None, recompute=False):
96 '''Computer the shortest path from each node to all other nodes in the networkx graph G. 97 A source and target can optionally passed to limit the search.''' 98 if source: 99 single_source_shortest_path(G, source, target, name, recompute) 100 else: 101 paths = {} 102 for n in G.nodes(): 103 this_path = single_source_shortest_path(G, n, target, name, recompute) 104 paths[n] = this_path 105 return paths
106
107 -def shortest_path_length(G, source=None, target=None, name=None, recompute=False):
108 '''Computer the shortest path length from each node to all other nodes in the networkx graph G. 109 A source and target can optionally passed to limit the search.''' 110 if source: 111 single_source_shortest_path(G, source, target, name, recompute) 112 else: 113 distances = {} 114 for n in G.nodes(): 115 this_distance = single_source_shortest_path_length(G, n, target, name, recompute) 116 distances[n] = this_distance 117 return distances
118
119 -def average_shortest_path_length(G,name=None, recompute=False):
120 '''Computer the average shortest path length from each node to all other nodes in the networkx graph G. 121 ''' 122 sum = 0.0 123 count = 0 124 for n in G.nodes(): 125 sum += single_source_average_shortest_path_length(G, n, None, name, recompute) 126 count += 1 127 return sum/count
128
129 -def average_out_degree(G, name=None):
130 '''Compute the average out-degree for the networkx graph G.''' 131 in_d, out_d = degree(G, name) 132 average_out = float(sum(out_d.values()))/len(out_d.values()) 133 return average_out
134
135 -def average_in_degree(G, name=None):
136 '''Compute the average in-degree for the networkx graph G.''' 137 in_d, out_d = degree(G, name) 138 average_in = float(sum(in_d.values()))/len(in_d.values()) 139 return average_in
140
141 -def average_degree(G, name=None):
142 '''Compute the average degree for the networkx graph G.''' 143 in_d, out_d = degree(G, name) 144 average_out = sum(out_d.values()) 145 average_in = sum(in_d.values()) 146 return (average_out+average_in)/(float(len(out_d.values()))+float(len(in_d.values())))
147
148 -def out_degree(G, name=None):
149 '''Compute the out-degree for each node in the networkx graph G.''' 150 in_d, out_d = degree(G, name) 151 return out_d
152
153 -def in_degree(G, name=None):
154 '''Compute the in-degree for each node in the networkx graph G.''' 155 in_d, out_d = degree(G, name) 156 return in_d
157
158 -def degree(G, name=None):
159 '''Compute the degree for each node in the networkx graph G.''' 160 G = GraphLoader(G) 161 print name 162 if name: 163 G.write_adjlist(name) 164 else: 165 G.write_adjlist("pbfs_input.adjlist") 166 hdfs_handle = G.graph_handle.split("/")[-1] 167 hdfs.rm(hdfs_handle+"/degree") 168 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/degree/part-00000") 169 in_degree, out_degree = parallel_degree(hdfs_handle) 170 return in_degree, out_degree
171
172 -def bfs(G, source, name=None):
173 '''Conduct a parallel BFS from the source node to all other reachable nodes in G.''' 174 source = str(source) 175 os.system("echo "+source + " > pbfs_source") 176 wd = config.GraphReduce_location 177 inf_count = len(G) 178 print "at bfs, name = " + name 179 G = GraphLoader(G, name) 180 if name: 181 G.write_adjlist(name) 182 else: 183 G.write_adjlist("pbfs_input.adjlist") 184 hdfs_handle = G.graph_handle.split("/")[-1] 185 print G.graph_handle 186 print hdfs_handle 187 print "writing to " + hdfs_handle + "/" + source 188 r = hdfs.rm(hdfs_handle+"/shortest_path/"+source) 189 190 hdfs.mkdir(hdfs_handle+"/shortest_path/"+source) 191 192 193 hdfs.copyToHDFS(G.graph_handle, hdfs_handle+"/shortest_path/"+source+"/part-00000") 194 195 196 distance, path = parallel_bfs(source, hdfs_handle, inf_count) 197 return distance, path
198
199 -def parallel_degree(hdfs_handle):
200 '''Compute node degree in parallel for the graph adjacency list stored in hdfs_handle.''' 201 hdfs.rm("pdegree") 202 base_path = os.path.realpath( __file__ ).split("/") 203 base_path = "/".join(base_path[0:-1]) 204 hadoop_call = hdmc.build_generic_hadoop_call(base_path+"/Degree_mapper.py", base_path+"/Degree_reducer.py", hdfs_handle+"/degree", "pdegree", []) 205 hdmc.execute_and_wait(hadoop_call) 206 # copy the output to the input 207 hdfs.rm(hdfs_handle+"/degree/part*") 208 hdfs.mv("pdegree/part*", hdfs_handle+"/degree/") 209 hdfs.rm("pdegree") 210 in_d, out_d = fetch_degree_from_hdfs(hdfs_handle) 211 return in_d, out_d
212 213 214
215 -def parallel_bfs(source, hdfs_handle, old_inf_count):
216 '''Compute shortest path from source to all nodes in parallel for the graph adjacency list stored in hdfs_handle.''' 217 hdfs.rm("PBFS-src-"+str(source)) 218 base_path = os.path.realpath( __file__ ).split("/") 219 base_path = "/".join(base_path[0:-1]) 220 hadoop_call = hdmc.build_generic_hadoop_call(base_path+"/PBFS_mapper.py", base_path+"/PBFS_reducer.py", hdfs_handle+"/shortest_path/"+source, "PBFS-src-"+str(source), ["pbfs_source"]) 221 hdmc.execute_and_wait(hadoop_call) 222 listing = hdfs.ls("PBFS-src-"+str(source)+"/part*")["stdout"].rstrip().split("\n") 223 inf_count = 0 224 for entry in listing: 225 last_part = entry.split("part-") 226 tail = hdfs.tail("PBFS-src-"+str(source)+"/part-"+last_part[-1])["stdout"].split("\n") 227 228 for line in tail: 229 tail_entry = line.rstrip().split(":") 230 if len(tail_entry) > 0: 231 if tail_entry[0] == "#inf_count": 232 inf_count += int(tail_entry[1]) 233 234 # copy the output to the input 235 hdfs.rm(hdfs_handle+"/shortest_path/"+source+"/part*") 236 hdfs.mv("PBFS-src-"+str(source)+"/part*", hdfs_handle+"/shortest_path/"+source+"/") 237 hdfs.rm("PBFS-src-"+str(source)) 238 if inf_count > 0 and old_inf_count > inf_count: 239 results, paths = parallel_bfs(source, hdfs_handle, inf_count) 240 else: 241 results, paths = fetch_sp_from_hdfs(hdfs_handle, source) 242 return results, paths
243
244 -def fetch_sp_from_hdfs(hdfs_handle, source):
245 '''Fetch shortest path results from HDFS.''' 246 results = {} 247 paths = {} 248 output = hdfs.cat(hdfs_handle+"/shortest_path/"+source+"/part*")["stdout"].split("\n") 249 for r in output: 250 if len(r) > 0: 251 if r[0] != "#": 252 o = r.rstrip().split("d:") 253 p = r.rstrip().split("path:") 254 nodes = o[0].split() 255 results[nodes[0]] = float(o[1].split()[0]) 256 paths[nodes[0]] = map(string.strip, p[-1].split(",")) 257 if '' in paths[nodes[0]]: 258 paths[nodes[0]].remove('') 259 return results, paths
260
261 -def fetch_degree_from_hdfs(hdfs_handle):
262 '''Fetch degree results from HDFS.''' 263 in_degrees = {} 264 out_degrees = {} 265 output = hdfs.cat(hdfs_handle+"/degree/part*")["stdout"].split("\n") 266 for r in output: 267 if len(r) > 0: 268 if r[0] != "#": 269 entry = r.split() 270 key = entry[0] 271 in_index = entry.index("in:") 272 out_index = entry.index("out:") 273 in_count = len(entry[in_index:out_index]) 274 out_count = len(entry[out_index:]) 275 in_degrees[key] = in_count 276 out_degrees[key] = out_count 277 return in_degrees, out_degrees
278
279 -def check_for_precomputed_degree_result(G, name):
280 '''Check to see if degree has been computed for the networkx graph G.''' 281 if not name: 282 name = "pbfs_input.adjlist" 283 try: 284 listing = hdfs.ls(name+'/degree')["stdout"].split("\n") 285 in_d, out_d = fetch_degree_from_hdfs(name) 286 except AttributeError: 287 in_d= None 288 out_d = None 289 return in_d, out_d
290 291
292 -def check_for_precomputed_bfs_result(G, name, source):
293 '''Check to see if shortest path has been computed for the networkx graph G.''' 294 #check for a precomputed result 295 print "at check: " + name 296 if not name: 297 name = "pbfs_input.adjlist" 298 listing = hdfs.ls(name+'/shortest_path')["stdout"].split("\n") 299 result_exists = False 300 for line in listing: 301 entry = line.rstrip().split("/")[-1] 302 if source == entry: 303 result_exists = True 304 if result_exists: 305 distance, path = fetch_sp_from_hdfs(name, source) 306 else: 307 distance, path = bfs(G, source, name) 308 return distance, path
309