Package ziggy :: Module util
[hide private]
[frames] | no frames]

Source Code for Module ziggy.util

  1  import os 
  2  import subprocess 
  3  from . import hdmc 
  4  from hdmc import hdfs 
  5  from . hdmc.code_generator import CodeGenerator 
  6   
  7  tmp_directory = "/tmp" 
  8  __all__ = ["sort_numeric", "sort_ascii", "search", "histogram"] 
  9   
 10   
11 -def make_histogram_frame():
12 c = CodeGenerator() 13 c.begin() 14 c.write("#! /usr/bin/env python\n\n") 15 c.write("import sys\n\n") 16 c.write("def read_input(file):\n") 17 c.indent() 18 c.write("for line in file:\n") 19 c.indent() 20 c.write("yield line.strip()\n") 21 c.dedent() 22 c.dedent() 23 c.write("\n\n") 24 c.write("def main():\n") 25 c.indent() 26 c.write("data = read_input(sys.stdin)\n") 27 c.write("for line in data:\n") 28 c.indent() 29 c.write("if len(line)>0:\n") 30 c.indent() 31 c.write("print 'ValueHistogram:'+line+':'+'1'\n") 32 c.dedent() 33 c.dedent() 34 c.dedent() 35 c.write("\n\n") 36 c.write('if __name__ == \"__main__\":\n') 37 c.indent() 38 c.write("main()\n") 39 c.write("\n") 40 frame_file = open("histogram_frame.py", "w") 41 print >> frame_file, c.end() 42 frame_file.close() 43 os.system("chmod a+rwx histogram_frame.py")
44
45 -def make_counting_frame():
46 c = CodeGenerator() 47 c.begin() 48 c.write("#! /usr/bin/env python\n\n") 49 c.write("from collections import defaultdict\n") 50 c.write("import sys\n\n") 51 c.write("def read_input(file):\n") 52 c.indent() 53 c.write("for line in file:\n") 54 c.indent() 55 c.write("yield line.strip()\n") 56 c.dedent() 57 c.dedent() 58 c.write("\n\n") 59 c.write("def main():\n") 60 c.indent() 61 c.write("data = read_input(sys.stdin)\n") 62 c.write("c = defaultdict(int)\n") 63 c.write("for line in data:\n") 64 c.indent() 65 c.write("if len(line)>0:\n") 66 c.indent() 67 c.write("c[line] += 1\n") 68 c.dedent() 69 c.dedent() 70 c.write("for key in c.keys():\n") 71 c.indent() 72 c.write("print 'ValueHistogram:'+key+':'+str(c[key])\n") 73 c.dedent() 74 c.dedent() 75 c.write("\n\n") 76 c.write('if __name__ == \"__main__\":\n') 77 c.indent() 78 c.write("main()\n") 79 c.write("\n") 80 frame_file = open("histogram_frame.py", "w") 81 print >> frame_file, c.end() 82 frame_file.close() 83 os.system("chmod a+rwx histogram_frame.py")
84
85 -def make_grep_frame():
86 c = CodeGenerator() 87 c.begin() 88 c.write("#! /usr/bin/env python\n\n") 89 c.write("import sys\n\n") 90 c.write("def read_input(file, pattern):\n") 91 c.indent() 92 c.write("line_count = 0\n") 93 c.write("for line in file:\n") 94 c.indent() 95 c.write("line_count += 1\n") 96 c.write("if pattern in line.strip():\n") 97 c.indent() 98 c.write("yield line.strip()\n") 99 c.dedent() 100 c.dedent() 101 c.dedent() 102 c.write("\n\n") 103 c.write("def main():\n") 104 c.indent() 105 c.write("search_pattern = open('ziggy_search').readline().strip()\n") 106 c.write("data = read_input(sys.stdin, search_pattern)\n") 107 c.write("for line in data:\n") 108 c.indent() 109 c.write("print line\n") 110 c.dedent() 111 c.dedent() 112 c.write("\n\n") 113 c.write('if __name__ == \"__main__\":\n') 114 c.indent() 115 c.write("main()\n") 116 c.write("\n") 117 frame_file = open("search_frame.py", "w") 118 print >> frame_file, c.end() 119 frame_file.close() 120 os.system("chmod a+rwx search_frame.py")
121
122 -def make_search_frame(suppress_lines):
123 c = CodeGenerator() 124 c.begin() 125 c.write("#! /usr/bin/env python\n\n") 126 c.write("import sys\n\n") 127 c.write("def read_input(file, pattern):\n") 128 c.indent() 129 c.write("line_count = 0\n") 130 c.write("for line in file:\n") 131 c.indent() 132 c.write("line_count += 1\n") 133 c.write("if pattern in line.strip():\n") 134 c.indent() 135 c.write("yield line.strip(), line_count\n") 136 c.dedent() 137 c.dedent() 138 c.dedent() 139 c.write("\n\n") 140 c.write("def main():\n") 141 c.indent() 142 c.write("search_pattern = open('ziggy_search').readline().strip()\n") 143 c.write("data = read_input(sys.stdin, search_pattern)\n") 144 c.write("for filename, count in data:\n") 145 c.indent() 146 if suppress_lines: 147 c.write("print filename.split(':')[0] + ':' + str(count)\n") 148 else: 149 c.write("print filename +':' + str(count)\n") 150 c.dedent() 151 c.dedent() 152 c.write("\n\n") 153 c.write('if __name__ == \"__main__\":\n') 154 c.indent() 155 c.write("main()\n") 156 c.write("\n") 157 frame_file = open("search_frame.py", "w") 158 print >> frame_file, c.end() 159 frame_file.close() 160 os.system("chmod a+rwx search_frame.py")
161
162 -def make_identity_frame():
163 c = CodeGenerator() 164 c.begin() 165 c.write("#! /usr/bin/env python\n\n") 166 c.write("import sys\n\n") 167 c.write("def read_input(file):\n") 168 c.indent() 169 c.write("for line in file:\n") 170 c.indent() 171 c.write("yield line.strip()\n") 172 c.dedent() 173 c.dedent() 174 c.write("\n\n") 175 c.write("def main():\n") 176 c.indent() 177 c.write("data = read_input(sys.stdin)\n") 178 c.write("for line in data:\n") 179 c.indent() 180 c.write("if len(line)>0:\n") 181 c.indent() 182 c.write("print line\n") 183 c.dedent() 184 c.dedent() 185 c.dedent() 186 c.write("\n\n") 187 c.write('if __name__ == \"__main__\":\n') 188 c.indent() 189 c.write("main()\n") 190 c.write("\n") 191 192 frame_file = open("identity_frame.py", "w") 193 print >> frame_file, c.end() 194 frame_file.close() 195 os.system("chmod a+rwx identity_frame.py")
196
197 -def sort_numeric(input_file, output_file, ascending=True,num_mappers=None, num_reducers=None):
198 '''Use MapReduce to sort a large set of numeric values.''' 199 hdfs.copyToHDFS(input_file, input_file.split("/")[-1]) 200 make_identity_frame() 201 if not ascending: 202 keycomp ="n" 203 else: 204 keycomp = "nr" 205 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers, keycomp) 206 print hadoop_call 207 hdmc.execute_and_wait(hadoop_call) 208 hdfs.copyFromHDFS(output_file, output_file) 209 cleanup()
210
211 -def sort_ascii(input_file, output_file, ascending=True,num_mappers=None, num_reducers=None):
212 '''Use MapReduce to sort a large set of ASCII values.''' 213 hdfs.copyToHDFS(input_file, input_file.split("/")[-1]) 214 make_identity_frame() 215 if ascending: 216 keycomp = "r" 217 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers, keycomp) 218 else: 219 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers) 220 print hadoop_call 221 hdmc.execute_and_wait(hadoop_call) 222 hdfs.copyFromHDFS(output_file, output_file) 223 cleanup()
224
225 -def text_search(input_directory, input_files, output_file, search_pattern, suppress_lines=True, num_mappers=None, num_reducers=None):
226 '''Use MapReduce to search a collection of input files.''' 227 #make an HDFS directory for the input files 228 hdfs.mkdir(input_directory) 229 for f in input_files: 230 hdfs_location = input_directory+"/"+f.split("/")[-1] 231 hdfs.copyToHDFS(f, hdfs_location) 232 233 #make a special input for the search pattern 234 os.system("echo " + search_pattern + " > ziggy_search") 235 #make a search frame 236 make_grep_frame() 237 #make an identity frame 238 make_identity_frame() 239 #build the hadoop call 240 hadoop_call = hdmc.build_generic_hadoop_call("search_frame.py", "identity_frame.py", input_directory, output_file, ["./ziggy_search"], num_mappers, num_reducers) 241 hdmc.execute_and_wait(hadoop_call) 242 hdfs.copyFromHDFS(output_file, output_file) 243 cleanup()
244
245 -def search(input_directory, input_files, output_file, search_pattern, suppress_lines=True, num_mappers=None, num_reducers=None):
246 '''Use MapReduce to search a collection of input files.''' 247 #make an HDFS directory for the input files 248 hdfs.mkdir(input_directory) 249 #put the files in that HDFS directory 250 for f in input_files: 251 #make a temp file with the filename attached 252 tmpfile = open(tmp_directory+"/ziggy_search_tmp", "w") 253 original_file = open(f) 254 for line in original_file: 255 print >> tmpfile, f +":"+line 256 tmpfile.close() 257 original_file.close() 258 hdfs_location = input_directory+"/"+f.split("/")[-1] 259 hdfs.copyToHDFS(tmp_directory+"/ziggy_search_tmp", hdfs_location) 260 os.remove(tmp_directory+"/ziggy_search_tmp") 261 #make a special input for the search pattern 262 os.system("echo " + search_pattern + " > ziggy_search") 263 #make a search frame 264 make_search_frame(suppress_lines) 265 #make an identity frame 266 make_identity_frame() 267 #build the hadoop call 268 hadoop_call = hdmc.build_generic_hadoop_call("search_frame.py", "identity_frame.py", input_directory, output_file, ["./ziggy_search"], num_mappers, num_reducers) 269 hdmc.execute_and_wait(hadoop_call) 270 hdfs.copyFromHDFS(output_file, output_file) 271 cleanup()
272
273 -def histogram(input_file, output_file, num_mappers=None, num_reducers=None):
274 '''Use MapReduce Aggregation to create a Histogram Report from large input file.''' 275 hdfs.copyToHDFS(input_file, input_file.split("/")[-1]) 276 #make histogram mapper 277 make_histogram_frame() 278 keycomp = "n" 279 hadoop_call = hdmc.build_generic_hadoop_call("histogram_frame.py", "aggregate", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers) 280 hdmc.execute_and_wait(hadoop_call) 281 hdfs.copyFromHDFS(output_file, output_file) 282 cleanup()
283
284 -def cleanup():
285 if os.path.isfile("identity_frame.py"): 286 os.remove("identity_frame.py") 287 if os.path.isfile("histogram_frame.py"): 288 os.remove("histogram_frame.py") 289 if os.path.isfile("search_frame.py"): 290 os.remove("search_frame.py") 291 if os.path.isfile("ziggy_search"): 292 os.remove("ziggy_search")
293