1 '''
2 Module for running monte carlo and other batch jobs on a Hadoop instance.
3 The module allows for the submission of scripts (and supporting files)
4 to a Hadoop MapReduce cluster for batch execution. Default operation runs
5 the submitted script for the specified number of iterations on the configured
6 Hadoop instance. By supplying an additional reducer script, data generated in
7 the batch process can be reduced/filtered/processed before it is written to HDFS
8 and made available to the user.
9
10 WARNING: Piped UNIX commands tend to fail when used as mappers and reducers. Instead
11 write a BASH or python script.
12
13 Created on Jul 28, 2010
14
15 @author: dwmclary
16 '''
17
18 import hadoop_config as config
19 from hdfs import hdfs_access as hdfs
20 import shlex
21 import subprocess
22 import sys
23 import os
24 import stat
25 from code_generator import CodeGenerator
26 from glob import glob
27
29 '''Generates a python script which filters checkpointing results from HDMC.'''
30 c = CodeGenerator()
31 c.begin()
32 c.write("#! /usr/bin/env python\n\n")
33 c.write("from copy import copy\n")
34 c.write("import sys\n\n")
35 c.write("current_key = ''\n")
36 c.write("def read_input(file):\n")
37 c.indent()
38 c.write("global current_key\n")
39 c.write("global current_line\n")
40 c.write("for line in file:\n")
41 c.indent()
42 c.write("line = line.strip()\n")
43 c.write("try:\n")
44 c.indent()
45 c.write("key, value = line.split('==HDMC_CHECKPOINT==')\n")
46 c.dedent()
47 c.write("except ValueError:\n")
48 c.indent()
49 c.write("key=current_key+'-moredata'\n")
50 c.write("value=line\n")
51 c.dedent()
52 c.write("current_key = key\n")
53 c.write("yield key, value\n")
54 c.dedent()
55 c.dedent()
56 c.write("\n\n")
57 c.write("def main():\n")
58 c.indent()
59 c.write("global current_key\n")
60 c.write("global current_line\n")
61 c.write("seen_keys = {}\n")
62 c.write("data = read_input(sys.stdin)\n")
63 c.write("for key, value in data:\n")
64 c.indent()
65 c.write("line_data = value.split('==')\n")
66 c.write("line_key = int(line_data[1])\n")
67 c.write("line_value = line_data[2]\n")
68 c.write("if key in seen_keys:\n")
69 c.indent()
70 c.write("if line_key not in seen_keys[key]:\n")
71 c.indent()
72 c.write("seen_keys[key][line_key] = line_value\n")
73 c.dedent()
74 c.dedent()
75 c.write("else:\n")
76 c.indent()
77 c.write("seen_keys[key] = {}\n")
78 c.write("seen_keys[key][line_key] = line_value\n")
79 c.dedent()
80 c.dedent()
81 c.write("for key in seen_keys:\n")
82 c.indent()
83 c.write("lines = seen_keys[key].keys()\n")
84 c.write("lines.sort()\n")
85 c.write("for l in lines:\n")
86 c.indent()
87
88 c.write("print seen_keys[key][l]\n")
89 c.dedent()
90 c.dedent()
91 c.dedent()
92 c.write("\n\n")
93 c.write('if __name__ == \"__main__\":\n')
94 c.indent()
95 c.write("main()\n")
96 c.write("\n")
97
98 frame_file = open("checkpoint_filter.py", "w")
99 print >> frame_file, c.end()
100 frame_file.close()
101 os.system("chmod a+rwx checkpoint_filter.py")
102
103
105 '''Generates a python script which given a list of files to be processed,
106 executes the specified script in over the files in parallel via MapReduce.'''
107
108 c = CodeGenerator()
109 c.begin()
110 c.write("#! /usr/bin/env python\n\n")
111 c.write("from glob import glob\n")
112 c.write("import sys, os, subprocess, shlex, random, time, re\n\n")
113 c.write("def main():\n")
114 c.indent()
115 c.write("wait_counter = 1\n")
116 c.write("time.sleep(random.random())\n")
117
118 c.write("all_checkpoints = "+str(checkpoint_names)+"\n")
119 c.write("this_checkpoint = random.choice(all_checkpoints)\n")
120 c.write("this_checkpoint_start = this_checkpoint+'_start'\n")
121 c.write("this_checkpoint_end = this_checkpoint+'_end'\n")
122 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n")
123 c.write("final_checkpoints = glob('"+checkpoint_dir+"/*_end')\n")
124 c.write("while len(final_checkpoints) < len(all_checkpoints):\n")
125 c.indent()
126 c.write("for i in range(len(current_checkpoints)):\n")
127 c.indent()
128 c.write("current_checkpoints[i] = re.sub('"+checkpoint_dir+"/', '', current_checkpoints[i])\n")
129 c.dedent()
130 c.write("while this_checkpoint_end in current_checkpoints:\n")
131 c.indent()
132 c.write("this_checkpoint = random.choice(all_checkpoints)\n")
133 c.write("this_checkpoint_start = this_checkpoint+'_start'\n")
134 c.write("this_checkpoint_end = this_checkpoint+'_end'\n")
135 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n")
136 c.write("final_checkpoints = glob('"+checkpoint_dir+"/*_end')\n")
137 c.write("for i in range(len(current_checkpoints)):\n")
138 c.indent()
139 c.write("current_checkpoints[i] = re.sub('"+checkpoint_dir+"/', '', current_checkpoints[i])\n")
140 c.dedent()
141 c.write("if len(final_checkpoints) == len(all_checkpoints):\n")
142 c.indent()
143 c.write("exit()\n")
144 c.dedent()
145 c.dedent()
146 c.write("\n")
147 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_start'])\n")
148 c.write("subprocess.call(['chmod','777','"+checkpoint_dir+"'+'/'+this_checkpoint+'_start'])\n")
149 cmd = str(shlex.split("./"+script.split("/")[-1] + " " + arguments))
150 c.write("os.system('chmod a+rwx "+script.split("/")[-1]+"')\n")
151 if files:
152 c.write("cmd = "+cmd+"+['./'+this_checkpoint]\n")
153 else:
154 c.write("argf = open('./'+this_checkpoint).readlines()\n")
155 c.write("for i in range(len(argf)):\n")
156 c.indent()
157 c.write("argf[i] = argf[i].strip()\n")
158 c.dedent()
159
160 c.write("cmd = "+cmd+"+argf\n")
161
162 c.write("p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n")
163 c.write("output, error = p.communicate()\n")
164 c.write("sts = p.wait()\n")
165 if not debug:
166 c.write("output = output.strip()\n")
167 c.write("if len(output) > 0:\n")
168 c.indent()
169 c.write("line_count = 0\n")
170 c.write("newline = '\\n'\n")
171 c.write("for line in output.split(newline):\n")
172 c.indent()
173 c.write("print this_checkpoint+'==HDMC_CHECKPOINT==LINE=='+ str(line_count) + '=='+line.strip()\n")
174 c.write("line_count += 1\n")
175 c.dedent()
176 c.dedent()
177 c.write("if len(error.strip()) > 0:\n")
178 c.indent()
179 c.write("print >> sys.stderr, error.strip()\n")
180 c.write("os.system('rm "+checkpoint_dir+"'+'/'+this_checkpoint)\n")
181 c.write("exit(1)\n")
182 c.dedent()
183 c.write("else:\n")
184 c.indent()
185 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n")
186 c.write("subprocess.call(['chmod','777','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n")
187 c.dedent()
188 c.write("os.system('rm "+checkpoint_dir+"/'+this_checkpoint+'_start')\n")
189 else:
190 c.write("print output.strip(),error.strip()\n")
191 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n")
192
193 c.write("this_checkpoint = random.choice(all_checkpoints)\n")
194 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n")
195 c.dedent()
196 c.dedent()
197 c.write("\n\n")
198 c.write('if __name__ == \"__main__\":\n')
199 c.indent()
200 c.write("main()\n")
201 c.write("\n")
202
203 frame_file = open("checkpoint_frame.py", "w")
204 print >> frame_file, c.end()
205 frame_file.close()
206 os.system("chmod a+rwx checkpoint_frame.py")
207 if not files:
208 os.system("cp checkpoint_frame.py checkpoint_frame_save.py")
209
210 -def make_frame(script, arguments="", iterations=1, debug=False):
211 '''Generates a basic python frame for running a batch job on a MapReduce cluster.'''
212 cmd = str(shlex.split("./"+script.split("/")[-1] + " " + arguments))
213 c = CodeGenerator()
214 c.begin()
215 c.write("#! /usr/bin/env python\n\n")
216 c.write("import sys, os, subprocess, shlex, random\n\n")
217 c.write("def main():\n")
218 c.indent()
219 c.write("os.system('chmod a+rwx "+script.split("/")[-1]+"')\n")
220 c.write("for i in range("+str(iterations/config.num_map_tasks)+"):\n")
221 c.indent()
222 c.write("p = subprocess.Popen("+cmd+", stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n")
223 c.write("output, error = p.communicate()\n")
224 c.write("sts = p.wait()\n")
225 if not debug:
226 c.write("print output\n")
227 else:
228 c.write("print output,error\n")
229 c.dedent()
230 c.dedent()
231 c.write("\n\n")
232 c.write('if __name__ == \"__main__\":\n')
233 c.indent()
234 c.write("main()\n")
235 c.write("\n")
236
237 frame_file = open("frame.py", "w")
238 print >> frame_file, c.end()
239 frame_file.close()
240 os.system("chmod a+rwx frame.py")
241
243 '''Given the full path to a file or directory, returns its HDFS equivalent'''
244 output_path = output_data_name.split("/")
245 return output_path[len(output_path)-1]
246
247
248 -def build_hadoop_call(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments=None, debug=False, num_mappers=None, num_reducers=None):
249 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to
250 the configured MapReduce instance. The function also generates the necessary execution frame.'''
251
252 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming]
253 if num_mappers:
254 hadoop_call += ["-D", "mapred.map.tasks="+str(num_mappers)]
255 if num_reducers:
256 hadoop_call += ["-D", "mapred.reduce.tasks="+str(num_reducers)]
257
258
259 hadoop_call += ['-input', 'dummy', '-output', get_output_hdfs_name(output_data_name)]
260
261 hadoop_call += ['-mapper', "frame.py"]
262
263
264 if reduction_script:
265 hadoop_call += ['-reducer', get_output_hdfs_name(reduction_script)]
266 else:
267 hadoop_call += ['-reducer', 'NONE']
268
269
270 file_list = ["-file", script]
271 file_list += ["-file", "./frame.py"]
272 if reduction_script:
273 file_list += ["-file", reduction_script]
274 if supporting_file_list:
275 for f in supporting_file_list:
276 file_list += ["-file", f]
277
278 hadoop_call += file_list
279 make_frame(script, arguments, iterations, debug)
280 return hadoop_call
281
282 -def build_checkpoint_call(script, output_data_name, supporting_file_list, reduction_script=None, arguments=None, num_mappers=None, num_reducers=None):
283 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to
284 the configured MapReduce instance. The function also generates the necessary execution frame.'''
285
286 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming]
287 if num_mappers:
288 hadoop_call += ["-D", "mapred.map.tasks="+str(num_mappers)]
289
290 if num_reducers:
291 hadoop_call += ["-D", "mapred.reduce.tasks="+str(num_reducers)]
292
293 hadoop_call += ['-input', 'dummy', '-output', get_output_hdfs_name(output_data_name)]
294
295 hadoop_call += ['-mapper', "checkpoint_frame.py"]
296
297
298 if reduction_script:
299 hadoop_call += ['-reducer', get_output_hdfs_name(reduction_script)]
300 else:
301 hadoop_call += ['-reducer', 'NONE']
302
303
304 file_list = ["-file", script]
305 file_list += ["-file", "./checkpoint_frame.py"]
306 if reduction_script:
307 if reduction_script != "NONE":
308 file_list += ["-file", reduction_script]
309 if supporting_file_list:
310 for f in supporting_file_list:
311 file_list += ["-file", f]
312
313 hadoop_call += file_list
314 return hadoop_call
315
316
317 -def build_generic_hadoop_call(mapper, reducer, input, output, supporting_file_list = None, num_mappers = None, num_reducers = None, key_comparator=None):
318 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to
319 the configured MapReduce instance.'''
320
321 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming]
322
323
324
325
326 if num_mappers:
327 hadoop_call += ["-D", "mapred.map.tasks="+str(num_mappers)]
328 if num_reducers:
329 hadoop_call += ["-D", "mapred.reduce.tasks="+str(num_reducers)]
330 if key_comparator:
331 hadoop_call += ["-D", "mapreduce.partition.keycomparator.options="+key_comparator]
332
333 hadoop_call += ['-input', input, '-output', output]
334
335
336 hadoop_call += ['-mapper', mapper]
337 if reducer != "NONE":
338 hadoop_call += ['-reducer', reducer]
339 else:
340 hadoop_call += ['-reducer', 'NONE']
341
342
343 if reducer not in ["NONE", "aggregate"]:
344 file_list = ["-file", mapper, "-file", reducer]
345 else:
346 file_list = ["-file", mapper]
347
348 if supporting_file_list:
349 for f in supporting_file_list:
350 file_list += ["-file", f]
351
352 hadoop_call += file_list
353 return hadoop_call
354
355
357 '''Nonblocking execution of the given call array'''
358 p = subprocess.Popen(hadoop_call)
359
361 '''Blocking execution of the given call array'''
362 p = subprocess.Popen(hadoop_call)
363 sts = p.wait()
364 return sts
365
366
368 '''Creates a piece of dummy map input data in HDFS. This is necessary because
369 Hadoop streamingrequires input for mapping tasks.'''
370 f = open("dummy", "w")
371 print >> f, "dummy data"
372 f.close()
373 hdfs.copyToHDFS("dummy", "dummy")
374
375
377 '''Loads a data file to HDFS. For future use.'''
378 input_path = input_data_file.split("/")
379 hdfs_filename = input_path[len(input_path)-1]
380 hdfs.copyToHDFS(input_data_file, hdfs_filename)
381
383 '''Given a full path, downloads an output directory from HDFS to the specified location.'''
384 output_path = output_data_name.split("/")
385 hdfs_filename = output_path[-1]
386 f = open(output_data_name, "w")
387 print >> f, hdfs.cat(hdfs_filename+"/part*")["stdout"]
388 f.close()
389
391 '''Given a full path, prints the output of all parts of an HDFS directory.'''
392 output_path = output_data_name.split("/")
393 hdfs_filename = output_path[-1]
394 print hdfs.cat(hdfs_filename+"/part*")["stdout"]
395
396
398 '''Creates a checkpoint directory for parallel file processing. This directory
399 is always named hdmc_checkpoints and exists at the same level as the trailing entry
400 in output_data_name.'''
401 output_path = output_data_name.split("/")
402 output_path.pop()
403 output_dir = config.shared_tmp_space+"/"+os.getlogin()
404 print output_dir
405 try:
406 os.mkdir(output_dir)
407 os.system('chmod 777 '+ output_dir)
408 except OSError:
409 pass
410 cwd = os.getcwd()
411 os.chdir(output_dir)
412 os.system("rm -rf hdmc_checkpoints")
413 os.system("mkdir hdmc_checkpoints")
414 os.system("chmod 777 hdmc_checkpoints")
415 os.chdir(cwd)
416 return output_dir+"/hdmc_checkpoints"
417
419 '''Given a list of file or command names, produces checkpoint names by taking
420 the last member of the array generated by splitting in /'''
421 checkpoints = []
422 for f in file_list:
423 if "/" in f:
424 path = f.split("/")
425 checkpoints.append(path[-1])
426 else:
427 path=f
428 checkpoints.append(f)
429 return checkpoints
430
432 '''Designed to make checkpointing long lists of parameters (e.g. URLs) easier'''
433 checkpoints = []
434 os.system('mkdir pseudo_checkpoints')
435 cp_count = 0
436 for f in file_list:
437 os.system('echo '+f + "> pseudo_checkpoints/" + str(cp_count))
438 checkpoints.append(str(cp_count))
439 file_list[cp_count] = "pseudo_checkpoints/" + str(cp_count)
440 cp_count += 1
441 return checkpoints
442
443
444
445 -def submit(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments="", debug=False, num_mappers=None, num_reducers=None):
446 '''Submits script non-blocking job to a MapReduce cluster and collects output
447 in output_data_name. Supporting filenames can be passed
448 as a list, as can a reducing/filtering script. Arguments to the submitted script
449 should be passed as a string.'''
450 create_dummy_data()
451 hadoop_call = build_hadoop_call(script, output_data_name, iterations, supporting_file_list, reduction_script, arguments, debug, num_mappers, num_reducers)
452 execute(hadoop_call)
453
454
455
456 -def submit_inline(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments="", debug=False, num_mappers=None, num_reducers=None):
457 '''Submits script blocking job to a MapReduce cluster and collects output
458 in output_data_name. Supporting filenames can be passed
459 as a list, as can a reducing/filtering script. Arguments to the submitted script
460 should be passed as a string.'''
461 create_dummy_data()
462 hadoop_call = build_hadoop_call(script, output_data_name, iterations, supporting_file_list, reduction_script, arguments, debug, num_mappers, num_reducers)
463 execute_and_wait(hadoop_call)
464 download_hdfs_data(output_data_name)
465 cleanup()
466
467 -def submit_checkpoint_inline(script, output_data_name, file_list, supporting_file_list=[], reduction_script = None, arguments="", files=True, debug=False, num_mappers=None, num_reducers=None):
468 '''Submits a script to a MapReduce cluster for
469 parallel operation on a number of files. An optional reducer script can be
470 applied as well, but should filter the map results by splitting file output
471 on ===HDMC_CHECKPOINT===. Arguments to the submitted script
472 should be passed as a string. Blocking.'''
473 create_dummy_data()
474 checkpoint_dir = set_checkpoint_directory(output_data_name)
475 if files:
476 checkpoints = get_checkpoint_names(file_list)
477 else:
478 print "not processing files"
479 checkpoints = make_pseudo_checkpoints(file_list)
480
481 make_checkpointing_frame(script, checkpoints, checkpoint_dir, arguments, files, debug)
482 if not reduction_script:
483 reduction_script = "checkpoint_filter.py"
484 make_checkpointing_filter()
485
486
487
488 hadoop_call = build_checkpoint_call(script, output_data_name, file_list+supporting_file_list, reduction_script, arguments, num_mappers, num_reducers)
489
490 print "executing"
491 execute_and_wait(hadoop_call)
492 download_hdfs_data(output_data_name)
493 cleanup()
494 return checkpoints
495
496 -def submit_checkpoint(script, output_data_name, file_list, supporting_file_list=[], reduction_script = None, arguments="", files=True, debug=False, num_mappers=None, num_reducers=None):
497 '''Submits a script to a MapReduce cluster for
498 parallel operation on a number of files. An optional reducer script can be
499 applied as well, but should filter the map results by splitting file output
500 on ===HDMC_CHECKPOINT===. Arguments to the submitted script
501 should be passed as a string. Non-blocking.'''
502 create_dummy_data()
503 checkpoint_dir = set_checkpoint_directory(output_data_name)
504 if files:
505 checkpoints = get_checkpoint_names(file_list)
506 else:
507 print "not processing files"
508 checkpoints = make_pseudo_checkpoints(file_list)
509
510 make_checkpointing_frame(script, checkpoints, checkpoint_dir, arguments, files, debug)
511 if not reduction_script:
512 reduction_script = "checkpoint_filter.py"
513 make_checkpointing_filter()
514
515
516
517 hadoop_call = build_checkpoint_call(script, output_data_name, file_list+supporting_file_list, reduction_script, arguments, num_mappers, num_reducers)
518
519 print "executing"
520 execute_and_wait(hadoop_call)
521 download_hdfs_data(output_data_name)
522 cleanup()
523 return checkpoints
524
526 '''Remove files generated by HDMC.'''
527 if os.path.isfile("frame.py"):
528 os.remove("frame.py")
529 if os.path.isfile("checkpoint_frame.py"):
530 os.remove("checkpoint_frame.py")
531 if os.path.isfile("checkpoint_filter.py"):
532 os.remove("checkpoint_filter.py")
533 if os.path.isfile("dummy"):
534 os.remove("dummy")
535 if os.path.isdir("pseudo_checkpoints"):
536 pcps = glob("pseudo_checkpoints/*")
537 for f in pcps:
538 os.remove(f)
539 os.removedirs("pseudo_checkpoints")
540