Package ziggy :: Package hdmc :: Module hdmc
[hide private]
[frames] | no frames]

Source Code for Module ziggy.hdmc.hdmc

  1  ''' 
  2  Module for running monte carlo and other batch jobs on a Hadoop instance. 
  3  The module allows for the submission of scripts (and supporting files) 
  4  to a Hadoop MapReduce cluster for batch execution.  Default operation runs 
  5  the submitted script for the specified number of iterations on the configured 
  6  Hadoop instance.  By supplying an additional reducer script, data generated in 
  7  the batch process can be reduced/filtered/processed before it is written to HDFS 
  8  and made available to the user. 
  9   
 10  WARNING: Piped UNIX commands tend to fail when used as mappers and reducers.  Instead 
 11  write a BASH or python script. 
 12   
 13  Created on Jul 28, 2010 
 14   
 15  @author: dwmclary 
 16  ''' 
 17   
 18  import hadoop_config as config 
 19  from hdfs import hdfs_access as hdfs 
 20  import shlex 
 21  import subprocess 
 22  import sys 
 23  import os 
 24  import stat 
 25  from code_generator import CodeGenerator 
 26  from glob import glob 
 27   
28 -def make_checkpointing_filter():
29 '''Generates a python script which filters checkpointing results from HDMC.''' 30 c = CodeGenerator() 31 c.begin() 32 c.write("#! /usr/bin/env python\n\n") 33 c.write("from copy import copy\n") 34 c.write("import sys\n\n") 35 c.write("current_key = ''\n") 36 c.write("def read_input(file):\n") 37 c.indent() 38 c.write("global current_key\n") 39 c.write("global current_line\n") 40 c.write("for line in file:\n") 41 c.indent() 42 c.write("line = line.strip()\n") 43 c.write("try:\n") 44 c.indent() 45 c.write("key, value = line.split('==HDMC_CHECKPOINT==')\n") 46 c.dedent() 47 c.write("except ValueError:\n") 48 c.indent() 49 c.write("key=current_key+'-moredata'\n") 50 c.write("value=line\n") 51 c.dedent() 52 c.write("current_key = key\n") 53 c.write("yield key, value\n") 54 c.dedent() 55 c.dedent() 56 c.write("\n\n") 57 c.write("def main():\n") 58 c.indent() 59 c.write("global current_key\n") 60 c.write("global current_line\n") 61 c.write("seen_keys = {}\n") 62 c.write("data = read_input(sys.stdin)\n") 63 c.write("for key, value in data:\n") 64 c.indent() 65 c.write("line_data = value.split('==')\n") 66 c.write("line_key = int(line_data[1])\n") 67 c.write("line_value = line_data[2]\n") 68 c.write("if key in seen_keys:\n") 69 c.indent() 70 c.write("if line_key not in seen_keys[key]:\n") 71 c.indent() 72 c.write("seen_keys[key][line_key] = line_value\n") 73 c.dedent() 74 c.dedent() 75 c.write("else:\n") 76 c.indent() 77 c.write("seen_keys[key] = {}\n") 78 c.write("seen_keys[key][line_key] = line_value\n") 79 c.dedent() 80 c.dedent() 81 c.write("for key in seen_keys:\n") 82 c.indent() 83 c.write("lines = seen_keys[key].keys()\n") 84 c.write("lines.sort()\n") 85 c.write("for l in lines:\n") 86 c.indent() 87 #c.write("print key +':'+str(l)+':'+seen_keys[key][l]\n") 88 c.write("print seen_keys[key][l]\n") 89 c.dedent() 90 c.dedent() 91 c.dedent() 92 c.write("\n\n") 93 c.write('if __name__ == \"__main__\":\n') 94 c.indent() 95 c.write("main()\n") 96 c.write("\n") 97 98 frame_file = open("checkpoint_filter.py", "w") 99 print >> frame_file, c.end() 100 frame_file.close() 101 os.system("chmod a+rwx checkpoint_filter.py")
102 103
104 -def make_checkpointing_frame(script, checkpoint_names, checkpoint_dir,arguments="", files = True, debug=False):
105 '''Generates a python script which given a list of files to be processed, 106 executes the specified script in over the files in parallel via MapReduce.''' 107 108 c = CodeGenerator() 109 c.begin() 110 c.write("#! /usr/bin/env python\n\n") 111 c.write("from glob import glob\n") 112 c.write("import sys, os, subprocess, shlex, random, time, re\n\n") 113 c.write("def main():\n") 114 c.indent() 115 c.write("wait_counter = 1\n") 116 c.write("time.sleep(random.random())\n") 117 #choose a checkpoint 118 c.write("all_checkpoints = "+str(checkpoint_names)+"\n") 119 c.write("this_checkpoint = random.choice(all_checkpoints)\n") 120 c.write("this_checkpoint_start = this_checkpoint+'_start'\n") 121 c.write("this_checkpoint_end = this_checkpoint+'_end'\n") 122 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n") 123 c.write("final_checkpoints = glob('"+checkpoint_dir+"/*_end')\n") 124 c.write("while len(final_checkpoints) < len(all_checkpoints):\n") 125 c.indent() 126 c.write("for i in range(len(current_checkpoints)):\n") 127 c.indent() 128 c.write("current_checkpoints[i] = re.sub('"+checkpoint_dir+"/', '', current_checkpoints[i])\n") 129 c.dedent() 130 c.write("while this_checkpoint_end in current_checkpoints:\n") 131 c.indent() 132 c.write("this_checkpoint = random.choice(all_checkpoints)\n") 133 c.write("this_checkpoint_start = this_checkpoint+'_start'\n") 134 c.write("this_checkpoint_end = this_checkpoint+'_end'\n") 135 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n") 136 c.write("final_checkpoints = glob('"+checkpoint_dir+"/*_end')\n") 137 c.write("for i in range(len(current_checkpoints)):\n") 138 c.indent() 139 c.write("current_checkpoints[i] = re.sub('"+checkpoint_dir+"/', '', current_checkpoints[i])\n") 140 c.dedent() 141 c.write("if len(final_checkpoints) == len(all_checkpoints):\n") 142 c.indent() 143 c.write("exit()\n") 144 c.dedent() 145 c.dedent() 146 c.write("\n") 147 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_start'])\n") 148 c.write("subprocess.call(['chmod','777','"+checkpoint_dir+"'+'/'+this_checkpoint+'_start'])\n") 149 cmd = str(shlex.split("./"+script.split("/")[-1] + " " + arguments)) 150 c.write("os.system('chmod a+rwx "+script.split("/")[-1]+"')\n") 151 if files: 152 c.write("cmd = "+cmd+"+['./'+this_checkpoint]\n") 153 else: 154 c.write("argf = open('./'+this_checkpoint).readlines()\n") 155 c.write("for i in range(len(argf)):\n") 156 c.indent() 157 c.write("argf[i] = argf[i].strip()\n") 158 c.dedent() 159 160 c.write("cmd = "+cmd+"+argf\n") 161 162 c.write("p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n") 163 c.write("output, error = p.communicate()\n") 164 c.write("sts = p.wait()\n") 165 if not debug: 166 c.write("output = output.strip()\n") 167 c.write("if len(output) > 0:\n") 168 c.indent() 169 c.write("line_count = 0\n") 170 c.write("newline = '\\n'\n") 171 c.write("for line in output.split(newline):\n") 172 c.indent() 173 c.write("print this_checkpoint+'==HDMC_CHECKPOINT==LINE=='+ str(line_count) + '=='+line.strip()\n") 174 c.write("line_count += 1\n") 175 c.dedent() 176 c.dedent() 177 c.write("if len(error.strip()) > 0:\n") 178 c.indent() 179 c.write("print >> sys.stderr, error.strip()\n") 180 c.write("os.system('rm "+checkpoint_dir+"'+'/'+this_checkpoint)\n") 181 c.write("exit(1)\n") 182 c.dedent() 183 c.write("else:\n") 184 c.indent() 185 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n") 186 c.write("subprocess.call(['chmod','777','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n") 187 c.dedent() 188 c.write("os.system('rm "+checkpoint_dir+"/'+this_checkpoint+'_start')\n") 189 else: 190 c.write("print output.strip(),error.strip()\n") 191 c.write("subprocess.call(['touch','"+checkpoint_dir+"'+'/'+this_checkpoint+'_end'])\n") 192 193 c.write("this_checkpoint = random.choice(all_checkpoints)\n") 194 c.write("current_checkpoints = glob('"+checkpoint_dir+"/*')\n") 195 c.dedent() 196 c.dedent() 197 c.write("\n\n") 198 c.write('if __name__ == \"__main__\":\n') 199 c.indent() 200 c.write("main()\n") 201 c.write("\n") 202 203 frame_file = open("checkpoint_frame.py", "w") 204 print >> frame_file, c.end() 205 frame_file.close() 206 os.system("chmod a+rwx checkpoint_frame.py") 207 if not files: 208 os.system("cp checkpoint_frame.py checkpoint_frame_save.py")
209
210 -def make_frame(script, arguments="", iterations=1, debug=False):
211 '''Generates a basic python frame for running a batch job on a MapReduce cluster.''' 212 cmd = str(shlex.split("./"+script.split("/")[-1] + " " + arguments)) 213 c = CodeGenerator() 214 c.begin() 215 c.write("#! /usr/bin/env python\n\n") 216 c.write("import sys, os, subprocess, shlex, random\n\n") 217 c.write("def main():\n") 218 c.indent() 219 c.write("os.system('chmod a+rwx "+script.split("/")[-1]+"')\n") 220 c.write("for i in range("+str(iterations/config.num_map_tasks)+"):\n") 221 c.indent() 222 c.write("p = subprocess.Popen("+cmd+", stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n") 223 c.write("output, error = p.communicate()\n") 224 c.write("sts = p.wait()\n") 225 if not debug: 226 c.write("print output\n") 227 else: 228 c.write("print output,error\n") 229 c.dedent() 230 c.dedent() 231 c.write("\n\n") 232 c.write('if __name__ == \"__main__\":\n') 233 c.indent() 234 c.write("main()\n") 235 c.write("\n") 236 237 frame_file = open("frame.py", "w") 238 print >> frame_file, c.end() 239 frame_file.close() 240 os.system("chmod a+rwx frame.py")
241
242 -def get_output_hdfs_name(output_data_name):
243 '''Given the full path to a file or directory, returns its HDFS equivalent''' 244 output_path = output_data_name.split("/") 245 return output_path[len(output_path)-1]
246 247
248 -def build_hadoop_call(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments=None, debug=False, num_mappers=None, num_reducers=None):
249 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to 250 the configured MapReduce instance. The function also generates the necessary execution frame.''' 251 # I/O setup 252 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming] 253 if num_mappers: 254 hadoop_call += ["-D", "mapred.map.tasks="+str(num_mappers)] 255 if num_reducers: 256 hadoop_call += ["-D", "mapred.reduce.tasks="+str(num_reducers)] 257 258 259 hadoop_call += ['-input', 'dummy', '-output', get_output_hdfs_name(output_data_name)] 260 #mapper name 261 hadoop_call += ['-mapper', "frame.py"] 262 263 #set the reducer 264 if reduction_script: 265 hadoop_call += ['-reducer', get_output_hdfs_name(reduction_script)] 266 else: 267 hadoop_call += ['-reducer', 'NONE'] 268 269 #build the supporting file list 270 file_list = ["-file", script] 271 file_list += ["-file", "./frame.py"] 272 if reduction_script: 273 file_list += ["-file", reduction_script] 274 if supporting_file_list: 275 for f in supporting_file_list: 276 file_list += ["-file", f] 277 278 hadoop_call += file_list 279 make_frame(script, arguments, iterations, debug) 280 return hadoop_call
281
282 -def build_checkpoint_call(script, output_data_name, supporting_file_list, reduction_script=None, arguments=None, num_mappers=None, num_reducers=None):
283 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to 284 the configured MapReduce instance. The function also generates the necessary execution frame.''' 285 # I/O setup 286 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming] 287 if num_mappers: 288 hadoop_call += ["-D", "mapred.map.tasks="+str(num_mappers)] 289 290 if num_reducers: 291 hadoop_call += ["-D", "mapred.reduce.tasks="+str(num_reducers)] 292 293 hadoop_call += ['-input', 'dummy', '-output', get_output_hdfs_name(output_data_name)] 294 #mapper name 295 hadoop_call += ['-mapper', "checkpoint_frame.py"] 296 297 #set the reducer 298 if reduction_script: 299 hadoop_call += ['-reducer', get_output_hdfs_name(reduction_script)] 300 else: 301 hadoop_call += ['-reducer', 'NONE'] 302 303 #build the supporting file list 304 file_list = ["-file", script] 305 file_list += ["-file", "./checkpoint_frame.py"] 306 if reduction_script: 307 if reduction_script != "NONE": 308 file_list += ["-file", reduction_script] 309 if supporting_file_list: 310 for f in supporting_file_list: 311 file_list += ["-file", f] 312 313 hadoop_call += file_list 314 return hadoop_call
315 316
317 -def build_generic_hadoop_call(mapper, reducer, input, output, supporting_file_list = None, num_mappers = None, num_reducers = None, key_comparator=None):
318 '''Builds a call array suitable for subprocess.Popen which submits a streaming job to 319 the configured MapReduce instance.''' 320 # I/O setup 321 hadoop_call = [config.hadoop, 'jar', config.hadoop_streaming] 322 323 324 #process mapper, reducer, and key comparator options 325 326 if num_mappers: 327 hadoop_call += ["-D", "mapred.map.tasks="+str(num_mappers)] 328 if num_reducers: 329 hadoop_call += ["-D", "mapred.reduce.tasks="+str(num_reducers)] 330 if key_comparator: 331 hadoop_call += ["-D", "mapreduce.partition.keycomparator.options="+key_comparator] 332 333 hadoop_call += ['-input', input, '-output', output] 334 335 #set mapper and reducer 336 hadoop_call += ['-mapper', mapper] 337 if reducer != "NONE": 338 hadoop_call += ['-reducer', reducer] 339 else: 340 hadoop_call += ['-reducer', 'NONE'] 341 342 #build the supporting file list 343 if reducer not in ["NONE", "aggregate"]: 344 file_list = ["-file", mapper, "-file", reducer] 345 else: 346 file_list = ["-file", mapper] 347 348 if supporting_file_list: 349 for f in supporting_file_list: 350 file_list += ["-file", f] 351 352 hadoop_call += file_list 353 return hadoop_call
354 355
356 -def execute(hadoop_call):
357 '''Nonblocking execution of the given call array''' 358 p = subprocess.Popen(hadoop_call)
359
360 -def execute_and_wait(hadoop_call):
361 '''Blocking execution of the given call array''' 362 p = subprocess.Popen(hadoop_call) 363 sts = p.wait() 364 return sts
365 366
367 -def create_dummy_data():
368 '''Creates a piece of dummy map input data in HDFS. This is necessary because 369 Hadoop streamingrequires input for mapping tasks.''' 370 f = open("dummy", "w") 371 print >> f, "dummy data" 372 f.close() 373 hdfs.copyToHDFS("dummy", "dummy")
374 375
376 -def load_data_to_hfds(input_data_file):
377 '''Loads a data file to HDFS. For future use.''' 378 input_path = input_data_file.split("/") 379 hdfs_filename = input_path[len(input_path)-1] 380 hdfs.copyToHDFS(input_data_file, hdfs_filename)
381
382 -def download_hdfs_data(output_data_name):
383 '''Given a full path, downloads an output directory from HDFS to the specified location.''' 384 output_path = output_data_name.split("/") 385 hdfs_filename = output_path[-1] 386 f = open(output_data_name, "w") 387 print >> f, hdfs.cat(hdfs_filename+"/part*")["stdout"] 388 f.close()
389 395 396
397 -def set_checkpoint_directory(output_data_name):
398 '''Creates a checkpoint directory for parallel file processing. This directory 399 is always named hdmc_checkpoints and exists at the same level as the trailing entry 400 in output_data_name.''' 401 output_path = output_data_name.split("/") 402 output_path.pop() 403 output_dir = config.shared_tmp_space+"/"+os.getlogin() 404 print output_dir 405 try: 406 os.mkdir(output_dir) 407 os.system('chmod 777 '+ output_dir) 408 except OSError: 409 pass 410 cwd = os.getcwd() 411 os.chdir(output_dir) 412 os.system("rm -rf hdmc_checkpoints") 413 os.system("mkdir hdmc_checkpoints") 414 os.system("chmod 777 hdmc_checkpoints") 415 os.chdir(cwd) 416 return output_dir+"/hdmc_checkpoints"
417
418 -def get_checkpoint_names(file_list):
419 '''Given a list of file or command names, produces checkpoint names by taking 420 the last member of the array generated by splitting in /''' 421 checkpoints = [] 422 for f in file_list: 423 if "/" in f: 424 path = f.split("/") 425 checkpoints.append(path[-1]) 426 else: 427 path=f 428 checkpoints.append(f) 429 return checkpoints
430
431 -def make_pseudo_checkpoints(file_list):
432 '''Designed to make checkpointing long lists of parameters (e.g. URLs) easier''' 433 checkpoints = [] 434 os.system('mkdir pseudo_checkpoints') 435 cp_count = 0 436 for f in file_list: 437 os.system('echo '+f + "> pseudo_checkpoints/" + str(cp_count)) 438 checkpoints.append(str(cp_count)) 439 file_list[cp_count] = "pseudo_checkpoints/" + str(cp_count) 440 cp_count += 1 441 return checkpoints
442 443 444
445 -def submit(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments="", debug=False, num_mappers=None, num_reducers=None):
446 '''Submits script non-blocking job to a MapReduce cluster and collects output 447 in output_data_name. Supporting filenames can be passed 448 as a list, as can a reducing/filtering script. Arguments to the submitted script 449 should be passed as a string.''' 450 create_dummy_data() 451 hadoop_call = build_hadoop_call(script, output_data_name, iterations, supporting_file_list, reduction_script, arguments, debug, num_mappers, num_reducers) 452 execute(hadoop_call)
453 454 455
456 -def submit_inline(script, output_data_name, iterations=1, supporting_file_list = None, reduction_script=None, arguments="", debug=False, num_mappers=None, num_reducers=None):
457 '''Submits script blocking job to a MapReduce cluster and collects output 458 in output_data_name. Supporting filenames can be passed 459 as a list, as can a reducing/filtering script. Arguments to the submitted script 460 should be passed as a string.''' 461 create_dummy_data() 462 hadoop_call = build_hadoop_call(script, output_data_name, iterations, supporting_file_list, reduction_script, arguments, debug, num_mappers, num_reducers) 463 execute_and_wait(hadoop_call) 464 download_hdfs_data(output_data_name) 465 cleanup()
466
467 -def submit_checkpoint_inline(script, output_data_name, file_list, supporting_file_list=[], reduction_script = None, arguments="", files=True, debug=False, num_mappers=None, num_reducers=None):
468 '''Submits a script to a MapReduce cluster for 469 parallel operation on a number of files. An optional reducer script can be 470 applied as well, but should filter the map results by splitting file output 471 on ===HDMC_CHECKPOINT===. Arguments to the submitted script 472 should be passed as a string. Blocking.''' 473 create_dummy_data() 474 checkpoint_dir = set_checkpoint_directory(output_data_name) 475 if files: 476 checkpoints = get_checkpoint_names(file_list) 477 else: 478 print "not processing files" 479 checkpoints = make_pseudo_checkpoints(file_list) 480 481 make_checkpointing_frame(script, checkpoints, checkpoint_dir, arguments, files, debug) 482 if not reduction_script: 483 reduction_script = "checkpoint_filter.py" 484 make_checkpointing_filter() 485 # elif not files: 486 # reduction_script = "NONE" 487 488 hadoop_call = build_checkpoint_call(script, output_data_name, file_list+supporting_file_list, reduction_script, arguments, num_mappers, num_reducers) 489 490 print "executing" 491 execute_and_wait(hadoop_call) 492 download_hdfs_data(output_data_name) 493 cleanup() 494 return checkpoints
495
496 -def submit_checkpoint(script, output_data_name, file_list, supporting_file_list=[], reduction_script = None, arguments="", files=True, debug=False, num_mappers=None, num_reducers=None):
497 '''Submits a script to a MapReduce cluster for 498 parallel operation on a number of files. An optional reducer script can be 499 applied as well, but should filter the map results by splitting file output 500 on ===HDMC_CHECKPOINT===. Arguments to the submitted script 501 should be passed as a string. Non-blocking.''' 502 create_dummy_data() 503 checkpoint_dir = set_checkpoint_directory(output_data_name) 504 if files: 505 checkpoints = get_checkpoint_names(file_list) 506 else: 507 print "not processing files" 508 checkpoints = make_pseudo_checkpoints(file_list) 509 510 make_checkpointing_frame(script, checkpoints, checkpoint_dir, arguments, files, debug) 511 if not reduction_script: 512 reduction_script = "checkpoint_filter.py" 513 make_checkpointing_filter() 514 # elif not files: 515 # reduction_script = "NONE" 516 517 hadoop_call = build_checkpoint_call(script, output_data_name, file_list+supporting_file_list, reduction_script, arguments, num_mappers, num_reducers) 518 519 print "executing" 520 execute_and_wait(hadoop_call) 521 download_hdfs_data(output_data_name) 522 cleanup() 523 return checkpoints
524
525 -def cleanup():
526 '''Remove files generated by HDMC.''' 527 if os.path.isfile("frame.py"): 528 os.remove("frame.py") 529 if os.path.isfile("checkpoint_frame.py"): 530 os.remove("checkpoint_frame.py") 531 if os.path.isfile("checkpoint_filter.py"): 532 os.remove("checkpoint_filter.py") 533 if os.path.isfile("dummy"): 534 os.remove("dummy") 535 if os.path.isdir("pseudo_checkpoints"): 536 pcps = glob("pseudo_checkpoints/*") 537 for f in pcps: 538 os.remove(f) 539 os.removedirs("pseudo_checkpoints")
540