1 import os
2 import subprocess
3 from . import hdmc
4 from hdmc import hdfs
5 from . hdmc.code_generator import CodeGenerator
6
7 tmp_directory = "/tmp"
8 __all__ = ["sort_numeric", "sort_ascii", "search", "histogram"]
9
10
12 c = CodeGenerator()
13 c.begin()
14 c.write("#! /usr/bin/env python\n\n")
15 c.write("import sys\n\n")
16 c.write("def read_input(file):\n")
17 c.indent()
18 c.write("for line in file:\n")
19 c.indent()
20 c.write("yield line.strip()\n")
21 c.dedent()
22 c.dedent()
23 c.write("\n\n")
24 c.write("def main():\n")
25 c.indent()
26 c.write("data = read_input(sys.stdin)\n")
27 c.write("for line in data:\n")
28 c.indent()
29 c.write("if len(line)>0:\n")
30 c.indent()
31 c.write("print 'ValueHistogram:'+line+':'+'1'\n")
32 c.dedent()
33 c.dedent()
34 c.dedent()
35 c.write("\n\n")
36 c.write('if __name__ == \"__main__\":\n')
37 c.indent()
38 c.write("main()\n")
39 c.write("\n")
40 frame_file = open("histogram_frame.py", "w")
41 print >> frame_file, c.end()
42 frame_file.close()
43 os.system("chmod a+rwx histogram_frame.py")
44
46 c = CodeGenerator()
47 c.begin()
48 c.write("#! /usr/bin/env python\n\n")
49 c.write("from collections import defaultdict\n")
50 c.write("import sys\n\n")
51 c.write("def read_input(file):\n")
52 c.indent()
53 c.write("for line in file:\n")
54 c.indent()
55 c.write("yield line.strip()\n")
56 c.dedent()
57 c.dedent()
58 c.write("\n\n")
59 c.write("def main():\n")
60 c.indent()
61 c.write("data = read_input(sys.stdin)\n")
62 c.write("c = defaultdict(int)\n")
63 c.write("for line in data:\n")
64 c.indent()
65 c.write("if len(line)>0:\n")
66 c.indent()
67 c.write("c[line] += 1\n")
68 c.dedent()
69 c.dedent()
70 c.write("for key in c.keys():\n")
71 c.indent()
72 c.write("print 'ValueHistogram:'+key+':'+str(c[key])\n")
73 c.dedent()
74 c.dedent()
75 c.write("\n\n")
76 c.write('if __name__ == \"__main__\":\n')
77 c.indent()
78 c.write("main()\n")
79 c.write("\n")
80 frame_file = open("histogram_frame.py", "w")
81 print >> frame_file, c.end()
82 frame_file.close()
83 os.system("chmod a+rwx histogram_frame.py")
84
86 c = CodeGenerator()
87 c.begin()
88 c.write("#! /usr/bin/env python\n\n")
89 c.write("import sys\n\n")
90 c.write("def read_input(file, pattern):\n")
91 c.indent()
92 c.write("line_count = 0\n")
93 c.write("for line in file:\n")
94 c.indent()
95 c.write("line_count += 1\n")
96 c.write("if pattern in line.strip():\n")
97 c.indent()
98 c.write("yield line.strip()\n")
99 c.dedent()
100 c.dedent()
101 c.dedent()
102 c.write("\n\n")
103 c.write("def main():\n")
104 c.indent()
105 c.write("search_pattern = open('ziggy_search').readline().strip()\n")
106 c.write("data = read_input(sys.stdin, search_pattern)\n")
107 c.write("for line in data:\n")
108 c.indent()
109 c.write("print line\n")
110 c.dedent()
111 c.dedent()
112 c.write("\n\n")
113 c.write('if __name__ == \"__main__\":\n')
114 c.indent()
115 c.write("main()\n")
116 c.write("\n")
117 frame_file = open("search_frame.py", "w")
118 print >> frame_file, c.end()
119 frame_file.close()
120 os.system("chmod a+rwx search_frame.py")
121
123 c = CodeGenerator()
124 c.begin()
125 c.write("#! /usr/bin/env python\n\n")
126 c.write("import sys\n\n")
127 c.write("def read_input(file, pattern):\n")
128 c.indent()
129 c.write("line_count = 0\n")
130 c.write("for line in file:\n")
131 c.indent()
132 c.write("line_count += 1\n")
133 c.write("if pattern in line.strip():\n")
134 c.indent()
135 c.write("yield line.strip(), line_count\n")
136 c.dedent()
137 c.dedent()
138 c.dedent()
139 c.write("\n\n")
140 c.write("def main():\n")
141 c.indent()
142 c.write("search_pattern = open('ziggy_search').readline().strip()\n")
143 c.write("data = read_input(sys.stdin, search_pattern)\n")
144 c.write("for filename, count in data:\n")
145 c.indent()
146 if suppress_lines:
147 c.write("print filename.split(':')[0] + ':' + str(count)\n")
148 else:
149 c.write("print filename +':' + str(count)\n")
150 c.dedent()
151 c.dedent()
152 c.write("\n\n")
153 c.write('if __name__ == \"__main__\":\n')
154 c.indent()
155 c.write("main()\n")
156 c.write("\n")
157 frame_file = open("search_frame.py", "w")
158 print >> frame_file, c.end()
159 frame_file.close()
160 os.system("chmod a+rwx search_frame.py")
161
163 c = CodeGenerator()
164 c.begin()
165 c.write("#! /usr/bin/env python\n\n")
166 c.write("import sys\n\n")
167 c.write("def read_input(file):\n")
168 c.indent()
169 c.write("for line in file:\n")
170 c.indent()
171 c.write("yield line.strip()\n")
172 c.dedent()
173 c.dedent()
174 c.write("\n\n")
175 c.write("def main():\n")
176 c.indent()
177 c.write("data = read_input(sys.stdin)\n")
178 c.write("for line in data:\n")
179 c.indent()
180 c.write("if len(line)>0:\n")
181 c.indent()
182 c.write("print line\n")
183 c.dedent()
184 c.dedent()
185 c.dedent()
186 c.write("\n\n")
187 c.write('if __name__ == \"__main__\":\n')
188 c.indent()
189 c.write("main()\n")
190 c.write("\n")
191
192 frame_file = open("identity_frame.py", "w")
193 print >> frame_file, c.end()
194 frame_file.close()
195 os.system("chmod a+rwx identity_frame.py")
196
197 -def sort_numeric(input_file, output_file, ascending=True,num_mappers=None, num_reducers=None):
210
211 -def sort_ascii(input_file, output_file, ascending=True,num_mappers=None, num_reducers=None):
212 '''Use MapReduce to sort a large set of ASCII values.'''
213 hdfs.copyToHDFS(input_file, input_file.split("/")[-1])
214 make_identity_frame()
215 if ascending:
216 keycomp = "r"
217 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers, keycomp)
218 else:
219 hadoop_call = hdmc.build_generic_hadoop_call("identity_frame.py", "identity_frame.py", input_file.split("/")[-1], output_file, [], num_mappers, num_reducers)
220 print hadoop_call
221 hdmc.execute_and_wait(hadoop_call)
222 hdfs.copyFromHDFS(output_file, output_file)
223 cleanup()
224
225 -def text_search(input_directory, input_files, output_file, search_pattern, suppress_lines=True, num_mappers=None, num_reducers=None):
226 '''Use MapReduce to search a collection of input files.'''
227
228 hdfs.mkdir(input_directory)
229 for f in input_files:
230 hdfs_location = input_directory+"/"+f.split("/")[-1]
231 hdfs.copyToHDFS(f, hdfs_location)
232
233
234 os.system("echo " + search_pattern + " > ziggy_search")
235
236 make_grep_frame()
237
238 make_identity_frame()
239
240 hadoop_call = hdmc.build_generic_hadoop_call("search_frame.py", "identity_frame.py", input_directory, output_file, ["./ziggy_search"], num_mappers, num_reducers)
241 hdmc.execute_and_wait(hadoop_call)
242 hdfs.copyFromHDFS(output_file, output_file)
243 cleanup()
244
245 -def search(input_directory, input_files, output_file, search_pattern, suppress_lines=True, num_mappers=None, num_reducers=None):
246 '''Use MapReduce to search a collection of input files.'''
247
248 hdfs.mkdir(input_directory)
249
250 for f in input_files:
251
252 tmpfile = open(tmp_directory+"/ziggy_search_tmp", "w")
253 original_file = open(f)
254 for line in original_file:
255 print >> tmpfile, f +":"+line
256 tmpfile.close()
257 original_file.close()
258 hdfs_location = input_directory+"/"+f.split("/")[-1]
259 hdfs.copyToHDFS(tmp_directory+"/ziggy_search_tmp", hdfs_location)
260 os.remove(tmp_directory+"/ziggy_search_tmp")
261
262 os.system("echo " + search_pattern + " > ziggy_search")
263
264 make_search_frame(suppress_lines)
265
266 make_identity_frame()
267
268 hadoop_call = hdmc.build_generic_hadoop_call("search_frame.py", "identity_frame.py", input_directory, output_file, ["./ziggy_search"], num_mappers, num_reducers)
269 hdmc.execute_and_wait(hadoop_call)
270 hdfs.copyFromHDFS(output_file, output_file)
271 cleanup()
272
273 -def histogram(input_file, output_file, num_mappers=None, num_reducers=None):
283
285 if os.path.isfile("identity_frame.py"):
286 os.remove("identity_frame.py")
287 if os.path.isfile("histogram_frame.py"):
288 os.remove("histogram_frame.py")
289 if os.path.isfile("search_frame.py"):
290 os.remove("search_frame.py")
291 if os.path.isfile("ziggy_search"):
292 os.remove("ziggy_search")
293