1 '''
2 Created on Jul 29, 2010
3
4 @author: dwmclary
5 '''
6 import unittest
7 import os
8 import hdmc
9 import hdfs.hdfs_access as hdfs
10 import hadoop_config as config
11 from glob import glob
12
14
19
21 self.wd = os.getcwd()
22 self.script = self.wd+"/test/numpy_random_means.py"
23 self.reducer = self.wd+"/test/numpy_mean_reduction.py"
24 self.output_file = self.wd+"/test/random_means"
25 self.checkpoint_names = map(str, range(1,20))
26 self.checkpoint_dir = config.shared_tmp_space+"/"+os.getlogin()+"/hdmc_checkpoints"
27 pass
28
29
32
33
35 hdmc.make_frame(self.script)
36 self.assertTrue(os.path.isfile(self.wd+"/frame.py"))
37
39 hdmc.make_checkpointing_frame(self.script, self.checkpoint_names, self.checkpoint_dir)
40 self.assertTrue(os.path.isfile(self.wd+"/checkpoint_frame.py"))
41
47
49 os.system('rmdir '+self.checkpoint_dir)
50 checkpoint_dir = hdmc.set_checkpoint_directory(self.output_file)
51 self.assertTrue(os.path.exists(checkpoint_dir))
52
54 hdmc.download_hdfs_data(self.wd+"/test/dummy")
55 self.assertTrue(os.path.isfile(self.wd+"/test/dummy"))
56 os.system('rm '+self.wd+'/test/dummy')
57 self.assertFalse(os.path.isfile(self.wd+"/test/dummy"))
58
60 hdfs.rm("random_means")
61 hdmc.submit_inline(self.script, self.output_file, iterations=200)
62 self.assertTrue(os.path.exists(self.wd+"/test/random_means"))
63
65 hdfs.rm("random_means")
66 hdmc.submit_inline(self.script, self.output_file, iterations=200, reduction_script = self.reducer)
67 self.assertTrue(os.path.exists(self.wd+"/test/random_means"))
68
70 hdfs.rm("line_counts")
71 file_list = glob(self.wd+"/test/gutenberg/*")
72 self.script = self.wd+"/test/line_counter.py"
73 self.output_file = self.wd+"/test/line_counts"
74 checkpoints = hdmc.submit_checkpoint_inline(self.script, self.output_file, file_list, [])
75 self.assertEqual(len(file_list), len(checkpoints))
76 self.assertTrue(os.path.exists(self.wd+"/test/line_counts"))
77 hadoop_result_file = self.wd+"/test/line_counts"
78 master_result_file = self.wd+"/test/wc_output.dat"
79 hadoop_results = {}
80 master_results = {}
81
82 for line in open(master_result_file).readlines():
83 if len(line.rstrip()) > 0:
84 entry = line.split()
85 master_results[entry[1]] = int(entry[0])
86 for line in open(hadoop_result_file).readlines():
87 if len(line.rstrip()) > 0:
88 entry = line.split()
89 hadoop_results[entry[1]] = int(entry[0])
90
91 for key in master_results.keys():
92 self.assertEqual(master_results[key], hadoop_results[key])
93
95 url_list = ["http://www.gutenberg.org/files/8713/8713-h/8713-h.htm",\
96 "http://www.gutenberg.org/files/10554/10554-h/10554-h.htm",\
97 "http://www.gutenberg.org/ebooks/8164.html.gen",\
98 "http://www.gutenberg.org/files/5200/5200-h/5200-h.htm",\
99 "http://www.gutenberg.org/ebooks/100.txt.utf8",\
100 "http://www.gutenberg.org/files/25717/25717-h/25717-h.htm",\
101 "http://www.gutenberg.org/files/221/221-h/221-h.htm"]
102 hdfs.rm("book_contents")
103 os.system("rm "+self.wd+"/test/book_contents")
104 self.script = self.wd+"/test/fetch_books.py"
105 self.output_file = self.wd+"/test/book_contents"
106 self.supporting_files = []
107 checkpoints = hdmc.submit_checkpoint_inline(self.script, self.output_file, url_list,self.supporting_files, files=False)
108 self.assertEqual(len(url_list), len(checkpoints))
109 self.assertTrue(os.path.exists(self.wd+"/test/book_contents"))
110
111
113 hdfs.rm("line_total")
114 file_list = glob(self.wd+"/test/gutenberg/*")
115 self.script = self.wd+"/test/line_counter.py"
116 self.output_file = self.wd+"/test/line_total"
117 self.reducer = self.wd+"/test/line_sum.py"
118 checkpoints = hdmc.submit_checkpoint_inline(self.script, self.output_file, file_list, reduction_script = self.reducer, arguments="")
119 self.assertEqual(len(file_list), len(checkpoints))
120 self.assertTrue(os.path.exists(self.wd+"/test/line_total"))
121 hadoop_result_file = self.wd+"/test/line_total"
122 master_result_file = self.wd+"/test/wc_total.dat"
123 hadoop_results = {}
124 master_results = {}
125 for line in open(master_result_file).readlines():
126 if len(line.rstrip()) > 0:
127 entry = line.split()
128 master_results[entry[1]] = int(entry[0])
129 for line in open(hadoop_result_file).readlines():
130 if len(line.rstrip()) > 0:
131 entry = line.split()
132 hadoop_results[entry[1]] = int(entry[0])
133 for key in master_results.keys():
134 self.assertEqual(master_results[key], hadoop_results[key])
135
136 if __name__ == "__main__":
137
138 unittest.main()
139