1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 __author__ = """Ronaldo Amaral Santos <ronaldinho.as@gmail.com>"""
32 __docformat__ = 'plaintext'
33
34 from xml.dom import minidom
35 from StringIO import StringIO
36 from GranularUtils import Grain
37
38 import os, sys, time, random, shutil
39 import PIL.Image
40 import mimetypes
41
42 from pypdf2table import ExecuteConverter
43
46
48 """
49 - Provide the grain extraction functionality for PDF documents
50 - Retrieve tables and images
51 """
52 Document = None
53 __pathFolder = None
54
56 """
57 Checks if the Document is a PDF file, then creates a temporary folder and saves
58 the PDF file in the filesystem
59 """
60 self.Document = Document
61
62 if self.Document.getContentType() == "application/pdf":
63 dtime = str(time.time())+str(int(random.random()*100))
64 self.__pathFolder = os.path.join('/tmp', dtime)
65 os.mkdir(self.__pathFolder)
66 filePDF=open(os.path.join(self.__pathFolder,self.Document.getFilename()),'w')
67 filePDF.write(self.Document().getvalue())
68 filePDF.close()
69
70 else:
71 raise "The file is not a PDF Document"
72
73
76
78 """
79 When the object is destroyed, the temporary folder is removed with everything inside of it.
80 """
81 shutil.rmtree(self.__pathFolder)
82
83
84
85
87 """
88 Retrieves images from a PDF document
89 """
90 if os.system('pdfimages -j "' + os.path.join(self.__pathFolder,self.Document.getFilename()) + '" ' + self.__pathFolder +'/imagegrain') == 256:
91
92 return []
93
94 images = os.listdir(self.__pathFolder)
95 images.remove(self.Document.getFilename())
96 image_list = [];
97 for image in images:
98 f, e = os.path.splitext(image)
99
100 if e.lower() in ['.ppm','.pbm']:
101 try:
102 content = StringIO()
103 PIL.Image.open(os.path.join(self.__pathFolder,image)).save(content, "PNG")
104 image = f + ".png"
105 except:
106 fileImage = open(self.__pathFolder+'/'+image, "r")
107 content = StringIO(fileImage.read())
108 fileImage.close()
109 else:
110 fileImage = open(self.__pathFolder+'/'+image, "r")
111 content = StringIO(fileImage.read())
112 fileImage.close()
113
114 image_list.append(Grain(id=image,content=content))
115
116
117 return image_list
118
120 """
121 Extract tables from a pdf file using pyPdf2Table
122 """
123 tableList = []
124 pdfFile = os.path.join(self.__pathFolder,self.Document.getFilename())
125 outputXMLFolder = os.path.join(self.__pathFolder,"outputXMLFolder")
126 try:
127 converterObj = ExecuteConverter.ExecuteConverter()
128 converterObj.extractTables(pdfFile, outputXMLFolder)
129 tableListStr = converterObj.getTableList()
130 except Exception, e:
131 raise PyPdf2TableError, e
132
133 i = 0
134 for table in tableListStr:
135
136 i+=1
137 tableId = "Table" + str(i) + ".html"
138
139 grainObj = Grain()
140 grainObj.setId(tableId)
141 grainObj.setContent(StringIO(table))
142 tableList.append(grainObj)
143
144 return tableList
145
146
147
148
149
150
151
152
153
154
155
156
157
158
160 """
161 Invoke the private method __getImageDocumentList in order to retrieve the document's images
162 """
163 if self.__pathFolder is not None:
164 return self.__getImageDocumentList()
165 else:
166 return None
167
169 """
170 Invoke the private method __getTableDocumentList in order to retrieve the document's tables
171 """
172 if self.__pathFolder is not None:
173 return self.__getTableDocumentList()
174 else:
175 return None
176
178 """
179 Extract the grains from a document, returning a dictionary with a list of tables and a list of images
180 """
181 returnfiles = {}
182 if self.__pathFolder is not None:
183 returnfiles['image_list'] = self.__getImageDocumentList()
184 returnfiles['table_list'] = self.__getTableDocumentList()
185 return returnfiles
186
187 else:
188 return None
189