1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 """
18 ===================
19 Text Parser Logic
20 ===================
21
22 Text Parser.
23 """
24 __author__ = u"Andr\xe9 Malo"
25 __docformat__ = "restructuredtext en"
26
27 import re as _re
28
29 from tdi._exceptions import LexerEOFError, LexerFinalizedError
30 from tdi import interfaces as _interfaces
31
32
33 -class TextLexer(object):
34 """ Text Lexer """
35
36
37 - def __init__(self, listener):
38 """
39 Initialization
40
41 :Parameters:
42 `listener` : `ListenerInterface`
43 The event listener
44 """
45 self._listener = listener
46
47 self.state = self.TEXT
48 self._lexers = [getattr(self, name) for name in self._LEXERS]
49 self._buffer = ''
50
51 - def feed(self, food):
52 """
53 Feed the lexer with new data
54
55 :Parameters:
56 `food` : ``str``
57 The data to process
58 """
59 self._buffer += food
60 self._lex()
61
63 """
64 Finalize the lexer
65
66 This processes the rest buffer (if any)
67
68 :Exceptions:
69 - `LexerEOFError` : The rest buffer could not be consumed
70 """
71 self._lex()
72 if self._buffer:
73 raise LexerEOFError(
74 "Unfinished parser state %s" % self._STATES[self.state]
75 )
76
77 self.state = self.FINAL
78
80 """ Parse the current buffer """
81 while self._buffer:
82 if self._lexers[self.state]():
83 break
84
85 - def _lex_text(self):
86 """
87 Text lexer
88
89 State: We are between tags or at the very beginning of the document
90 and look for a ``[``.
91
92 :Return: Unfinished state?
93 :Rtype: ``bool``
94 """
95 data = self._buffer
96 pos = data.find('[')
97 if pos == 0:
98 self.state = self.MARKUP
99 return False
100 elif pos == -1:
101 self._buffer = ''
102 else:
103 self._buffer, data = data[pos:], data[:pos]
104 self.state = self.MARKUP
105
106 self._listener.handle_text(data)
107 return False
108
109 - def _lex_markup(self):
110 """
111 Markup lexer
112
113 State: We've hit a ``[`` character and now find out, what it's
114 becoming
115
116 :Return: Unfinished state?
117 :Rtype: ``bool``
118 """
119 data = self._buffer
120 if len(data) < 2:
121 return True
122
123 char = data[1]
124 if char == '/':
125 state = self.ENDTAG
126 elif char == '#':
127 state = self.COMMENT
128 elif char == '?':
129 state = self.PI
130 elif char == ']':
131 state = self.TEXT
132 self._listener.handle_escape(data[0], data[:2])
133 self._buffer = data[2:]
134 else:
135 state = self.STARTTAG
136
137 self.state = state
138 return False
139
140
141
142
143
144 _START_MATCH = _re.compile(r'''
145 \[
146 (
147 [^\\"'\[\]]*
148 (?:
149 (?:
150 "[^\\"]*(?:\\.[^\\"]*)*"
151 | '[^\\']*(?:\\.[^\\']*)*'
152 )
153 [^\\"'\[\]]*
154 )*
155 )
156 \]
157 ''', _re.X | _re.S).match
158
159
160
161
162 _EMPTY_START_MATCH = _re.compile(r'''
163 \[
164 (
165 \[
166 [^\\"'\[\]]*
167 (?:
168 (?:
169 "[^\\"]*(?:\\.[^\\"]*)*"
170 | '[^\\']*(?:\\.[^\\']*)*'
171 )
172 [^\\"'\[\]]*
173 )*
174 \]
175 )
176 \]
177 ''', _re.X | _re.S).match
178
179
180
181
182
183 _ATT_ITER = _re.compile(r'''
184 \s*
185 (?P<name>[^\s=\]]*) # attribute name
186 \s*
187 (?:
188 =
189 (?P<value> # optional value
190 \s* "[^\\"]*(?:\\.[^\\"]*)*"
191 | \s* '[^\\']*(?:\\.[^\\']*)*'
192 | [^\\\s\]]*
193 )
194 )?
195 ''', _re.X | _re.S).finditer
196
197 - def _lex_start(self):
198 """
199 Starttag lexer
200
201 State: We've hit a ``[tag`` and now look for the ``]``
202
203 :Return: Unfinished State?
204 :Rtype: ``bool``
205 """
206 data = self._buffer
207 match = self._EMPTY_START_MATCH(data) or self._START_MATCH(data)
208 if match is None:
209 return True
210
211 pos = match.end()
212 self._buffer, data = data[pos:], data[:pos]
213
214 attrstring = match.group(1)
215 quoted = attrstring.startswith('[')
216 if quoted:
217 attrstring = attrstring[1:-1]
218
219 splitted = attrstring.split(None, 1)
220 if not splitted:
221 self._listener.handle_text(data)
222 self.state = self.TEXT
223 return False
224 name = splitted[0]
225 if '=' in name:
226 name = ''
227 elif len(splitted) == 1:
228 attrstring = None
229 else:
230 attrstring = splitted[1]
231
232 attr = []
233 if attrstring:
234 for match in self._ATT_ITER(attrstring):
235 key, value = match.group('name', 'value')
236 if key or value is not None:
237 if value:
238 value = value.strip()
239 attr.append((key.strip(), value))
240 else:
241 break
242
243 self.state = self.TEXT
244 self._listener.handle_starttag(name, attr, quoted, data)
245 return False
246
247 - def _lex_end(self):
248 """
249 Endtag lexer
250
251 State: We've hit ``[/``.
252
253 :Return: Unfinished state?
254 :Rtype: ``bool``
255 """
256 data = self._buffer
257 pos = data.find(']') + 1
258 if pos == 0:
259 return True
260
261 self._buffer, data = data[pos:], data[:pos]
262 name = data[2:-1].strip()
263
264 self.state = self.TEXT
265 self._listener.handle_endtag(name, data)
266 return False
267
268
269
270
271
272 _COMMENT_SEARCH = _re.compile(r'#\]').search
273
275 """
276 Comment lexer
277
278 State: We've hit ``[#``.
279
280 :Return: Unfinished state?
281 :Rtype: ``bool``
282 """
283 data = self._buffer
284 if len(data) < 4:
285 return True
286
287 match = self._COMMENT_SEARCH(data, 2)
288 if match is None:
289 return True
290
291 pos = match.end()
292 self._buffer, data = data[pos:], data[:pos]
293
294 self.state = self.TEXT
295 self._listener.handle_comment(data)
296 return False
297
299 """
300 Processing instruction lexer
301
302 State: We've hit a ``[?`` and now peek inside
303
304 :Return: Unfinished state?
305 :Rtype: ``bool``
306 """
307 data = self._buffer
308 pos = data.find('?]', 2)
309 if pos == -1:
310 return True
311 pos += 2
312
313 self._buffer, data = data[pos:], data[:pos]
314
315 self.state = self.TEXT
316 self._listener.handle_pi(data)
317 return False
318
319 - def _lex_final(self):
320 """
321 Called after the lexer was finalized
322
323 State: after all
324
325 :Exceptions:
326 - `LexerFinalizedError` : The lexer was already finalized
327 (raised always)
328 """
329 raise LexerFinalizedError("The lexer was already finalized")
330
331 _LEXERS = []
332 _STATES = []
333 for _idx, (_statename, _funcname) in enumerate([
334 ('FINAL', '_lex_final'),
335 ('TEXT', '_lex_text'),
336 ('MARKUP', '_lex_markup'),
337 ('STARTTAG', '_lex_start'),
338 ('ENDTAG', '_lex_end'),
339 ('PI', '_lex_pi'),
340 ('COMMENT', '_lex_comment'),
341 ]):
342 setattr(TextLexer, _statename, _idx)
343 _LEXERS.append(_funcname)
344 _STATES.append(_statename)
345
346 TextLexer._LEXERS = tuple(_LEXERS)
347 TextLexer._STATES = tuple(_STATES)
348 del _idx, _statename, _funcname, _LEXERS, _STATES
349
350
351 -class TextParser(object):
352 """ Text Parser """
353 __implements__ = [
354 _interfaces.ListenerInterface, _interfaces.ParserInterface
355 ]
356
357 - def __init__(self, listener, lexer=TextLexer):
358 """
359 Initialization
360
361 :Parameters:
362 `listener` : `BuildingListenerInterface`
363 The building listener
364
365 `lexer` : ``callable``
366 Lexer class/factory. This must be a callable taking an
367 event listener and returning a lexer instance
368 """
369 self._tagstack = []
370 self.listener = listener
371 self._lexer = lexer(self)
372 self._normalize = self.listener.decoder.normalize
373
374
375
376
377
378 - def handle_text(self, data):
379 """ :See: `ListenerInterface` """
380 self.listener.handle_text(data)
381
382 - def handle_escape(self, escaped, data):
383 """ :See: `ListenerInterface` """
384 self.listener.handle_escape(escaped, data)
385
386 - def handle_starttag(self, name, attrs, closed, data):
387 """ :See: `ListenerInterface` """
388 self.listener.handle_starttag(name, attrs, closed, data)
389 if not closed:
390 self._tagstack.append((self._normalize(name), name))
391
392 - def handle_endtag(self, name, data):
393 """ :See: `ListenerInterface` """
394 tagstack = self._tagstack
395 if tagstack:
396 if name == '':
397 name = tagstack[-1][1]
398 endtag = self._normalize(name)
399 if endtag in dict(tagstack):
400 toclose, original = tagstack.pop()
401 while toclose != name:
402 self.listener.handle_endtag(original, '')
403 toclose, original = tagstack.pop()
404 self.listener.handle_endtag(name, data)
405
407 """ :See: `ListenerInterface` """
408 self.listener.handle_comment(data)
409
410 - def handle_pi(self, data):
411 """ :See: `ListenerInterface` """
412 self.listener.handle_pi(data)
413
414 - def handle_msection(self, name, value, data):
415 """ :See: `ListenerInterface` """
416
417 raise AssertionError()
418
419 - def handle_decl(self, name, value, data):
420 """ :See: `ListenerInterface` """
421
422 raise AssertionError()
423
424
425
426
427
428 - def feed(self, food):
429 """ :See: `ParserInterface` """
430 self._lexer.feed(food)
431
432 - def finalize(self):
433 """
434 :See: `ParserInterface`
435
436 :Exceptions:
437 - `LexerEOFError` : EOF in the middle of a state
438 """
439 if self._lexer is not None:
440 self._lexer, _ = None, self._lexer.finalize()
441
442 tagstack = self._tagstack
443 while tagstack:
444 self.listener.handle_endtag(tagstack.pop()[1], '')
445