Package tdi :: Package markup :: Package text :: Module parser
[frames] | no frames]

Source Code for Module tdi.markup.text.parser

  1  # -*- coding: ascii -*- 
  2  # 
  3  # Copyright 2012 - 2013 
  4  # Andr\xe9 Malo or his licensors, as applicable 
  5  # 
  6  # Licensed under the Apache License, Version 2.0 (the "License"); 
  7  # you may not use this file except in compliance with the License. 
  8  # You may obtain a copy of the License at 
  9  # 
 10  #     http://www.apache.org/licenses/LICENSE-2.0 
 11  # 
 12  # Unless required by applicable law or agreed to in writing, software 
 13  # distributed under the License is distributed on an "AS IS" BASIS, 
 14  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 15  # See the License for the specific language governing permissions and 
 16  # limitations under the License. 
 17  """ 
 18  =================== 
 19   Text Parser Logic 
 20  =================== 
 21   
 22  Text Parser. 
 23  """ 
 24  __author__ = u"Andr\xe9 Malo" 
 25  __docformat__ = "restructuredtext en" 
 26   
 27  import re as _re 
 28   
 29  from tdi._exceptions import LexerEOFError, LexerFinalizedError 
 30  from tdi import interfaces as _interfaces 
 31   
 32   
33 -class TextLexer(object):
34 """ Text Lexer """ 35 # pylint: disable = E1101 36
37 - def __init__(self, listener):
38 """ 39 Initialization 40 41 :Parameters: 42 `listener` : `ListenerInterface` 43 The event listener 44 """ 45 self._listener = listener 46 47 self.state = self.TEXT 48 self._lexers = [getattr(self, name) for name in self._LEXERS] 49 self._buffer = ''
50
51 - def feed(self, food):
52 """ 53 Feed the lexer with new data 54 55 :Parameters: 56 `food` : ``str`` 57 The data to process 58 """ 59 self._buffer += food 60 self._lex()
61
62 - def finalize(self):
63 """ 64 Finalize the lexer 65 66 This processes the rest buffer (if any) 67 68 :Exceptions: 69 - `LexerEOFError` : The rest buffer could not be consumed 70 """ 71 self._lex() 72 if self._buffer: 73 raise LexerEOFError( 74 "Unfinished parser state %s" % self._STATES[self.state] 75 ) 76 77 self.state = self.FINAL
78
79 - def _lex(self):
80 """ Parse the current buffer """ 81 while self._buffer: 82 if self._lexers[self.state](): 83 break
84
85 - def _lex_text(self):
86 """ 87 Text lexer 88 89 State: We are between tags or at the very beginning of the document 90 and look for a ``[``. 91 92 :Return: Unfinished state? 93 :Rtype: ``bool`` 94 """ 95 data = self._buffer 96 pos = data.find('[') 97 if pos == 0: 98 self.state = self.MARKUP 99 return False 100 elif pos == -1: 101 self._buffer = '' 102 else: 103 self._buffer, data = data[pos:], data[:pos] 104 self.state = self.MARKUP 105 106 self._listener.handle_text(data) 107 return False
108
109 - def _lex_markup(self):
110 """ 111 Markup lexer 112 113 State: We've hit a ``[`` character and now find out, what it's 114 becoming 115 116 :Return: Unfinished state? 117 :Rtype: ``bool`` 118 """ 119 data = self._buffer 120 if len(data) < 2: 121 return True 122 123 char = data[1] 124 if char == '/': 125 state = self.ENDTAG 126 elif char == '#': 127 state = self.COMMENT 128 elif char == '?': 129 state = self.PI 130 elif char == ']': 131 state = self.TEXT 132 self._listener.handle_escape(data[0], data[:2]) 133 self._buffer = data[2:] 134 else: 135 state = self.STARTTAG 136 137 self.state = state 138 return False
139 140 141 #: Regex matcher for a start tag 142 #: 143 #: :Type: ``callable`` 144 _START_MATCH = _re.compile(r''' 145 \[ 146 ( 147 [^\\"'\[\]]* 148 (?: 149 (?: 150 "[^\\"]*(?:\\.[^\\"]*)*" 151 | '[^\\']*(?:\\.[^\\']*)*' 152 ) 153 [^\\"'\[\]]* 154 )* 155 ) 156 \] 157 ''', _re.X | _re.S).match 158 159 #: Regex matcher for an empty start tag 160 #: 161 #: :Type: ``callable`` 162 _EMPTY_START_MATCH = _re.compile(r''' 163 \[ 164 ( 165 \[ 166 [^\\"'\[\]]* 167 (?: 168 (?: 169 "[^\\"]*(?:\\.[^\\"]*)*" 170 | '[^\\']*(?:\\.[^\\']*)*' 171 ) 172 [^\\"'\[\]]* 173 )* 174 \] 175 ) 176 \] 177 ''', _re.X | _re.S).match 178 179 180 #: Regex iterator for extracting start tag attributes 181 #: 182 #: :Type: ``callable`` 183 _ATT_ITER = _re.compile(r''' 184 \s* 185 (?P<name>[^\s=\]]*) # attribute name 186 \s* 187 (?: 188 = 189 (?P<value> # optional value 190 \s* "[^\\"]*(?:\\.[^\\"]*)*" 191 | \s* '[^\\']*(?:\\.[^\\']*)*' 192 | [^\\\s\]]* 193 ) 194 )? 195 ''', _re.X | _re.S).finditer 196
197 - def _lex_start(self):
198 """ 199 Starttag lexer 200 201 State: We've hit a ``[tag`` and now look for the ``]`` 202 203 :Return: Unfinished State? 204 :Rtype: ``bool`` 205 """ 206 data = self._buffer 207 match = self._EMPTY_START_MATCH(data) or self._START_MATCH(data) 208 if match is None: 209 return True 210 211 pos = match.end() 212 self._buffer, data = data[pos:], data[:pos] 213 214 attrstring = match.group(1) 215 quoted = attrstring.startswith('[') 216 if quoted: 217 attrstring = attrstring[1:-1] 218 219 splitted = attrstring.split(None, 1) 220 if not splitted: 221 self._listener.handle_text(data) 222 self.state = self.TEXT 223 return False 224 name = splitted[0] 225 if '=' in name: 226 name = '' 227 elif len(splitted) == 1: 228 attrstring = None 229 else: 230 attrstring = splitted[1] 231 232 attr = [] 233 if attrstring: 234 for match in self._ATT_ITER(attrstring): 235 key, value = match.group('name', 'value') 236 if key or value is not None: 237 if value: 238 value = value.strip() 239 attr.append((key.strip(), value)) 240 else: # bug protection for Python < 2.3.5 (fixed in rev 37262) 241 break 242 243 self.state = self.TEXT 244 self._listener.handle_starttag(name, attr, quoted, data) 245 return False
246
247 - def _lex_end(self):
248 """ 249 Endtag lexer 250 251 State: We've hit ``[/``. 252 253 :Return: Unfinished state? 254 :Rtype: ``bool`` 255 """ 256 data = self._buffer 257 pos = data.find(']') + 1 258 if pos == 0: 259 return True 260 261 self._buffer, data = data[pos:], data[:pos] 262 name = data[2:-1].strip() 263 264 self.state = self.TEXT 265 self._listener.handle_endtag(name, data) 266 return False
267 268 269 #: Regex searcher for finding the end of a comment 270 #: 271 #: :Type: ``callable`` 272 _COMMENT_SEARCH = _re.compile(r'#\]').search 273
274 - def _lex_comment(self):
275 """ 276 Comment lexer 277 278 State: We've hit ``[#``. 279 280 :Return: Unfinished state? 281 :Rtype: ``bool`` 282 """ 283 data = self._buffer 284 if len(data) < 4: 285 return True 286 287 match = self._COMMENT_SEARCH(data, 2) 288 if match is None: 289 return True 290 291 pos = match.end() 292 self._buffer, data = data[pos:], data[:pos] 293 294 self.state = self.TEXT 295 self._listener.handle_comment(data) 296 return False
297
298 - def _lex_pi(self):
299 """ 300 Processing instruction lexer 301 302 State: We've hit a ``[?`` and now peek inside 303 304 :Return: Unfinished state? 305 :Rtype: ``bool`` 306 """ 307 data = self._buffer 308 pos = data.find('?]', 2) 309 if pos == -1: 310 return True 311 pos += 2 312 313 self._buffer, data = data[pos:], data[:pos] 314 315 self.state = self.TEXT 316 self._listener.handle_pi(data) 317 return False
318
319 - def _lex_final(self):
320 """ 321 Called after the lexer was finalized 322 323 State: after all 324 325 :Exceptions: 326 - `LexerFinalizedError` : The lexer was already finalized 327 (raised always) 328 """ 329 raise LexerFinalizedError("The lexer was already finalized")
330 331 _LEXERS = [] 332 _STATES = [] 333 for _idx, (_statename, _funcname) in enumerate([ 334 ('FINAL', '_lex_final'), 335 ('TEXT', '_lex_text'), 336 ('MARKUP', '_lex_markup'), 337 ('STARTTAG', '_lex_start'), 338 ('ENDTAG', '_lex_end'), 339 ('PI', '_lex_pi'), 340 ('COMMENT', '_lex_comment'), 341 ]): 342 setattr(TextLexer, _statename, _idx) 343 _LEXERS.append(_funcname) 344 _STATES.append(_statename) 345 346 TextLexer._LEXERS = tuple(_LEXERS) 347 TextLexer._STATES = tuple(_STATES) 348 del _idx, _statename, _funcname, _LEXERS, _STATES # pylint: disable = W0631 349 350
351 -class TextParser(object):
352 """ Text Parser """ 353 __implements__ = [ 354 _interfaces.ListenerInterface, _interfaces.ParserInterface 355 ] 356
357 - def __init__(self, listener, lexer=TextLexer):
358 """ 359 Initialization 360 361 :Parameters: 362 `listener` : `BuildingListenerInterface` 363 The building listener 364 365 `lexer` : ``callable`` 366 Lexer class/factory. This must be a callable taking an 367 event listener and returning a lexer instance 368 """ 369 self._tagstack = [] 370 self.listener = listener 371 self._lexer = lexer(self) 372 self._normalize = self.listener.decoder.normalize
373 374 ######################################################################### 375 ### ListenerInterface ################################################### 376 ######################################################################### 377
378 - def handle_text(self, data):
379 """ :See: `ListenerInterface` """ 380 self.listener.handle_text(data)
381
382 - def handle_escape(self, escaped, data):
383 """ :See: `ListenerInterface` """ 384 self.listener.handle_escape(escaped, data)
385
386 - def handle_starttag(self, name, attrs, closed, data):
387 """ :See: `ListenerInterface` """ 388 self.listener.handle_starttag(name, attrs, closed, data) 389 if not closed: 390 self._tagstack.append((self._normalize(name), name))
391
392 - def handle_endtag(self, name, data):
393 """ :See: `ListenerInterface` """ 394 tagstack = self._tagstack 395 if tagstack: 396 if name == '': 397 name = tagstack[-1][1] 398 endtag = self._normalize(name) 399 if endtag in dict(tagstack): 400 toclose, original = tagstack.pop() 401 while toclose != name: 402 self.listener.handle_endtag(original, '') 403 toclose, original = tagstack.pop() 404 self.listener.handle_endtag(name, data)
405
406 - def handle_comment(self, data):
407 """ :See: `ListenerInterface` """ 408 self.listener.handle_comment(data)
409
410 - def handle_pi(self, data):
411 """ :See: `ListenerInterface` """ 412 self.listener.handle_pi(data)
413
414 - def handle_msection(self, name, value, data):
415 """ :See: `ListenerInterface` """ 416 # pylint: disable = W0613 417 raise AssertionError()
418
419 - def handle_decl(self, name, value, data):
420 """ :See: `ListenerInterface` """ 421 # pylint: disable = W0613 422 raise AssertionError()
423 424 ######################################################################### 425 ### ParserInterface ##################################################### 426 ######################################################################### 427
428 - def feed(self, food):
429 """ :See: `ParserInterface` """ 430 self._lexer.feed(food)
431
432 - def finalize(self):
433 """ 434 :See: `ParserInterface` 435 436 :Exceptions: 437 - `LexerEOFError` : EOF in the middle of a state 438 """ 439 if self._lexer is not None: 440 self._lexer, _ = None, self._lexer.finalize() 441 442 tagstack = self._tagstack 443 while tagstack: 444 self.listener.handle_endtag(tagstack.pop()[1], '')
445