Package tdi :: Package markup :: Package soup :: Module parser
[frames] | no frames]

Source Code for Module tdi.markup.soup.parser

  1  # -*- coding: ascii -*- 
  2  # 
  3  # Copyright 2006 - 2013 
  4  # Andr\xe9 Malo or his licensors, as applicable 
  5  # 
  6  # Licensed under the Apache License, Version 2.0 (the "License"); 
  7  # you may not use this file except in compliance with the License. 
  8  # You may obtain a copy of the License at 
  9  # 
 10  #     http://www.apache.org/licenses/LICENSE-2.0 
 11  # 
 12  # Unless required by applicable law or agreed to in writing, software 
 13  # distributed under the License is distributed on an "AS IS" BASIS, 
 14  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 15  # See the License for the specific language governing permissions and 
 16  # limitations under the License. 
 17  """ 
 18  ===================== 
 19   Markup Parser Logic 
 20  ===================== 
 21   
 22  Soup Parser 
 23  ~~~~~~~~~~~ 
 24   
 25  This module provides a very lenient HTML/XML lexer. The `SoupLexer` class is 
 26  initialized with a listener object, which receives all low level events 
 27  (like starttag, endtag, text etc). Listeners must implement the 
 28  `ListenerInterface`. 
 29   
 30  On top of the lexer there's `SoupParser` class, which actually implements the 
 31  `ListenerInterface` itself (the parser listens to the lexer). The parser adds 
 32  HTML semantics to the lexed data and passes the events to a building listener 
 33  (`BuildingListenerInterface`). In addition to the events sent by the lexer the 
 34  `SoupParser` class generates endtag events (with empty data arguments) for 
 35  implicitly closed elements. Furthermore it knows about CDATA elements like 
 36  ``<script>`` or ``<style>`` and modifies the lexer state accordingly. 
 37   
 38  The actual semantics are provided by a DTD query class (implementing 
 39  `DTDInterface`.) 
 40  """ 
 41  __author__ = u"Andr\xe9 Malo" 
 42  __docformat__ = "restructuredtext en" 
 43   
 44  import re as _re 
 45   
 46  from tdi._exceptions import LexerEOFError, LexerFinalizedError 
 47  from tdi.markup.soup import dtd as _dtd 
 48  from tdi import interfaces as _interfaces 
49 50 51 -class SoupLexer(object):
52 """ 53 (X)HTML Tagsoup Lexer 54 55 The lexer works hard to preserve the original data. In order to achieve 56 this goal, it does not validate the input and recognizes its input in a 57 quite lenient way. 58 59 :Groups: 60 - `Lexer states` : 61 `TEXT`, 62 `CDATA`, 63 `MARKUP`, 64 `STARTTAG`, 65 `ENDTAG`, 66 `COMMENT`, 67 `MSECTION`, 68 `DECL`, 69 `PI`, 70 `EMPTY`, 71 `FINAL` 72 - `Regex Matchers` : 73 `_START_MATCH`, 74 `_ATT_ITER`, 75 `_COMMENT_SEARCH`, 76 `_MSECTION_MATCH`, 77 `_MSECTIONINVALID_MATCH`, 78 `_MEND_SEARCH`, 79 `_MSEND_SEARCH`, 80 `_DECL_MATCH` 81 82 :CVariables: 83 `TEXT` : ``int`` 84 Lexer state ``TEXT`` (between tags) 85 86 `CDATA` : ``int`` 87 Lexer state ``CDATA`` (between (P)CDATA tags) 88 89 `MARKUP` : ``int`` 90 Lexer state ``MARKUP`` (``<``) 91 92 `STARTTAG` : ``int`` 93 Lexer state ``STARTTAG`` (``<[letter]``) 94 95 `ENDTAG` : ``int`` 96 Lexer state ``ENDTAG`` (``</``) 97 98 `COMMENT` : ``int`` 99 Lexer state ``COMMENT`` (``<!--``) 100 101 `MSECTION` : ``int`` 102 Lexer state ``MSECTION`` (``<![``) 103 104 `DECL` : ``int`` 105 Lexer state ``DECL`` (``<!``) 106 107 `PI` : ``int`` 108 Lexer state ``PI`` (``<?``) 109 110 `EMPTY` : ``int`` 111 Lexer state ``EMPTY`` (``<>``) 112 113 `FINAL` : ``int`` 114 Lexer state ``FINAL`` 115 116 `_LEXERS` : ``tuple`` 117 The state lexer method names (``('method', ...)``) 118 119 `_STATES` : ``tuple`` 120 The state names (``('name', ...)``) 121 122 :IVariables: 123 `_state` : ``int`` 124 The current lexer state 125 126 `_lexers` : ``list`` 127 The state lexer methods (``[method, ...]``) 128 129 `_listener` : `ListenerInterface` 130 The listener the events shall be sent to 131 132 `_buffer` : ``str`` 133 Current unprocessed buffer 134 135 `_conditional_ie_comments` : ``bool`` 136 Handle conditional IE comments as text? 137 """ 138 # pylint: disable = E1101 139
140 - def __init__(self, listener, conditional_ie_comments=True):
141 """ 142 Initialization 143 144 :Parameters: 145 `listener` : `ListenerInterface` 146 The event listener 147 148 `conditional_ie_comments` : ``bool`` 149 Handle conditional IE comments as text? 150 151 Conditional comments are described in full detail 152 at `MSDN`_\. 153 154 .. _MSDN: http://msdn.microsoft.com/en-us/library/ 155 ms537512%28v=vs.85%29.aspx 156 """ 157 self._listener = listener 158 self._normalize = None 159 self._cdata_name = None 160 161 self._state = self.TEXT 162 self._lexers = [getattr(self, name) for name in self._LEXERS] 163 self._buffer = '' 164 self._conditional_ie_comments = bool(conditional_ie_comments)
165
166 - def feed(self, food):
167 """ 168 Feed the lexer with new data 169 170 :Parameters: 171 `food` : ``str`` 172 The data to process 173 """ 174 self._buffer += food 175 self._lex()
176
177 - def finalize(self):
178 """ 179 Finalize the lexer 180 181 This processes the rest buffer (if any) 182 183 :Exceptions: 184 - `LexerEOFError` : The rest buffer could not be consumed 185 """ 186 self._lex() 187 if self._buffer: 188 raise LexerEOFError( 189 "Unfinished parser state %s" % self._STATES[self._state] 190 ) 191 192 self._state = self.FINAL
193
194 - def cdata(self, normalize, name):
195 """ Set CDATA state """ 196 if self._state != self.FINAL: 197 self._state = self.CDATA 198 self._normalize = normalize 199 self._cdata_name = normalize(name)
200
201 - def _lex(self):
202 """ Parse the current buffer """ 203 while self._buffer: 204 if self._lexers[self._state](): 205 break
206
207 - def _lex_text(self):
208 """ 209 Text lexer 210 211 State: We are between tags or at the very beginning of the document 212 and look for a ``<``. 213 214 :Return: Unfinished state? 215 :Rtype: ``bool`` 216 """ 217 data = self._buffer 218 pos = data.find('<') 219 if pos == 0: 220 self._state = self.MARKUP 221 return False 222 elif pos == -1: 223 self._buffer = '' 224 else: 225 self._buffer, data = data[pos:], data[:pos] 226 self._state = self.MARKUP 227 228 self._listener.handle_text(data) 229 return False
230
231 - def _lex_cdata(self):
232 """ 233 (PR)CDATA lexer 234 235 State: We are inside a text element and looking for the end tag only 236 237 :Return: Unfinished state? 238 :Rtype: ``bool`` 239 """ 240 incomplete = False 241 data, pos = self._buffer, 0 242 while True: 243 pos = data.find('<', pos) 244 if pos == -1: 245 pos = len(data) 246 self._buffer = '' 247 break 248 else: 249 char = data[pos + 1:pos + 2] 250 if char == '/': 251 self._state = self.ENDTAG 252 break 253 elif char == '': 254 incomplete = True 255 break 256 else: 257 pos += 1 258 259 if pos > 0: 260 self._buffer, data = data[pos:], data[:pos] 261 self._listener.handle_text(data) 262 263 return incomplete
264 265 #: Regex matcher for a tagname character 266 #: 267 #: :Type: ``callable`` 268 _TAGNAME_MATCH = _re.compile(r'[a-zA-Z0-9]').match 269
270 - def _lex_markup(self):
271 """ 272 Markup lexer 273 274 State: We've hit a ``<`` character and now find out, what it's 275 becoming 276 277 :Return: Unfinished state? 278 :Rtype: ``bool`` 279 """ 280 data = self._buffer 281 if len(data) < 2: 282 return True 283 284 char = data[1] 285 state = (self.ENDTAG, self.DECL, self.PI, self.EMPTY, -1)[ 286 "/!?>".find(char) 287 ] 288 if state == -1: 289 if self._TAGNAME_MATCH(char): 290 state = self.STARTTAG 291 else: 292 state = self.TEXT 293 self._buffer = data[1:] 294 self._listener.handle_text(data[0]) 295 296 self._state = state 297 return False
298 299 #: Regex matcher for a start tag 300 #: 301 #: :Type: ``callable`` 302 _START_MATCH = _re.compile(r''' 303 < 304 (?P<name>[^ \t\r\n\f/>]+) 305 (?P<attr> 306 [^"'>]* 307 (?: 308 (?: 309 "[^"]*" 310 | '[^']*' 311 ) 312 [^"'>]* 313 )* 314 ) 315 [ \t\r\n\f]* 316 > 317 ''', _re.X).match 318 319 #: Regex iterator for extracting start tag attributes 320 #: 321 #: :Type: ``callable`` 322 _ATT_ITER = _re.compile(r''' 323 [ \t\r\n\f]* 324 (?P<name>(?:/|[^ \t\r\n\f/=>]*)) # attribute name 325 [ \t\r\n\f]* 326 (?: 327 = 328 (?P<value> # optional value 329 [ \t\r\n\f]*"[^"]*" 330 | [ \t\r\n\f]*'[^']*' 331 | [^ \t\r\n\f/>]* 332 ) 333 )? 334 ''', _re.X).finditer 335
336 - def _lex_start(self):
337 """ 338 Starttag lexer 339 340 State: We've hit a ``<x`` and now look for the ``>``. 341 342 :Return: Unfinished State? 343 :Rtype: ``bool`` 344 """ 345 data = self._buffer 346 match = self._START_MATCH(data) 347 if match is None: 348 return True 349 350 pos = match.end() 351 self._buffer, data = data[pos:], data[:pos] 352 353 name, attrstring = match.group('name', 'attr') 354 attr, closed = [], False 355 if attrstring: 356 for match in self._ATT_ITER(attrstring): 357 key, value = match.group('name', 'value') 358 if key == '/' and value is None: 359 closed = True 360 continue 361 if key or value is not None: 362 if value: 363 value = value.strip() 364 attr.append((key.strip(), value)) 365 else: # bug protection for Python < 2.3.5 (fixed in rev 37262) 366 break 367 368 self._state = self.TEXT 369 self._listener.handle_starttag(name, attr, closed, data) 370 return False
371
372 - def _lex_end(self):
373 """ 374 Endtag lexer 375 376 State: We've hit ``</``. 377 378 :Return: Unfinished state? 379 :Rtype: ``bool`` 380 """ 381 data = self._buffer 382 pos = data.find('>') + 1 383 if pos == 0: 384 return True 385 386 self._buffer, data = data[pos:], data[:pos] 387 name = data[2:-1].strip() 388 389 if self._cdata_name is not None and \ 390 self._normalize(name) != self._cdata_name: 391 self._state = self.CDATA 392 self._listener.handle_text(data) 393 else: 394 self._cdata_name = self._normalize = None 395 self._state = self.TEXT 396 self._listener.handle_endtag(name, data) 397 return False
398 399 400 #: Regex searcher for finding the end of a comment 401 #: 402 #: :Type: ``callable`` 403 _COMMENT_SEARCH = _re.compile(r'--[ \t\r\n\f]*>').search 404 405 #: Regex searcher for matching IE conditional comment 406 #: 407 #: :Type: ``callable`` 408 _IE_COMMENT_MATCH = _re.compile(r''' 409 \[[ \t\r\n\f]* (?: 410 [iI][fF] | [eE][lL][sS][eE] | [eE][nN][dD][iI][fF] 411 ) [^\]]+]> 412 ''', _re.X).match 413
414 - def _lex_comment(self):
415 """ 416 Comment lexer 417 418 State: We've hit ``<!--``. 419 420 :Return: Unfinished state? 421 :Rtype: ``bool`` 422 """ 423 data = self._buffer 424 if len(data) < 7: 425 return True 426 427 if self._conditional_ie_comments: 428 match = iec = self._IE_COMMENT_MATCH(data, 4) 429 else: 430 match = iec = None 431 if match is None: 432 match = self._COMMENT_SEARCH(data, 4) 433 if match is None: 434 return True 435 436 pos = match.end() 437 self._buffer, data = data[pos:], data[:pos] 438 439 self._state = self.TEXT 440 if iec: 441 self._listener.handle_text(data) 442 else: 443 self._listener.handle_comment(data) 444 445 return False
446 447 #: List of MS-specific marked section names (lowercased) 448 #: 449 #: :Type: ``tuple`` 450 _MSSECTIONS = ('if', 'else', 'endif') 451 452 #: Regex matcher for the start of a marked section 453 #: 454 #: :Type: ``callable`` 455 _MSECTION_MATCH = _re.compile(r''' 456 <!\[[ \t\r\n\f]*(?P<name>[^\][ \t\r\n\f>]+)(?=[\][ \t\r\n\f>]) 457 ''', _re.X).match 458 459 #: Regex matcher for the start of an invalid marked section 460 #: 461 #: :Type: ``callable`` 462 _MSECTIONINVALID_MATCH = _re.compile(r'<!\[[ \t\r\n\f]*[\][>]').match 463 464 #: Regex searcher for the end of a marked section 465 #: 466 #: :Type: ``callable`` 467 _MEND_SEARCH = _re.compile(r'][ \t\r\n\f]*][ \t\r\n\f]*>').search 468 469 #: Regex searcher for the end of a MS specific marked section 470 #: 471 #: :Type: ``callable`` 472 _MSEND_SEARCH = _re.compile(r'][ \t\r\n\f]*(?:--)?[ \t\r\n\f]*>').search 473
474 - def _lex_msection(self):
475 """ 476 Marked section lexer 477 478 State: We've hit a ``<![`` and now seek the end 479 480 :Return: Unfinished state? 481 :Rtype: ``bool`` 482 """ 483 data = self._buffer 484 match = self._MSECTION_MATCH(data) 485 if match is None: 486 match = self._MSECTIONINVALID_MATCH(data) 487 if match is not None: # pass invalid msection as text 488 pos = match.end() 489 self._buffer = data[pos:] 490 data = data[:pos] 491 self._state = self.TEXT 492 self._listener.handle_text(data) 493 return False 494 return True 495 496 name = match.group('name') 497 start = match.end() 498 if self._conditional_ie_comments and name.lower() in self._MSSECTIONS: 499 match = iec = self._MSEND_SEARCH(data, start) 500 else: 501 pos = data.find('[', start) 502 if pos >= 0: 503 start = pos + 1 504 match = self._MEND_SEARCH(data, start) 505 iec = None 506 if match is None: 507 return True 508 pos, end = match.end(), match.start() 509 value = data[start:end] 510 self._buffer, data = data[pos:], data[:pos] 511 512 self._state = self.TEXT 513 if iec: 514 self._listener.handle_text(data) 515 else: 516 self._listener.handle_msection(name, value, data) 517 return False
518 519 #: Regex matcher for a complete declaration 520 #: 521 #: This regex seems a bit nasty, but it should catch all stuff allowed 522 #: in declarations (including doctype). Some day, it probably needs to 523 #: be replaced it by real lexer states... 524 #: 525 #: :Type: ``callable`` 526 _DECL_MATCH = _re.compile(r''' 527 <! 528 (?P<name>[^\][ \t\r\n\f>]*) 529 (?P<value> 530 [^"'<>-]* # any nonspecial 531 (?: 532 (?: 533 "[^"]*" # double quoted string 534 | '[^']*' # single quoted string (valid?) 535 | <!\[ # marked section 536 [^\]]* 537 (?: 538 ](?![ \t\r\n\f]*][ \t\r\n\f]*>) 539 [^\]]* 540 )* 541 ][ \t\r\n\f]*][ \t\r\n\f]*> 542 | <(?!!\[) # declaration 543 # hopefully not a doctype 544 # (but unlikely, because we are 545 # probably already in a DT subset) 546 [^"'>-]* 547 (?: 548 (?: 549 "[^"]*" 550 | '[^']*' 551 | -- # comment 552 [^-]* 553 (?:-[^-]+)* 554 -- 555 | -(?!-) # just a hyphen 556 ) 557 [^"'>-]* 558 )* 559 > 560 | -- # comment 561 [^-]* 562 (?:-[^-]+)* 563 -- 564 | -(?!-) # just a hyphen 565 ) 566 [^"'<>-]* # more non-specials 567 )* 568 ) 569 > 570 ''', _re.X).match
571 - def _lex_decl(self):
572 """ 573 Declaration lexer 574 575 State: We've hit a ``<!`` and now peek inside 576 577 :Return: Unfinished state? 578 :Rtype: ``bool`` 579 """ 580 data = self._buffer 581 if len(data) < 3: 582 return True 583 584 if data.startswith('<!--'): 585 self._state = self.COMMENT 586 return False 587 elif data.startswith('<!['): 588 self._state = self.MSECTION 589 return False 590 elif data == '<!-': 591 return True 592 593 match = self._DECL_MATCH(data) 594 if match is None: 595 return True 596 597 name, value = match.group('name', 'value') 598 pos = match.end() 599 self._buffer, data = data[pos:], data[:pos] 600 601 self._state = self.TEXT 602 self._listener.handle_decl(name, value.strip(), data) 603 return False
604
605 - def _lex_pi(self):
606 """ 607 Processing instruction lexer 608 609 State: We've hit a ``<?`` and now peek inside 610 611 :Return: Unfinished state? 612 :Rtype: ``bool`` 613 """ 614 data = self._buffer 615 pos = data.find('?>', 2) 616 if pos == -1: 617 return True 618 pos += 2 619 620 self._buffer, data = data[pos:], data[:pos] 621 622 self._state = self.TEXT 623 self._listener.handle_pi(data) 624 return False
625
626 - def _lex_empty(self):
627 """ 628 Empty tag lexer 629 630 State: We've hit a ``<>`` 631 632 :Return: Unfinished state? 633 :Rtype: ``bool`` 634 """ 635 self._buffer, data = self._buffer[2:], self._buffer[:2] 636 637 self._state = self.TEXT 638 self._listener.handle_starttag('', [], False, data) 639 return False
640
641 - def _lex_final(self):
642 """ 643 Called after the lexer was finalized 644 645 State: after all 646 647 :Exceptions: 648 - `LexerFinalizedError` : The lexer was already finalized 649 (raised always) 650 """ 651 raise LexerFinalizedError("The lexer was already finalized")
652 653 _LEXERS = [] 654 _STATES = [] 655 for _idx, (_statename, _funcname) in enumerate([ 656 ('FINAL', '_lex_final'), 657 ('TEXT', '_lex_text'), 658 ('CDATA', '_lex_cdata'), 659 ('MARKUP', '_lex_markup'), 660 ('STARTTAG', '_lex_start'), 661 ('ENDTAG', '_lex_end'), 662 ('COMMENT', '_lex_comment'), 663 ('MSECTION', '_lex_msection'), 664 ('DECL', '_lex_decl'), 665 ('PI', '_lex_pi'), 666 ('EMPTY', '_lex_empty'), 667 ]): 668 setattr(SoupLexer, _statename, _idx) 669 _LEXERS.append(_funcname) 670 _STATES.append(_statename) 671 672 SoupLexer._LEXERS = tuple(_LEXERS) 673 SoupLexer._STATES = tuple(_STATES) 674 del _idx, _statename, _funcname, _LEXERS, _STATES # pylint: disable = W0631 675 676 677 from tdi import c 678 c = c.load('impl') 679 if c is not None: 680 DEFAULT_LEXER = c.SoupLexer 681 else: 682 DEFAULT_LEXER = SoupLexer 683 del c
684 685 686 -class SoupParser(object):
687 """ 688 ========================= 689 (X)HTML Tag Soup Parser 690 ========================= 691 692 Overview 693 ~~~~~~~~ 694 695 The parser is actually a tagsoup parser by design in order to process 696 most of the "HTML" that can be found out there. Of course, if the HTML 697 is well-formed and valid, this would be the best. There is only as 698 much HTML syntax applied as necessary to parse it. You can influence 699 these syntax definitions by picking another lexer. You can change 700 the semantics by picking another dtd query class. 701 702 This parser guarantees, that for each not-self-closing starttag event also 703 an endtag event is generated (if the endtag is not actually there, the 704 data parameter is an empty string). This also happens for empty tags (like 705 ``br``). On the other hand, there may be more endtag events than starttag 706 events, because of unbalanced or wrongly nested tags. 707 708 Special constructs, which are comments, PIs, marked sections and 709 declarations may occur anywhere, i.e. they are not closing elements 710 implicitly. 711 712 The default lexer does not deal with NET tags (<h1/Heading/). Neither 713 does it handle unfinished starttags by SGML rules like ``<map<area>``. 714 It *does* know about empty tags (``<>`` and ``</>``). 715 716 CDATA elements and comments are handled in a simplified way. Once 717 the particular state is entered, it's only left, when the accompanying 718 end marker was found (``<script>...</script>``, ``<!-- ... -->``). 719 Anything in between is text. 720 721 How is it used? 722 ~~~~~~~~~~~~~~~ 723 724 The parser API is "streamy" on the input side and event based on the 725 output side. So, what you need first is a building listener, which will 726 receive all generated parser events and process them. Such is listener 727 object is expected to implement the `BuildingListenerInterface`. 728 729 Now you create a `SoupParser` instance and pass the listener object to 730 the contructor and the parser is ready to be fed. You can feed as many 731 chunks of input data you like into the parser by using the `feed` 732 method. Every feed call may generate mutiple events on the output side. 733 When you're done feeding, call the parser's `finalize` method in order 734 to clean up. This also flushes pending events to the listener. 735 736 :IVariables: 737 `listener` : `BuildingListenerInterface` 738 The building listener to send the events to 739 740 `lexer` : `SoupLexer` 741 The lexer instance 742 743 `_tagstack` : ``list`` 744 The current tag stack 745 746 `_inempty` : ``bool`` 747 indicates if the last tag on the stack is an empty one 748 749 `_lastopen` : ``str`` 750 Stores the last seen open tag name 751 """ 752 __implements__ = [ 753 _interfaces.ListenerInterface, _interfaces.ParserInterface 754 ] 755
756 - def __init__(self, listener, dtd, lexer=None):
757 """ 758 Initialization 759 760 :Parameters: 761 `listener` : `ListenerInterface` 762 The building listener 763 764 `dtd` : `DTDInterface` 765 DTD query object 766 767 `lexer` : ``callable`` 768 Lexer class/factory. This mus be a callable taking an 769 event listener and returning a lexer instance. If omitted or 770 ``None``, the default lexer will be used (`DEFAULT_LEXER`). 771 """ 772 self._tagstack, self._inempty, self._lastopen = [], False, '' 773 self.listener = listener 774 self._is_nestable = dtd.nestable 775 self._is_cdata = dtd.cdata 776 self._is_empty = dtd.empty 777 if lexer is None: 778 lexer = DEFAULT_LEXER 779 self._lexer = lexer(self) 780 self._normalize = listener.decoder.normalize
781 782 @classmethod
783 - def html(cls, listener):
784 """ 785 Construct a parser using the `HTMLDTD` 786 787 :Parameters: 788 `listener` : `BuildingListenerInterface` 789 The building listener 790 791 :Return: The new parser instance 792 :Rtype: `SoupParser` 793 """ 794 return cls(listener, _dtd.HTMLDTD())
795 796 @classmethod
797 - def xml(cls, listener):
798 """ 799 Construct a parser using the `XMLDTD` 800 801 :Parameters: 802 `listener` : `ListenerInterface` 803 The building listener 804 805 :Return: The new parser instance 806 :Rtype: `SoupParser` 807 """ 808 return cls(listener, _dtd.XMLDTD())
809
810 - def _close_empty(self):
811 """ Ensure we close last empty tag """ 812 if self._inempty: 813 self._inempty = False 814 self.listener.handle_endtag(self._tagstack.pop()[1], '')
815 816 ######################################################################### 817 ### ListenerInterface ################################################### 818 ######################################################################### 819
820 - def handle_text(self, data):
821 """ :See: `ListenerInterface` """ 822 self._close_empty() 823 self.listener.handle_text(data)
824
825 - def handle_starttag(self, name, attrs, closed, data):
826 """ :See: `ListenerInterface` """ 827 self._close_empty() 828 829 if name == '' and not attrs: 830 name = self._lastopen 831 else: 832 self._lastopen = name 833 834 tagstack = self._tagstack 835 nestable = self._is_nestable 836 starttag = self._normalize(name) 837 while tagstack and not nestable(tagstack[-1][0], starttag): 838 self.listener.handle_endtag(tagstack.pop()[1], '') 839 840 if closed: 841 self.listener.handle_starttag(name, attrs, closed, data) 842 else: 843 if self._is_cdata(starttag): 844 self._lexer.cdata(self._normalize, starttag) 845 self.listener.handle_starttag(name, attrs, closed, data) 846 tagstack.append((starttag, name)) 847 if self._is_empty(starttag): 848 self._inempty = True
849
850 - def handle_endtag(self, name, data):
851 """ :See: `ListenerInterface` """ 852 tagstack = self._tagstack 853 if tagstack: 854 if name == '': 855 name = tagstack[-1][1] 856 endtag = self._normalize(name) 857 if endtag in dict(tagstack): 858 toclose, original = tagstack.pop() 859 self._inempty = False 860 while toclose != endtag: 861 self.listener.handle_endtag(original, '') 862 toclose, original = tagstack.pop() 863 864 self._close_empty() 865 self.listener.handle_endtag(name, data)
866
867 - def handle_comment(self, data):
868 """ :See: `ListenerInterface` """ 869 self._close_empty() 870 self.listener.handle_comment(data)
871
872 - def handle_msection(self, name, value, data):
873 """ :See: `ListenerInterface` """ 874 self._close_empty() 875 self.listener.handle_msection(name, value, data)
876
877 - def handle_decl(self, name, value, data):
878 """ :See: `ListenerInterface` """ 879 self._close_empty() 880 self.listener.handle_decl(name, value, data)
881
882 - def handle_pi(self, data):
883 """ :See: `ListenerInterface` """ 884 self._close_empty() 885 self.listener.handle_pi(data)
886
887 - def handle_escape(self, escaped, data):
888 """ :See: `ListenerInterface` """ 889 # pylint: disable = W0613 890 raise AssertionError()
891 892 ######################################################################### 893 ### ParserInterface ##################################################### 894 ######################################################################### 895
896 - def feed(self, food):
897 """ :See: `ParserInterface` """ 898 self._lexer.feed(food)
899
900 - def finalize(self):
901 """ 902 :See: `ParserInterface` 903 904 :Exceptions: 905 - `LexerEOFError` : EOF in the middle of a state 906 """ 907 if self._lexer is not None: 908 self._lexer, _ = None, self._lexer.finalize() 909 910 tagstack = self._tagstack 911 while tagstack: 912 self.listener.handle_endtag(tagstack.pop()[1], '')
913 914 915 from tdi import c # pylint: disable = W0404 916 c = c.load('impl') 917 if c is not None: 918 DEFAULT_PARSER = c.SoupParser 919 else: 920 DEFAULT_PARSER = SoupParser 921 del c 922