1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 """
18 =====================
19 Markup Parser Logic
20 =====================
21
22 Soup Parser
23 ~~~~~~~~~~~
24
25 This module provides a very lenient HTML/XML lexer. The `SoupLexer` class is
26 initialized with a listener object, which receives all low level events
27 (like starttag, endtag, text etc). Listeners must implement the
28 `ListenerInterface`.
29
30 On top of the lexer there's `SoupParser` class, which actually implements the
31 `ListenerInterface` itself (the parser listens to the lexer). The parser adds
32 HTML semantics to the lexed data and passes the events to a building listener
33 (`BuildingListenerInterface`). In addition to the events sent by the lexer the
34 `SoupParser` class generates endtag events (with empty data arguments) for
35 implicitly closed elements. Furthermore it knows about CDATA elements like
36 ``<script>`` or ``<style>`` and modifies the lexer state accordingly.
37
38 The actual semantics are provided by a DTD query class (implementing
39 `DTDInterface`.)
40 """
41 __author__ = u"Andr\xe9 Malo"
42 __docformat__ = "restructuredtext en"
43
44 import re as _re
45
46 from tdi._exceptions import LexerEOFError, LexerFinalizedError
47 from tdi.markup.soup import dtd as _dtd
48 from tdi import interfaces as _interfaces
52 """
53 (X)HTML Tagsoup Lexer
54
55 The lexer works hard to preserve the original data. In order to achieve
56 this goal, it does not validate the input and recognizes its input in a
57 quite lenient way.
58
59 :Groups:
60 - `Lexer states` :
61 `TEXT`,
62 `CDATA`,
63 `MARKUP`,
64 `STARTTAG`,
65 `ENDTAG`,
66 `COMMENT`,
67 `MSECTION`,
68 `DECL`,
69 `PI`,
70 `EMPTY`,
71 `FINAL`
72 - `Regex Matchers` :
73 `_START_MATCH`,
74 `_ATT_ITER`,
75 `_COMMENT_SEARCH`,
76 `_MSECTION_MATCH`,
77 `_MSECTIONINVALID_MATCH`,
78 `_MEND_SEARCH`,
79 `_MSEND_SEARCH`,
80 `_DECL_MATCH`
81
82 :CVariables:
83 `TEXT` : ``int``
84 Lexer state ``TEXT`` (between tags)
85
86 `CDATA` : ``int``
87 Lexer state ``CDATA`` (between (P)CDATA tags)
88
89 `MARKUP` : ``int``
90 Lexer state ``MARKUP`` (``<``)
91
92 `STARTTAG` : ``int``
93 Lexer state ``STARTTAG`` (``<[letter]``)
94
95 `ENDTAG` : ``int``
96 Lexer state ``ENDTAG`` (``</``)
97
98 `COMMENT` : ``int``
99 Lexer state ``COMMENT`` (``<!--``)
100
101 `MSECTION` : ``int``
102 Lexer state ``MSECTION`` (``<![``)
103
104 `DECL` : ``int``
105 Lexer state ``DECL`` (``<!``)
106
107 `PI` : ``int``
108 Lexer state ``PI`` (``<?``)
109
110 `EMPTY` : ``int``
111 Lexer state ``EMPTY`` (``<>``)
112
113 `FINAL` : ``int``
114 Lexer state ``FINAL``
115
116 `_LEXERS` : ``tuple``
117 The state lexer method names (``('method', ...)``)
118
119 `_STATES` : ``tuple``
120 The state names (``('name', ...)``)
121
122 :IVariables:
123 `_state` : ``int``
124 The current lexer state
125
126 `_lexers` : ``list``
127 The state lexer methods (``[method, ...]``)
128
129 `_listener` : `ListenerInterface`
130 The listener the events shall be sent to
131
132 `_buffer` : ``str``
133 Current unprocessed buffer
134
135 `_conditional_ie_comments` : ``bool``
136 Handle conditional IE comments as text?
137 """
138
139
140 - def __init__(self, listener, conditional_ie_comments=True):
141 """
142 Initialization
143
144 :Parameters:
145 `listener` : `ListenerInterface`
146 The event listener
147
148 `conditional_ie_comments` : ``bool``
149 Handle conditional IE comments as text?
150
151 Conditional comments are described in full detail
152 at `MSDN`_\.
153
154 .. _MSDN: http://msdn.microsoft.com/en-us/library/
155 ms537512%28v=vs.85%29.aspx
156 """
157 self._listener = listener
158 self._normalize = None
159 self._cdata_name = None
160
161 self._state = self.TEXT
162 self._lexers = [getattr(self, name) for name in self._LEXERS]
163 self._buffer = ''
164 self._conditional_ie_comments = bool(conditional_ie_comments)
165
166 - def feed(self, food):
167 """
168 Feed the lexer with new data
169
170 :Parameters:
171 `food` : ``str``
172 The data to process
173 """
174 self._buffer += food
175 self._lex()
176
178 """
179 Finalize the lexer
180
181 This processes the rest buffer (if any)
182
183 :Exceptions:
184 - `LexerEOFError` : The rest buffer could not be consumed
185 """
186 self._lex()
187 if self._buffer:
188 raise LexerEOFError(
189 "Unfinished parser state %s" % self._STATES[self._state]
190 )
191
192 self._state = self.FINAL
193
194 - def cdata(self, normalize, name):
195 """ Set CDATA state """
196 if self._state != self.FINAL:
197 self._state = self.CDATA
198 self._normalize = normalize
199 self._cdata_name = normalize(name)
200
202 """ Parse the current buffer """
203 while self._buffer:
204 if self._lexers[self._state]():
205 break
206
207 - def _lex_text(self):
208 """
209 Text lexer
210
211 State: We are between tags or at the very beginning of the document
212 and look for a ``<``.
213
214 :Return: Unfinished state?
215 :Rtype: ``bool``
216 """
217 data = self._buffer
218 pos = data.find('<')
219 if pos == 0:
220 self._state = self.MARKUP
221 return False
222 elif pos == -1:
223 self._buffer = ''
224 else:
225 self._buffer, data = data[pos:], data[:pos]
226 self._state = self.MARKUP
227
228 self._listener.handle_text(data)
229 return False
230
232 """
233 (PR)CDATA lexer
234
235 State: We are inside a text element and looking for the end tag only
236
237 :Return: Unfinished state?
238 :Rtype: ``bool``
239 """
240 incomplete = False
241 data, pos = self._buffer, 0
242 while True:
243 pos = data.find('<', pos)
244 if pos == -1:
245 pos = len(data)
246 self._buffer = ''
247 break
248 else:
249 char = data[pos + 1:pos + 2]
250 if char == '/':
251 self._state = self.ENDTAG
252 break
253 elif char == '':
254 incomplete = True
255 break
256 else:
257 pos += 1
258
259 if pos > 0:
260 self._buffer, data = data[pos:], data[:pos]
261 self._listener.handle_text(data)
262
263 return incomplete
264
265
266
267
268 _TAGNAME_MATCH = _re.compile(r'[a-zA-Z0-9]').match
269
271 """
272 Markup lexer
273
274 State: We've hit a ``<`` character and now find out, what it's
275 becoming
276
277 :Return: Unfinished state?
278 :Rtype: ``bool``
279 """
280 data = self._buffer
281 if len(data) < 2:
282 return True
283
284 char = data[1]
285 state = (self.ENDTAG, self.DECL, self.PI, self.EMPTY, -1)[
286 "/!?>".find(char)
287 ]
288 if state == -1:
289 if self._TAGNAME_MATCH(char):
290 state = self.STARTTAG
291 else:
292 state = self.TEXT
293 self._buffer = data[1:]
294 self._listener.handle_text(data[0])
295
296 self._state = state
297 return False
298
299
300
301
302 _START_MATCH = _re.compile(r'''
303 <
304 (?P<name>[^ \t\r\n\f/>]+)
305 (?P<attr>
306 [^"'>]*
307 (?:
308 (?:
309 "[^"]*"
310 | '[^']*'
311 )
312 [^"'>]*
313 )*
314 )
315 [ \t\r\n\f]*
316 >
317 ''', _re.X).match
318
319
320
321
322 _ATT_ITER = _re.compile(r'''
323 [ \t\r\n\f]*
324 (?P<name>(?:/|[^ \t\r\n\f/=>]*)) # attribute name
325 [ \t\r\n\f]*
326 (?:
327 =
328 (?P<value> # optional value
329 [ \t\r\n\f]*"[^"]*"
330 | [ \t\r\n\f]*'[^']*'
331 | [^ \t\r\n\f/>]*
332 )
333 )?
334 ''', _re.X).finditer
335
337 """
338 Starttag lexer
339
340 State: We've hit a ``<x`` and now look for the ``>``.
341
342 :Return: Unfinished State?
343 :Rtype: ``bool``
344 """
345 data = self._buffer
346 match = self._START_MATCH(data)
347 if match is None:
348 return True
349
350 pos = match.end()
351 self._buffer, data = data[pos:], data[:pos]
352
353 name, attrstring = match.group('name', 'attr')
354 attr, closed = [], False
355 if attrstring:
356 for match in self._ATT_ITER(attrstring):
357 key, value = match.group('name', 'value')
358 if key == '/' and value is None:
359 closed = True
360 continue
361 if key or value is not None:
362 if value:
363 value = value.strip()
364 attr.append((key.strip(), value))
365 else:
366 break
367
368 self._state = self.TEXT
369 self._listener.handle_starttag(name, attr, closed, data)
370 return False
371
373 """
374 Endtag lexer
375
376 State: We've hit ``</``.
377
378 :Return: Unfinished state?
379 :Rtype: ``bool``
380 """
381 data = self._buffer
382 pos = data.find('>') + 1
383 if pos == 0:
384 return True
385
386 self._buffer, data = data[pos:], data[:pos]
387 name = data[2:-1].strip()
388
389 if self._cdata_name is not None and \
390 self._normalize(name) != self._cdata_name:
391 self._state = self.CDATA
392 self._listener.handle_text(data)
393 else:
394 self._cdata_name = self._normalize = None
395 self._state = self.TEXT
396 self._listener.handle_endtag(name, data)
397 return False
398
399
400
401
402
403 _COMMENT_SEARCH = _re.compile(r'--[ \t\r\n\f]*>').search
404
405
406
407
408 _IE_COMMENT_MATCH = _re.compile(r'''
409 \[[ \t\r\n\f]* (?:
410 [iI][fF] | [eE][lL][sS][eE] | [eE][nN][dD][iI][fF]
411 ) [^\]]+]>
412 ''', _re.X).match
413
446
447
448
449
450 _MSSECTIONS = ('if', 'else', 'endif')
451
452
453
454
455 _MSECTION_MATCH = _re.compile(r'''
456 <!\[[ \t\r\n\f]*(?P<name>[^\][ \t\r\n\f>]+)(?=[\][ \t\r\n\f>])
457 ''', _re.X).match
458
459
460
461
462 _MSECTIONINVALID_MATCH = _re.compile(r'<!\[[ \t\r\n\f]*[\][>]').match
463
464
465
466
467 _MEND_SEARCH = _re.compile(r'][ \t\r\n\f]*][ \t\r\n\f]*>').search
468
469
470
471
472 _MSEND_SEARCH = _re.compile(r'][ \t\r\n\f]*(?:--)?[ \t\r\n\f]*>').search
473
475 """
476 Marked section lexer
477
478 State: We've hit a ``<![`` and now seek the end
479
480 :Return: Unfinished state?
481 :Rtype: ``bool``
482 """
483 data = self._buffer
484 match = self._MSECTION_MATCH(data)
485 if match is None:
486 match = self._MSECTIONINVALID_MATCH(data)
487 if match is not None:
488 pos = match.end()
489 self._buffer = data[pos:]
490 data = data[:pos]
491 self._state = self.TEXT
492 self._listener.handle_text(data)
493 return False
494 return True
495
496 name = match.group('name')
497 start = match.end()
498 if self._conditional_ie_comments and name.lower() in self._MSSECTIONS:
499 match = iec = self._MSEND_SEARCH(data, start)
500 else:
501 pos = data.find('[', start)
502 if pos >= 0:
503 start = pos + 1
504 match = self._MEND_SEARCH(data, start)
505 iec = None
506 if match is None:
507 return True
508 pos, end = match.end(), match.start()
509 value = data[start:end]
510 self._buffer, data = data[pos:], data[:pos]
511
512 self._state = self.TEXT
513 if iec:
514 self._listener.handle_text(data)
515 else:
516 self._listener.handle_msection(name, value, data)
517 return False
518
519
520
521
522
523
524
525
526 _DECL_MATCH = _re.compile(r'''
527 <!
528 (?P<name>[^\][ \t\r\n\f>]*)
529 (?P<value>
530 [^"'<>-]* # any nonspecial
531 (?:
532 (?:
533 "[^"]*" # double quoted string
534 | '[^']*' # single quoted string (valid?)
535 | <!\[ # marked section
536 [^\]]*
537 (?:
538 ](?![ \t\r\n\f]*][ \t\r\n\f]*>)
539 [^\]]*
540 )*
541 ][ \t\r\n\f]*][ \t\r\n\f]*>
542 | <(?!!\[) # declaration
543 # hopefully not a doctype
544 # (but unlikely, because we are
545 # probably already in a DT subset)
546 [^"'>-]*
547 (?:
548 (?:
549 "[^"]*"
550 | '[^']*'
551 | -- # comment
552 [^-]*
553 (?:-[^-]+)*
554 --
555 | -(?!-) # just a hyphen
556 )
557 [^"'>-]*
558 )*
559 >
560 | -- # comment
561 [^-]*
562 (?:-[^-]+)*
563 --
564 | -(?!-) # just a hyphen
565 )
566 [^"'<>-]* # more non-specials
567 )*
568 )
569 >
570 ''', _re.X).match
572 """
573 Declaration lexer
574
575 State: We've hit a ``<!`` and now peek inside
576
577 :Return: Unfinished state?
578 :Rtype: ``bool``
579 """
580 data = self._buffer
581 if len(data) < 3:
582 return True
583
584 if data.startswith('<!--'):
585 self._state = self.COMMENT
586 return False
587 elif data.startswith('<!['):
588 self._state = self.MSECTION
589 return False
590 elif data == '<!-':
591 return True
592
593 match = self._DECL_MATCH(data)
594 if match is None:
595 return True
596
597 name, value = match.group('name', 'value')
598 pos = match.end()
599 self._buffer, data = data[pos:], data[:pos]
600
601 self._state = self.TEXT
602 self._listener.handle_decl(name, value.strip(), data)
603 return False
604
606 """
607 Processing instruction lexer
608
609 State: We've hit a ``<?`` and now peek inside
610
611 :Return: Unfinished state?
612 :Rtype: ``bool``
613 """
614 data = self._buffer
615 pos = data.find('?>', 2)
616 if pos == -1:
617 return True
618 pos += 2
619
620 self._buffer, data = data[pos:], data[:pos]
621
622 self._state = self.TEXT
623 self._listener.handle_pi(data)
624 return False
625
627 """
628 Empty tag lexer
629
630 State: We've hit a ``<>``
631
632 :Return: Unfinished state?
633 :Rtype: ``bool``
634 """
635 self._buffer, data = self._buffer[2:], self._buffer[:2]
636
637 self._state = self.TEXT
638 self._listener.handle_starttag('', [], False, data)
639 return False
640
642 """
643 Called after the lexer was finalized
644
645 State: after all
646
647 :Exceptions:
648 - `LexerFinalizedError` : The lexer was already finalized
649 (raised always)
650 """
651 raise LexerFinalizedError("The lexer was already finalized")
652
653 _LEXERS = []
654 _STATES = []
655 for _idx, (_statename, _funcname) in enumerate([
656 ('FINAL', '_lex_final'),
657 ('TEXT', '_lex_text'),
658 ('CDATA', '_lex_cdata'),
659 ('MARKUP', '_lex_markup'),
660 ('STARTTAG', '_lex_start'),
661 ('ENDTAG', '_lex_end'),
662 ('COMMENT', '_lex_comment'),
663 ('MSECTION', '_lex_msection'),
664 ('DECL', '_lex_decl'),
665 ('PI', '_lex_pi'),
666 ('EMPTY', '_lex_empty'),
667 ]):
668 setattr(SoupLexer, _statename, _idx)
669 _LEXERS.append(_funcname)
670 _STATES.append(_statename)
671
672 SoupLexer._LEXERS = tuple(_LEXERS)
673 SoupLexer._STATES = tuple(_STATES)
674 del _idx, _statename, _funcname, _LEXERS, _STATES
675
676
677 from tdi import c
678 c = c.load('impl')
679 if c is not None:
680 DEFAULT_LEXER = c.SoupLexer
681 else:
682 DEFAULT_LEXER = SoupLexer
683 del c
687 """
688 =========================
689 (X)HTML Tag Soup Parser
690 =========================
691
692 Overview
693 ~~~~~~~~
694
695 The parser is actually a tagsoup parser by design in order to process
696 most of the "HTML" that can be found out there. Of course, if the HTML
697 is well-formed and valid, this would be the best. There is only as
698 much HTML syntax applied as necessary to parse it. You can influence
699 these syntax definitions by picking another lexer. You can change
700 the semantics by picking another dtd query class.
701
702 This parser guarantees, that for each not-self-closing starttag event also
703 an endtag event is generated (if the endtag is not actually there, the
704 data parameter is an empty string). This also happens for empty tags (like
705 ``br``). On the other hand, there may be more endtag events than starttag
706 events, because of unbalanced or wrongly nested tags.
707
708 Special constructs, which are comments, PIs, marked sections and
709 declarations may occur anywhere, i.e. they are not closing elements
710 implicitly.
711
712 The default lexer does not deal with NET tags (<h1/Heading/). Neither
713 does it handle unfinished starttags by SGML rules like ``<map<area>``.
714 It *does* know about empty tags (``<>`` and ``</>``).
715
716 CDATA elements and comments are handled in a simplified way. Once
717 the particular state is entered, it's only left, when the accompanying
718 end marker was found (``<script>...</script>``, ``<!-- ... -->``).
719 Anything in between is text.
720
721 How is it used?
722 ~~~~~~~~~~~~~~~
723
724 The parser API is "streamy" on the input side and event based on the
725 output side. So, what you need first is a building listener, which will
726 receive all generated parser events and process them. Such is listener
727 object is expected to implement the `BuildingListenerInterface`.
728
729 Now you create a `SoupParser` instance and pass the listener object to
730 the contructor and the parser is ready to be fed. You can feed as many
731 chunks of input data you like into the parser by using the `feed`
732 method. Every feed call may generate mutiple events on the output side.
733 When you're done feeding, call the parser's `finalize` method in order
734 to clean up. This also flushes pending events to the listener.
735
736 :IVariables:
737 `listener` : `BuildingListenerInterface`
738 The building listener to send the events to
739
740 `lexer` : `SoupLexer`
741 The lexer instance
742
743 `_tagstack` : ``list``
744 The current tag stack
745
746 `_inempty` : ``bool``
747 indicates if the last tag on the stack is an empty one
748
749 `_lastopen` : ``str``
750 Stores the last seen open tag name
751 """
752 __implements__ = [
753 _interfaces.ListenerInterface, _interfaces.ParserInterface
754 ]
755
756 - def __init__(self, listener, dtd, lexer=None):
757 """
758 Initialization
759
760 :Parameters:
761 `listener` : `ListenerInterface`
762 The building listener
763
764 `dtd` : `DTDInterface`
765 DTD query object
766
767 `lexer` : ``callable``
768 Lexer class/factory. This mus be a callable taking an
769 event listener and returning a lexer instance. If omitted or
770 ``None``, the default lexer will be used (`DEFAULT_LEXER`).
771 """
772 self._tagstack, self._inempty, self._lastopen = [], False, ''
773 self.listener = listener
774 self._is_nestable = dtd.nestable
775 self._is_cdata = dtd.cdata
776 self._is_empty = dtd.empty
777 if lexer is None:
778 lexer = DEFAULT_LEXER
779 self._lexer = lexer(self)
780 self._normalize = listener.decoder.normalize
781
782 @classmethod
783 - def html(cls, listener):
784 """
785 Construct a parser using the `HTMLDTD`
786
787 :Parameters:
788 `listener` : `BuildingListenerInterface`
789 The building listener
790
791 :Return: The new parser instance
792 :Rtype: `SoupParser`
793 """
794 return cls(listener, _dtd.HTMLDTD())
795
796 @classmethod
797 - def xml(cls, listener):
798 """
799 Construct a parser using the `XMLDTD`
800
801 :Parameters:
802 `listener` : `ListenerInterface`
803 The building listener
804
805 :Return: The new parser instance
806 :Rtype: `SoupParser`
807 """
808 return cls(listener, _dtd.XMLDTD())
809
811 """ Ensure we close last empty tag """
812 if self._inempty:
813 self._inempty = False
814 self.listener.handle_endtag(self._tagstack.pop()[1], '')
815
816
817
818
819
820 - def handle_text(self, data):
821 """ :See: `ListenerInterface` """
822 self._close_empty()
823 self.listener.handle_text(data)
824
826 """ :See: `ListenerInterface` """
827 self._close_empty()
828
829 if name == '' and not attrs:
830 name = self._lastopen
831 else:
832 self._lastopen = name
833
834 tagstack = self._tagstack
835 nestable = self._is_nestable
836 starttag = self._normalize(name)
837 while tagstack and not nestable(tagstack[-1][0], starttag):
838 self.listener.handle_endtag(tagstack.pop()[1], '')
839
840 if closed:
841 self.listener.handle_starttag(name, attrs, closed, data)
842 else:
843 if self._is_cdata(starttag):
844 self._lexer.cdata(self._normalize, starttag)
845 self.listener.handle_starttag(name, attrs, closed, data)
846 tagstack.append((starttag, name))
847 if self._is_empty(starttag):
848 self._inempty = True
849
851 """ :See: `ListenerInterface` """
852 tagstack = self._tagstack
853 if tagstack:
854 if name == '':
855 name = tagstack[-1][1]
856 endtag = self._normalize(name)
857 if endtag in dict(tagstack):
858 toclose, original = tagstack.pop()
859 self._inempty = False
860 while toclose != endtag:
861 self.listener.handle_endtag(original, '')
862 toclose, original = tagstack.pop()
863
864 self._close_empty()
865 self.listener.handle_endtag(name, data)
866
871
873 """ :See: `ListenerInterface` """
874 self._close_empty()
875 self.listener.handle_msection(name, value, data)
876
878 """ :See: `ListenerInterface` """
879 self._close_empty()
880 self.listener.handle_decl(name, value, data)
881
883 """ :See: `ListenerInterface` """
884 self._close_empty()
885 self.listener.handle_pi(data)
886
888 """ :See: `ListenerInterface` """
889
890 raise AssertionError()
891
892
893
894
895
896 - def feed(self, food):
897 """ :See: `ParserInterface` """
898 self._lexer.feed(food)
899
901 """
902 :See: `ParserInterface`
903
904 :Exceptions:
905 - `LexerEOFError` : EOF in the middle of a state
906 """
907 if self._lexer is not None:
908 self._lexer, _ = None, self._lexer.finalize()
909
910 tagstack = self._tagstack
911 while tagstack:
912 self.listener.handle_endtag(tagstack.pop()[1], '')
913
914
915 from tdi import c
916 c = c.load('impl')
917 if c is not None:
918 DEFAULT_PARSER = c.SoupParser
919 else:
920 DEFAULT_PARSER = SoupParser
921 del c
922