1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 """
18 =====================
19 Soup Filter Classes
20 =====================
21
22 Filters for soup templates.
23 """
24 __author__ = u"Andr\xe9 Malo"
25 __docformat__ = "restructuredtext en"
26
27 import re as _re
28
29 from tdi import util as _util
30 from tdi import filters as _filters
31
32
34 """ Extract template encoding and pass it properly to the builder """
35 __slots__ = ('_normalize', '_meta')
36
42
44 """
45 Extract encoding from HTML meta element
46
47 Here are samples for the expected formats::
48
49 <meta charset="utf-8"> <!-- HTML5 -->
50
51 <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
52
53 The event is passed to the builder nevertheless.
54
55 :See: `BuildingListenerInterface`
56 """
57 normalize = self._normalize
58
59 iname = normalize(name)
60 if iname == self._meta:
61 adict = dict([(normalize(key), val) for key, val in attr])
62 value = str((adict.get(normalize('charset')) or ''))
63 if value.startswith('"') or value.startswith("'"):
64 value = value[1:-1].strip()
65 if value:
66 self.builder.handle_encoding(value)
67 else:
68 value = (adict.get(normalize('http-equiv')) or '').lower()
69 if value.startswith('"') or value.startswith("'"):
70 value = value[1:-1].strip()
71 if value == 'content-type':
72 ctype = adict.get(normalize('content'))
73 if ctype:
74 if ctype.startswith('"') or ctype.startswith("'"):
75 ctype = ctype[1:-1].strip()
76
77 parsed = _util.parse_content_type(ctype)
78 if parsed is not None:
79 encoding = parsed[1].get('charset')
80 if encoding:
81 self.builder.handle_encoding(
82 encoding[0].strip()
83 )
84
85 self.builder.handle_starttag(name, attr, closed, data)
86
87
88
89
90 _PI_MATCH = _re.compile(r'''
91 <\? \s* [xX][mM][lL] \s+ (?P<attr>
92 [^"'?]*
93 (?:
94 (?:
95 "[^"]*"
96 | '[^']*'
97 )
98 [^"'?]*
99 )*
100 )
101 \s* \?>$
102 ''', _re.X).match
103
104
105
106
107 _PI_ATT_ITER = _re.compile(r'''
108 \s*
109 (?P<name>[^\s=]*) # attribute name
110 \s*
111 =
112 (?P<value> # value
113 \s*"[^"]*"
114 | \s*'[^']*'
115 )
116 ''', _re.X).finditer
117
119 """
120 Extract encoding from xml declaration
121
122 Here's a sample for the expected format::
123
124 <?xml version="1.0" encoding="ascii" ?>
125
126 The event is passed to the builder nevertheless.
127
128 :See: `BuildingListenerInterface`
129 """
130 match = self._PI_MATCH(str(data))
131 if match:
132 encoding = 'utf-8'
133 for match in self._PI_ATT_ITER(match.group('attr')):
134 key, value = match.group('name', 'value')
135 if key or value:
136 if key == 'encoding':
137 value = value.strip()
138 if value.startswith('"') or value.startswith("'"):
139 value = value[1:-1].strip()
140 if value:
141 encoding = value
142 break
143 else:
144 break
145 self.builder.handle_encoding(encoding)
146 self.builder.handle_pi(data)
147
148 from tdi import c
149 c = c.load('impl')
150 if c is not None:
151 EncodingDetectFilter = c.SoupEncodingDetectFilter
152 del c
153