1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 """
18 ============
19 HTML Tools
20 ============
21
22 HTML Tools.
23 """
24 __author__ = u"Andr\xe9 Malo"
25 __docformat__ = "restructuredtext en"
26 __all__ = [
27 'decode', 'entities', 'class_add', 'class_del', 'multiline',
28 'CommentStripFilter', 'MinifyFilter', 'minify'
29 ]
30
31 import codecs as _codecs
32 import re as _re
33 try:
34 import cStringIO as _string_io
35 except ImportError:
36 import StringIO as _string_io
37
38 from tdi import LexerError
39 from tdi import factory as _factory
40 from tdi import filters as _filters
41 from tdi import interfaces as _interfaces
42 from tdi.markup.soup import dtd as _dtd
43 from tdi.markup.soup import encoder as _encoder
44 from tdi.markup.soup import decoder as _decoder
45 from tdi.markup.soup import parser as _parser
46 from tdi.tools import css as _css
47 from tdi.tools import javascript as _javascript
48 from tdi._htmldecode import decode
49 from tdi._htmlentities import htmlentities as entities
50
51
52
53
54
55
56
57
58
59 entities = dict(entities)
60
61
63 """
64 Add class(es) to a node's class attribute
65
66 :Parameters:
67 `node` : TDI node
68 The node to modify
69
70 `class_` : ``tuple``
71 Class name(s) to add
72 """
73 try:
74 old = decode(node[u'class'], node.raw.encoder.encoding).split()
75 except KeyError:
76 class_ = u' '.join(class_)
77 else:
78 class_ = u' '.join(old + list(class_))
79 if class_:
80 node[u'class'] = class_
81 else:
82 del node[u'class']
83
84
86 """
87 Remove class(es) from node's class attribute
88
89 :Parameters:
90 `node` : TDI node
91 The node to modify
92
93 `class_` : ``tuple``
94 Class name(s) to remove. It is *not* an error if a class is not
95 defined before.
96 """
97 try:
98 old = decode(node[u'class'], node.raw.encoder.encoding).split()
99 except KeyError:
100 pass
101 else:
102 class_ = u' '.join([item for item in old if item not in class_])
103 if class_:
104 node[u'class'] = class_
105 else:
106 del node[u'class']
107
108
110 """ Make multiline encoder """
111
112
113 divmod_, len_ = divmod, len
114 def space_func(match):
115 """ Space filler """
116 length, rest = divmod_(len_(match.group(0)), 2)
117 if length == 0:
118 return u' '
119 return u' ' * rest + u' ' * length
120 ws_sub = _re.compile(ur'\s+').sub
121 ws1_sub = _re.compile(ur'^\s(\S)').sub
122
123 def multiline(content, encoding='ascii', tabwidth=8, xhtml=True):
124 """
125 Encode multiline content to HTML, assignable to ``node.raw.content``
126
127 :Parameters:
128 `content` : ``unicode``
129 Content to encode
130
131 `encoding` : ``str``
132 Target encoding
133
134 `tabwidth` : ``int``
135 Tab width? Used to expand tabs. If ``None``, tabs are not
136 expanded.
137
138 `xhtml` : ``bool``
139 XHTML? Only used to determine if <br> or <br /> is emitted.
140
141 :Return: The multilined content
142 :Rtype: ``str``
143 """
144 content = (content
145 .replace(u'&', u'&')
146 .replace(u'<', u'<')
147 .replace(u'>', u'>')
148 )
149 lines = []
150 for line in content.splitlines():
151 line = line.rstrip()
152 if not line:
153 line = u' '
154 else:
155 if tabwidth is not None:
156 line = line.expandtabs(tabwidth)
157 line = ws1_sub(ur' \1', line)
158 line = ws_sub(space_func, line)
159 lines.append(line)
160 if xhtml:
161 res = u'<br />'.join(lines)
162 else:
163 res = u'<br>'.join(lines)
164 return res.encode(encoding, 'xmlcharrefreplace')
165
166 return multiline
167
168 multiline = _make_multiline()
169
170
177
178
180 """
181 Strip unneeded whitespace and comments
182
183 :IVariables:
184 `_buffer` : ``list``
185 Current text buffer
186
187 `_stack` : ``list``
188 Current tag stack
189
190 `_last` : ``str``
191 Last seen endtag name (normalized) or ``None``
192
193 `_blocks` : ``dict``
194 List of block elements (in a dict for better lookup)
195 """
196
197 - def __init__(self, builder, comment_filter=None):
198 """
199 Initialization
200
201 :Parameters:
202 `builder` : `BuildingListenerInterface`
203 Next level builder.
204
205 `comment_filter` : callable
206 Comment filter. A function which takes the comment data and
207 returns a filtered comment (which is passed through to the
208 builder) or ``None`` (meaning the comment can be stripped
209 completely). For example::
210
211 def keep_ad_comments(data):
212 if 'google_ad_section' in data:
213 return data
214 return None
215
216 If omitted or ``None``, all comments are stripped.
217 """
218 super(MinifyFilter, self).__init__(builder)
219 self._buffer = []
220 self._stack = []
221 self._last = None
222 self._dtd = _dtd.HTMLDTD()
223 self._normalize = self.builder.decoder.normalize
224 if comment_filter is None:
225 comment_filter = lambda x: None
226 self._comment_filter = comment_filter
227 self._blocks = dict([(item, None) for item in (
228 'address',
229 'article',
230 'aside',
231 'blockquote',
232 'body',
233 'caption',
234 'col',
235 'colgroup',
236 'dd',
237 'dir',
238 'div',
239 'dl',
240 'dt',
241 'fieldset',
242 'figcaption',
243 'figure',
244 'footer',
245 'form',
246 'frame',
247 'frameset',
248 'h1',
249 'h2',
250 'h3',
251 'h4',
252 'h5',
253 'h6',
254 'head',
255 'header',
256 'hgroup',
257 'hr',
258 'html',
259 'isindex',
260 'layer',
261 'li',
262 'listing',
263 'map',
264 'marquee',
265 'menu',
266 'multicol',
267 'nav',
268 'noframes',
269 'ol',
270 'option',
271 'p',
272 'script',
273 'style',
274 'section',
275 'table',
276 'tbody',
277 'td',
278 'title',
279 'tfoot',
280 'th',
281 'thead',
282 'tr',
283 'ul',
284 'xmp',
285 )])
286
287
288
289
290 _WS_SUB = _re.compile(r'\s+').sub
291
292 - def _flush(self, endtag=False, starttag=None):
293 """
294 Flush the current text buffer to the builder
295
296 :Parameters:
297 `endtag` : ``bool``
298 Endtag flush?
299
300 `starttag` : ``str``
301 Next starttag (normalized) if starttag flush
302 """
303 if self._buffer:
304 self._buffer, buf, stack = [], ''.join(self._buffer), self._stack
305 if stack and \
306 (self._dtd.cdata(stack[-1]) or stack[-1] == 'pre'):
307 if stack[-1] == 'pre':
308 buf = [line.rstrip()
309 for line in buf.rstrip().splitlines(False)
310 ]
311 elif stack[-1] in ('script', 'style'):
312 buf = buf.strip().splitlines(False)
313 else:
314 buf = buf.splitlines(False)
315 buf = '\n'.join(buf)
316 else:
317 buf = self._WS_SUB(' ', buf)
318 if self._last in self._blocks:
319 buf = buf.lstrip()
320 if (endtag and stack and stack[-1] in self._blocks) \
321 or starttag in self._blocks:
322 buf = buf.rstrip()
323 self.builder.handle_text(buf)
324
326 """
327 Flush the last chunk
328
329 :See: `tdi.interfaces.BuilderInterface`
330 """
331 self._flush(starttag=self._blocks.keys()[0])
332 return self.builder.finalize()
333
334 - def handle_text(self, data):
335 """
336 Buffer the text
337
338 :See: `tdi.interfaces.ListenerInterface`
339 """
340 self._buffer.append(data)
341
343 """ :See: `tdi.interfaces.ListenerInterface` """
344 norm = self._normalize
345 norm_name = norm(name)
346 self._flush(False, norm_name)
347 if not closed:
348 self._stack.append(norm_name)
349 newattr = [(norm(key), value) for key, value in attr]
350 newattr.sort()
351 data = self.encoder.starttag(
352 norm_name, newattr, closed
353 )
354 self.builder.handle_starttag(norm_name, attr, closed, data)
355
357 """ :See: `tdi.interfaces.ListenerInterface` """
358 self._flush(True)
359 norm_name, stack = self._normalize(name), self._stack
360 if stack and norm_name == stack[-1]:
361 self._last = stack.pop()
362 if data:
363 data = self.encoder.endtag(norm_name)
364 self.builder.handle_endtag(norm_name, data)
365
371
376
378 """ :See: `tdi.interfaces.ListenerInterface` """
379 self._flush()
380 self.builder.handle_decl(name, value, data)
381
383 """ :See: `tdi.interfaces.ListenerInterface` """
384 self._flush()
385 self.builder.handle_pi(data)
386
387
388 -def minify(html, encoding='ascii', fail_silently=False, comment_filter=None,
389 cdata_containers=False):
390 """
391 Minify HTML
392
393 Enclosed <script> and <style> blocks are minified as well.
394
395 :Parameters:
396 `html` : ``basestring``
397 HTML to minify
398
399 `encoding` : ``str``
400 Initially assumed encoding. Only marginally interesting.
401
402 `fail_silently` : ``bool``
403 Fail if a parse error is encountered? If true, the parse error is
404 passed. Otherwise it's swallowed and the input html is returned.
405
406 `comment_filter` : callable
407 HTML Comment filter. A function which takes the comment data and
408 returns a filtered comment (which is passed through to the
409 builder) or ``None`` (meaning the comment can be stripped
410 completely). For example::
411
412 def keep_ad_comments(data):
413 if 'google_ad_section' in data:
414 return data
415 return None
416
417 If omitted or ``None``, all HTML comments are stripped.
418
419 `cdata_containers` : ``bool``
420 Add CDATA containers to enclosed <script> or <style> content? If true,
421 these containers are added after minimization of the content. Default
422 is false.
423
424 :Return: the minified HTML - typed as input
425 :Rtype: ``basestring``
426 """
427 def js_minify(builder):
428 """ Javascript minifier filter factory """
429 return _javascript.MinifyFilter(builder, standalone=True)
430
431 def js_cdata(builder):
432 """ Javascript cdata container filter factory """
433 return _javascript.CDATAFilter(builder, standalone=True)
434
435 def css_minify(builder):
436 """ CSS minifier filter factory """
437 return _css.MinifyFilter(builder, standalone=True)
438
439 def css_cdata(builder):
440 """ CSS cdata container filter factory """
441 return _css.CDATAFilter(builder, standalone=True)
442
443 def html_minify(builder):
444 """ HTML minifier filter factory """
445 return MinifyFilter(builder, comment_filter=comment_filter)
446
447 filters = cdata_containers and [js_cdata, css_cdata] or []
448 isuni = isinstance(html, unicode)
449 if isuni:
450 html = html.encode('utf-8')
451 try:
452 result = _factory.Loader(
453 builder=_StringBuilder,
454 parser=_parser.SoupParser.html,
455 encoder=_encoder.SoupEncoder,
456 decoder=_decoder.HTMLDecoder,
457 eventfilters=filters + [
458 js_minify,
459 css_minify,
460 html_minify,
461 ]
462 )(_string_io.StringIO(html), '<string>', encoding)
463 except LexerError:
464 if not fail_silently:
465 raise
466 result = html
467 if isuni:
468 return result.decode('utf-8')
469 return result
470
471
473 """ String builder """
474 __implements__ = [_interfaces.BuilderInterface,
475 _interfaces.BuildingListenerInterface]
476
477 encoding = 'ascii'
478
480 """
481 Initialization
482
483 :Parameters:
484 `encoder` : ``callable``
485 Encoder factory
486
487 `decoder` : ``callable``
488 Decoder factory
489 """
490 self._result = []
491 self.encoder = encoder(self.encoding)
492 self.decoder = decoder(self.encoding)
493
494 - def handle_text(self, data):
495 """ :see: `ListenerInterface` """
496 self._result.append(data)
497
499 """ :see: `ListenerInterface` """
500
501 self._result.append(data)
502
504 """ :see: `ListenerInterface` """
505
506 self._result.append(data)
507
509 """ :see: `ListenerInterface` """
510
511 self._result.append(data)
512
516
518 """ :see: `ListenerInterface` """
519
520 self._result.append(data)
521
523 """ :see: `ListenerInterface` """
524
525 self._result.append(data)
526
528 """ :see: `ListenerInterface` """
529 self._result.append(data)
530
542
544 """ :See: `tdi.interfaces.BuilderInterface` """
545 return ''.join(self._result)
546