Package tdi :: Package tools :: Module html
[frames] | no frames]

Source Code for Module tdi.tools.html

  1  # -*- coding: ascii -*- 
  2  # 
  3  # Copyright 2006 - 2013 
  4  # Andr\xe9 Malo or his licensors, as applicable 
  5  # 
  6  # Licensed under the Apache License, Version 2.0 (the "License"); 
  7  # you may not use this file except in compliance with the License. 
  8  # You may obtain a copy of the License at 
  9  # 
 10  #     http://www.apache.org/licenses/LICENSE-2.0 
 11  # 
 12  # Unless required by applicable law or agreed to in writing, software 
 13  # distributed under the License is distributed on an "AS IS" BASIS, 
 14  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 15  # See the License for the specific language governing permissions and 
 16  # limitations under the License. 
 17  """ 
 18  ============ 
 19   HTML Tools 
 20  ============ 
 21   
 22  HTML Tools. 
 23  """ 
 24  __author__ = u"Andr\xe9 Malo" 
 25  __docformat__ = "restructuredtext en" 
 26  __all__ = [ 
 27      'decode', 'entities', 'class_add', 'class_del', 'multiline', 
 28      'CommentStripFilter', 'MinifyFilter', 'minify' 
 29  ] 
 30   
 31  import codecs as _codecs 
 32  import re as _re 
 33  try: 
 34      import cStringIO as _string_io 
 35  except ImportError: 
 36      import StringIO as _string_io 
 37   
 38  from tdi import LexerError 
 39  from tdi import factory as _factory 
 40  from tdi import filters as _filters 
 41  from tdi import interfaces as _interfaces 
 42  from tdi.markup.soup import dtd as _dtd 
 43  from tdi.markup.soup import encoder as _encoder 
 44  from tdi.markup.soup import decoder as _decoder 
 45  from tdi.markup.soup import parser as _parser 
 46  from tdi.tools import css as _css 
 47  from tdi.tools import javascript as _javascript 
 48  from tdi._htmldecode import decode 
 49  from tdi._htmlentities import htmlentities as entities 
 50   
 51   
 52  #: HTML named character references, generated from 
 53  #: `the HTML5 spec`_\. 
 54  #: 
 55  #: .. _the HTML5 spec: http://www.w3.org/TR/html5/ 
 56  #:    syntax.html#named-character-references 
 57  #: 
 58  #: :Type: ``dict`` 
 59  entities = dict(entities) 
 60   
 61   
62 -def class_add(node, *class_):
63 """ 64 Add class(es) to a node's class attribute 65 66 :Parameters: 67 `node` : TDI node 68 The node to modify 69 70 `class_` : ``tuple`` 71 Class name(s) to add 72 """ 73 try: 74 old = decode(node[u'class'], node.raw.encoder.encoding).split() 75 except KeyError: 76 class_ = u' '.join(class_) 77 else: 78 class_ = u' '.join(old + list(class_)) 79 if class_: 80 node[u'class'] = class_ 81 else: 82 del node[u'class']
83 84
85 -def class_del(node, *class_):
86 """ 87 Remove class(es) from node's class attribute 88 89 :Parameters: 90 `node` : TDI node 91 The node to modify 92 93 `class_` : ``tuple`` 94 Class name(s) to remove. It is *not* an error if a class is not 95 defined before. 96 """ 97 try: 98 old = decode(node[u'class'], node.raw.encoder.encoding).split() 99 except KeyError: 100 pass 101 else: 102 class_ = u' '.join([item for item in old if item not in class_]) 103 if class_: 104 node[u'class'] = class_ 105 else: 106 del node[u'class']
107 108
109 -def _make_multiline():
110 """ Make multiline encoder """ 111 # pylint: disable = W0621 112 113 divmod_, len_ = divmod, len 114 def space_func(match): 115 """ Space filler """ 116 length, rest = divmod_(len_(match.group(0)), 2) 117 if length == 0: 118 return u' ' 119 return u' ' * rest + u'  ' * length
120 ws_sub = _re.compile(ur'\s+').sub 121 ws1_sub = _re.compile(ur'^\s(\S)').sub 122 123 def multiline(content, encoding='ascii', tabwidth=8, xhtml=True): 124 """ 125 Encode multiline content to HTML, assignable to ``node.raw.content`` 126 127 :Parameters: 128 `content` : ``unicode`` 129 Content to encode 130 131 `encoding` : ``str`` 132 Target encoding 133 134 `tabwidth` : ``int`` 135 Tab width? Used to expand tabs. If ``None``, tabs are not 136 expanded. 137 138 `xhtml` : ``bool`` 139 XHTML? Only used to determine if <br> or <br /> is emitted. 140 141 :Return: The multilined content 142 :Rtype: ``str`` 143 """ 144 content = (content 145 .replace(u'&', u'&amp;') 146 .replace(u'<', u'&lt;') 147 .replace(u'>', u'&gt;') 148 ) 149 lines = [] 150 for line in content.splitlines(): 151 line = line.rstrip() 152 if not line: 153 line = u'&nbsp;' 154 else: 155 if tabwidth is not None: 156 line = line.expandtabs(tabwidth) 157 line = ws1_sub(ur'&nbsp;\1', line) 158 line = ws_sub(space_func, line) 159 lines.append(line) 160 if xhtml: 161 res = u'<br />'.join(lines) 162 else: 163 res = u'<br>'.join(lines) 164 return res.encode(encoding, 'xmlcharrefreplace') 165 166 return multiline 167 168 multiline = _make_multiline() 169 170
171 -class CommentStripFilter(_filters.BaseEventFilter):
172 """ Strip comments from the event chain """ 173
174 - def handle_comment(self, data):
175 """ :See: `tdi.interfaces.ListenerInterface` """ 176 pass
177 178
179 -class MinifyFilter(_filters.BaseEventFilter):
180 """ 181 Strip unneeded whitespace and comments 182 183 :IVariables: 184 `_buffer` : ``list`` 185 Current text buffer 186 187 `_stack` : ``list`` 188 Current tag stack 189 190 `_last` : ``str`` 191 Last seen endtag name (normalized) or ``None`` 192 193 `_blocks` : ``dict`` 194 List of block elements (in a dict for better lookup) 195 """ 196
197 - def __init__(self, builder, comment_filter=None):
198 """ 199 Initialization 200 201 :Parameters: 202 `builder` : `BuildingListenerInterface` 203 Next level builder. 204 205 `comment_filter` : callable 206 Comment filter. A function which takes the comment data and 207 returns a filtered comment (which is passed through to the 208 builder) or ``None`` (meaning the comment can be stripped 209 completely). For example:: 210 211 def keep_ad_comments(data): 212 if 'google_ad_section' in data: 213 return data 214 return None 215 216 If omitted or ``None``, all comments are stripped. 217 """ 218 super(MinifyFilter, self).__init__(builder) 219 self._buffer = [] 220 self._stack = [] 221 self._last = None 222 self._dtd = _dtd.HTMLDTD() 223 self._normalize = self.builder.decoder.normalize 224 if comment_filter is None: 225 comment_filter = lambda x: None 226 self._comment_filter = comment_filter 227 self._blocks = dict([(item, None) for item in ( 228 'address', 229 'article', 230 'aside', 231 'blockquote', 232 'body', 233 'caption', 234 'col', 235 'colgroup', 236 'dd', 237 'dir', 238 'div', 239 'dl', 240 'dt', 241 'fieldset', 242 'figcaption', 243 'figure', 244 'footer', 245 'form', 246 'frame', 247 'frameset', 248 'h1', 249 'h2', 250 'h3', 251 'h4', 252 'h5', 253 'h6', 254 'head', 255 'header', 256 'hgroup', 257 'hr', 258 'html', 259 'isindex', 260 'layer', 261 'li', 262 'listing', 263 'map', 264 'marquee', 265 'menu', 266 'multicol', 267 'nav', 268 'noframes', 269 'ol', 270 'option', 271 'p', 272 'script', 273 'style', 274 'section', 275 'table', 276 'tbody', 277 'td', 278 'title', 279 'tfoot', 280 'th', 281 'thead', 282 'tr', 283 'ul', 284 'xmp', 285 )])
286 287 #: Whitespace substitutor 288 #: 289 #: :Type: ``callable`` 290 _WS_SUB = _re.compile(r'\s+').sub 291
292 - def _flush(self, endtag=False, starttag=None):
293 """ 294 Flush the current text buffer to the builder 295 296 :Parameters: 297 `endtag` : ``bool`` 298 Endtag flush? 299 300 `starttag` : ``str`` 301 Next starttag (normalized) if starttag flush 302 """ 303 if self._buffer: 304 self._buffer, buf, stack = [], ''.join(self._buffer), self._stack 305 if stack and \ 306 (self._dtd.cdata(stack[-1]) or stack[-1] == 'pre'): 307 if stack[-1] == 'pre': 308 buf = [line.rstrip() 309 for line in buf.rstrip().splitlines(False) 310 ] 311 elif stack[-1] in ('script', 'style'): 312 buf = buf.strip().splitlines(False) 313 else: 314 buf = buf.splitlines(False) 315 buf = '\n'.join(buf) 316 else: 317 buf = self._WS_SUB(' ', buf) 318 if self._last in self._blocks: 319 buf = buf.lstrip() 320 if (endtag and stack and stack[-1] in self._blocks) \ 321 or starttag in self._blocks: 322 buf = buf.rstrip() 323 self.builder.handle_text(buf)
324
325 - def finalize(self):
326 """ 327 Flush the last chunk 328 329 :See: `tdi.interfaces.BuilderInterface` 330 """ 331 self._flush(starttag=self._blocks.keys()[0]) 332 return self.builder.finalize()
333
334 - def handle_text(self, data):
335 """ 336 Buffer the text 337 338 :See: `tdi.interfaces.ListenerInterface` 339 """ 340 self._buffer.append(data)
341
342 - def handle_starttag(self, name, attr, closed, data):
343 """ :See: `tdi.interfaces.ListenerInterface` """ 344 norm = self._normalize 345 norm_name = norm(name) 346 self._flush(False, norm_name) 347 if not closed: 348 self._stack.append(norm_name) 349 newattr = [(norm(key), value) for key, value in attr] 350 newattr.sort() 351 data = self.encoder.starttag( 352 norm_name, newattr, closed 353 ) 354 self.builder.handle_starttag(norm_name, attr, closed, data)
355
356 - def handle_endtag(self, name, data):
357 """ :See: `tdi.interfaces.ListenerInterface` """ 358 self._flush(True) 359 norm_name, stack = self._normalize(name), self._stack 360 if stack and norm_name == stack[-1]: 361 self._last = stack.pop() 362 if data: 363 data = self.encoder.endtag(norm_name) 364 self.builder.handle_endtag(norm_name, data)
365
366 - def handle_comment(self, data):
367 """ :See: `tdi.interfaces.ListenerInterface` """ 368 data = self._comment_filter(data) 369 if data is not None: 370 self.builder.handle_comment(data)
371
372 - def handle_msection(self, name, value, data):
373 """ :See: `tdi.interfaces.ListenerInterface` """ 374 self._flush() 375 self.builder.handle_msection(name, value, data)
376
377 - def handle_decl(self, name, value, data):
378 """ :See: `tdi.interfaces.ListenerInterface` """ 379 self._flush() 380 self.builder.handle_decl(name, value, data)
381
382 - def handle_pi(self, data):
383 """ :See: `tdi.interfaces.ListenerInterface` """ 384 self._flush() 385 self.builder.handle_pi(data)
386 387
388 -def minify(html, encoding='ascii', fail_silently=False, comment_filter=None, 389 cdata_containers=False):
390 """ 391 Minify HTML 392 393 Enclosed <script> and <style> blocks are minified as well. 394 395 :Parameters: 396 `html` : ``basestring`` 397 HTML to minify 398 399 `encoding` : ``str`` 400 Initially assumed encoding. Only marginally interesting. 401 402 `fail_silently` : ``bool`` 403 Fail if a parse error is encountered? If true, the parse error is 404 passed. Otherwise it's swallowed and the input html is returned. 405 406 `comment_filter` : callable 407 HTML Comment filter. A function which takes the comment data and 408 returns a filtered comment (which is passed through to the 409 builder) or ``None`` (meaning the comment can be stripped 410 completely). For example:: 411 412 def keep_ad_comments(data): 413 if 'google_ad_section' in data: 414 return data 415 return None 416 417 If omitted or ``None``, all HTML comments are stripped. 418 419 `cdata_containers` : ``bool`` 420 Add CDATA containers to enclosed <script> or <style> content? If true, 421 these containers are added after minimization of the content. Default 422 is false. 423 424 :Return: the minified HTML - typed as input 425 :Rtype: ``basestring`` 426 """ 427 def js_minify(builder): 428 """ Javascript minifier filter factory """ 429 return _javascript.MinifyFilter(builder, standalone=True)
430 431 def js_cdata(builder): 432 """ Javascript cdata container filter factory """ 433 return _javascript.CDATAFilter(builder, standalone=True) 434 435 def css_minify(builder): 436 """ CSS minifier filter factory """ 437 return _css.MinifyFilter(builder, standalone=True) 438 439 def css_cdata(builder): 440 """ CSS cdata container filter factory """ 441 return _css.CDATAFilter(builder, standalone=True) 442 443 def html_minify(builder): 444 """ HTML minifier filter factory """ 445 return MinifyFilter(builder, comment_filter=comment_filter) 446 447 filters = cdata_containers and [js_cdata, css_cdata] or [] 448 isuni = isinstance(html, unicode) 449 if isuni: 450 html = html.encode('utf-8') 451 try: 452 result = _factory.Loader( 453 builder=_StringBuilder, 454 parser=_parser.SoupParser.html, 455 encoder=_encoder.SoupEncoder, 456 decoder=_decoder.HTMLDecoder, 457 eventfilters=filters + [ 458 js_minify, 459 css_minify, 460 html_minify, 461 ] 462 )(_string_io.StringIO(html), '<string>', encoding) 463 except LexerError: 464 if not fail_silently: 465 raise 466 result = html 467 if isuni: 468 return result.decode('utf-8') 469 return result 470 471
472 -class _StringBuilder(object):
473 """ String builder """ 474 __implements__ = [_interfaces.BuilderInterface, 475 _interfaces.BuildingListenerInterface] 476 477 encoding = 'ascii' 478
479 - def __init__(self, encoder, decoder):
480 """ 481 Initialization 482 483 :Parameters: 484 `encoder` : ``callable`` 485 Encoder factory 486 487 `decoder` : ``callable`` 488 Decoder factory 489 """ 490 self._result = [] 491 self.encoder = encoder(self.encoding) 492 self.decoder = decoder(self.encoding)
493
494 - def handle_text(self, data):
495 """ :see: `ListenerInterface` """ 496 self._result.append(data)
497
498 - def handle_escape(self, escaped, data):
499 """ :see: `ListenerInterface` """ 500 # pylint: disable = W0613 501 self._result.append(data)
502
503 - def handle_starttag(self, name, attr, closed, data):
504 """ :see: `ListenerInterface` """ 505 # pylint: disable = W0613 506 self._result.append(data)
507
508 - def handle_endtag(self, name, data):
509 """ :see: `ListenerInterface` """ 510 # pylint: disable = W0613 511 self._result.append(data)
512
513 - def handle_comment(self, data):
514 """ :see: `ListenerInterface` """ 515 self._result.append(data)
516
517 - def handle_msection(self, name, value, data):
518 """ :see: `ListenerInterface` """ 519 # pylint: disable = W0613 520 self._result.append(data)
521
522 - def handle_decl(self, name, value, data):
523 """ :see: `ListenerInterface` """ 524 # pylint: disable = W0613 525 self._result.append(data)
526
527 - def handle_pi(self, data):
528 """ :see: `ListenerInterface` """ 529 self._result.append(data)
530
531 - def handle_encoding(self, encoding):
532 """ :See: `tdi.interfaces.BuildingListenerInterface` """ 533 try: 534 _codecs.lookup(encoding) 535 except LookupError: 536 pass 537 else: 538 if self.encoding != encoding: 539 self.encoding = encoding 540 self.encoder.encoding = encoding 541 self.decoder.encoding = encoding
542
543 - def finalize(self):
544 """ :See: `tdi.interfaces.BuilderInterface` """ 545 return ''.join(self._result)
546