Package tdi :: Module _htmldecode
[frames] | no frames]

Source Code for Module tdi._htmldecode

  1  # -*- coding: ascii -*- 
  2  # 
  3  # Copyright 2006 - 2013 
  4  # Andr\xe9 Malo or his licensors, as applicable 
  5  # 
  6  # Licensed under the Apache License, Version 2.0 (the "License"); 
  7  # you may not use this file except in compliance with the License. 
  8  # You may obtain a copy of the License at 
  9  # 
 10  #     http://www.apache.org/licenses/LICENSE-2.0 
 11  # 
 12  # Unless required by applicable law or agreed to in writing, software 
 13  # distributed under the License is distributed on an "AS IS" BASIS, 
 14  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 15  # See the License for the specific language governing permissions and 
 16  # limitations under the License. 
 17  """ 
 18  ============== 
 19   HTML Decoder 
 20  ============== 
 21   
 22  HTML Decoder. 
 23  """ 
 24  __author__ = u"Andr\xe9 Malo" 
 25  __docformat__ = "restructuredtext en" 
 26   
 27  import re as _re 
 28   
 29  from tdi import _htmlentities 
 30   
 31   
32 -def _make_decode():
33 """ Make decoder """ 34 # pylint: disable = R0912 35 36 from tdi import c 37 c = c.load('impl') 38 if c is not None: 39 return c.htmldecode 40 41 sub = _re.compile(ur'&([^& \t\n\r\f;]*);').sub 42 unicode_, unichr_, str_, int_ = unicode, unichr, str, int 43 isinstance_ = isinstance 44 default_entities = dict(_htmlentities.htmlentities) 45 46 # pylint: disable = W0621 47 def decode(value, encoding='latin-1', errors='strict', entities=None): 48 """ 49 Decode HTML encoded text 50 51 :Parameters: 52 `value` : ``basestring`` 53 HTML content to decode 54 55 `encoding` : ``str`` 56 Unicode encoding to be applied before value is being processed 57 further. If value is already a unicode instance, the encoding is 58 ignored. If omitted, 'latin-1' is applied (because it can't fail 59 and maps bytes 1:1 to unicode codepoints). 60 61 `errors` : ``str`` 62 Error handling, passed to .decode() and evaluated for entities. 63 If the entity name or character codepoint could not be found or 64 not be parsed then the error handler has the following semantics: 65 66 ``strict`` (or anything different from the other tokens below) 67 A ``ValueError`` is raised. 68 69 ``ignore`` 70 The original entity is passed through 71 72 ``replace`` 73 The character is replaced by the replacement character 74 (U+FFFD) 75 76 `entities` : ``dict`` 77 Entity name mapping (unicode(name) -> unicode(value)). If 78 omitted or ``None``, the `HTML5 entity list`_ is applied. 79 80 .. _HTML5 entity list: http://www.w3.org/TR/html5/ 81 syntax.html#named-character-references 82 83 :Return: The decoded content 84 :Rtype: ``unicode`` 85 """ 86 # pylint: disable = E1101 87 # pylint: disable = R0912 88 if not isinstance_(value, unicode_): 89 value = str_(value).decode(encoding, errors) 90 if entities is None: 91 entities = default_entities 92 def subber(match): 93 """ Substituter """ 94 name = match.group(1) 95 if not name.startswith(u'#'): 96 try: 97 return entities[name] 98 except KeyError: 99 pass 100 else: 101 if name.startswith(u'#x') or name.startswith(u'#X'): 102 base = 16 103 name = name[2:] 104 else: 105 base = 10 106 name = name[1:] 107 try: 108 return unichr_(int_(name, base)) 109 except (ValueError, TypeError, OverflowError): 110 pass 111 112 if errors == 'ignore': 113 return match.group(0) 114 elif errors == 'replace': 115 return u'\ufffd' 116 else: 117 raise ValueError( 118 "Unresolved entity %r" % (match.group(0),) 119 )
120 121 return sub(subber, value) 122 return decode 123 124 decode = _make_decode() 125