1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 """
18 ==============
19 HTML Decoder
20 ==============
21
22 HTML Decoder.
23 """
24 __author__ = u"Andr\xe9 Malo"
25 __docformat__ = "restructuredtext en"
26
27 import re as _re
28
29 from tdi import _htmlentities
30
31
33 """ Make decoder """
34
35
36 from tdi import c
37 c = c.load('impl')
38 if c is not None:
39 return c.htmldecode
40
41 sub = _re.compile(ur'&([^& \t\n\r\f;]*);').sub
42 unicode_, unichr_, str_, int_ = unicode, unichr, str, int
43 isinstance_ = isinstance
44 default_entities = dict(_htmlentities.htmlentities)
45
46
47 def decode(value, encoding='latin-1', errors='strict', entities=None):
48 """
49 Decode HTML encoded text
50
51 :Parameters:
52 `value` : ``basestring``
53 HTML content to decode
54
55 `encoding` : ``str``
56 Unicode encoding to be applied before value is being processed
57 further. If value is already a unicode instance, the encoding is
58 ignored. If omitted, 'latin-1' is applied (because it can't fail
59 and maps bytes 1:1 to unicode codepoints).
60
61 `errors` : ``str``
62 Error handling, passed to .decode() and evaluated for entities.
63 If the entity name or character codepoint could not be found or
64 not be parsed then the error handler has the following semantics:
65
66 ``strict`` (or anything different from the other tokens below)
67 A ``ValueError`` is raised.
68
69 ``ignore``
70 The original entity is passed through
71
72 ``replace``
73 The character is replaced by the replacement character
74 (U+FFFD)
75
76 `entities` : ``dict``
77 Entity name mapping (unicode(name) -> unicode(value)). If
78 omitted or ``None``, the `HTML5 entity list`_ is applied.
79
80 .. _HTML5 entity list: http://www.w3.org/TR/html5/
81 syntax.html#named-character-references
82
83 :Return: The decoded content
84 :Rtype: ``unicode``
85 """
86
87
88 if not isinstance_(value, unicode_):
89 value = str_(value).decode(encoding, errors)
90 if entities is None:
91 entities = default_entities
92 def subber(match):
93 """ Substituter """
94 name = match.group(1)
95 if not name.startswith(u'#'):
96 try:
97 return entities[name]
98 except KeyError:
99 pass
100 else:
101 if name.startswith(u'#x') or name.startswith(u'#X'):
102 base = 16
103 name = name[2:]
104 else:
105 base = 10
106 name = name[1:]
107 try:
108 return unichr_(int_(name, base))
109 except (ValueError, TypeError, OverflowError):
110 pass
111
112 if errors == 'ignore':
113 return match.group(0)
114 elif errors == 'replace':
115 return u'\ufffd'
116 else:
117 raise ValueError(
118 "Unresolved entity %r" % (match.group(0),)
119 )
120
121 return sub(subber, value)
122 return decode
123
124 decode = _make_decode()
125