Coverage for phml\parser.py: 100%
124 statements
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-05 15:06 -0500
« prev ^ index » next coverage.py v6.5.0, created at 2023-04-05 15:06 -0500
1"""Pythonic Hypertext Markup Language (phml) parser."""
2import re
3from copy import deepcopy
4from operator import itemgetter
6from .nodes import (
7 AST,
8 Attribute,
9 Element,
10 Literal,
11 LiteralType,
12 Parent,
13 Point,
14 Position,
15)
18def strip(data: str, cur_tags: list[str]) -> str:
19 """This function takes a possibly multiline string and strips leading and trailing
20 blank lines. Given the current tag stack it will not strip the text if it is nested
21 in a `pre` tag.
22 """
23 if len(cur_tags) > 0 and (
24 cur_tags[-1] == "python"
25 or cur_tags[-1] == "script"
26 or cur_tags[-1] == "style"
27 or "pre" in cur_tags
28 ):
29 return data
30 return data.strip()
33self_closing = [
34 "area",
35 "base",
36 "br",
37 "col",
38 "embed",
39 "hr",
40 "img",
41 "input",
42 "link",
43 "meta",
44 "param",
45 "source",
46 "track",
47 "wbr",
48 "command",
49 "keygen",
50 "menuitem",
51 "Slot",
52 "Markdown",
53]
56# Main form of tokenization
57class RE:
58 tag_start = re.compile(
59 r"(?P<comment><!--)|<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)",
60 )
61 """Matches the start of a tag `<!name|</name|<name`"""
63 tag_end = re.compile(r"(?P<closing>/?)>")
64 """Matches the end of a tag `/>|>`."""
66 comment = re.compile(r"<!--((?:.|\s)*)-->")
67 """Matches all html style comments `<!--Comment-->`."""
68 comment_close = re.compile(r"-->")
70 attribute = re.compile(
71 r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"\s]+)))?",
72 )
73 """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`."""
75 bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$")
78class HypertextMarkupParser:
79 """Parse html/xml like source code strings."""
81 tag_stack = []
82 """Current stack of tags in order of when they are opened."""
83 in_pre: int = 0
84 """Whether the current element context is inside a pre element."""
86 def __calc_line_col(self, source: str, start: int) -> tuple[int, int]:
87 """Calculate the number of lines and columns that lead to the starting point int he source
88 string.
89 """
90 source = source[:start]
91 return (
92 source.count("\n"),
93 len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0,
94 )
96 def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int:
97 """Calculate whether the number of columns should be added to the current column or be
98 treated as if it is starting from zero based on whether new lines exist.
99 """
100 return num_cols if num_lines != 0 else init_cols + num_cols
102 def __parse_text(self, text: str, pos: Position) -> Literal | None:
103 """Parse the comments and general text found in the provided source."""
105 if len(text) > 0 and strip(text, self.tag_stack) != "":
106 line, col = self.__calc_line_col(text, len(text))
107 pos.start.line += line
108 pos.start.column = col
110 pos.end.line += line
111 pos.end.column = self.__calc_col(line, col, pos.end.column)
112 return Literal(
113 LiteralType.Text,
114 strip(text, self.tag_stack),
115 position=Position.from_pos(pos),
116 in_pre=self.in_pre > 0,
117 )
119 return None
121 def __parse_attributes(self, attrs: str) -> dict[str, Attribute]:
122 """Parse a tags attributes from the text found between the tag start and the tag end.
124 Example:
125 `<name (attributes)>`
126 """
127 attributes = {}
128 for attr in RE.attribute.finditer(attrs):
129 (name, value, _, double, single, no_bracket) = itemgetter(
130 "name",
131 "value",
132 "curly",
133 "double",
134 "single",
135 "open",
136 )(attr.groupdict())
138 value = double or single or no_bracket
140 if value in ["yes", "true", None]:
141 value = True
142 elif value in ["no", "false"]:
143 value = False
145 attributes[name] = value
146 return attributes
148 def __parse_tag(self, source, position: Position):
149 """Parse a tag from the given source. This includes the tag start, attributes and tag end.
150 It will also parse any comments and text from the start of the source to the start of the
151 tag.
152 """
153 begin = RE.tag_start.search(source)
154 begin = (begin.start(), begin.group(0), begin.groupdict())
156 elem = None
157 if begin[0] > 0:
158 elem = self.__parse_text(source[: begin[0]], position)
160 position.end.column = position.start.column + len(begin[1])
161 source = source[begin[0] + len(begin[1]) :]
163 if begin[2]["comment"] is not None:
164 end = RE.comment_close.search(source)
165 if end is None:
166 raise Exception("Comment was not closed")
167 end = (end.start(), end.group(0), end.groupdict())
168 attributes: dict[str, Attribute] = {"data": source[: end[0]]}
169 else:
170 begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"]
171 end = RE.tag_end.search(source)
172 if end is None:
173 raise Exception(
174 f"Expected tag {begin[1]} to be closed with symbol '>'. Was not closed.",
175 )
176 end = (end.start(), end.group(0), end.groupdict())
177 if begin[2]["opening"] == "/" and "<" in source[: end[0]]:
178 line, col = self.__calc_line_col(source, end[0] + len(end[1]))
179 position.end.line = position.start.line + line
180 position.end.column = position.end.column + col
181 raise Exception(
182 f"Closing tag {begin[1]!r} was not closed, maybe it is missing a '>' symbol"
183 )
184 attributes = self.__parse_attributes(source[: end[0]])
186 line, col = self.__calc_line_col(source, end[0] + len(end[1]))
187 position.end.line = position.start.line + line
188 position.end.column = position.end.column + col
190 return source[end[0] + len(end[1]) :], begin, attributes, end, elem
192 def is_self_closing(self, name: str, auto_closing: bool) -> bool:
193 """Check if the tag is self closing. Only check if auto_closing is toggled on."""
195 if auto_closing:
196 return name in self_closing
197 return False # pragma: no cover
199 def parse(self, source: str, auto_close: bool = True) -> AST:
200 """Parse a given html or phml string into it's corresponding phml ast.
202 Args:
203 source (str): The html or phml source to parse.
205 Returns:
206 AST: A phml AST representing the parsed code source.
207 """
209 self.tag_stack = []
210 current = AST()
211 position = Position((0, 0), (0, 0))
213 while RE.tag_start.search(source) is not None and current is not None:
214 source, begin, attr, end, elem = self.__parse_tag(source, position)
216 if elem is not None:
217 current.append(elem)
219 if begin[2]["comment"] is not None:
220 current.append(
221 Literal(
222 LiteralType.Comment,
223 str(attr["data"]),
224 position=Position.from_pos(position),
225 in_pre=self.in_pre > 0,
226 ),
227 )
228 else:
229 name = begin[2]["name"] or ""
230 if begin[2]["opening"] == "/":
231 if len(self.tag_stack) == 0:
232 raise Exception(
233 f"Unbalanced tags: Tag was closed without first being opened at {position}",
234 )
235 elif name != self.tag_stack[-1]:
236 print("Tag Stack", self.tag_stack)
237 raise Exception(
238 f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}",
239 )
241 ptag = self.tag_stack.pop()
242 if ptag == "pre":
243 self.in_pre -= 1
245 if current.position is not None:
246 current.position.end.line = position.end.line
247 current.position.end.column = position.end.column
249 current = current.parent
250 elif begin[2]["opening"] == "!":
251 current.append(
252 Element(
253 "doctype",
254 {"lang": attr.get("lang", "html")},
255 position=Position.from_pos(position),
256 ),
257 )
258 elif (
259 end[2]["closing"] != "/"
260 and not self.is_self_closing(name, auto_close)
261 and begin[2]["opening"] is None
262 ):
263 self.tag_stack.append(name)
264 if name == "pre":
265 self.in_pre += 1
266 current.append(
267 Element(
268 name,
269 attr,
270 [],
271 position=Position.from_pos(position),
272 in_pre=self.in_pre > 0,
273 ),
274 )
275 if len(current) > 0:
276 current = current[-1]
277 else:
278 current.append(
279 Element(
280 name,
281 attr,
282 position=deepcopy(position),
283 in_pre=self.in_pre > 0,
284 ),
285 )
287 position.start = Point(position.end.line, position.end.column)
289 if len(source) > 0:
290 elem = self.__parse_text(source, position)
291 if (
292 current is not None
293 and isinstance(current, Parent)
294 and current.children is not None
295 and elem is not None
296 ):
297 current.append(elem)
299 if len(self.tag_stack) > 0:
300 raise Exception(
301 f"The following tags where expected to be closed: {', '.join(repr(tag) for tag in self.tag_stack)}",
302 )
303 return current