phml.core.formats.parse
Pythonic Hypertext Markup Language (phml) parser.
1"""Pythonic Hypertext Markup Language (phml) parser.""" 2from copy import deepcopy 3from operator import itemgetter 4import re 5 6from phml.core.nodes import ( 7 AST, 8 Comment, 9 DocType, 10 Element, 11 Point, 12 Position, 13 Root, 14 Text, 15 Node 16) 17 18def parse_hypertest_markup(data: str, class_name: str, auto_close: bool = True) -> AST: 19 """Parse a string as a hypertest markup document.""" 20 21 phml_parser = HypertextMarkupParser() 22 23 if isinstance(data, str): 24 return phml_parser.parse(data, auto_close=auto_close) 25 raise Exception(f"Data passed to {class_name}.parse must be a str") 26 27def strip_blank_lines(data_lines: list[str]) -> list[str]: 28 """Strip the blank lines at the start and end of a list.""" 29 data_lines = [line.replace("\r\n", "\n") for line in data_lines] 30 # remove leading blank lines 31 for idx in range(0, len(data_lines)): # pylint: disable=consider-using-enumerate 32 if data_lines[idx].strip() != "": 33 data_lines = data_lines[idx:] 34 break 35 if idx == len(data_lines) - 1: 36 data_lines = [] 37 break 38 39 # Remove trailing blank lines 40 if len(data_lines) > 0: 41 for idx in range(len(data_lines) - 1, -1, -1): 42 if data_lines[idx].replace("\n", " ").strip() != "": 43 data_lines = data_lines[: idx + 1] 44 break 45 46 return data_lines 47 48def strip(data: str, cur_tags: list[str]) -> tuple[str, int, int]: 49 """This function takes a possibly mutliline string and strips leading and trailing 50 blank lines. Given the current position it will also calculate the line and column 51 taht the data ends at. 52 """ 53 if "pre" not in cur_tags: 54 data_lines = data.split("\n") 55 56 # If multiline data block 57 if len(data_lines) > 1: 58 data_lines = strip_blank_lines(data_lines) 59 60 data = "\n".join(data_lines) 61 # Else it is a single line data block 62 else: 63 data = data_lines[0] 64 65 return data 66 67 68self_closing = [ 69 "area", 70 "base", 71 "br", 72 "col", 73 "embed", 74 "hr", 75 "img", 76 "input", 77 "link", 78 "meta", 79 "param", 80 "source", 81 "track", 82 "wbr", 83 "command", 84 "keygen", 85 "menuitem", 86 "Slot", 87] 88 89# Main form of tokenization 90class RE: 91 tag_start = re.compile(r"(?P<comment><!--)|<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)") 92 """Matches the start of a tag `<!name|</name|<name`""" 93 94 tag_end = re.compile(r"(?P<closing>/?)>") 95 """Matches the end of a tag `/>|>`.""" 96 97 comment = re.compile(r"<!--((?:.|\s)*)-->") 98 """Matches all html style comments `<!--Comment-->`.""" 99 comment_close = re.compile(r"-->") 100 101 attribute = re.compile(r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"]+)))?") 102 """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`.""" 103 104 bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$") 105 106class HypertextMarkupParser: 107 """Parse html/xml like source code strings.""" 108 109 tag_stack = [] 110 """Current stack of tags in order of when they are opened.""" 111 112 def __calc_line_col(self, source: str, start: int) -> tuple[int, int]: 113 """Calculate the number of lines and columns that lead to the starting point int he source 114 string. 115 """ 116 source = source[:start] 117 return source.count("\n"), len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0 118 119 def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int: 120 """Calculate whether the number of columns should be added to the current column or be 121 treated as if it is starting from zero based on whether new lines exist. 122 """ 123 return num_cols if num_lines != 0 else init_cols + num_cols 124 125 def __parse_text_comment(self, text: str, pos: Position) -> list[Node]: 126 """Parse the comments and general text found in the provided source.""" 127 128 elements = [] # List of text and comment elements. 129 130 # For each comment add it to the list of elements 131 while RE.comment.search(text) is not None: 132 comment = RE.comment.search(text) 133 line_s, col_s = self.__calc_line_col(text, comment.start()) 134 line_e, col_e = self.__calc_line_col(comment.group(0), len(comment.group(0))) 135 136 pos.start = Point( 137 pos.start.line + line_s, 138 self.__calc_col(line_s, col_s, pos.start.column) 139 ) 140 pos.end = Point( 141 pos.start.line + line_e, 142 self.__calc_col(line_e, col_e, pos.start.column) 143 ) 144 145 # If there is text between two comments then add a text element 146 if comment.start() > 0: 147 elements.append(Text( 148 text[:comment.span()[0]], 149 position=deepcopy(pos) 150 )) 151 152 text = text[comment.span()[1]:] 153 elements.append( 154 Comment(comment.group(1), position=deepcopy(pos)) 155 ) 156 157 # remaining text is added as a text element 158 if len(text) > 0: 159 line, col = self.__calc_line_col(text, len(text)) 160 pos.start.line += line 161 pos.start.column = col 162 163 elements.append(Text( 164 text, 165 position=Position( 166 deepcopy(pos.end), 167 (pos.end.line + line, self.__calc_col(line, col, pos.end.column)) 168 ) 169 )) 170 return elements 171 172 def __parse_attributes(self, attrs: str) -> dict: 173 """Parse a tags attributes from the text found between the tag start and the tag end. 174 175 Example: 176 `<name (attributes)>` 177 """ 178 attributes = {} 179 for attr in RE.attribute.finditer(attrs): 180 ( 181 name, 182 value, 183 _, 184 double, 185 single, 186 no_bracket 187 ) = itemgetter('name', 'value', 'curly', 'double', 'single', 'open')(attr.groupdict()) 188 189 if value is not None and RE.bracket_attributte.match(value) is not None: 190 if not name.startswith(":"): 191 name = ":" + name 192 value = RE.bracket_attributte.match(value).group(1) 193 else: 194 value = double or single or no_bracket 195 196 if value in ["yes", "true", None]: 197 value = True 198 elif value in ["no", "false"]: 199 value = False 200 201 attributes[name] = value 202 return attributes 203 204 def __parse_tag(self, source, position: Position): 205 """Parse a tag from the given source. This includes the tag start, attributes and tag end. 206 It will also parse any comments and text from the start of the source to the start of the 207 tag. 208 """ 209 begin = RE.tag_start.search(source) 210 begin = (begin.start(), begin.group(0), begin.groupdict()) 211 212 elems = [] 213 if begin[0] > 0: 214 elems = self.__parse_text_comment(source[:begin[0]], position) 215 position.end.column = position.start.column + len(begin[1]) 216 source = source[begin[0] + len(begin[1]):] 217 218 if begin[2]["comment"] is not None: 219 end = RE.comment_close.search(source) 220 if end is None: 221 raise Exception("Comment was not closed") 222 end = (end.start(), end.group(0), end.groupdict()) 223 attributes = {"data": source[:end[0]]} 224 else: 225 begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"] 226 227 end = RE.tag_end.search(source) 228 if end is None: 229 raise Exception(f"Expected tag {begin} to be closed with symbol '>'. Was not closed.") 230 end = (end.start(), end.group(0), end.groupdict()) 231 attributes = self.__parse_attributes(source[:end[0]]) 232 233 line, col = self.__calc_line_col(source, end[0] + len(end[1])) 234 position.end.line = position.start.line + line 235 position.end.column = position.end.column + col 236 237 return source[end[0] + len(end[1]):], begin, attributes, end, elems 238 239 def is_self_closing(self, name: str, auto_closing: bool) -> bool: 240 """Check if the tag is self closing. Only check if auto_closing is toggled on.""" 241 242 if auto_closing: 243 return name in self_closing 244 return False 245 246 def parse(self, source: str, auto_close: bool = True) -> Root: 247 """Parse a given html or phml string into it's corresponding phml ast. 248 249 Args: 250 source (str): The html or phml source to parse. 251 252 Returns: 253 AST: A phml AST representing the parsed code source. 254 """ 255 256 self.tag_stack = [] 257 current = Root() 258 position = Position((0, 0), (0, 0)) 259 260 while RE.tag_start.search(source) is not None: 261 source, begin, attr, end, elems = self.__parse_tag(source, position) 262 263 if len(elems) > 0: 264 current.extend(elems) 265 266 if begin[2]["comment"] is not None: 267 current.append(Comment(attr["data"], position=deepcopy(position))) 268 else: 269 name = begin[2]["name"] or '' 270 if begin[2]["opening"] == "/": 271 if name != self.tag_stack[-1]: 272 print("Tag Stack", self.tag_stack) 273 raise Exception( 274 f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}" 275 ) 276 277 self.tag_stack.pop() 278 current.position.end.line = position.end.line 279 current.position.end.column = position.end.column 280 281 current = current.parent 282 elif begin[2]["opening"] == "!": 283 current.append(DocType(attr.get("lang", "html"), position=deepcopy(position))) 284 elif ( 285 end[2]["closing"] != "/" 286 and not self.is_self_closing(name, auto_close) 287 and begin[2]["opening"] is None 288 ): 289 self.tag_stack.append(name) 290 current.append(Element(name, attr, position=deepcopy(position))) 291 current = current.children[-1] 292 else: 293 current.append(Element(name, attr, position=deepcopy(position), startend=True)) 294 295 position.start = deepcopy(position.end) 296 297 if len(source) > 0: 298 elems = self.__parse_text_comment(source, position) 299 current.extend(elems) 300 301 return AST(current)
def
parse_hypertest_markup( data: str, class_name: str, auto_close: bool = True) -> phml.core.nodes.AST.AST:
19def parse_hypertest_markup(data: str, class_name: str, auto_close: bool = True) -> AST: 20 """Parse a string as a hypertest markup document.""" 21 22 phml_parser = HypertextMarkupParser() 23 24 if isinstance(data, str): 25 return phml_parser.parse(data, auto_close=auto_close) 26 raise Exception(f"Data passed to {class_name}.parse must be a str")
Parse a string as a hypertest markup document.
def
strip_blank_lines(data_lines: list[str]) -> list[str]:
28def strip_blank_lines(data_lines: list[str]) -> list[str]: 29 """Strip the blank lines at the start and end of a list.""" 30 data_lines = [line.replace("\r\n", "\n") for line in data_lines] 31 # remove leading blank lines 32 for idx in range(0, len(data_lines)): # pylint: disable=consider-using-enumerate 33 if data_lines[idx].strip() != "": 34 data_lines = data_lines[idx:] 35 break 36 if idx == len(data_lines) - 1: 37 data_lines = [] 38 break 39 40 # Remove trailing blank lines 41 if len(data_lines) > 0: 42 for idx in range(len(data_lines) - 1, -1, -1): 43 if data_lines[idx].replace("\n", " ").strip() != "": 44 data_lines = data_lines[: idx + 1] 45 break 46 47 return data_lines
Strip the blank lines at the start and end of a list.
def
strip(data: str, cur_tags: list[str]) -> tuple[str, int, int]:
49def strip(data: str, cur_tags: list[str]) -> tuple[str, int, int]: 50 """This function takes a possibly mutliline string and strips leading and trailing 51 blank lines. Given the current position it will also calculate the line and column 52 taht the data ends at. 53 """ 54 if "pre" not in cur_tags: 55 data_lines = data.split("\n") 56 57 # If multiline data block 58 if len(data_lines) > 1: 59 data_lines = strip_blank_lines(data_lines) 60 61 data = "\n".join(data_lines) 62 # Else it is a single line data block 63 else: 64 data = data_lines[0] 65 66 return data
This function takes a possibly mutliline string and strips leading and trailing blank lines. Given the current position it will also calculate the line and column taht the data ends at.
class
RE:
91class RE: 92 tag_start = re.compile(r"(?P<comment><!--)|<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)") 93 """Matches the start of a tag `<!name|</name|<name`""" 94 95 tag_end = re.compile(r"(?P<closing>/?)>") 96 """Matches the end of a tag `/>|>`.""" 97 98 comment = re.compile(r"<!--((?:.|\s)*)-->") 99 """Matches all html style comments `<!--Comment-->`.""" 100 comment_close = re.compile(r"-->") 101 102 attribute = re.compile(r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"]+)))?") 103 """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`.""" 104 105 bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$")
class
HypertextMarkupParser:
107class HypertextMarkupParser: 108 """Parse html/xml like source code strings.""" 109 110 tag_stack = [] 111 """Current stack of tags in order of when they are opened.""" 112 113 def __calc_line_col(self, source: str, start: int) -> tuple[int, int]: 114 """Calculate the number of lines and columns that lead to the starting point int he source 115 string. 116 """ 117 source = source[:start] 118 return source.count("\n"), len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0 119 120 def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int: 121 """Calculate whether the number of columns should be added to the current column or be 122 treated as if it is starting from zero based on whether new lines exist. 123 """ 124 return num_cols if num_lines != 0 else init_cols + num_cols 125 126 def __parse_text_comment(self, text: str, pos: Position) -> list[Node]: 127 """Parse the comments and general text found in the provided source.""" 128 129 elements = [] # List of text and comment elements. 130 131 # For each comment add it to the list of elements 132 while RE.comment.search(text) is not None: 133 comment = RE.comment.search(text) 134 line_s, col_s = self.__calc_line_col(text, comment.start()) 135 line_e, col_e = self.__calc_line_col(comment.group(0), len(comment.group(0))) 136 137 pos.start = Point( 138 pos.start.line + line_s, 139 self.__calc_col(line_s, col_s, pos.start.column) 140 ) 141 pos.end = Point( 142 pos.start.line + line_e, 143 self.__calc_col(line_e, col_e, pos.start.column) 144 ) 145 146 # If there is text between two comments then add a text element 147 if comment.start() > 0: 148 elements.append(Text( 149 text[:comment.span()[0]], 150 position=deepcopy(pos) 151 )) 152 153 text = text[comment.span()[1]:] 154 elements.append( 155 Comment(comment.group(1), position=deepcopy(pos)) 156 ) 157 158 # remaining text is added as a text element 159 if len(text) > 0: 160 line, col = self.__calc_line_col(text, len(text)) 161 pos.start.line += line 162 pos.start.column = col 163 164 elements.append(Text( 165 text, 166 position=Position( 167 deepcopy(pos.end), 168 (pos.end.line + line, self.__calc_col(line, col, pos.end.column)) 169 ) 170 )) 171 return elements 172 173 def __parse_attributes(self, attrs: str) -> dict: 174 """Parse a tags attributes from the text found between the tag start and the tag end. 175 176 Example: 177 `<name (attributes)>` 178 """ 179 attributes = {} 180 for attr in RE.attribute.finditer(attrs): 181 ( 182 name, 183 value, 184 _, 185 double, 186 single, 187 no_bracket 188 ) = itemgetter('name', 'value', 'curly', 'double', 'single', 'open')(attr.groupdict()) 189 190 if value is not None and RE.bracket_attributte.match(value) is not None: 191 if not name.startswith(":"): 192 name = ":" + name 193 value = RE.bracket_attributte.match(value).group(1) 194 else: 195 value = double or single or no_bracket 196 197 if value in ["yes", "true", None]: 198 value = True 199 elif value in ["no", "false"]: 200 value = False 201 202 attributes[name] = value 203 return attributes 204 205 def __parse_tag(self, source, position: Position): 206 """Parse a tag from the given source. This includes the tag start, attributes and tag end. 207 It will also parse any comments and text from the start of the source to the start of the 208 tag. 209 """ 210 begin = RE.tag_start.search(source) 211 begin = (begin.start(), begin.group(0), begin.groupdict()) 212 213 elems = [] 214 if begin[0] > 0: 215 elems = self.__parse_text_comment(source[:begin[0]], position) 216 position.end.column = position.start.column + len(begin[1]) 217 source = source[begin[0] + len(begin[1]):] 218 219 if begin[2]["comment"] is not None: 220 end = RE.comment_close.search(source) 221 if end is None: 222 raise Exception("Comment was not closed") 223 end = (end.start(), end.group(0), end.groupdict()) 224 attributes = {"data": source[:end[0]]} 225 else: 226 begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"] 227 228 end = RE.tag_end.search(source) 229 if end is None: 230 raise Exception(f"Expected tag {begin} to be closed with symbol '>'. Was not closed.") 231 end = (end.start(), end.group(0), end.groupdict()) 232 attributes = self.__parse_attributes(source[:end[0]]) 233 234 line, col = self.__calc_line_col(source, end[0] + len(end[1])) 235 position.end.line = position.start.line + line 236 position.end.column = position.end.column + col 237 238 return source[end[0] + len(end[1]):], begin, attributes, end, elems 239 240 def is_self_closing(self, name: str, auto_closing: bool) -> bool: 241 """Check if the tag is self closing. Only check if auto_closing is toggled on.""" 242 243 if auto_closing: 244 return name in self_closing 245 return False 246 247 def parse(self, source: str, auto_close: bool = True) -> Root: 248 """Parse a given html or phml string into it's corresponding phml ast. 249 250 Args: 251 source (str): The html or phml source to parse. 252 253 Returns: 254 AST: A phml AST representing the parsed code source. 255 """ 256 257 self.tag_stack = [] 258 current = Root() 259 position = Position((0, 0), (0, 0)) 260 261 while RE.tag_start.search(source) is not None: 262 source, begin, attr, end, elems = self.__parse_tag(source, position) 263 264 if len(elems) > 0: 265 current.extend(elems) 266 267 if begin[2]["comment"] is not None: 268 current.append(Comment(attr["data"], position=deepcopy(position))) 269 else: 270 name = begin[2]["name"] or '' 271 if begin[2]["opening"] == "/": 272 if name != self.tag_stack[-1]: 273 print("Tag Stack", self.tag_stack) 274 raise Exception( 275 f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}" 276 ) 277 278 self.tag_stack.pop() 279 current.position.end.line = position.end.line 280 current.position.end.column = position.end.column 281 282 current = current.parent 283 elif begin[2]["opening"] == "!": 284 current.append(DocType(attr.get("lang", "html"), position=deepcopy(position))) 285 elif ( 286 end[2]["closing"] != "/" 287 and not self.is_self_closing(name, auto_close) 288 and begin[2]["opening"] is None 289 ): 290 self.tag_stack.append(name) 291 current.append(Element(name, attr, position=deepcopy(position))) 292 current = current.children[-1] 293 else: 294 current.append(Element(name, attr, position=deepcopy(position), startend=True)) 295 296 position.start = deepcopy(position.end) 297 298 if len(source) > 0: 299 elems = self.__parse_text_comment(source, position) 300 current.extend(elems) 301 302 return AST(current)
Parse html/xml like source code strings.
def
is_self_closing(self, name: str, auto_closing: bool) -> bool:
240 def is_self_closing(self, name: str, auto_closing: bool) -> bool: 241 """Check if the tag is self closing. Only check if auto_closing is toggled on.""" 242 243 if auto_closing: 244 return name in self_closing 245 return False
Check if the tag is self closing. Only check if auto_closing is toggled on.
247 def parse(self, source: str, auto_close: bool = True) -> Root: 248 """Parse a given html or phml string into it's corresponding phml ast. 249 250 Args: 251 source (str): The html or phml source to parse. 252 253 Returns: 254 AST: A phml AST representing the parsed code source. 255 """ 256 257 self.tag_stack = [] 258 current = Root() 259 position = Position((0, 0), (0, 0)) 260 261 while RE.tag_start.search(source) is not None: 262 source, begin, attr, end, elems = self.__parse_tag(source, position) 263 264 if len(elems) > 0: 265 current.extend(elems) 266 267 if begin[2]["comment"] is not None: 268 current.append(Comment(attr["data"], position=deepcopy(position))) 269 else: 270 name = begin[2]["name"] or '' 271 if begin[2]["opening"] == "/": 272 if name != self.tag_stack[-1]: 273 print("Tag Stack", self.tag_stack) 274 raise Exception( 275 f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}" 276 ) 277 278 self.tag_stack.pop() 279 current.position.end.line = position.end.line 280 current.position.end.column = position.end.column 281 282 current = current.parent 283 elif begin[2]["opening"] == "!": 284 current.append(DocType(attr.get("lang", "html"), position=deepcopy(position))) 285 elif ( 286 end[2]["closing"] != "/" 287 and not self.is_self_closing(name, auto_close) 288 and begin[2]["opening"] is None 289 ): 290 self.tag_stack.append(name) 291 current.append(Element(name, attr, position=deepcopy(position))) 292 current = current.children[-1] 293 else: 294 current.append(Element(name, attr, position=deepcopy(position), startend=True)) 295 296 position.start = deepcopy(position.end) 297 298 if len(source) > 0: 299 elems = self.__parse_text_comment(source, position) 300 current.extend(elems) 301 302 return AST(current)
Parse a given html or phml string into it's corresponding phml ast.
Args
- source (str): The html or phml source to parse.
Returns
AST: A phml AST representing the parsed code source.