Coverage for phml\parser.py: 100%

124 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-04-05 15:06 -0500

1"""Pythonic Hypertext Markup Language (phml) parser.""" 

2import re 

3from copy import deepcopy 

4from operator import itemgetter 

5 

6from .nodes import ( 

7 AST, 

8 Attribute, 

9 Element, 

10 Literal, 

11 LiteralType, 

12 Parent, 

13 Point, 

14 Position, 

15) 

16 

17 

18def strip(data: str, cur_tags: list[str]) -> str: 

19 """This function takes a possibly multiline string and strips leading and trailing 

20 blank lines. Given the current tag stack it will not strip the text if it is nested 

21 in a `pre` tag. 

22 """ 

23 if len(cur_tags) > 0 and ( 

24 cur_tags[-1] == "python" 

25 or cur_tags[-1] == "script" 

26 or cur_tags[-1] == "style" 

27 or "pre" in cur_tags 

28 ): 

29 return data 

30 return data.strip() 

31 

32 

33self_closing = [ 

34 "area", 

35 "base", 

36 "br", 

37 "col", 

38 "embed", 

39 "hr", 

40 "img", 

41 "input", 

42 "link", 

43 "meta", 

44 "param", 

45 "source", 

46 "track", 

47 "wbr", 

48 "command", 

49 "keygen", 

50 "menuitem", 

51 "Slot", 

52 "Markdown", 

53] 

54 

55 

56# Main form of tokenization 

57class RE: 

58 tag_start = re.compile( 

59 r"(?P<comment><!--)|<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)", 

60 ) 

61 """Matches the start of a tag `<!name|</name|<name`""" 

62 

63 tag_end = re.compile(r"(?P<closing>/?)>") 

64 """Matches the end of a tag `/>|>`.""" 

65 

66 comment = re.compile(r"<!--((?:.|\s)*)-->") 

67 """Matches all html style comments `<!--Comment-->`.""" 

68 comment_close = re.compile(r"-->") 

69 

70 attribute = re.compile( 

71 r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"\s]+)))?", 

72 ) 

73 """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`.""" 

74 

75 bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$") 

76 

77 

78class HypertextMarkupParser: 

79 """Parse html/xml like source code strings.""" 

80 

81 tag_stack = [] 

82 """Current stack of tags in order of when they are opened.""" 

83 in_pre: int = 0 

84 """Whether the current element context is inside a pre element.""" 

85 

86 def __calc_line_col(self, source: str, start: int) -> tuple[int, int]: 

87 """Calculate the number of lines and columns that lead to the starting point int he source 

88 string. 

89 """ 

90 source = source[:start] 

91 return ( 

92 source.count("\n"), 

93 len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0, 

94 ) 

95 

96 def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int: 

97 """Calculate whether the number of columns should be added to the current column or be 

98 treated as if it is starting from zero based on whether new lines exist. 

99 """ 

100 return num_cols if num_lines != 0 else init_cols + num_cols 

101 

102 def __parse_text(self, text: str, pos: Position) -> Literal | None: 

103 """Parse the comments and general text found in the provided source.""" 

104 

105 if len(text) > 0 and strip(text, self.tag_stack) != "": 

106 line, col = self.__calc_line_col(text, len(text)) 

107 pos.start.line += line 

108 pos.start.column = col 

109 

110 pos.end.line += line 

111 pos.end.column = self.__calc_col(line, col, pos.end.column) 

112 return Literal( 

113 LiteralType.Text, 

114 strip(text, self.tag_stack), 

115 position=Position.from_pos(pos), 

116 in_pre=self.in_pre > 0, 

117 ) 

118 

119 return None 

120 

121 def __parse_attributes(self, attrs: str) -> dict[str, Attribute]: 

122 """Parse a tags attributes from the text found between the tag start and the tag end. 

123 

124 Example: 

125 `<name (attributes)>` 

126 """ 

127 attributes = {} 

128 for attr in RE.attribute.finditer(attrs): 

129 (name, value, _, double, single, no_bracket) = itemgetter( 

130 "name", 

131 "value", 

132 "curly", 

133 "double", 

134 "single", 

135 "open", 

136 )(attr.groupdict()) 

137 

138 value = double or single or no_bracket 

139 

140 if value in ["yes", "true", None]: 

141 value = True 

142 elif value in ["no", "false"]: 

143 value = False 

144 

145 attributes[name] = value 

146 return attributes 

147 

148 def __parse_tag(self, source, position: Position): 

149 """Parse a tag from the given source. This includes the tag start, attributes and tag end. 

150 It will also parse any comments and text from the start of the source to the start of the 

151 tag. 

152 """ 

153 begin = RE.tag_start.search(source) 

154 begin = (begin.start(), begin.group(0), begin.groupdict()) 

155 

156 elem = None 

157 if begin[0] > 0: 

158 elem = self.__parse_text(source[: begin[0]], position) 

159 

160 position.end.column = position.start.column + len(begin[1]) 

161 source = source[begin[0] + len(begin[1]) :] 

162 

163 if begin[2]["comment"] is not None: 

164 end = RE.comment_close.search(source) 

165 if end is None: 

166 raise Exception("Comment was not closed") 

167 end = (end.start(), end.group(0), end.groupdict()) 

168 attributes: dict[str, Attribute] = {"data": source[: end[0]]} 

169 else: 

170 begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"] 

171 end = RE.tag_end.search(source) 

172 if end is None: 

173 raise Exception( 

174 f"Expected tag {begin[1]} to be closed with symbol '>'. Was not closed.", 

175 ) 

176 end = (end.start(), end.group(0), end.groupdict()) 

177 if begin[2]["opening"] == "/" and "<" in source[: end[0]]: 

178 line, col = self.__calc_line_col(source, end[0] + len(end[1])) 

179 position.end.line = position.start.line + line 

180 position.end.column = position.end.column + col 

181 raise Exception( 

182 f"Closing tag {begin[1]!r} was not closed, maybe it is missing a '>' symbol" 

183 ) 

184 attributes = self.__parse_attributes(source[: end[0]]) 

185 

186 line, col = self.__calc_line_col(source, end[0] + len(end[1])) 

187 position.end.line = position.start.line + line 

188 position.end.column = position.end.column + col 

189 

190 return source[end[0] + len(end[1]) :], begin, attributes, end, elem 

191 

192 def is_self_closing(self, name: str, auto_closing: bool) -> bool: 

193 """Check if the tag is self closing. Only check if auto_closing is toggled on.""" 

194 

195 if auto_closing: 

196 return name in self_closing 

197 return False # pragma: no cover 

198 

199 def parse(self, source: str, auto_close: bool = True) -> AST: 

200 """Parse a given html or phml string into it's corresponding phml ast. 

201 

202 Args: 

203 source (str): The html or phml source to parse. 

204 

205 Returns: 

206 AST: A phml AST representing the parsed code source. 

207 """ 

208 

209 self.tag_stack = [] 

210 current = AST() 

211 position = Position((0, 0), (0, 0)) 

212 

213 while RE.tag_start.search(source) is not None and current is not None: 

214 source, begin, attr, end, elem = self.__parse_tag(source, position) 

215 

216 if elem is not None: 

217 current.append(elem) 

218 

219 if begin[2]["comment"] is not None: 

220 current.append( 

221 Literal( 

222 LiteralType.Comment, 

223 str(attr["data"]), 

224 position=Position.from_pos(position), 

225 in_pre=self.in_pre > 0, 

226 ), 

227 ) 

228 else: 

229 name = begin[2]["name"] or "" 

230 if begin[2]["opening"] == "/": 

231 if len(self.tag_stack) == 0: 

232 raise Exception( 

233 f"Unbalanced tags: Tag was closed without first being opened at {position}", 

234 ) 

235 elif name != self.tag_stack[-1]: 

236 print("Tag Stack", self.tag_stack) 

237 raise Exception( 

238 f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}", 

239 ) 

240 

241 ptag = self.tag_stack.pop() 

242 if ptag == "pre": 

243 self.in_pre -= 1 

244 

245 if current.position is not None: 

246 current.position.end.line = position.end.line 

247 current.position.end.column = position.end.column 

248 

249 current = current.parent 

250 elif begin[2]["opening"] == "!": 

251 current.append( 

252 Element( 

253 "doctype", 

254 {"lang": attr.get("lang", "html")}, 

255 position=Position.from_pos(position), 

256 ), 

257 ) 

258 elif ( 

259 end[2]["closing"] != "/" 

260 and not self.is_self_closing(name, auto_close) 

261 and begin[2]["opening"] is None 

262 ): 

263 self.tag_stack.append(name) 

264 if name == "pre": 

265 self.in_pre += 1 

266 current.append( 

267 Element( 

268 name, 

269 attr, 

270 [], 

271 position=Position.from_pos(position), 

272 in_pre=self.in_pre > 0, 

273 ), 

274 ) 

275 if len(current) > 0: 

276 current = current[-1] 

277 else: 

278 current.append( 

279 Element( 

280 name, 

281 attr, 

282 position=deepcopy(position), 

283 in_pre=self.in_pre > 0, 

284 ), 

285 ) 

286 

287 position.start = Point(position.end.line, position.end.column) 

288 

289 if len(source) > 0: 

290 elem = self.__parse_text(source, position) 

291 if ( 

292 current is not None 

293 and isinstance(current, Parent) 

294 and current.children is not None 

295 and elem is not None 

296 ): 

297 current.append(elem) 

298 

299 if len(self.tag_stack) > 0: 

300 raise Exception( 

301 f"The following tags where expected to be closed: {', '.join(repr(tag) for tag in self.tag_stack)}", 

302 ) 

303 return current