Coverage for phml\utilities\transform\sanitize\clean.py: 100%

58 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-04-06 15:05 -0500

1from re import match 

2 

3from phml.nodes import Element, Parent 

4 

5from .schema import Schema 

6 

7 

8def sanatize(tree: Parent, schema: Schema = Schema()): 

9 """Sanatize elements and attributes in the phml tree. Should be used when using 

10 data from an unkown source. It should be used with an AST that has already been 

11 compiled to html to no unkown values are unchecked. 

12 

13 By default the sanatization schema uses the github schema and follows the hast 

14 sanatize utility. 

15 

16 * [github schema](https://github.com/syntax-tree/hast-util-sanitize/blob/main/lib/schema.js) 

17 * [hast sanatize](https://github.com/syntax-tree/hast-util-sanitize) 

18 

19 Note: 

20 This utility will edit the tree in place. 

21 

22 Args: 

23 tree (Parent): The root of the tree that will be sanatized. 

24 schema (Schema, optional): User defined schema. Defaults to github schema. 

25 """ 

26 

27 from phml.utilities import ( # pylint: disable=import-outside-toplevel 

28 check, 

29 is_element, 

30 remove_nodes, 

31 ) 

32 

33 for strip in schema.strip: 

34 remove_nodes(tree, ["element", {"tag": strip}]) 

35 

36 def recurse_check_tag(node: Parent): 

37 for child in list(node): 

38 if isinstance(child, Element) and not is_element(child, schema.tag_names): 

39 node.remove(child) 

40 elif isinstance(child, Parent): 

41 recurse_check_tag(child) 

42 

43 def recurse_check_ancestor(node: Parent): 

44 for child in list(node): 

45 if ( 

46 isinstance(child, Element) 

47 and child.tag in schema.ancestors 

48 and ( 

49 not isinstance(child.parent, Element) 

50 or child.parent.tag not in schema.ancestors[child.tag] 

51 ) 

52 ): 

53 node.remove(child) 

54 elif isinstance(child, Element): 

55 recurse_check_ancestor(child) 

56 

57 def build_remove_attr_list( 

58 properties: dict, 

59 attributes: dict[str, tuple[str | bool, ...]], 

60 valid_attributes: list, 

61 ): 

62 """Build the list of attributes to remove from a dict of attributes.""" 

63 result = [] 

64 for attribute in properties: 

65 if attribute not in valid_attributes: 

66 result.append(attribute) 

67 elif attribute in attributes: 

68 if ( 

69 isinstance(properties[attribute], str) 

70 and attribute in schema.protocols 

71 and not check_protocols( 

72 properties[attribute], schema.protocols[attribute] 

73 ) 

74 ): 

75 result.append(attribute) 

76 elif properties[attribute] != attributes[attribute]: 

77 result.append(attribute) 

78 elif ( 

79 isinstance(properties[attribute], str) 

80 and attribute in schema.protocols 

81 and not check_protocols( 

82 properties[attribute], schema.protocols[attribute] 

83 ) 

84 ): 

85 result.append(attribute) 

86 return result 

87 

88 def recurse_check_attributes(node: Parent): 

89 for child in node: 

90 if isinstance(child, Element): 

91 if child.tag in schema.attributes: 

92 pop_attrs = build_remove_attr_list( 

93 child.attributes, 

94 { 

95 str(attr[0]): attr[1:] 

96 for attr in ( 

97 schema.attributes[child.tag] 

98 + schema.attributes.get("*", []) 

99 ) 

100 if isinstance(attr, tuple) 

101 }, 

102 [ 

103 attr if isinstance(attr, str) else attr[0] 

104 for attr in ( 

105 schema.attributes[child.tag] 

106 + schema.attributes.get("*", []) 

107 ) 

108 ], 

109 ) 

110 

111 for attribute in pop_attrs: 

112 child.pop(attribute, None) 

113 

114 recurse_check_attributes(child) 

115 

116 def recurse_check_required(node: Parent): 

117 for child in node: 

118 if isinstance(child, Element) and child.tag in schema.required: 

119 for attr, value in schema.required[child.tag].items(): 

120 if attr not in child.attributes: 

121 child[attr] = value 

122 elif isinstance(value, bool): 

123 child[attr] = str(value).lower() 

124 elif isinstance(value, str) and child[attr] != value: 

125 child[attr] = value 

126 elif isinstance(child, Element): 

127 recurse_check_required(child) 

128 

129 def check_protocols(value: str, protocols: list[str]): 

130 return match(f"{'|'.join(protocols)}:.*", value) is not None 

131 

132 def recurse_strip(node): 

133 for child in list(node): 

134 if isinstance(child, Element) and is_element(child, schema.strip): 

135 node.remove(child) 

136 elif isinstance(child, Parent): 

137 recurse_strip(child) 

138 

139 recurse_check_tag(tree) 

140 recurse_strip(tree) 

141 recurse_check_ancestor(tree) 

142 recurse_check_attributes(tree) 

143 recurse_check_required(tree)