1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
|
from __future__ import absolute_import, division, unicode_literals
from pip._vendor.six import text_type
import gettext
_ = gettext.gettext
from ..constants import voidElements, spaceCharacters
spaceCharacters = "".join(spaceCharacters)
class TreeWalker(object):
def __init__(self, tree):
self.tree = tree
def __iter__(self):
raise NotImplementedError
def error(self, msg):
return {"type": "SerializeError", "data": msg}
def emptyTag(self, namespace, name, attrs, hasChildren=False):
assert namespace is None or isinstance(namespace, text_type), type(namespace)
assert isinstance(name, text_type), type(name)
assert all((namespace is None or isinstance(namespace, text_type)) and
isinstance(name, text_type) and
isinstance(value, text_type)
for (namespace, name), value in attrs.items())
yield {"type": "EmptyTag", "name": name,
"namespace": namespace,
"data": attrs}
if hasChildren:
yield self.error(_("Void element has children"))
def startTag(self, namespace, name, attrs):
assert namespace is None or isinstance(namespace, text_type), type(namespace)
assert isinstance(name, text_type), type(name)
assert all((namespace is None or isinstance(namespace, text_type)) and
isinstance(name, text_type) and
isinstance(value, text_type)
for (namespace, name), value in attrs.items())
return {"type": "StartTag",
"name": name,
"namespace": namespace,
"data": attrs}
def endTag(self, namespace, name):
assert namespace is None or isinstance(namespace, text_type), type(namespace)
assert isinstance(name, text_type), type(namespace)
return {"type": "EndTag",
"name": name,
"namespace": namespace,
"data": {}}
def text(self, data):
assert isinstance(data, text_type), type(data)
data = data
middle = data.lstrip(spaceCharacters)
left = data[:len(data) - len(middle)]
if left:
yield {"type": "SpaceCharacters", "data": left}
data = middle
middle = data.rstrip(spaceCharacters)
right = data[len(middle):]
if middle:
yield {"type": "Characters", "data": middle}
if right:
yield {"type": "SpaceCharacters", "data": right}
def comment(self, data):
assert isinstance(data, text_type), type(data)
return {"type": "Comment", "data": data}
def doctype(self, name, publicId=None, systemId=None, correct=True):
assert name is None or isinstance(name, text_type), type(name)
assert publicId is None or isinstance(publicId, text_type), type(publicId)
assert systemId is None or isinstance(systemId, text_type), type(systemId)
return {"type": "Doctype",
"name": name if name is not None else "",
"publicId": publicId,
"systemId": systemId,
"correct": correct}
def entity(self, name):
assert isinstance(name, text_type), type(name)
return {"type": "Entity", "name": name}
def unknown(self, nodeType):
return self.error(_("Unknown node type: ") + nodeType)
class RecursiveTreeWalker(TreeWalker):
def walkChildren(self, node):
raise NotImplementedError
def element(self, node, namespace, name, attrs, hasChildren):
if name in voidElements:
for token in self.emptyTag(namespace, name, attrs, hasChildren):
yield token
else:
yield self.startTag(name, attrs)
if hasChildren:
for token in self.walkChildren(node):
yield token
yield self.endTag(name)
from xml.dom import Node
DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE
ENTITY = Node.ENTITY_NODE
UNKNOWN = "<#UNKNOWN#>"
class NonRecursiveTreeWalker(TreeWalker):
def getNodeDetails(self, node):
raise NotImplementedError
def getFirstChild(self, node):
raise NotImplementedError
def getNextSibling(self, node):
raise NotImplementedError
def getParentNode(self, node):
raise NotImplementedError
def __iter__(self):
currentNode = self.tree
while currentNode is not None:
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
hasChildren = False
if type == DOCTYPE:
yield self.doctype(*details)
elif type == TEXT:
for token in self.text(*details):
yield token
elif type == ELEMENT:
namespace, name, attributes, hasChildren = details
if name in voidElements:
for token in self.emptyTag(namespace, name, attributes,
hasChildren):
yield token
hasChildren = False
else:
yield self.startTag(namespace, name, attributes)
elif type == COMMENT:
yield self.comment(details[0])
elif type == ENTITY:
yield self.entity(details[0])
elif type == DOCUMENT:
hasChildren = True
else:
yield self.unknown(details[0])
if hasChildren:
firstChild = self.getFirstChild(currentNode)
else:
firstChild = None
if firstChild is not None:
currentNode = firstChild
else:
while currentNode is not None:
details = self.getNodeDetails(currentNode)
type, details = details[0], details[1:]
if type == ELEMENT:
namespace, name, attributes, hasChildren = details
if name not in voidElements:
yield self.endTag(namespace, name)
if self.tree is currentNode:
currentNode = None
break
nextSibling = self.getNextSibling(currentNode)
if nextSibling is not None:
currentNode = nextSibling
break
else:
currentNode = self.getParentNode(currentNode)
|