-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpull_parser_test.py
70 lines (60 loc) · 2.11 KB
/
pull_parser_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import re
from xml.dom.minidom import Element, Text
from xml.dom.pulldom import CHARACTERS, START_ELEMENT, parseString, END_ELEMENT
"""
XML pull parser and minidom demo
@author: Ronald Haentjens Dekker
"""
class Stack(list):
def push(self, item):
self.append(item)
def peek(self):
return self[-1]
def tokenize(contents):
return re.findall("[.?!,;:]+[\\s]*|[^.?!,;:\\s]+[\\s]*", contents)
source = """<rdg wit="#ipa">рускаꙗ землѧ <lb/>
<add place="margin">и хто в неи почалъ пѣрвѣе кнѧжи<hi rend="sup"
>т</hi></add>·:·</rdg>"""
# init input
doc = parseString(source)
# init output
output = Element("output")
open_elements = Stack()
open_elements.push(output)
for event, node in doc:
# debug
# print(event, node)
if event == START_ELEMENT:
# skip rdg element
if node.localName == "rdg":
continue
# in case of add deal with overlapping hierarchies
if node.localName == "add":
# set type attribute to start and add node as a child to output
node.setAttribute("type","start")
open_elements.peek().appendChild(node)
else:
open_elements.peek().appendChild(node)
open_elements.push(node)
elif event == END_ELEMENT:
# skip rdg element
if node.localName == "rdg":
continue
# in case of add deal with overlapping hierarchies
if node.localName == "add":
# create a clone of the node and set type attribute to end and add node as a child to output
clone = node.cloneNode(False)
clone.setAttribute("type","end")
open_elements.peek().appendChild(clone)
else:
open_elements.pop()
elif event == CHARACTERS:
tokens = tokenize(node.data)
if tokens:
textdata = "\n".join(tokens)
else:
textdata = re.sub("[^\S\n]+", "", node.data)
t = Text()
t.data = textdata
open_elements.peek().appendChild(t)
print(output.toxml())