forked from edsu/alto-words
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalto_words.py
executable file
·45 lines (35 loc) · 1.15 KB
/
alto_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python
"""
alto_words.py reads an Alto OCR XML file and prints out the ratio of
dictionary words to all words for the document.
alto_words.py example.xml
"""
import sys
import dbm
from xml.sax.handler import ContentHandler, feature_namespaces
from xml.sax import make_parser
def main(ocr_filename):
handler = AltoHandler()
parser = make_parser()
parser.setContentHandler(handler)
parser.setFeature(feature_namespaces, 0)
parser.parse(ocr_filename)
print(handler.ratio)
class AltoHandler(ContentHandler):
def __init__(self):
self.dictionary = dbm.open('dictionary.db')
self.dictionary_words = []
self.words = []
def startElement(self, tag, attrs):
if tag == 'String':
word = attrs.get("CONTENT").lower()
# keys are b'' so turn str word also to b''
if self.dictionary.get(word.encode()):
self.dictionary_words.append(word)
self.words.append(word)
@property
def ratio(self):
return len(self.dictionary_words) / float(len(self.words))
if __name__ == "__main__":
filename = sys.argv[1]
main(filename)