-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathtreetagger2prolog.py
executable file
·78 lines (55 loc) · 2.3 KB
/
treetagger2prolog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright © 2011 University of Zürich
# Author: Rico Sennrich <[email protected]>
from __future__ import print_function, unicode_literals
import sys
import codecs
def spelling_variations(word):
"""Deal with spelling variations that morphology system may not know"""
variants = set([word])
for old, new in [('Ae','Ä'), ('Oe','Ö'), ('Ue','Ü'), ('ae','ä',), ('oe','ö'), ('ue','ü'), ('ss','ß')]:
for variant in list(variants):
if old in variant:
segments = variant.split(old)
for i in range(len(segments)-1):
variants.add(segments[i] + new + segments[i+1])
for variant in variants:
yield variant
def prolog_escape(word):
"""escape Prolog meta characters"""
return word.replace("\\","\\\\").replace("'","\\'")
def format_conversion(line):
"""format conversion into Prolog format"""
try:
word, pos = line.split()
newline = "w('{0}', '{1}', ['{0}_{1}'], '{0}').".format(prolog_escape(word),prolog_escape(pos))
return word, newline
except:
if line == '\n':
return '', "w('ENDOFSENTENCE','{0}',['._{0}'],'ENDOFSENTENCE').".format(sentdelim)
else:
sys.stderr.write('Error: Line does not have word and POS tag: {0}\n'.format(line))
raise
if __name__ == '__main__':
if len(sys.argv) != 3:
sys.stderr.write('Usage: ' + sys.argv[0] + ' temporary_file_for_morphology sentence_delimiter\n')
sys.exit(1)
morphology_input_path = sys.argv[1]
sentdelim = sys.argv[2]
if sys.version_info < (3, 0):
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
#used for morphology tool
to_analyze = set()
morphology_tempfile = codecs.open(morphology_input_path,'w', 'UTF-8')
for line in sys.stdin:
word, line = format_conversion(line)
print(line)
#expand word forms for query (to also include spelling variants)
for variant in spelling_variations(word):
to_analyze.add(variant)
print("w('ENDOFDOC','{0}',['._{0}'],'ENDOFDOC').".format(sentdelim))
for item in to_analyze:
morphology_tempfile.write(item + '\n')
morphology_tempfile.close()