-
Notifications
You must be signed in to change notification settings - Fork 1
/
add_pos2elan_p2.7.py
250 lines (205 loc) · 12.6 KB
/
add_pos2elan_p2.7.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# -*- coding:utf-8 -*-
import re, os, errno, cgi, json, xml
import sys, codecs, locale, getopt
import xml.etree.ElementTree as ET
from subprocess import Popen, PIPE
from operator import itemgetter
from xml.dom.minidom import parse, parseString
def main():
# to be adjusted as needed
in_dir = 'example'
out_dir = 'out_'+in_dir+'_p2.7'
cwd = os.getcwd()
out_dir_path = os.path.join(cwd,out_dir)
if not os.path.exists(out_dir_path):
os.mkdir(out_dir_path)
# parameters to be adjusted as needed
lang = 'kpv'
plup = Popen('which lookup', shell=True, stdout=PIPE, stderr=PIPE)
olup, elup = plup.communicate()
print("___ lookup is ",olup.decode())
if not olup.decode():
print('No lookup found, please install it!')
sys.exit()
lookup = olup.decode().strip()
langs_dir = 'analyser/'
rel_xfst_file = '/analyser-gt-desc.xfst'
abs_xfst_file = langs_dir+lang+rel_xfst_file
# command to analyse the input string comming from the left of the pipeline
cmd = "| iconv -f UTF-8 -t UTF-8 | " + lookup + " " + abs_xfst_file
for root, dirs, files in os.walk(in_dir): # Walk directory tree
print("Input dir {0} with {1} files ...".format(root, len(files)))
for f in files:
if f.endswith('eaf'):
print('... processing ', str(f))
tree = ET.parse(os.path.join(in_dir,f))
f_root = tree.getroot()
a_refs = f_root.findall('.//REF_ANNOTATION')
ar_ids = []
for arid in a_refs:
ar_ids.append(arid.attrib['ANNOTATION_ID'].translate(None, 'a'))
ar_ids = sorted(ar_ids, key=int, reverse=True)
t_counter = int(ar_ids[0])
#print(t_counter)
# find the insertion positions for the generated tiers
child_list = f_root.getchildren()
child_positions = []
for child in child_list:
c_child = child.tag
if child.tag == 'TIER':
c_child += '_' + child.attrib['TIER_ID']
child_positions.append(c_child)
#print(child_positions)
p_counter = -1
#read last used annotation
#last_used_annot = int(f_root.find('.//PROPERTY[@NAME="lastUsedAnnotationId"]').text)
#print('last used annotation before processing is ' + str(last_used_annot))
participants = []
for refTIER in f_root.findall('.//TIER[@LINGUISTIC_TYPE_REF="refT"]'):
current_participant = refTIER.attrib['TIER_ID'].split('@',1)[1]
participants.append(current_participant)
insertion_positions = {}
for p in participants:
insertion_positions[p] = child_positions.index('TIER_word@'+p)
#print(insertion_positions)
# loop over all participants
for refTIER in f_root.findall('.//TIER[@LINGUISTIC_TYPE_REF="refT"]'):
current_participant = refTIER.attrib['TIER_ID'].split('@',1)[1]
p_counter += 1
#print('___ ', current_participant, ' ___')
# create empty list for [wordID, wordform, analysis_output]
wlp = []
word_tier = f_root.find('.//TIER[@TIER_ID="word@' + current_participant + '"]')
#print(word_tier)
lang = f_root.find('.//TIER[@TIER_ID="word@'+current_participant+'"]').attrib['LANG_REF']
print('___ current lang is ', lang, ' ___')
abs_xfst_file = langs_dir+lang+rel_xfst_file
print('___ current xfst file is ', abs_xfst_file, ' ___')
# command to analyse the input string comming from the left of the pipeline
cmd = "| iconv -f UTF-8 -t UTF-8 | " + lookup + " " + abs_xfst_file
for t in f_root.findall('.//TIER[@TIER_ID="word@'+current_participant+'"]/ANNOTATION/REF_ANNOTATION'):
#//////
ref_ID = t.attrib['ANNOTATION_ID']
#print('_',ref_ID,'_')
current_wordform = t[0].text
# if the current word form is empty add a dummy one "_DWF_" for "Default Word Form"
if not current_wordform:
current_wordform = "_DWF_"
#print('... xxx ', ref_ID, ' ___ ', current_wordform)
p = Popen('echo '+current_wordform+cmd, shell=True, stdout=PIPE, stderr=PIPE)
out, err = p.communicate()
#print("|",out.split('\n', 1 )[0],"|")
current_analysis = filter(None,out.split('\n'))
# fix inconsistency of TAB usage in the FST output
current_analysis = [w.replace('\t+','+') for w in current_analysis]
# get rid of the word form from the FST output
current_analysis = [w.split('\t',1)[1] for w in current_analysis]
current_dict = {}
for item in current_analysis:
key = item.split('+',1)[0]
value = item.split('+',1)[1]
if not key in current_dict:
current_dict[key] = []
current_dict[key].append(value)
for key in current_dict:
c_val = current_dict[key]
pm_dict = {}
for v in c_val:
xval = v.split('+')
pos = v.split('+',1)[0]
morph = '_'
if len(xval) > 2:
morph = v.split('+',1)[1]
if not pos in pm_dict:
pm_dict[pos] = []
pm_dict[pos].append(morph)
current_dict[key] = pm_dict
print(current_dict)
#print(current_analysis)
wlp.append([ref_ID ,current_wordform, current_dict])
#///////////
i_position = insertion_positions[current_participant]+3*p_counter+1
#print('_ip_' + str(i_position) + '_ip_')
# insert the generated tiers at the specified position
if f_root.find('.//TIER[@TIER_ID="morph@' + current_participant + '"]') == None:
morph_tier = ET.Element('TIER')
morph_tier.set('LINGUISTIC_TYPE_REF', 'morphT')
morph_tier.set('PARENT_REF', 'pos@' + current_participant)
morph_tier.set('TIER_ID', 'morph@' + current_participant)
f_root.insert(i_position, morph_tier)
else:
morph_tier = f_root.find('.//TIER[@TIER_ID="morph@' + current_participant + '"]')
if f_root.find('.//TIER[@TIER_ID="pos@' + current_participant + '"]') == None:
pos_tier = ET.Element('TIER')
pos_tier.set('LINGUISTIC_TYPE_REF', 'posT')
pos_tier.set('PARENT_REF', 'lemma@' + current_participant)
pos_tier.set('TIER_ID', 'pos@' + current_participant)
f_root.insert(i_position, pos_tier)
else:
pos_tier = f_root.find('.//TIER[@TIER_ID="pos@' + current_participant + '"]')
if f_root.find('.//TIER[@TIER_ID="lemma@' + current_participant + '"]') == None:
lemma_tier = ET.Element('TIER')
lemma_tier.set('LINGUISTIC_TYPE_REF', 'lemmaT')
lemma_tier.set('PARENT_REF', 'word@'+current_participant)
lemma_tier.set('TIER_ID', 'lemma@' + current_participant)
f_root.insert(i_position, lemma_tier)
else:
lemma_tier = f_root.find('.//TIER[@TIER_ID="lemma@' + current_participant + '"]')
# populate all tiers
print('==> Populating the generated tiers for participant ', current_participant)
for i in range(len(wlp)):
print("populating lemma at position ", str(i), "/", str(len(wlp)))
lemma_dict = itemgetter(2)(wlp[i])
for l_i, l_key in enumerate(lemma_dict):
t_counter += 1
l_a_id = 'a'+str(t_counter)
l_a = ET.SubElement(lemma_tier, 'ANNOTATION')
l_r = ET.SubElement(l_a, 'REF_ANNOTATION')
l_v = ET.SubElement(l_r, 'ANNOTATION_VALUE')
l_r.set('ANNOTATION_ID', l_a_id)
l_r.set('ANNOTATION_REF', itemgetter(0)(wlp[i]))
if l_i > 0:
previous_lemma = f_root.find('.//TIER[@TIER_ID="lemma@'+current_participant+'"]/ANNOTATION[last()-1]/REF_ANNOTATION').attrib['ANNOTATION_ID']
l_r.set('PREVIOUS_ANNOTATION', previous_lemma)
l_v.text = l_key
pos_dict = lemma_dict[l_key]
for p_i, p_key in enumerate(pos_dict):
t_counter += 1
p_a_id = 'a'+str(t_counter)
p_a = ET.SubElement(pos_tier, 'ANNOTATION')
p_r = ET.SubElement(p_a, 'REF_ANNOTATION')
p_v = ET.SubElement(p_r, 'ANNOTATION_VALUE')
p_r.set('ANNOTATION_ID', p_a_id)
p_r.set('ANNOTATION_REF', l_a_id)
if p_i > 0:
previous_pos = f_root.find('.//TIER[@TIER_ID="pos@'+current_participant+'"]/ANNOTATION[last()-1]/REF_ANNOTATION').attrib['ANNOTATION_ID']
p_r.set('PREVIOUS_ANNOTATION', previous_pos)
p_v.text = p_key
morph_list = pos_dict[p_key]
for m_i, m_m in enumerate(morph_list):
t_counter += 1
m_a_id = 'a'+str(t_counter)
m_a = ET.SubElement(morph_tier, 'ANNOTATION')
m_r = ET.SubElement(m_a, 'REF_ANNOTATION')
m_v = ET.SubElement(m_r, 'ANNOTATION_VALUE')
m_r.set('ANNOTATION_ID', m_a_id)
m_r.set('ANNOTATION_REF', p_a_id)
if m_i > 0:
previous_morph = f_root.find('.//TIER[@TIER_ID="morph@'+current_participant+'"]/ANNOTATION[last()-1]/REF_ANNOTATION').attrib['ANNOTATION_ID']
m_r.set('PREVIOUS_ANNOTATION', previous_morph)
m_v.text = morph_list[m_i]
#set lastUsedAnnotationId
print('last used annotation after processing is ' + str(t_counter))
f_root.find('.//PROPERTY[@NAME="lastUsedAnnotationId"]').text = str(t_counter)
tree.write(os.path.join(out_dir_path,str(f)),
xml_declaration=True,encoding='utf-8',
method="xml")
print('DONE ', f, '\n\n')
# txt = ET.tostring(f_root)
# text_file = open(os.path.join(out_dir_path,str(f)), "w")
# text_file.write(xml.dom.minidom.parseString(txt).toprettyxml())
# text_file.close()
if __name__ == "__main__":
reload(sys)
sys.setdefaultencoding("utf-8")
main()