-
Notifications
You must be signed in to change notification settings - Fork 1
/
add_pos2elan_p3-sje-psdp.py
331 lines (270 loc) · 17.4 KB
/
add_pos2elan_p3-sje-psdp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# -*- coding:utf-8 -*-
import re, os, errno, cgi, json, xml
import numpy
print('numpy version: '+numpy.version.version)
import sys, codecs, locale, getopt
import xml.etree.ElementTree as ET
from subprocess import Popen, PIPE
from operator import itemgetter
from xml.dom.minidom import parse, parseString
from imp import reload
from collections import defaultdict
def main():
#reload(sys)
#sys.setdefaultencoding('utf8')
# to be adjusted as needed
if len(sys.argv) != 2:
print('wrong number of arguments')
sys.exit('Error')
in_dir = sys.argv[1]
out_dir = 'temp_python'
cwd = os.getcwd()
out_dir_path = os.path.join(cwd,out_dir)
if not os.path.exists(out_dir_path):
os.mkdir(out_dir_path)
debug_fst = False
# parameters to be adjusted as needed
lang = 'sje'
plup = Popen('which lookup', shell=True, stdout=PIPE, stderr=PIPE)
olup, elup = plup.communicate()
print("___ lookup is ",olup.decode())
if not olup.decode():
print('No lookup found, please install it!')
sys.exit('Error')
lookup = olup.decode().strip()
langs_dir = '$GTHOME/langs/'
rel_xfst_file = '/src/analyser-gt-desc.xfst'
abs_xfst_file = langs_dir+lang+rel_xfst_file
abs_prep_file = '$GIELLA_CORE/scripts/preprocess' #!!!!!!!
prep = '| iconv -f UTF-8 -t UTF-8 |'+abs_prep_file
# command to analyse the input string comming from the left of the pipeline
cmd = "| iconv -f UTF-8 -t UTF-8 | " + lookup + " " + abs_xfst_file
for root, dirs, files in os.walk(in_dir): # Walk directory tree
print("Input dir {0} with {1} files ...".format(root, len(files)))
for f in files:
if f.endswith('eaf'):
print('... processing ', str(f))
tree = ET.parse(os.path.join(in_dir,f))
f_root = tree.getroot()
a_refs = f_root.findall('.//REF_ANNOTATION')
ar_ids = []
for arid in a_refs:
ar_ids.append(arid.attrib['ANNOTATION_ID'].replace('a',''))
ar_ids = sorted(ar_ids, key=int, reverse=True)
t_counter = int(ar_ids[0])
#print(t_counter)
# find the insertion positions for the generated tiers
child_list = f_root.getchildren()
child_positions = []
for child in child_list:
c_child = child.tag
if child.tag == 'TIER':
c_child += '_' + child.attrib['TIER_ID']
child_positions.append(c_child)
#print(child_positions)
p_counter = -1
participants = []
for refTIER in f_root.findall('.//TIER[@LINGUISTIC_TYPE_REF="refT"]'):
current_participant = refTIER.attrib['TIER_ID'].split('@',1)[1]
participants.append(current_participant)
insertion_positions = {}
for p in participants:
insertion_positions[p] = child_positions.index('TIER_word@'+p)
#print(insertion_positions)
# loop over all participants
for refTIER in f_root.findall('.//TIER[@LINGUISTIC_TYPE_REF="refT"]'):
current_participant = refTIER.attrib['TIER_ID'].split('@',1)[1]
p_counter += 1
#print('___ ', current_participant, ' ___')
# create empty list for [wordID, wordform, analysis_output]
wlp = []
orth = []
lang = f_root.find('.//TIER[@TIER_ID="word@'+current_participant+'"]').attrib['LANG_REF']
print('___ current lang is ', lang, ' ___')
abs_xfst_file = langs_dir+lang+rel_xfst_file
print('___ current xfst file is ', abs_xfst_file, ' ___')
# command to analyse the input string comming from the left of the pipeline
cmd = "| iconv -f UTF-8 -t UTF-8 | " + lookup + " " + abs_xfst_file
#######################
#Tokenise if not already done
if not f_root.find('.//TIER[@TIER_ID="word@'+current_participant+'"]/ANNOTATION/REF_ANNOTATION'):
print('==> Tokenising for participant ', current_participant)
word_tier = f_root.find('.//TIER[@TIER_ID="word@' + current_participant + '"]')
for t in f_root.findall('.//TIER[@TIER_ID="orth@'+current_participant+'"]/ANNOTATION/REF_ANNOTATION'):
ref_ID = t.attrib['ANNOTATION_ID']
current_orth = t[0].text
if not current_orth:
current_orth = "_DWF_"
p2 = Popen('echo \''+current_orth+'\''+prep, shell=True, stdout=PIPE, stderr=PIPE)
out, err = p2.communicate()
current_tokens = numpy.asarray(out.decode().split('\n'))
current_tokens = current_tokens[:-1]
orth.append([ref_ID ,current_tokens])
print('current token: '+str(current_tokens))
for i in range(len(orth)):
print("inserting token at position ", str(i+1), "/", str(len(orth)))
tokens = list(itemgetter(1)(orth[i]))
for t in enumerate(tokens):
t_counter += 1
tok_a_id = 'a'+str(t_counter)
tok_a = ET.SubElement(word_tier, 'ANNOTATION')
tok_r = ET.SubElement(tok_a, 'REF_ANNOTATION')
tok_v = ET.SubElement(tok_r, 'ANNOTATION_VALUE')
tok_r.set('ANNOTATION_ID', tok_a_id)
tok_r.set('ANNOTATION_REF', itemgetter(0)(orth[i]))
if t[0] > 0:
previous_token = f_root.find('.//TIER[@TIER_ID="word@'+current_participant+'"]/ANNOTATION[last()-1]/REF_ANNOTATION').attrib['ANNOTATION_ID']
tok_r.set('PREVIOUS_ANNOTATION', previous_token)
tok_v.text = t[1]
#######################
for t in f_root.findall('.//TIER[@TIER_ID="word@'+current_participant+'"]/ANNOTATION/REF_ANNOTATION'):
#//////
ref_ID = t.attrib['ANNOTATION_ID']
#print('_',ref_ID,'_')
current_wordform = t[0].text
# if the current word form is empty add a dummy one "_DWF_" for "Default Word Form"
if not current_wordform:
current_wordform = "_DWF_"
#print('... xxx ', ref_ID, ' ___ ', current_wordform)
p = Popen('echo '+current_wordform+cmd, shell=True, stdout=PIPE, stderr=PIPE)
out, err = p.communicate()
# print("|", out.decode().split('\n', 1 ),"|")
current_analysis = filter(None,out.decode().split('\n'))
# fix inconsistency of TAB usage in the FST output
current_analysis = [w.replace('\t+','+') for w in current_analysis]
# get rid of the word form from the FST output
current_analysis = [w.split('\t',1)[1] for w in current_analysis]
current_dict = {}
for item in current_analysis:
key = item.split('+',1)[0]
value = item.split('+',1)[1]
if not key in current_dict:
current_dict[key] = []
current_dict[key].append(value)
for key in current_dict:
c_val = current_dict[key]
pm_dict = {}
for v in c_val:
xval = v.split('+')
pos = v.split('+',1)[0]
morph = '_'
if len(xval) > 1:
morph = v.split('+',1)[1]
if not pos in pm_dict:
pm_dict[pos] = []
pm_dict[pos].append(morph)
current_dict[key] = pm_dict
# output of the current analysis to be added to the file
print(current_dict)
# output of the FST to compare to the current_dict
if debug_fst:
print(current_analysis)
wlp.append([ref_ID ,current_wordform, current_dict])
#///////////
i_position = insertion_positions[current_participant]+3*p_counter+1
#print('_ip_' + str(i_position) + '_ip_')
# set insertion tiers OR insert generated tiers at the specified position
if f_root.find('.//TIER[@TIER_ID="gloss@' + current_participant + '"]') == None:
gloss_tier = ET.Element('TIER')
gloss_tier.set('LINGUISTIC_TYPE_REF', 'glossT')
gloss_tier.set('PARENT_REF', 'pos@'+current_participant)
gloss_tier.set('TIER_ID', 'gloss@' + current_participant)
f_root.insert(i_position, gloss_tier)
else:
gloss_tier = f_root.find('.//TIER[@TIER_ID="gloss@' + current_participant + '"]')
if f_root.find('.//TIER[@TIER_ID="morph@' + current_participant + '"]') == None:
morph_tier = ET.Element('TIER')
morph_tier.set('LINGUISTIC_TYPE_REF', 'morphT')
morph_tier.set('PARENT_REF', 'pos@' + current_participant)
morph_tier.set('TIER_ID', 'morph@' + current_participant)
f_root.insert(i_position, morph_tier)
else:
morph_tier = f_root.find('.//TIER[@TIER_ID="morph@' + current_participant + '"]')
if f_root.find('.//TIER[@TIER_ID="pos@' + current_participant + '"]') == None:
pos_tier = ET.Element('TIER')
pos_tier.set('LINGUISTIC_TYPE_REF', 'posT')
pos_tier.set('PARENT_REF', 'lemma@' + current_participant)
pos_tier.set('TIER_ID', 'pos@' + current_participant)
f_root.insert(i_position, pos_tier)
else:
pos_tier = f_root.find('.//TIER[@TIER_ID="pos@' + current_participant + '"]')
if f_root.find('.//TIER[@TIER_ID="lemma@' + current_participant + '"]') == None:
lemma_tier = ET.Element('TIER')
lemma_tier.set('LINGUISTIC_TYPE_REF', 'lemmaT')
lemma_tier.set('PARENT_REF', 'word@'+current_participant)
lemma_tier.set('TIER_ID', 'lemma@' + current_participant)
f_root.insert(i_position, lemma_tier)
else:
lemma_tier = f_root.find('.//TIER[@TIER_ID="lemma@' + current_participant + '"]')
# populate all tiers
print('==> Populating the generated tiers for participant ', current_participant)
for i in range(len(wlp)):
print("populating lemma at position ", str(i+1), "/", str(len(wlp)))
lemma_dict = itemgetter(2)(wlp[i])
#print(wlp[i])
lemma_exists = False
if lemma_tier.find('.//ANNOTATION/REF_ANNOTATION[@ANNOTATION_REF="'+itemgetter(0)(wlp[i]) + '"]') != None:
lemma_exists = True
node = lemma_tier.find('.//ANNOTATION/REF_ANNOTATION[@ANNOTATION_REF="'+itemgetter(0)(wlp[i]) + '"]')
l_a_id = node.attrib['ANNOTATION_ID']
for l_i, l_key in enumerate(lemma_dict):
if not lemma_exists:
t_counter += 1
l_a_id = 'a'+str(t_counter)
l_a = ET.SubElement(lemma_tier, 'ANNOTATION')
l_r = ET.SubElement(l_a, 'REF_ANNOTATION')
l_v = ET.SubElement(l_r, 'ANNOTATION_VALUE')
l_r.set('ANNOTATION_ID', l_a_id)
l_r.set('ANNOTATION_REF', itemgetter(0)(wlp[i]))
if l_i > 0:
previous_lemma = f_root.find('.//TIER[@TIER_ID="lemma@'+current_participant+'"]/ANNOTATION[last()-1]/REF_ANNOTATION').attrib['ANNOTATION_ID']
l_r.set('PREVIOUS_ANNOTATION', previous_lemma)
l_v.text = l_key
pos_exists = False
if pos_tier.find('.//ANNOTATION/REF_ANNOTATION[@ANNOTATION_REF="'+l_a_id + '"]') != None:
node = pos_tier.find('.//ANNOTATION/REF_ANNOTATION[@ANNOTATION_REF="'+l_a_id + '"]')
pos_exists = True
p_a_id = node.attrib['ANNOTATION_ID']
pos_dict = lemma_dict[l_key]
for p_i, p_key in enumerate(pos_dict):
if not pos_exists:
t_counter += 1
p_a_id = 'a'+str(t_counter)
p_a = ET.SubElement(pos_tier, 'ANNOTATION')
p_r = ET.SubElement(p_a, 'REF_ANNOTATION')
p_v = ET.SubElement(p_r, 'ANNOTATION_VALUE')
p_r.set('ANNOTATION_ID', p_a_id)
p_r.set('ANNOTATION_REF', l_a_id)
if p_i > 0:
previous_pos = f_root.find('.//TIER[@TIER_ID="pos@'+current_participant+'"]/ANNOTATION[last()-1]/REF_ANNOTATION').attrib['ANNOTATION_ID']
p_r.set('PREVIOUS_ANNOTATION', previous_pos)
p_v.text = p_key
morph_list = pos_dict[p_key]
if morph_tier.find('.//ANNOTATION/REF_ANNOTATION[@ANNOTATION_REF="'+p_a_id + '"]') == None:
for m_i, m_m in enumerate(morph_list):
t_counter += 1
m_a_id = 'a'+str(t_counter)
m_a_id = 'a'+str(t_counter)
m_a = ET.SubElement(morph_tier, 'ANNOTATION')
m_r = ET.SubElement(m_a, 'REF_ANNOTATION')
m_v = ET.SubElement(m_r, 'ANNOTATION_VALUE')
m_r.set('ANNOTATION_ID', m_a_id)
m_r.set('ANNOTATION_REF', p_a_id)
if m_i > 0:
previous_morph = f_root.find('.//TIER[@TIER_ID="morph@'+current_participant+'"]/ANNOTATION[last()-1]/REF_ANNOTATION').attrib['ANNOTATION_ID']
m_r.set('PREVIOUS_ANNOTATION', previous_morph)
m_v.text = morph_list[m_i]
#set lastUsedAnnotationId
print('last used annotation after processing is ' + str(t_counter))
f_root.find('.//PROPERTY[@NAME="lastUsedAnnotationId"]').text = str(t_counter)
tree.write(os.path.join(out_dir_path,str(f)),
xml_declaration=True,encoding='utf-8',
method="xml")
print('DONE ', f, '\n\n')
# txt = ET.tostring(f_root)
# text_file = open(os.path.join(out_dir_path,str(f)), "w")
# text_file.write(xml.dom.minidom.parseString(txt).toprettyxml())
# text_file.close()
if __name__ == "__main__":
reload(sys)
main()