-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcldfbench_gbabvd_vv.py
228 lines (200 loc) · 9.72 KB
/
cldfbench_gbabvd_vv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
"""Run:
cldfbench makecldf --with-zenodo --with-cldfreadme --communities lexibank ./cldfbench_gbabvd_vv.py
cldfbench readme ./cldfbench_gbabvd_vv.py
cldfbench gbabvd_vv.gbabvdvv_analyse
"""
from pathlib import Path
from collections import defaultdict
from git import Repo, GitCommandError
import unicodedata
import pycldf
from clldutils.path import read_text
from cldfzenodo import oai_lexibank
from cldfzenodo.record import GithubRepos
from cldfbench import Dataset as BaseDataset
from cldfbench import CLDFSpec, Metadata
from segments import Profile, Tokenizer
class Dataset(BaseDataset):
dir = Path(__file__).parent
id = "gbabvd_vv"
def cldf_specs(self):
return CLDFSpec(
module='StructureDataset',
dir=self.cldf_dir,
metadata_fname='cldf-metadata.json'
)
def cmd_download(self, args):
self.dataset_meta = {
r["ID"]: r["URL"]
for r in self.etc_dir.read_csv("datasets.tsv", delimiter="\t", dicts=True)
}
github_info = {rec.doi: rec.github_repos for rec in oai_lexibank()}
for dataset, src in self.dataset_meta.items():
ghinfo = github_info[src] if src in github_info else GithubRepos.from_url(src)
args.log.info("Checking {}".format(dataset))
dest = self.raw_dir / dataset
# download data
if dest.exists():
args.log.info("... dataset already exists, pulling changes")
for remote in Repo(str(dest)).remotes:
remote.fetch()
else:
args.log.info("... cloning {}".format(dataset))
try:
Repo.clone_from(ghinfo.clone_url, str(dest))
except GitCommandError as e:
args.log.error("... download failed\n{}".format(str(e)))
continue
# check out release (fall back to master branch)
repo = Repo(str(dest))
if ghinfo.tag:
args.log.info("... checking out tag {}".format(ghinfo.tag))
repo.git.checkout(ghinfo.tag)
else:
args.log.warning("... could not determine tag to check out")
args.log.info("... checking out master")
try:
branch = repo.branches.main
branch.checkout()
except AttributeError:
try:
branch = repo.branches.master
branch.checkout()
except AttributeError:
args.log.error("found neither main nor master branch")
repo.git.merge()
def cmd_makecldf(self, args):
vv_island_north = (-12.829882, 166.507220) # (-15.871027018941662, 167.2265488760759)
vv_island_west = (-16.232479, 166.111713) # (-16.083418602140625, 167.13775323375486)
vv_island_east = (-16.650297, 170.477881) # (-16.46849476280046, 167.86200219008973)
vv_island_south = (-20.551360, 168.988437) # (-16.624889040314066, 167.5021108694064)
# abvd_id <-> gb_glottocode
gb_abvd_map = {}
for r in self.etc_dir.read_csv("gb_abvd_map.tsv", delimiter="\t", dicts=True):
gb_abvd_map[r["Grambank_ID"]] = r["ABVD_ID"]
gb_abvd_map[r["ABVD_ID"]] = r["Grambank_ID"]
abvd = pycldf.Dataset.from_metadata('./raw/abvd/cldf/cldf-metadata.json')
gb = pycldf.Dataset.from_metadata('./raw/grambank/cldf/StructureDataset-metadata.json')
abvd_gb_map = defaultdict(set)
for lg in abvd.objects('LanguageTable'):
abvd_gb_map[lg.cldf.glottocode].add(lg.cldf.id)
abvd_gb_map[lg.cldf.id].add(lg.cldf.glottocode)
gb_lgs = {}
for lg in gb.objects('LanguageTable'):
if lg.cldf.latitude is not None and \
lg.cldf.latitude < vv_island_north[0] and lg.cldf.latitude > vv_island_south[0] and \
lg.cldf.longitude > vv_island_west[1] and lg.cldf.longitude < vv_island_east[1]:
if lg.cldf.glottocode in gb_abvd_map:
gb_lgs[lg.cldf.glottocode] = lg
elif len(abvd_gb_map[lg.cldf.glottocode]) == 1:
gb_lgs[lg.cldf.glottocode] = lg
a_id = list(abvd_gb_map[lg.cldf.glottocode])[0]
gb_abvd_map[lg.cldf.glottocode] = a_id
gb_abvd_map[a_id] = lg.cldf.glottocode
abvd_lgs = {}
abvd_ids = set()
seen_gcs = set()
for lg in abvd.objects('LanguageTable'):
if lg.cldf.id in gb_abvd_map:
if lg.cldf.glottocode in seen_gcs:
continue
abvd_lgs[lg.cldf.id] = lg
abvd_ids.add(lg.cldf.id)
seen_gcs.add(lg.cldf.glottocode)
with args.writer as ds:
ds.cldf.add_component('ParameterTable')
ds.cldf.add_component('LanguageTable')
ds.cldf.add_component('FormTable')
ds.cldf.add_component('CognateTable')
ds.cldf.add_columns('LanguageTable', 'ABVD_ID')
ds.cldf.add_columns('ParameterTable', 'Concepticon_ID')
ds.cldf.add_columns('FormTable', 'Cognacy')
ds.cldf.add_columns('FormTable', 'Loan')
ds.cldf.add_columns('CognateTable', 'Doubt')
ds.cldf.add_sources(Path.read_text(self.etc_dir / 'sources.bib'))
for g, lg in gb_lgs.items():
ds.objects['LanguageTable'].append({
'ID': lg.cldf.id,
'ABVD_ID': abvd_lgs[gb_abvd_map[g]].cldf.id,
'Name': f'{lg.cldf.name}/{abvd_lgs[gb_abvd_map[g]].cldf.name}',
'Macroarea': lg.cldf.macroarea,
'Glottocode': g,
'Latitude': lg.cldf.latitude,
'Longitude': lg.cldf.longitude,
})
ds.objects['LanguageTable'].sort(key=lambda d: d['ID'])
seen_params = set()
for v in gb.objects('ValueTable'):
if v.cldf.languageReference in gb_lgs:
if v.cldf.parameterReference not in seen_params:
p = gb.objects('ParameterTable')[v.cldf.parameterReference]
ds.objects['ParameterTable'].append({
'ID': p.cldf.id,
'Name': p.cldf.name,
})
seen_params.add(v.cldf.parameterReference)
ds.objects['ValueTable'].append({
'ID': v.cldf.id,
'Language_ID': v.cldf.languageReference,
'Parameter_ID': v.cldf.parameterReference,
'Value': v.cldf.value,
'Source': ['Skirgardetal2023'],
})
seen_params = set()
seen_form_ids = set()
prf = Profile.from_file(self.etc_dir / 'orthography.tsv', form='NFC')
tok = Tokenizer(profile=prf)
ign = ['..']
seen_fids = set()
for form in abvd.objects('FormTable'):
if form.cldf.languageReference in abvd_ids:
if form.cldf.form in ign:
continue
if form.cldf.parameterReference not in seen_params:
p = abvd.objects('ParameterTable')[form.cldf.parameterReference]
ds.objects['ParameterTable'].append({
'ID': p.cldf.id,
'Name': p.cldf.name,
'Concepticon_ID': p.cldf.concepticonReference
})
seen_params.add(form.cldf.parameterReference)
frm = unicodedata.normalize('NFC', form.cldf.form)
if frm == 'naᵐ batina':
frm = 'na ᵐbatina'
lid = gb_abvd_map[form.cldf.languageReference]
fid = f'{lid}-{form.cldf.parameterReference}'
fid_cnt = 1
while f'{fid}-{fid_cnt}' in seen_fids:
fid_cnt += 1
nfid = f'{fid}-{fid_cnt}'
seen_fids.add(nfid)
ds.objects['FormTable'].append({
'ID': nfid,
'Language_ID': lid,
'Parameter_ID': form.cldf.parameterReference,
'Value': form.cldf.value,
'Form': frm,
'Segments': tok(frm, column='IPA', form='NFC').split(' '),
'Source': ['Greenhilletal2008'],
'Cognacy': form.data['Cognacy'],
'Loan': form.data['Loan'],
})
seen_form_ids.add(nfid)
for c in abvd.objects('CognateTable'):
fids = c.data['Form_ID'].split('-')
if fids[0] not in gb_abvd_map:
continue
fids[0] = gb_abvd_map[fids[0]]
fid = '-'.join(fids)
if fid in seen_form_ids:
ds.objects['CognateTable'].append({
'ID': fid,
'Form_ID': fid,
'Cognateset_ID': c.data['Cognateset_ID'],
'Doubt': c.data['Doubt'],
'Source': ['Greenhilletal2008'],
})
ds.objects['ParameterTable'].sort(key=lambda d: d['ID'])
ds.objects['CognateTable'].sort(key=lambda d: d['Cognateset_ID'])
ds.objects['ValueTable'].sort(key=lambda r: (r['Language_ID'], r['Parameter_ID']))
ds.objects['FormTable'].sort(key=lambda r: (r['Language_ID'], r['Parameter_ID']))