-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbrieven-van-hooft-notebook.py
536 lines (438 loc) · 21.2 KB
/
brieven-van-hooft-notebook.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
import marimo
__generated_with = "0.8.11"
app = marimo.App(width="medium", app_title="Brieven van Hooft - Notebook")
@app.cell
def __(mo):
mo.md(
"""
# Brieven van Hooft - Notebook
## Introduction
This notebook provides access to the linguistic and socio-linguistic
annotations that were added to the P.C van Hooft letters in an annotation
project in 2017 by Marjo van Koppen and Marijn Schraagen.
The letters come from *"De briefwisseling van Pieter Corneliszoon Hooft, edited
by H.W van Tricht e.a.,"*, as published by the DBNL in the following three
parts:
* [Part 1](https://www.dbnl.org/tekst/hoof001hwva02_01/)
* [Part 2](https://www.dbnl.org/tekst/hoof001hwva03_01/)
* [Part 3](https://www.dbnl.org/tekst/hoof001hwva04_01/)
License information for these works can be found in
[here](https://www.dbnl.org/titels/gebruiksvoorwaarden.php?id=hoof001hwva03).
We did not receive the rights to publish the editorial parts of the texts that
are not from the 17th century. They will still be available in this notebook as
they can be downloaded from DBNL directly, but republishing them is not
permitted unfortunately. This notebook's code itself is under the GNU General Public License v3.
The annotations were initially published in a combination of FoLiA XML and
other stand-off formats. In 2024, they have been re-aligned with the original
DBNL sources and published as a [STAM](https://annotation.github.io/stam) model.
You can also inspect the full [pipeline that produced this
model](https://github.com/knaw-huc/brieven-van-hooft-pipeline).
This notebook provides search and visualisation functionality on this STAM
model. We will guide you through several examples.
Note that the letters are shown exactly according to the plain text data
from DBNL. Conversion from TEI XML to plain text was done by DBNL and may
contain some conversion artefacts.
### Obtaining the data
We first obtain the data by downloading the original texts of the three books
from DBNL, and by downloading the STAM model from Zenodo. Then we will load the data into memory. All this may take a
while. Please wait until three checkmarks appear below to indicate this has been done:
"""
)
return
@app.cell
def __():
#these are the main imports
import marimo as mo
import polars
from natsort import natsorted
import stam
import os
import os.path
from urllib.request import urlretrieve
return mo, natsorted, os, polars, stam, urlretrieve
@app.cell
def __(mo, natsorted, os, polars, stam, urlretrieve):
import hashlib
#download and load the data
if not os.path.exists("hoof001hwva02.txt"):
urlretrieve("https://www.dbnl.org/nieuws/text.php?id=hoof001hwva02","hoof001hwva02.txt")
if not os.path.exists("hoof001hwva03.txt"):
urlretrieve("https://www.dbnl.org/nieuws/text.php?id=hoof001hwva03","hoof001hwva03.txt")
if not os.path.exists("hoof001hwva04.txt"):
urlretrieve("https://www.dbnl.org/nieuws/text.php?id=hoof001hwva04","hoof001hwva04.txt")
if not os.path.exists("hoof001hwva.output.store.stam.json"):
#TODO: adapt link to Zenodo before final publication
urlretrieve("https://download.anaproy.nl/hoof001hwva.output.store.stam.json","hoof001hwva.output.store.stam.json")
os.sync()
_data_downloaded = "✅"
_checksums = {
"hoof001hwva02.txt":"5f0df29a5ea14e87bc66c3a8e8012ec966a8a948b709cc80504c6fb5c2e9d82b",
"hoof001hwva03.txt":"4c0a23a238b6da382c6a0c5334a867d8e3ef4cb081aae37c5104cf612cbeb64a",
"hoof001hwva04.txt":"6a2f9c4454f0db71a84c774418edaa9adc4ee19a5b3da00f051dd8c6b2f691df",
"hoof001hwva.output.store.stam.json": "f56baccb3dc8ca88d1f6327f806c173a954391e42c5236e76cc5a9284e7521ec"
}
_data_integrity = "✅"
_msg = ""
for _filename, _checksum in _checksums.items():
_m = hashlib.sha256()
with open(_filename,'rb') as _f:
_m.update(_f.read())
if _m.hexdigest() != _checksum:
_data_integrity = "❌"
if _filename.endswith(".txt"):
_msg += f"\n* Checksum for {_filename} failed! This means that the plain text data for Brieven van Hooft at DBNL has changed and that either you need to obtain the older files, or the annotation pipeline needs to be rerun! (contact [email protected] and [email protected])"
elif _filename.endswith(".json"):
_msg += f"\n* Checksum for {_filename} failed! This means that STAM model for Brieven van Hooft has changed and the notebook needs to adapt to the new version (contact [email protected] and [email protected])"
if _data_downloaded and _data_integrity == "✅":
#load the STAM model (AnnotationStore) into the variable `store`
store = stam.AnnotationStore(file="hoof001hwva.output.store.stam.json")
_data_loaded = "✅"
else:
store = None
_data_loaded = "❌"
_md = f"* Data download ready {_data_downloaded}\n* Data integrity check? {_data_integrity} {_msg}\n* Data loaded? {_data_loaded}\n"
mo.stop(store is None, mo.md(_md))
mo.md(_md)
return store
@app.cell
def __(mo, store):
mo.stop(store is None)
mo.md(
"""
## Data exploration
### Vocabularies
Before we get to the actual texts and annotations, we first want to give some
insight into the vocabularies that are used in this project. Understanding and
exploring the vocabularies is important to be able to make sensible queries
later on.
Vocabularies used by the annotations are grouped into so-called **annotation data
sets**, within these sets, **keys** are defined. Notable keys in this project are the following:
| Set | Key | Explanation |
| --- | --- | ----------- |
| `https://w3id.org/folia/v2/` | `elementtype` | Indicates the type of FoLiA element of this annotation (e.g. `s` (sentence), `w`(word), `pos`, `lemma`) |
| `gustave-pos` | `class` | The Part-of-Speech tag, manually assigned by the annotator, according to the CGN tagset and an extension thereof. This contains the full tag along with all its features. If you only want the tag head, use key `head` instead. If you want a specific feature use the key pertaining to the feature (e.g. `gender` or `number`) |
| `gustave-lemma` | `class` | The lemma, manually assigned by the annotator |
| `http://ilk.uvt.nl/folia/sets/frog-mbpos-cgn` | `class` | The Part-of-Speech tag, automatically annotated by Frog, according to the CGN tagset |
| `http://ilk.uvt.nl/folia/sets/frog-mblem-nl` | `class` | The lemma, automatically annotated by Frog |
| `https://w3id.org/folia/v2/` | `confidence` | The confidence value that was assigned to the annotation (a value between 0 and 1, occurs with automatic annotations by Frog) |
| `brieven-van-hooft-metadata` | `dbnl_id` | The full letter identifier as assigned by the DBNL. It is the primary means of identifying a particular letter. You will find this key and others in this set on annotations of letters as a whole. |
| `brieven-van-hooft-metadata` | `dated` | The date of a letter |
| `brieven-van-hooft-metadata` | `recipient` | The name of the recipient of a letter |
| `brieven-van-hooft-metadata` | `letter_id` | The letter sequence number (not necessarily entirely numerical) |
| `brieven-van-hooft-metadata` | `invididual` | `True` if the recipient is an individual, `False` if it's an organization or group |
| `brieven-van-hooft-metadata` | `gender` | The gender of the recipient: `male` or `female` (not much space for gender fluidity in the 17th century) |
| `brieven-van-hooft-metadata` | `function` | Occupation of the recipient, type of organisation of the recipient or type of personal relation to the recipient. Free value. |
| `brieven-van-hooft-metadata` | `literary` | `True` if the recipient is a literary author, `False` otherwise |
| `brieven-van-hooft-categories` | `function` | Function of the letter (closed vocabulary). You will find this key and others in this set on annotations of letters as a whole. |
| `brieven-van-hooft-categories` | `topic` | Topic of the letter (closed vocabulary) |
| `brieven-van-hooft-categories` | `business` | `True` if it's a business letter, `False` if it's a personal letter |
| `brieven-van-hooft-categories` | `accompanying` | `True` if it's an accompanying letter, `False` if it's an independent letter |
| `brieven-van-hooft-categories` | `part` | This key is found on annotations that identifies *parts* of letters, values are a closed vocabulary containing `greeting`, `opening`, `narratio`, `closing`, `finalgreeting` |
"""
)
return
@app.cell
def __(mo, store):
# present a form to explore vocabularies
available_datasets = [ x.id() for x in store.datasets() ]
chosen_dataset = mo.ui.dropdown(options=sorted(available_datasets), value="brieven-van-hooft-metadata", label="Annotation Dataset:")
mo.md(f"""
### Exploring vocabularies
You can explore the keys and values in a vocabulary. If you select any values in the table below, they will be used to constrain the letters shown in the next section.
* {chosen_dataset}
""")
return available_datasets, chosen_dataset
@app.cell
def __(chosen_dataset, mo, store):
# present datakeys based on selected dataset
dataset = store.dataset(chosen_dataset.value)
available_keys = [x.id() for x in dataset.keys() ]
chosen_key = mo.ui.dropdown(options=sorted(available_keys),label="Datakey:",value=available_keys[0])
mo.md(f"""
* {chosen_key}
""")
return available_keys, chosen_key, dataset
@app.cell
def __(store):
# initialize some data we need later
dataset_metadata = store.dataset("brieven-van-hooft-metadata")
key_dbnl_id = dataset_metadata.key("dbnl_id")
return dataset_metadata, key_dbnl_id
@app.cell
def __(chosen_key, dataset, mo, natsorted, polars):
# show the data for the selected data key
key = dataset.key(chosen_key.value)
vocab_dataframe = polars.DataFrame(
data=natsorted((str(x), x.annotations_len()) for x in key.data()),
schema=["Value","Occurrences"],
orient="row"
)
vocab_selection = mo.ui.table(vocab_dataframe, selection="multi")
vocab_selection
return key, vocab_dataframe, vocab_selection
@app.cell
def __(
chosen_dataset,
chosen_key,
key_dbnl_id,
mo,
polars,
store,
vocab_selection,
):
#constrain letters given selected data
data_values = "|".join([str(x[0]) for x in vocab_selection.value.select(polars.selectors.first()).iter_rows()])
data_query = f"""SELECT ANNOTATION ?a WHERE DATA "{chosen_dataset.value}" "{chosen_key.value}" = "{data_values}";"""
matching_letters = []
for _annotation in store.query(data_query):
if _annotation["a"].test_data(key_dbnl_id):
matching_letters.append(next(_annotation["a"].data(key_dbnl_id)))
#else:
# for _letter in _annotation["a"].related_text(stam.TextSelectionOperator.embedded(), limit=5).annotations(key_dbnl_id):
# _dbnl_id = next(_letter.data(key_dbnl_id))
# if _dbnl_id not in matching_letters:
# matching_letters.append(_dbnl_id)
if matching_letters:
_md = mo.md(f"{len(matching_letters)} matching letters were found (query was: ``{data_query}``), the selection below is constrained accordingly:" )
elif data_values:
_md = mo.md(f"No matching letters found (query was ``{data_query}``)")
else:
_md = mo.md(f"No constraints provided, select one or more in the above table if you want to constraint the letters shown in the next section")
_md
return data_query, data_values, matching_letters
@app.cell
def __(key_dbnl_id, matching_letters, mo, natsorted, polars):
#this cell presents a form to view letters and annotations
if matching_letters:
available_letters = polars.DataFrame(
data=natsorted((str(x) for x in matching_letters)),
schema=["dbnl_id"],
orient="row"
)
letter_note = "*(this selection is constrained by your data query above!)*"
else:
available_letters = polars.DataFrame(
data=natsorted(str(x) for x in key_dbnl_id.data()),
schema=["dbnl_id"],
orient="row"
)
letter_note = ""
chosen_letters = mo.ui.table(available_letters,selection="multi")
show_pos_annotations = mo.ui.checkbox()
show_lemma_annotations = mo.ui.checkbox()
show_part_annotations = mo.ui.checkbox()
show_structure_annotations = mo.ui.checkbox()
mo.md(f"""
## Visualisation of Letters and Annotations
* Select one or more letters to visualise: {letter_note} {chosen_letters}
* Show part-of-speech annotations? {show_pos_annotations}
* Show lemma annotations? {show_lemma_annotations}
* Show part annotations? {show_part_annotations}
* Show structure annotations from FoLiA? {show_structure_annotations}
""")
return (
available_letters,
chosen_letters,
letter_note,
show_lemma_annotations,
show_part_annotations,
show_pos_annotations,
show_structure_annotations,
)
@app.cell
def __(
chosen_letters,
mo,
polars,
show_lemma_annotations,
show_part_annotations,
show_pos_annotations,
show_structure_annotations,
store,
):
#this cell forms and runs query for letter visualisation and display the results
if not chosen_letters.value.is_empty():
_chosen_letters = "|".join(chosen_letters.value.to_series())
query = f"""SELECT ANNOTATION ?letter WHERE DATA "brieven-van-hooft-metadata" "dbnl_id" = "{_chosen_letters}";"""
_highlights = []
if show_pos_annotations.value:
_highlights.append("""@VALUETAG SELECT OPTIONAL ANNOTATION ?pos WHERE RELATION ?letter EMBEDS; DATA "gustave-pos" "class";""")
if show_lemma_annotations.value:
_highlights.append("""@VALUETAG SELECT OPTIONAL ANNOTATION ?lemma WHERE RELATION ?letter EMBEDS; DATA "gustave-lem" "class";""")
if show_part_annotations.value:
_highlights.append("""@VALUETAG SELECT OPTIONAL ANNOTATION ?part WHERE RELATION ?letter EMBEDS; DATA "brieven-van-hooft-categories" "part";""")
if show_structure_annotations.value:
_highlights.append("""@VALUETAG SELECT OPTIONAL ANNOTATION ?w WHERE RELATION ?letter EMBEDS; DATA "https://w3id.org/folia/v2/" "elementtype" = "w";""")
_highlights.append("""@VALUETAG SELECT OPTIONAL ANNOTATION ?p WHERE RELATION ?letter EMBEDS; DATA "https://w3id.org/folia/v2/" "elementtype" = "p";""")
_highlights.append("""@VALUETAG SELECT OPTIONAL ANNOTATION ?s WHERE RELATION ?letter EMBEDS; DATA "https://w3id.org/folia/v2/" "elementtype" = "s";""")
if _highlights:
query += " { " + " | ".join(_highlights) + " }"
print(query)
_html = store.view(query)
for _letter in store.query(query):
letter_metadata = polars.DataFrame(((x.dataset().id(), x.key().id(), str(x)) for x in _letter["letter"].data()), schema=["Dataset","Key", "Value"],orient="row")
break
else:
_html = "(no letters selected)"
query = "(no query provided)"
letter_metadata = polars.DataFrame()
mo.Html(_html)
return letter_metadata, query
@app.cell
def __(mo, query):
mo.md(f"""
The following query was used to render the above visualisation:
* ``{query}``
The table below shows all the metadata that was associated with the first selected letter:
""")
return
@app.cell
def __(letter_metadata):
letter_metadata
return
@app.cell
def __(mo, store):
#this cell produces the custom query form
mo.stop(store is None)
queryform = mo.ui.text_area(label="Enter a query. Subqueries can be used to specify highlights. Use [STAMQL syntax](https://github.com/annotation/stam/tree/master/extensions/stam-query):",full_width=True, rows=25).form()
mo.md(f"""
## Custom Queries
{queryform}
""")
return queryform,
@app.cell
def __(mo, queryform, store):
#this cell runs the custom query and presents the results
if queryform.value:
_html = store.view(queryform.value)
if _html.find("<h2>") == -1:
_html = "(custom query did no produce any results)"
else:
_html = "(no custom query submitted)"
mo.Html(_html)
return
@app.cell
def __(mo, store):
mo.stop(store is None)
mo.md(
r"""
## Custom Query Examples
Below are various example queries in STAMQL.
You can copy any these to the custom query input and
run them. They serve as examples which you can adapt to your own search needs.
[A formal specification of the query language](https://github.com/annotation/stam/tree/master/extensions/stam-query) is available for in-depth documentation.
### Metadata search
Show all letters to recipients born prior to 1600:
```
SELECT ANNOTATION ?letter WHERE
DATA "http://www.w3.org/ns/anno/" "type" = "Letter";
DATA "brieven-van-hooft-metadata" "birthyear" < 1600;
```
### Display a specific letter and highlight specific Part-of-Speech tags
```
SELECT ANNOTATION ?letter WHERE
DATA "brieven-van-hooft-metadata" "dbnl_id" = "hoof001hwva02_01_0032";
{
SELECT ANNOTATION ?adj WHERE
RELATION ?letter EMBEDS;
DATA "gustave-pos" "head" = "ADJ";
|
SELECT ANNOTATION ?adv WHERE
RELATION ?letter EMBEDS;
DATA "gustave-pos" "head" = "BW";
}
}
```
### Search for words with a specific text
In letters, search for words with a specific text:
```
SELECT ANNOTATION ?letter WHERE
DATA "http://www.w3.org/ns/anno/" "type" = "Letter";
{
SELECT ANNOTATION ?match WHERE
RELATION ?letter EMBEDS;
DATA "https://w3id.org/folia/v2/" "elementtype" = "w";
TEXT "Blaricom";
}
```
### Search for part of speech tags with a specific text
```
SELECT ANNOTATION ?letter WHERE
DATA "http://www.w3.org/ns/anno/" "type" = "Letter";
{
SELECT ANNOTATION ?match WHERE
RELATION ?letter EMBEDS;
DATA "gustave-pos" "head" = "WW";
TEXT "vlieghen";
}
```
### Search for words with one of multiple lemmas
```
SELECT ANNOTATION ?letter WHERE
DATA "http://www.w3.org/ns/anno/" "type" = "Letter";
{
SELECT ANNOTATION ?match WHERE
RELATION ?letter EMBEDS;
DATA "gustave-lem" "class" = "vreemd|raar|merkwaardig";
}
```
### Search for words with a specific text and part-of-Speech tag
This is a more complex example that explicitly searches letters for words that have a particular PoS tag:
```
SELECT ANNOTATION ?letter WHERE
DATA "http://www.w3.org/ns/anno/" "type" = "Letter";
{
SELECT ANNOTATION ?w WHERE
RELATION ?letter EMBEDS;
DATA "https://w3id.org/folia/v2/" "elementtype" = "w";
TEXT "Naerden";
{
SELECT ANNOTATION ?pos WHERE
RELATION ?w EQUALS;
DATA "gustave-pos" "head" = "N";
}
}
```
Simpler alternative:
```
SELECT ANNOTATION ?letter WHERE
DATA "http://www.w3.org/ns/anno/" "type" = "Letter";
{
SELECT TEXT ?w WHERE
RELATION ?letter EMBEDS;
TEXT "Naerden";
DATA "https://w3id.org/folia/v2/" "elementtype" = "w";
DATA "gustave-pos" "head" = "N";
}
```
### Search for a particular sequence of PoS tags
This finds combinations of: ADJ + VZ + LID
```
SELECT ANNOTATION ?letter WHERE
DATA "http://www.w3.org/ns/anno/" "type" = "Letter";
{
SELECT ANNOTATION ?adj WHERE
RELATION ?letter EMBEDS;
DATA "gustave-pos" "head" = "ADJ";
{
SELECT ANNOTATION ?vz WHERE
RELATION ?adj PRECEDES;
DATA "gustave-pos" "head" = "VZ";
{
SELECT ANNOTATION ?lid WHERE
RELATION ?vz PRECEDES;
DATA "gustave-pos" "head" = "LID";
}
}
}
### Search for a specific part annotation
This returns all annotations that were categorised as 'greeting':
```
SELECT ANNOTATION ?greeting WHERE
DATA "brieven-van-hooft-categories" "part" = "greeting";
```
"""
)
return
if __name__ == "__main__":
app.run()