Skip to content

Commit

Permalink
Merge pull request #1556 from UUDigitalHumanitieslab/feature/update-i…
Browse files Browse the repository at this point in the history
…analyzer-readers

Update XML corpus definitions
  • Loading branch information
lukavdplas authored Jul 18, 2024
2 parents 90030f8 + a26ca62 commit a0da9e1
Show file tree
Hide file tree
Showing 26 changed files with 709 additions and 989 deletions.
10 changes: 10 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,16 @@
}
},
{
"name": "Python: Debug Tests",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"purpose": [
"debug-test"
],
"console": "internalConsole",
"justMyCode": false
}, {
"name": "celery",
"type": "debugpy",
"request": "launch",
Expand Down
4 changes: 1 addition & 3 deletions backend/addcorpus/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@ class NewReader(CSVReader):
for f in corpus.configuration.fields.all()]

def sources(self, *args, **kwargs):
return (
(fn, {}) for fn in glob.glob(f'{self.data_directory}/**/*.csv', recursive=True)
)
return glob.glob(f'{self.data_directory}/**/*.csv', recursive=True)

return NewReader()
29 changes: 14 additions & 15 deletions backend/corpora/dbnl/dbnl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import re
from tqdm import tqdm
from ianalyzer_readers.xml_tag import Tag, CurrentTag, TransformTag

from django.conf import settings
from addcorpus.python_corpora.corpus import XMLCorpusDefinition, FieldDefinition
Expand All @@ -25,8 +26,8 @@ class DBNL(XMLCorpusDefinition):
languages = ['nl', 'dum', 'fr', 'la', 'fy', 'lat', 'en', 'nds', 'de', 'af']
category = 'book'

tag_toplevel = 'TEI.2'
tag_entry = { 'name': 'div', 'attrs': {'type': 'chapter'} }
tag_toplevel = Tag('TEI.2')
tag_entry = Tag('div', type='chapter')

document_context = {
'context_fields': ['title_id'],
Expand Down Expand Up @@ -261,18 +262,18 @@ def _xml_files(self):
Pass(
Backup(
XML( # get the language on chapter-level if available
CurrentTag(),
attribute='lang',
transform=lambda value: [value] if value else None,
),
XML( # look for section-level codes
{'name': 'div', 'attrs': {'type': 'section'}},
Tag('div', type='section'),
attribute='lang',
multiple=True,
),
XML( # look in the top-level metadata
'language',
Tag('language'),
toplevel=True,
recursive=True,
multiple=True,
attribute='id'
),
Expand All @@ -298,17 +299,17 @@ def _xml_files(self):
extractor=Pass(
Backup(
XML( # get the language on chapter-level if available
CurrentTag(),
attribute='lang',
),
XML( # look for section-level code
{'name': 'div', 'attrs': {'type': 'section'}},
Tag('div', type='section'),
attribute='lang'
),
XML( #otherwise, get the (first) language for the book
'language',
Tag('language'),
attribute='id',
toplevel=True,
recursive=True,
),
transform=utils.single_language_code,
),
Expand All @@ -322,13 +323,11 @@ def _xml_files(self):
display_name='Chapter',
extractor=Backup(
XML(
tag='head',
recursive=True,
Tag('head'),
flatten=True,
),
XML(
tag=utils.LINE_TAG,
recursive=True,
Tag(utils.LINE_TAG),
flatten=True,
)
),
Expand Down Expand Up @@ -359,11 +358,11 @@ def _xml_files(self):
search_field_core=True,
csv_core=True,
extractor=XML(
tag=utils.LINE_TAG,
recursive=True,
Tag(utils.LINE_TAG),
TransformTag(utils.pad_content),
multiple=True,
flatten=True,
transform_soup_func=utils.pad_content,
transform=lambda lines: '\n'.join(lines).strip() if lines else None,
),
es_mapping=main_content_mapping(token_counts=True),
visualizations=['wordcloud'],
Expand Down
14 changes: 8 additions & 6 deletions backend/corpora/dbnl/tests/test_dbnl_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,12 @@ def test_append_to_tag(xml, tag, padding, original_output, new_output):
'content': '\n'.join([
'Register der Liedekens.',
'A.',
'ACh gesalfde van den Heer. Pag. 30 ',
'Als Saul, en david den vyant in\'t velt. 41 ',
'Als ick de Son verhoogen sie. 184 ',
'Als hem de Son begeeft. 189 ',
'Als ick den Herfst aenschou. 194 ',
'Als in koelt, de nacht komt overkleeden 208 ',
'ACh gesalfde van den Heer. Pag. 30',
'Als Saul, en david den vyant in\'t velt. 41',
'Als ick de Son verhoogen sie. 184',
'Als hem de Son begeeft. 189',
'Als ick den Herfst aenschou. 194',
'Als in koelt, de nacht komt overkleeden 208',
'Als van der meer op Eng\'le-vleug\'len vloog. 232',
])
}, { # metadata-only book
Expand Down Expand Up @@ -194,6 +194,8 @@ def test_dbnl_extraction(dbnl_corpus):
for actual, expected in zip(docs, expected_docs):
# assert that actual is a superset of expected
for key in expected:
if expected[key] != actual[key]:
print(key)
assert expected[key] == actual[key]
assert expected.items() <= actual.items()

Expand Down
3 changes: 2 additions & 1 deletion backend/corpora/dbnl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,8 @@ def append_to_tag(soup, tag, padding):
def pad_content(node):
pad_cells = lambda n: append_to_tag(n, 'cell', ' ')
pad_linebreaks = lambda n: append_to_tag(n, 'lb', '\n')
return pad_cells(pad_linebreaks(node))
pad_cells(pad_linebreaks(node))
return [node]

def standardize_language_code(code):
if code:
Expand Down
10 changes: 4 additions & 6 deletions backend/corpora/dutchannualreports/dutchannualreports.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os.path as op
import logging
from datetime import datetime
from ianalyzer_readers.xml_tag import Tag

from django.conf import settings

Expand All @@ -20,7 +21,6 @@
class DutchAnnualReports(XMLCorpusDefinition):
""" Alto XML corpus of Dutch annual reports. """

# Data overrides from .common.Corpus (fields at bottom of class)
title = "Dutch Annual Reports"
description = "Annual reports of Dutch financial and non-financial institutes"
min_date = datetime(year=1957, month=1, day=1)
Expand All @@ -38,9 +38,8 @@ class DutchAnnualReports(XMLCorpusDefinition):

mimetype = 'application/pdf'

# Data overrides from .common.XMLCorpus
tag_toplevel = 'alto'
tag_entry = 'Page'
tag_toplevel = Tag('alto')
tag_entry = Tag('Page')

# New data members
non_xml_msg = 'Skipping non-XML file {}'
Expand Down Expand Up @@ -187,9 +186,8 @@ def sources(self, start=min_date, end=max_date):
description='Text content of the page.',
results_overview=True,
extractor=XML(
tag='String',
Tag('String'),
attribute='CONTENT',
recursive=True,
multiple=True,
transform=lambda x: ' '.join(x),
),
Expand Down
61 changes: 24 additions & 37 deletions backend/corpora/dutchnewspapers/dutchnewspapers_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from datetime import datetime
from os.path import join, split, splitext
import os
from ianalyzer_readers.xml_tag import Tag, SiblingTag

from django.conf import settings

Expand Down Expand Up @@ -43,8 +44,9 @@ class DutchNewspapersPublic(XMLCorpusDefinition):
def es_settings(self):
return es_settings(self.languages[:1], stopword_analysis=True, stemming_analysis=True)

tag_toplevel = 'text'
tag_entry = 'p'
tag_toplevel = Tag('text')
tag_entry = Tag('p')
external_file_tag_toplevel = Tag('DIDL')

# New data members
definition_pattern = re.compile(r'didl')
Expand Down Expand Up @@ -137,18 +139,10 @@ def fields(self):
description="Link to record on Delpher",
display_type='url',
es_mapping=keyword_mapping(),
extractor=XML(tag='identifier',
toplevel=True,
recursive=True,
multiple=False,
secondary_tag={
'tag': 'recordIdentifier',
'match': 'id'
},
external_file={
'xml_tag_toplevel': 'DIDL',
'xml_tag_entry': 'dcx'
}
extractor=XML(
lambda metadata: Tag('recordIdentifier', string=metadata['id']),
SiblingTag('identifier'),
external_file=True
)
),
FieldDefinition(
Expand Down Expand Up @@ -179,13 +173,9 @@ def fields(self):
'indicator is in this range.'
)
),
extractor=XML(tag='OCRConfidencelevel',
toplevel=True,
recursive=True,
external_file={
'xml_tag_toplevel': 'DIDL',
'xml_tag_entry': 'dcx'
},
extractor=XML(
Tag('OCRConfidencelevel'),
external_file=True,
transform=lambda x: float(x)*100
),
sortable=True
Expand Down Expand Up @@ -225,19 +215,11 @@ def fields(self):
description='Whether the item is an article, advertisment, etc.',
csv_core=True,
es_mapping={'type': 'keyword'},
extractor=XML(tag='subject',
toplevel=True,
recursive=True,
multiple=False,
secondary_tag={
'tag': 'recordIdentifier',
'match': 'id'
},
external_file={
'xml_tag_toplevel': 'DIDL',
'xml_tag_entry': 'dcx'
}
),
extractor=XML(
lambda metadata: Tag('recordIdentifier', string=metadata['id']),
SiblingTag('subject'),
external_file=True
),
search_filter=filters.MultipleChoiceFilter(
description='Accept only articles in these categories.',
option_count=2,
Expand Down Expand Up @@ -276,7 +258,7 @@ def fields(self):
description='Article title',
results_overview=True,
search_field_core=True,
extractor=XML(tag='title', flatten=True, toplevel=True)
extractor=XML(Tag('title'), flatten=True, toplevel=True)
),
FieldDefinition(
name='id',
Expand Down Expand Up @@ -320,8 +302,13 @@ def fields(self):
es_mapping=main_content_mapping(True, True, True, 'nl'),
results_overview=True,
search_field_core=True,
extractor=XML(tag='p', multiple=True,
flatten=True, toplevel=True),
extractor=XML(
Tag('p'),
multiple=True,
flatten=True,
toplevel=True,
transform='\n'.join,
),
visualizations=["wordcloud"],
language='nl',
),
Expand Down
8 changes: 4 additions & 4 deletions backend/corpora/ecco/ecco.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from datetime import datetime
import logging
import re
from ianalyzer_readers.xml_tag import Tag

from django.conf import settings

Expand Down Expand Up @@ -37,8 +38,8 @@ class Ecco(XMLCorpusDefinition):
languages = ['en', 'fr', 'la', 'grc', 'de', 'it', 'cy', 'ga', 'gd']
category = 'book'

tag_toplevel = 'pageContent'
tag_entry = 'page'
tag_toplevel = Tag('pageContent')
tag_entry = Tag('page')

meta_pattern = re.compile('^\d+\_DocMetadata\.xml$')

Expand Down Expand Up @@ -153,8 +154,7 @@ def fields(self):
description='Text content.',
results_overview=True,
search_field_core=True,
extractor=XML(tag='ocrText',
flatten=True),
extractor=XML(Tag('ocrText'), flatten=True),
visualizations=['wordcloud']
),
FieldDefinition(
Expand Down
Loading

0 comments on commit a0da9e1

Please sign in to comment.