From 0a2d421353cc7530c78f49b24d63686211a391f1 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Thu, 4 Jul 2024 15:01:46 +0200 Subject: [PATCH 1/6] Add celery debug configuration --- .vscode/launch.json | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/.vscode/launch.json b/.vscode/launch.json index 9ae029d6b..786b5f0a5 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -59,6 +59,26 @@ "/*": "*", "/./~/*": "${webRoot}/node_modules/*" } + }, + { + "name": "celery", + "type": "debugpy", + "request": "launch", + "cwd": "${workspaceFolder}/backend", + "env": { + "PYTHONPATH": "${workspaceFolder}/backend" + }, + "module": "celery", + "console": "integratedTerminal", + "args": [ + "-A", + "ianalyzer.celery", + "worker", + "--pool=solo", + "--concurrency=1", + "--events", + "--loglevel=info" + ] } ], "inputs": [ From 4f5be59cd214aae7a8114947f7c38ea83add6831 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Thu, 4 Jul 2024 16:00:22 +0200 Subject: [PATCH 2/6] Hide token columns in histogram when on documents measure Resolves #1560 --- .../barchart/histogram.component.ts | 47 ++++++++++--------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/frontend/src/app/visualization/barchart/histogram.component.ts b/frontend/src/app/visualization/barchart/histogram.component.ts index a73f630fd..24f952f09 100644 --- a/frontend/src/app/visualization/barchart/histogram.component.ts +++ b/frontend/src/app/visualization/barchart/histogram.component.ts @@ -213,28 +213,33 @@ export class HistogramComponent formatDownload: this.formatDownloadValue, isOptional: 'relative_doc_count' !== valueKey, }, - { - key: 'match_count', - label: 'Token Frequency', - format: this.formatValue('raw'), - formatDownload: this.formatDownloadValue, - isOptional: 'match_count' !== valueKey, - }, - { - key: 'matches_by_doc_count', - label: 'Relative Frequency (documents)', - format: this.formatValue('documents'), - formatDownload: this.formatDownloadValue, - isOptional: 'matches_by_doc_count' !== valueKey, - }, - { - key: 'matches_by_token_count', - label: 'Relative Frequency (terms)', - format: this.formatValue('terms'), - formatDownload: this.formatDownloadValue, - isOptional: 'matches_by_token_count' !== valueKey, - }, ]; + if (this.frequencyMeasure == 'tokens') { + // Headers related to tokens should not be applied to document visualizations + this.tableHeaders = this.tableHeaders.concat([ + { + key: 'match_count', + label: 'Token Frequency', + format: this.formatValue('raw'), + formatDownload: this.formatDownloadValue, + isOptional: 'match_count' !== valueKey, + }, + { + key: 'matches_by_doc_count', + label: 'Relative Frequency (documents)', + format: this.formatValue('documents'), + formatDownload: this.formatDownloadValue, + isOptional: 'matches_by_doc_count' !== valueKey, + }, + { + key: 'matches_by_token_count', + label: 'Relative Frequency (terms)', + format: this.formatValue('terms'), + formatDownload: this.formatDownloadValue, + isOptional: 'matches_by_token_count' !== valueKey, + }, + ]); + } } } } From b0cc71f4677e71cc87f47c43aaed6b0a6eb00374 Mon Sep 17 00:00:00 2001 From: Luka van der Plas Date: Thu, 4 Jul 2024 18:25:20 +0200 Subject: [PATCH 3/6] fix query text in csv --- backend/download/tasks.py | 11 ++--------- backend/download/tests/test_download_views.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/backend/download/tasks.py b/backend/download/tasks.py index 875f4d5bc..ccc559065 100644 --- a/backend/download/tasks.py +++ b/backend/download/tasks.py @@ -44,19 +44,12 @@ def make_download(request_json, download_id, download_size=None): es_query = api_query_to_es_query(request_json, corpus_name) results, _total = es_download.scroll( corpus_name, es_query, download_size) + filepath = create_csv.search_results_csv( - results, request_json['fields'], query, download_id) + results, request_json['fields'], query.get_query_text(es_query), download_id) return filepath -def create_query(request_json): - """ - format the route of the search into a query string - """ - route = request_json.get('route') - return re.sub(r';|%\d+', '_', re.sub(r'\$', '', route.split('/')[2])) - - def try_download(tasks_func, download): ''' Try initialising a task chain for a download. Marks the download diff --git a/backend/download/tests/test_download_views.py b/backend/download/tests/test_download_views.py index ef1c6411a..7cbbd981c 100644 --- a/backend/download/tests/test_download_views.py +++ b/backend/download/tests/test_download_views.py @@ -246,3 +246,22 @@ def test_unauthenticated_download(db, client, basic_mock_corpus, basic_corpus_pu download_objects = Download.objects.all() assert download_objects.count() == 1 assert download_objects.first().user == None + +def test_query_text_in_csv(db, client, basic_mock_corpus, basic_corpus_public, index_basic_mock_corpus): + es_query = query.set_query_text(mock_match_all_query(), 'ghost') + download_request_json = { + 'corpus': basic_mock_corpus, + 'es_query': es_query, + 'fields': ['character', 'line'], + 'route': f"/search/{basic_mock_corpus}", + 'encoding': 'utf-8' + } + response = client.post('/api/download/search_results', + download_request_json, + content_type='application/json' + ) + assert status.is_success(response.status_code) + stream = read_file_response(response, 'utf-8') + reader = csv.DictReader(stream, delimiter=';') + row = next(reader) + assert row['query'] == 'ghost' From 2b4230932fa3258888e1d5dfcc5a0233fec27e1c Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Fri, 5 Jul 2024 09:48:38 +0200 Subject: [PATCH 4/6] Sort documentation page in canonical order --- backend/addcorpus/models.py | 6 +++++- backend/addcorpus/tests/test_corpus_views.py | 7 ++++++- backend/addcorpus/views.py | 18 +++++++++++++----- backend/corpora_test/basic/license/license.md | 1 + backend/corpora_test/basic/mock_csv_corpus.py | 2 ++ 5 files changed, 27 insertions(+), 7 deletions(-) create mode 100644 backend/corpora_test/basic/license/license.md diff --git a/backend/addcorpus/models.py b/backend/addcorpus/models.py index ca31d9912..91dae899d 100644 --- a/backend/addcorpus/models.py +++ b/backend/addcorpus/models.py @@ -431,11 +431,12 @@ def clean(self): e ]) + class CorpusDocumentationPage(models.Model): class PageType(models.TextChoices): GENERAL = ('general', 'General information') CITATION = ('citation', 'Citation') - LICENSE = ('license', 'Licence') + LICENSE = ('license', 'License') TERMS_OF_SERVICE = ('terms_of_service', 'Terms of service') WORDMODELS = ('wordmodels', 'Word models') @@ -455,6 +456,9 @@ class PageType(models.TextChoices): help_text='markdown contents of the documentation' ) + def __str__(self): + return f'{self.corpus_configuration.corpus.name} - {self.type}' + class Meta: constraints = [ UniqueConstraint( diff --git a/backend/addcorpus/tests/test_corpus_views.py b/backend/addcorpus/tests/test_corpus_views.py index 8fd15290c..d0080c053 100644 --- a/backend/addcorpus/tests/test_corpus_views.py +++ b/backend/addcorpus/tests/test_corpus_views.py @@ -19,9 +19,14 @@ def test_no_corpora(db, settings, admin_client): def test_corpus_documentation_view(admin_client, basic_mock_corpus, settings): response = admin_client.get(f'/api/corpus/documentation/{basic_mock_corpus}/') assert response.status_code == 200 + pages = response.data + + # check that the pages are sorted in canonical order + page_types = [page['type'] for page in pages] + assert page_types == ['General information', 'Citation', 'License'] # should contain citation guidelines - citation_page = next(page for page in response.data if page['type'] == 'Citation') + citation_page = next(page for page in pages if page['type'] == 'Citation') # check that the page template is rendered with context content = citation_page['content'] diff --git a/backend/addcorpus/views.py b/backend/addcorpus/views.py index f495fd162..a32064011 100644 --- a/backend/addcorpus/views.py +++ b/backend/addcorpus/views.py @@ -42,22 +42,30 @@ def send_corpus_file(corpus='', subdir='', filename=''): return FileResponse(open(path, 'rb')) + class CorpusDocumentationPageViewset(viewsets.ModelViewSet): permission_classes = [IsAuthenticatedOrReadOnly, CorpusAccessPermission] serializer_class = CorpusDocumentationPageSerializer - def get_queryset(self): - corpus_name = corpus_name_from_request(self.request) - pages = CorpusDocumentationPage.objects.filter(corpus_configuration__corpus__name=corpus_name) - + @staticmethod + def get_relevant_pages(pages, corpus_name): # only include wordmodels documentation if models are present if Corpus.objects.get(name=corpus_name).has_python_definition: definition = load_corpus_definition(corpus_name) if definition.word_models_present: return pages - return pages.exclude(type=CorpusDocumentationPage.PageType.WORDMODELS) + def get_queryset(self): + corpus_name = corpus_name_from_request(self.request) + pages = CorpusDocumentationPage.objects.filter( + corpus_configuration__corpus__name=corpus_name) + relevant_pages = self.get_relevant_pages(pages, corpus_name) + canonical_order = [e.value for e in CorpusDocumentationPage.PageType] + + return sorted( + relevant_pages, key=lambda p: canonical_order.index(p.type)) + class CorpusImageView(APIView): ''' diff --git a/backend/corpora_test/basic/license/license.md b/backend/corpora_test/basic/license/license.md new file mode 100644 index 000000000..4fcb91a18 --- /dev/null +++ b/backend/corpora_test/basic/license/license.md @@ -0,0 +1 @@ +Do whatever you please. diff --git a/backend/corpora_test/basic/mock_csv_corpus.py b/backend/corpora_test/basic/mock_csv_corpus.py index 84711f8a0..3b3d38360 100644 --- a/backend/corpora_test/basic/mock_csv_corpus.py +++ b/backend/corpora_test/basic/mock_csv_corpus.py @@ -25,6 +25,8 @@ class MockCSVCorpus(CSVCorpusDefinition): max_date = datetime.datetime(year=2022, month=12, day=31) data_directory = os.path.join(here, 'source_data') citation_page = 'citation.md' + license_page = 'license.md' + description_page = 'mock-csv-corpus.md' languages = ['en'] category = 'book' From a4dde03fec194ff16f402448fbbb069e20fb815e Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Fri, 5 Jul 2024 10:24:10 +0200 Subject: [PATCH 5/6] Add missing reset$ to CreateDefinitionComponent --- .../create-definition/create-definition.component.ts | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/frontend/src/app/corpus-definitions/create-definition/create-definition.component.ts b/frontend/src/app/corpus-definitions/create-definition/create-definition.component.ts index deba1add0..9d7e16469 100644 --- a/frontend/src/app/corpus-definitions/create-definition/create-definition.component.ts +++ b/frontend/src/app/corpus-definitions/create-definition/create-definition.component.ts @@ -5,11 +5,12 @@ import { APIEditableCorpus, CorpusDefinition } from '../../models/corpus-definit import * as _ from 'lodash'; import { Router } from '@angular/router'; import { HttpErrorResponse } from '@angular/common/http'; +import { Subject } from 'rxjs'; @Component({ selector: 'ia-create-definition', templateUrl: './create-definition.component.html', - styleUrls: ['./create-definition.component.scss'] + styleUrls: ['./create-definition.component.scss'], }) export class CreateDefinitionComponent { actionIcons = actionIcons; @@ -19,6 +20,8 @@ export class CreateDefinitionComponent { error: Error; + reset$ = new Subject(); + constructor(private apiService: ApiService, private router: Router) { this.corpus = new CorpusDefinition(apiService); } @@ -31,12 +34,15 @@ export class CreateDefinitionComponent { this.error = undefined; this.corpus.save().subscribe( (result: APIEditableCorpus) => { - this.router.navigate(['/corpus-definitions', 'edit', result.id]); + this.router.navigate([ + '/corpus-definitions', + 'edit', + result.id, + ]); }, (err: HttpErrorResponse) => { this.error = err; } ); } - } From d5cdf8c23f093b75db32c9f174c2dd4e03f13fd9 Mon Sep 17 00:00:00 2001 From: Jelte van Boheemen Date: Fri, 5 Jul 2024 10:46:17 +0200 Subject: [PATCH 6/6] Remove documentation from media test corpus --- backend/corpora_test/media/media_mock_corpus.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/corpora_test/media/media_mock_corpus.py b/backend/corpora_test/media/media_mock_corpus.py index e965ca660..6bd7be485 100644 --- a/backend/corpora_test/media/media_mock_corpus.py +++ b/backend/corpora_test/media/media_mock_corpus.py @@ -13,6 +13,8 @@ class MediaMockCorpus(MockCSVCorpus): data_directory = os.path.join(here, 'source_data') scan_image_type = 'image/png' citation_page = None + license_page = None + description_page = None def request_media(self, document, corpus_name): field_values = document['fieldValues']