From 0a2d421353cc7530c78f49b24d63686211a391f1 Mon Sep 17 00:00:00 2001
From: Jelte van Boheemen <j.vanboheemen@uu.nl>
Date: Thu, 4 Jul 2024 15:01:46 +0200
Subject: [PATCH 1/6] Add celery debug configuration

---
 .vscode/launch.json | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 9ae029d6b..786b5f0a5 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -59,6 +59,26 @@
                 "/*": "*",
                 "/./~/*": "${webRoot}/node_modules/*"
             }
+        },
+        {
+            "name": "celery",
+            "type": "debugpy",
+            "request": "launch",
+            "cwd": "${workspaceFolder}/backend",
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}/backend"
+            },
+            "module": "celery",
+            "console": "integratedTerminal",
+            "args": [
+                "-A",
+                "ianalyzer.celery",
+                "worker",
+                "--pool=solo",
+                "--concurrency=1",
+                "--events",
+                "--loglevel=info"
+            ]
         }
     ],
     "inputs": [

From 4f5be59cd214aae7a8114947f7c38ea83add6831 Mon Sep 17 00:00:00 2001
From: Jelte van Boheemen <j.vanboheemen@uu.nl>
Date: Thu, 4 Jul 2024 16:00:22 +0200
Subject: [PATCH 2/6] Hide token columns in histogram when on documents measure

Resolves #1560
---
 .../barchart/histogram.component.ts           | 47 ++++++++++---------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/frontend/src/app/visualization/barchart/histogram.component.ts b/frontend/src/app/visualization/barchart/histogram.component.ts
index a73f630fd..24f952f09 100644
--- a/frontend/src/app/visualization/barchart/histogram.component.ts
+++ b/frontend/src/app/visualization/barchart/histogram.component.ts
@@ -213,28 +213,33 @@ export class HistogramComponent
                     formatDownload: this.formatDownloadValue,
                     isOptional: 'relative_doc_count' !== valueKey,
                 },
-                {
-                    key: 'match_count',
-                    label: 'Token Frequency',
-                    format: this.formatValue('raw'),
-                    formatDownload: this.formatDownloadValue,
-                    isOptional: 'match_count' !== valueKey,
-                },
-                {
-                    key: 'matches_by_doc_count',
-                    label: 'Relative Frequency (documents)',
-                    format: this.formatValue('documents'),
-                    formatDownload: this.formatDownloadValue,
-                    isOptional: 'matches_by_doc_count' !== valueKey,
-                },
-                {
-                    key: 'matches_by_token_count',
-                    label: 'Relative Frequency (terms)',
-                    format: this.formatValue('terms'),
-                    formatDownload: this.formatDownloadValue,
-                    isOptional: 'matches_by_token_count' !== valueKey,
-                },
             ];
+            if (this.frequencyMeasure == 'tokens') {
+                // Headers related to tokens should not be applied to document visualizations
+                this.tableHeaders = this.tableHeaders.concat([
+                    {
+                        key: 'match_count',
+                        label: 'Token Frequency',
+                        format: this.formatValue('raw'),
+                        formatDownload: this.formatDownloadValue,
+                        isOptional: 'match_count' !== valueKey,
+                    },
+                    {
+                        key: 'matches_by_doc_count',
+                        label: 'Relative Frequency (documents)',
+                        format: this.formatValue('documents'),
+                        formatDownload: this.formatDownloadValue,
+                        isOptional: 'matches_by_doc_count' !== valueKey,
+                    },
+                    {
+                        key: 'matches_by_token_count',
+                        label: 'Relative Frequency (terms)',
+                        format: this.formatValue('terms'),
+                        formatDownload: this.formatDownloadValue,
+                        isOptional: 'matches_by_token_count' !== valueKey,
+                    },
+                ]);
+            }
         }
     }
 }

From b0cc71f4677e71cc87f47c43aaed6b0a6eb00374 Mon Sep 17 00:00:00 2001
From: Luka van der Plas <lukavdplas@mailfence.com>
Date: Thu, 4 Jul 2024 18:25:20 +0200
Subject: [PATCH 3/6] fix query text in csv

---
 backend/download/tasks.py                     | 11 ++---------
 backend/download/tests/test_download_views.py | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/backend/download/tasks.py b/backend/download/tasks.py
index 875f4d5bc..ccc559065 100644
--- a/backend/download/tasks.py
+++ b/backend/download/tasks.py
@@ -44,19 +44,12 @@ def make_download(request_json, download_id, download_size=None):
     es_query = api_query_to_es_query(request_json, corpus_name)
     results, _total = es_download.scroll(
         corpus_name, es_query, download_size)
+
     filepath = create_csv.search_results_csv(
-        results, request_json['fields'], query, download_id)
+        results, request_json['fields'], query.get_query_text(es_query), download_id)
     return filepath
 
 
-def create_query(request_json):
-    """
-    format the route of the search into a query string
-    """
-    route = request_json.get('route')
-    return re.sub(r';|%\d+', '_', re.sub(r'\$', '', route.split('/')[2]))
-
-
 def try_download(tasks_func, download):
     '''
     Try initialising a task chain for a download. Marks the download
diff --git a/backend/download/tests/test_download_views.py b/backend/download/tests/test_download_views.py
index ef1c6411a..7cbbd981c 100644
--- a/backend/download/tests/test_download_views.py
+++ b/backend/download/tests/test_download_views.py
@@ -246,3 +246,22 @@ def test_unauthenticated_download(db, client, basic_mock_corpus, basic_corpus_pu
     download_objects = Download.objects.all()
     assert download_objects.count() == 1
     assert download_objects.first().user == None
+
+def test_query_text_in_csv(db, client, basic_mock_corpus, basic_corpus_public, index_basic_mock_corpus):
+    es_query = query.set_query_text(mock_match_all_query(), 'ghost')
+    download_request_json = {
+        'corpus': basic_mock_corpus,
+        'es_query': es_query,
+        'fields': ['character', 'line'],
+        'route': f"/search/{basic_mock_corpus}",
+        'encoding': 'utf-8'
+    }
+    response = client.post('/api/download/search_results',
+                           download_request_json,
+                           content_type='application/json'
+                           )
+    assert status.is_success(response.status_code)
+    stream = read_file_response(response, 'utf-8')
+    reader = csv.DictReader(stream, delimiter=';')
+    row = next(reader)
+    assert row['query'] == 'ghost'

From 2b4230932fa3258888e1d5dfcc5a0233fec27e1c Mon Sep 17 00:00:00 2001
From: Jelte van Boheemen <j.vanboheemen@uu.nl>
Date: Fri, 5 Jul 2024 09:48:38 +0200
Subject: [PATCH 4/6] Sort documentation page in canonical order

---
 backend/addcorpus/models.py                   |  6 +++++-
 backend/addcorpus/tests/test_corpus_views.py  |  7 ++++++-
 backend/addcorpus/views.py                    | 18 +++++++++++++-----
 backend/corpora_test/basic/license/license.md |  1 +
 backend/corpora_test/basic/mock_csv_corpus.py |  2 ++
 5 files changed, 27 insertions(+), 7 deletions(-)
 create mode 100644 backend/corpora_test/basic/license/license.md

diff --git a/backend/addcorpus/models.py b/backend/addcorpus/models.py
index ca31d9912..91dae899d 100644
--- a/backend/addcorpus/models.py
+++ b/backend/addcorpus/models.py
@@ -431,11 +431,12 @@ def clean(self):
                     e
                 ])
 
+
 class CorpusDocumentationPage(models.Model):
     class PageType(models.TextChoices):
         GENERAL = ('general', 'General information')
         CITATION = ('citation', 'Citation')
-        LICENSE = ('license', 'Licence')
+        LICENSE = ('license', 'License')
         TERMS_OF_SERVICE = ('terms_of_service', 'Terms of service')
         WORDMODELS = ('wordmodels', 'Word models')
 
@@ -455,6 +456,9 @@ class PageType(models.TextChoices):
         help_text='markdown contents of the documentation'
     )
 
+    def __str__(self):
+        return f'{self.corpus_configuration.corpus.name} - {self.type}'
+
     class Meta:
         constraints = [
             UniqueConstraint(
diff --git a/backend/addcorpus/tests/test_corpus_views.py b/backend/addcorpus/tests/test_corpus_views.py
index 8fd15290c..d0080c053 100644
--- a/backend/addcorpus/tests/test_corpus_views.py
+++ b/backend/addcorpus/tests/test_corpus_views.py
@@ -19,9 +19,14 @@ def test_no_corpora(db, settings, admin_client):
 def test_corpus_documentation_view(admin_client, basic_mock_corpus, settings):
     response = admin_client.get(f'/api/corpus/documentation/{basic_mock_corpus}/')
     assert response.status_code == 200
+    pages = response.data
+
+    # check that the pages are sorted in canonical order
+    page_types = [page['type'] for page in pages]
+    assert page_types == ['General information', 'Citation', 'License']
 
     # should contain citation guidelines
-    citation_page = next(page for page in response.data if page['type'] == 'Citation')
+    citation_page = next(page for page in pages if page['type'] == 'Citation')
 
     # check that the page template is rendered with context
     content = citation_page['content']
diff --git a/backend/addcorpus/views.py b/backend/addcorpus/views.py
index f495fd162..a32064011 100644
--- a/backend/addcorpus/views.py
+++ b/backend/addcorpus/views.py
@@ -42,22 +42,30 @@ def send_corpus_file(corpus='', subdir='', filename=''):
 
     return FileResponse(open(path, 'rb'))
 
+
 class CorpusDocumentationPageViewset(viewsets.ModelViewSet):
     permission_classes = [IsAuthenticatedOrReadOnly, CorpusAccessPermission]
     serializer_class = CorpusDocumentationPageSerializer
 
-    def get_queryset(self):
-        corpus_name = corpus_name_from_request(self.request)
-        pages = CorpusDocumentationPage.objects.filter(corpus_configuration__corpus__name=corpus_name)
-
+    @staticmethod
+    def get_relevant_pages(pages, corpus_name):
         # only include wordmodels documentation if models are present
         if Corpus.objects.get(name=corpus_name).has_python_definition:
             definition = load_corpus_definition(corpus_name)
             if definition.word_models_present:
                 return pages
-
         return pages.exclude(type=CorpusDocumentationPage.PageType.WORDMODELS)
 
+    def get_queryset(self):
+        corpus_name = corpus_name_from_request(self.request)
+        pages = CorpusDocumentationPage.objects.filter(
+            corpus_configuration__corpus__name=corpus_name)
+        relevant_pages = self.get_relevant_pages(pages, corpus_name)
+        canonical_order = [e.value for e in CorpusDocumentationPage.PageType]
+
+        return sorted(
+            relevant_pages, key=lambda p: canonical_order.index(p.type))
+
 
 class CorpusImageView(APIView):
     '''
diff --git a/backend/corpora_test/basic/license/license.md b/backend/corpora_test/basic/license/license.md
new file mode 100644
index 000000000..4fcb91a18
--- /dev/null
+++ b/backend/corpora_test/basic/license/license.md
@@ -0,0 +1 @@
+Do whatever you please.
diff --git a/backend/corpora_test/basic/mock_csv_corpus.py b/backend/corpora_test/basic/mock_csv_corpus.py
index 84711f8a0..3b3d38360 100644
--- a/backend/corpora_test/basic/mock_csv_corpus.py
+++ b/backend/corpora_test/basic/mock_csv_corpus.py
@@ -25,6 +25,8 @@ class MockCSVCorpus(CSVCorpusDefinition):
     max_date = datetime.datetime(year=2022, month=12, day=31)
     data_directory = os.path.join(here, 'source_data')
     citation_page = 'citation.md'
+    license_page = 'license.md'
+    description_page = 'mock-csv-corpus.md'
 
     languages = ['en']
     category = 'book'

From a4dde03fec194ff16f402448fbbb069e20fb815e Mon Sep 17 00:00:00 2001
From: Jelte van Boheemen <j.vanboheemen@uu.nl>
Date: Fri, 5 Jul 2024 10:24:10 +0200
Subject: [PATCH 5/6] Add missing reset$ to CreateDefinitionComponent

---
 .../create-definition/create-definition.component.ts | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/frontend/src/app/corpus-definitions/create-definition/create-definition.component.ts b/frontend/src/app/corpus-definitions/create-definition/create-definition.component.ts
index deba1add0..9d7e16469 100644
--- a/frontend/src/app/corpus-definitions/create-definition/create-definition.component.ts
+++ b/frontend/src/app/corpus-definitions/create-definition/create-definition.component.ts
@@ -5,11 +5,12 @@ import { APIEditableCorpus, CorpusDefinition } from '../../models/corpus-definit
 import * as _ from 'lodash';
 import { Router } from '@angular/router';
 import { HttpErrorResponse } from '@angular/common/http';
+import { Subject } from 'rxjs';
 
 @Component({
     selector: 'ia-create-definition',
     templateUrl: './create-definition.component.html',
-    styleUrls: ['./create-definition.component.scss']
+    styleUrls: ['./create-definition.component.scss'],
 })
 export class CreateDefinitionComponent {
     actionIcons = actionIcons;
@@ -19,6 +20,8 @@ export class CreateDefinitionComponent {
 
     error: Error;
 
+    reset$ = new Subject<void>();
+
     constructor(private apiService: ApiService, private router: Router) {
         this.corpus = new CorpusDefinition(apiService);
     }
@@ -31,12 +34,15 @@ export class CreateDefinitionComponent {
         this.error = undefined;
         this.corpus.save().subscribe(
             (result: APIEditableCorpus) => {
-                this.router.navigate(['/corpus-definitions', 'edit', result.id]);
+                this.router.navigate([
+                    '/corpus-definitions',
+                    'edit',
+                    result.id,
+                ]);
             },
             (err: HttpErrorResponse) => {
                 this.error = err;
             }
         );
     }
-
 }

From d5cdf8c23f093b75db32c9f174c2dd4e03f13fd9 Mon Sep 17 00:00:00 2001
From: Jelte van Boheemen <j.vanboheemen@uu.nl>
Date: Fri, 5 Jul 2024 10:46:17 +0200
Subject: [PATCH 6/6] Remove documentation from media test corpus

---
 backend/corpora_test/media/media_mock_corpus.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backend/corpora_test/media/media_mock_corpus.py b/backend/corpora_test/media/media_mock_corpus.py
index e965ca660..6bd7be485 100644
--- a/backend/corpora_test/media/media_mock_corpus.py
+++ b/backend/corpora_test/media/media_mock_corpus.py
@@ -13,6 +13,8 @@ class MediaMockCorpus(MockCSVCorpus):
     data_directory = os.path.join(here, 'source_data')
     scan_image_type = 'image/png'
     citation_page = None
+    license_page = None
+    description_page = None
 
     def request_media(self, document, corpus_name):
         field_values = document['fieldValues']