Merge branch 'develop' into bugfix/nn-reset

CentreForDigitalHumanities · Jul 5, 2024 · 20d25dc · 20d25dc
2 parents e9ee10f + 6093a65
commit 20d25dc
Show file tree

Hide file tree

Showing 11 changed files with 105 additions and 40 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -59,6 +59,26 @@
                 "/*": "*",
                 "/./~/*": "${webRoot}/node_modules/*"
             }
+        },
+        {
+            "name": "celery",
+            "type": "debugpy",
+            "request": "launch",
+            "cwd": "${workspaceFolder}/backend",
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}/backend"
+            },
+            "module": "celery",
+            "console": "integratedTerminal",
+            "args": [
+                "-A",
+                "ianalyzer.celery",
+                "worker",
+                "--pool=solo",
+                "--concurrency=1",
+                "--events",
+                "--loglevel=info"
+            ]
         }
     ],
     "inputs": [

diff --git a/backend/addcorpus/models.py b/backend/addcorpus/models.py
@@ -448,11 +448,12 @@ def clean(self):
                     e
                 ])
 
+
 class CorpusDocumentationPage(models.Model):
     class PageType(models.TextChoices):
         GENERAL = ('general', 'General information')
         CITATION = ('citation', 'Citation')
-        LICENSE = ('license', 'Licence')
+        LICENSE = ('license', 'License')
         TERMS_OF_SERVICE = ('terms_of_service', 'Terms of service')
         WORDMODELS = ('wordmodels', 'Word models')
 
@@ -472,6 +473,9 @@ class PageType(models.TextChoices):
         help_text='markdown contents of the documentation'
     )
 
+    def __str__(self):
+        return f'{self.corpus_configuration.corpus.name} - {self.type}'
+
     class Meta:
         constraints = [
             UniqueConstraint(

diff --git a/backend/addcorpus/tests/test_corpus_views.py b/backend/addcorpus/tests/test_corpus_views.py
@@ -19,9 +19,14 @@ def test_no_corpora(db, settings, admin_client):
 def test_corpus_documentation_view(admin_client, basic_mock_corpus, settings):
     response = admin_client.get(f'/api/corpus/documentation/{basic_mock_corpus}/')
     assert response.status_code == 200
+    pages = response.data
+
+    # check that the pages are sorted in canonical order
+    page_types = [page['type'] for page in pages]
+    assert page_types == ['General information', 'Citation', 'License']
 
     # should contain citation guidelines
-    citation_page = next(page for page in response.data if page['type'] == 'Citation')
+    citation_page = next(page for page in pages if page['type'] == 'Citation')
 
     # check that the page template is rendered with context
     content = citation_page['content']

diff --git a/backend/addcorpus/views.py b/backend/addcorpus/views.py
@@ -42,22 +42,30 @@ def send_corpus_file(corpus='', subdir='', filename=''):
 
     return FileResponse(open(path, 'rb'))
 
+
 class CorpusDocumentationPageViewset(viewsets.ModelViewSet):
     permission_classes = [IsAuthenticatedOrReadOnly, CorpusAccessPermission]
     serializer_class = CorpusDocumentationPageSerializer
 
-    def get_queryset(self):
-        corpus_name = corpus_name_from_request(self.request)
-        pages = CorpusDocumentationPage.objects.filter(corpus_configuration__corpus__name=corpus_name)
-
+    @staticmethod
+    def get_relevant_pages(pages, corpus_name):
         # only include wordmodels documentation if models are present
         if Corpus.objects.get(name=corpus_name).has_python_definition:
             definition = load_corpus_definition(corpus_name)
             if definition.word_models_present:
                 return pages
-
         return pages.exclude(type=CorpusDocumentationPage.PageType.WORDMODELS)
 
+    def get_queryset(self):
+        corpus_name = corpus_name_from_request(self.request)
+        pages = CorpusDocumentationPage.objects.filter(
+            corpus_configuration__corpus__name=corpus_name)
+        relevant_pages = self.get_relevant_pages(pages, corpus_name)
+        canonical_order = [e.value for e in CorpusDocumentationPage.PageType]
+
+        return sorted(
+            relevant_pages, key=lambda p: canonical_order.index(p.type))
+
 
 class CorpusImageView(APIView):
     '''

diff --git a/backend/corpora_test/basic/license/license.md b/backend/corpora_test/basic/license/license.md
@@ -0,0 +1 @@
+Do whatever you please.
diff --git a/backend/corpora_test/basic/mock_csv_corpus.py b/backend/corpora_test/basic/mock_csv_corpus.py
@@ -25,6 +25,8 @@ class MockCSVCorpus(CSVCorpusDefinition):
     max_date = datetime.datetime(year=2022, month=12, day=31)
     data_directory = os.path.join(here, 'source_data')
     citation_page = 'citation.md'
+    license_page = 'license.md'
+    description_page = 'mock-csv-corpus.md'
 
     languages = ['en']
     category = 'book'

diff --git a/backend/corpora_test/media/media_mock_corpus.py b/backend/corpora_test/media/media_mock_corpus.py
@@ -13,6 +13,8 @@ class MediaMockCorpus(MockCSVCorpus):
     data_directory = os.path.join(here, 'source_data')
     scan_image_type = 'image/png'
     citation_page = None
+    license_page = None
+    description_page = None
 
     def request_media(self, document, corpus_name):
         field_values = document['fieldValues']

diff --git a/backend/download/tasks.py b/backend/download/tasks.py
@@ -44,19 +44,12 @@ def make_download(request_json, download_id, download_size=None):
     es_query = api_query_to_es_query(request_json, corpus_name)
     results, _total = es_download.scroll(
         corpus_name, es_query, download_size)
+
     filepath = create_csv.search_results_csv(
-        results, request_json['fields'], query, download_id)
+        results, request_json['fields'], query.get_query_text(es_query), download_id)
     return filepath
 
 
-def create_query(request_json):
-    """
-    format the route of the search into a query string
-    """
-    route = request_json.get('route')
-    return re.sub(r';|%\d+', '_', re.sub(r'\$', '', route.split('/')[2]))
-
-
 def try_download(tasks_func, download):
     '''
     Try initialising a task chain for a download. Marks the download

diff --git a/backend/download/tests/test_download_views.py b/backend/download/tests/test_download_views.py
@@ -246,3 +246,22 @@ def test_unauthenticated_download(db, client, basic_mock_corpus, basic_corpus_pu
     download_objects = Download.objects.all()
     assert download_objects.count() == 1
     assert download_objects.first().user == None
+
+def test_query_text_in_csv(db, client, basic_mock_corpus, basic_corpus_public, index_basic_mock_corpus):
+    es_query = query.set_query_text(mock_match_all_query(), 'ghost')
+    download_request_json = {
+        'corpus': basic_mock_corpus,
+        'es_query': es_query,
+        'fields': ['character', 'line'],
+        'route': f"/search/{basic_mock_corpus}",
+        'encoding': 'utf-8'
+    }
+    response = client.post('/api/download/search_results',
+                           download_request_json,
+                           content_type='application/json'
+                           )
+    assert status.is_success(response.status_code)
+    stream = read_file_response(response, 'utf-8')
+    reader = csv.DictReader(stream, delimiter=';')
+    row = next(reader)
+    assert row['query'] == 'ghost'
diff --git a/frontend/src/app/corpus-definitions/create-definition/create-definition.component.ts b/frontend/src/app/corpus-definitions/create-definition/create-definition.component.ts
@@ -5,11 +5,12 @@ import { APIEditableCorpus, CorpusDefinition } from '../../models/corpus-definit
 import * as _ from 'lodash';
 import { Router } from '@angular/router';
 import { HttpErrorResponse } from '@angular/common/http';
+import { Subject } from 'rxjs';
 
 @Component({
     selector: 'ia-create-definition',
     templateUrl: './create-definition.component.html',
-    styleUrls: ['./create-definition.component.scss']
+    styleUrls: ['./create-definition.component.scss'],
 })
 export class CreateDefinitionComponent {
     actionIcons = actionIcons;
@@ -19,6 +20,8 @@ export class CreateDefinitionComponent {
 
     error: Error;
 
+    reset$ = new Subject<void>();
+
     constructor(private apiService: ApiService, private router: Router) {
         this.corpus = new CorpusDefinition(apiService);
     }
@@ -31,12 +34,15 @@ export class CreateDefinitionComponent {
         this.error = undefined;
         this.corpus.save().subscribe(
             (result: APIEditableCorpus) => {
-                this.router.navigate(['/corpus-definitions', 'edit', result.id]);
+                this.router.navigate([
+                    '/corpus-definitions',
+                    'edit',
+                    result.id,
+                ]);
             },
             (err: HttpErrorResponse) => {
                 this.error = err;
             }
         );
     }
-
 }
diff --git a/frontend/src/app/visualization/barchart/histogram.component.ts b/frontend/src/app/visualization/barchart/histogram.component.ts
@@ -213,28 +213,33 @@ export class HistogramComponent
                     formatDownload: this.formatDownloadValue,
                     isOptional: 'relative_doc_count' !== valueKey,
                 },
-                {
-                    key: 'match_count',
-                    label: 'Token Frequency',
-                    format: this.formatValue('raw'),
-                    formatDownload: this.formatDownloadValue,
-                    isOptional: 'match_count' !== valueKey,
-                },
-                {
-                    key: 'matches_by_doc_count',
-                    label: 'Relative Frequency (documents)',
-                    format: this.formatValue('documents'),
-                    formatDownload: this.formatDownloadValue,
-                    isOptional: 'matches_by_doc_count' !== valueKey,
-                },
-                {
-                    key: 'matches_by_token_count',
-                    label: 'Relative Frequency (terms)',
-                    format: this.formatValue('terms'),
-                    formatDownload: this.formatDownloadValue,
-                    isOptional: 'matches_by_token_count' !== valueKey,
-                },
             ];
+            if (this.frequencyMeasure == 'tokens') {
+                // Headers related to tokens should not be applied to document visualizations
+                this.tableHeaders = this.tableHeaders.concat([
+                    {
+                        key: 'match_count',
+                        label: 'Token Frequency',
+                        format: this.formatValue('raw'),
+                        formatDownload: this.formatDownloadValue,
+                        isOptional: 'match_count' !== valueKey,
+                    },
+                    {
+                        key: 'matches_by_doc_count',
+                        label: 'Relative Frequency (documents)',
+                        format: this.formatValue('documents'),
+                        formatDownload: this.formatDownloadValue,
+                        isOptional: 'matches_by_doc_count' !== valueKey,
+                    },
+                    {
+                        key: 'matches_by_token_count',
+                        label: 'Relative Frequency (terms)',
+                        format: this.formatValue('terms'),
+                        formatDownload: this.formatDownloadValue,
+                        isOptional: 'matches_by_token_count' !== valueKey,
+                    },
+                ]);
+            }
         }
     }
 }