Merge branch 'develop' into feature/corpus-doc-order

CentreForDigitalHumanities · Jul 5, 2024 · 945201c · 945201c
2 parents d5cdf8c + 70b3f8c
commit 945201c
Show file tree

Hide file tree

Showing 63 changed files with 787 additions and 300 deletions.
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -3,7 +3,7 @@ FROM docker.io/library/python:3.9
 # Setting this means stdout and stderr streams are sent to terminal in real time
 ENV PYTHONUNBUFFERED 1
 # Get required libraries for xmlsec
-RUN apt-get -y update
+RUN apt-get -y update && apt-get -y upgrade
 RUN apt-get install -y pkg-config libxml2-dev libxmlsec1-dev libxmlsec1-openssl default-libmysqlclient-dev
 
 RUN pip install --upgrade pip

diff --git a/backend/addcorpus/models.py b/backend/addcorpus/models.py
@@ -4,8 +4,8 @@
 from addcorpus.validation.creation import (
     validate_es_mapping, validate_field_language, validate_implication, validate_language_code,
     validate_mimetype,
-    validate_name_is_not_a_route_parameter, validate_search_filter,
-    validate_search_filter_with_mapping,
+    validate_name_is_not_a_route_parameter, validate_name_has_no_ner_suffix,
+    validate_search_filter, validate_search_filter_with_mapping,
     validate_searchable_field_has_full_text_search,
     validate_sort_configuration, validate_visualizations_with_mapping,
     validate_source_data_directory,
@@ -21,6 +21,8 @@
 from django.db import models
 from django.db.models.constraints import UniqueConstraint
 
+from ianalyzer.elasticsearch import elasticsearch
+
 MAX_LENGTH_NAME = 126
 MAX_LENGTH_DESCRIPTION = 254
 MAX_LENGTH_TITLE = 256
@@ -260,6 +262,20 @@ def clean(self):
                     e
                 ])
 
+    @property
+    def has_named_entities(self):
+        client = elasticsearch(self.es_index)
+        try:
+            mapping = client.indices.get_mapping(
+                index=self.es_index)
+            fields = mapping[self.es_index].get(
+                'mappings', {}).get('properties', {}).keys()
+            if any(field.endswith(':ner') for field in fields):
+                return True
+        except:
+            return False
+        return False
+
 
 FIELD_DISPLAY_TYPES = [
     ('text_content', 'text content'),
@@ -293,7 +309,8 @@ def clean(self):
 class Field(models.Model):
     name = models.SlugField(
         max_length=MAX_LENGTH_NAME,
-        validators=[validate_name_is_not_a_route_parameter],
+        validators=[validate_name_is_not_a_route_parameter,
+                    validate_name_has_no_ner_suffix],
         help_text='internal name for the field',
     )
     corpus_configuration = models.ForeignKey(

diff --git a/backend/addcorpus/serializers.py b/backend/addcorpus/serializers.py
@@ -70,6 +70,7 @@ class CorpusConfigurationSerializer(serializers.ModelSerializer):
     languages = serializers.ListField(child=LanguageField())
     category = PrettyChoiceField(choices=CATEGORIES)
     default_sort = NonEmptyJSONField()
+    has_named_entities = serializers.ReadOnlyField()
 
     class Meta:
         model = CorpusConfiguration
@@ -89,6 +90,7 @@ class Meta:
             'default_sort',
             'language_field',
             'fields',
+            'has_named_entities',
         ]
 
 

diff --git a/backend/addcorpus/validation/creation.py b/backend/addcorpus/validation/creation.py
@@ -122,6 +122,13 @@ def validate_name_is_not_a_route_parameter(value):
             f'{value} cannot be used as a field name, because it is also a route parameter'
         )
 
+
+def validate_name_has_no_ner_suffix(value):
+    if value.endswith(':ner'):
+        raise ValidationError(
+            f'{value} cannot be used as a field name: the suffix `:ner` is reserved for annotated_text fields'
+        )
+
 def mapping_can_be_searched(es_mapping):
     '''
     Verify if a mapping is appropriate for searching

diff --git a/backend/corpora/jewishmigration/jewishmigration.py b/backend/corpora/jewishmigration/jewishmigration.py
@@ -48,9 +48,9 @@ class JewishMigration(PeacePortal, JSONCorpusDefinition):
     min_date = datetime(year=1, month=1, day=1)
     max_date = datetime(year=1800, month=12, day=31)
 
-    data_directory = getattr(settings, 'JMIG_DATA')
-    data_url = getattr(settings, 'JMIG_DATA_URL',
-                       'localhost:8100/api/records/')
+    data_directory = settings.JMIG_DATA_DIR
+    data_filepath = getattr(settings, 'JMIG_DATA', None)
+    data_url = getattr(settings, 'JMIG_DATA_URL', None)
 
     es_index = getattr(settings, 'JMIG_INDEX', 'jewishmigration')
     image = 'jewish_inscriptions.jpg'
@@ -62,12 +62,12 @@ def sources(self, start, end):
         if self.data_url:
             response = requests.get(self.data_url)
             list_of_sources = response.json()
-        elif self.data_directory:
-            with open(self.data_directory, 'r') as f:
+        elif self.data_filepath:
+            with open(self.data_filepath, 'r') as f:
                 list_of_sources = json.load(f)
         else:
             logging.getLogger('indexing').warning(
-                'No data directory or URL provided.')
+                'No data filepath or URL provided.')
         for source in list_of_sources:
             yield source
 

diff --git a/backend/corpora/jewishmigration/test_jewishmigration.py b/backend/corpora/jewishmigration/test_jewishmigration.py
@@ -136,6 +136,7 @@ def jm_corpus_settings(settings):
     settings.CORPORA = {
         'jewishmigration': os.path.join(here, 'jewishmigration.py')
     }
+    settings.JMIG_DATA_DIR = '/corpora'
     settings.JMIG_DATA = None
     settings.JMIG_DATA_URL = 'http://www.example.com'
     settings.JMIG_INDEX = 'test-jewishmigration'

diff --git a/backend/es/conftest.py b/backend/es/conftest.py
@@ -1,4 +1,6 @@
 import pytest
+from time import sleep
+
 from django.contrib.auth.models import Group
 
 from addcorpus.python_corpora.load_corpus import load_corpus_definition
@@ -17,6 +19,27 @@ def corpus_definition(mock_corpus):
     yield corpus
 
 
+@pytest.fixture()
+def es_ner_search_client(es_client, basic_mock_corpus, basic_corpus_public, index_basic_mock_corpus):
+    """
+    Create and populate an index for the mock corpus in elasticsearch.
+    Returns an elastic search client for the mock corpus.
+    """
+    # add data from mock corpus
+    corpus = Corpus.objects.get(name=basic_mock_corpus)
+    es_client.indices.put_mapping(index=corpus.configuration.es_index, properties={
+                                  "content:ner": {"type": "annotated_text"}})
+
+    es_client.index(index=corpus.configuration.es_index, document={
+        'id': 'my_identifier',
+        'content': 'Guybrush Threepwood is looking for treasure on Monkey Island',
+        'content:ner': '[Guybrush Threepwood](PER) is looking for treasure on [Monkey Island](LOC)'})
+
+    # ES is "near real time", so give it a second before we start searching the index
+    sleep(1)
+    yield es_client
+
+
 @pytest.fixture()
 def es_index_client(es_client, mock_corpus):
     """

diff --git a/backend/es/tests/test_named_entity_search.py b/backend/es/tests/test_named_entity_search.py
@@ -0,0 +1,48 @@
+from es.views import NamedEntitySearchView
+
+
+def test_ner_search_view(es_ner_search_client, client):
+    route = '/api/es/mock-csv-corpus/my_identifier/named_entities'
+    response = client.get(route, content_type='application/json')
+    assert response.status_code == 200
+
+
+def test_construct_ner_query():
+    viewset = NamedEntitySearchView()
+    fields = ['content:ner']
+    query = viewset.construct_named_entity_query(fields, 'my_identifier')
+    expected = {
+        "bool": {
+            "must": [
+                {
+                    "term": {
+                        "id": "my_identifier"
+                    }
+                },
+                {
+                    "terms": {
+                        "content:ner": ["LOC", "PER", "ORG", "MISC"]
+                    }
+                }
+            ]
+        }
+    }
+    assert query == expected
+
+
+def test_find_named_entity_fields(es_ner_search_client):
+    viewset = NamedEntitySearchView()
+    fields = viewset.find_named_entity_fields(
+        es_ner_search_client, 'test-basic-corpus')
+    assert len(fields) == 1
+    assert fields[0] == 'content:ner'
+
+
+def test_find_entities():
+    viewset = NamedEntitySearchView()
+    text = '[Guybrush Threepwood](PER) is looking for treasure on [Monkey Island](LOC)'
+    output = viewset.find_entities(text)
+    expected = [{'entity': 'person', 'text': 'Guybrush Threepwood'},
+                {'entity': 'flat', 'text': ' is looking for treasure on '},
+                {'entity': 'location', 'text': 'Monkey Island'}]
+    assert output == expected
diff --git a/backend/es/urls.py b/backend/es/urls.py
@@ -1,6 +1,8 @@
 from django.urls import path
-from es.views import *
+from es.views import ForwardSearchView, NamedEntitySearchView
 
 urlpatterns = [
     path('<str:corpus>/_search', ForwardSearchView.as_view()),
+    path('<str:corpus>/<str:id>/named_entities',
+         NamedEntitySearchView.as_view())
 ]
diff --git a/backend/es/views.py b/backend/es/views.py
@@ -1,17 +1,19 @@
+import logging
+import re
+
 from django.utils import timezone
 from rest_framework.views import APIView
 from rest_framework.response import Response
-from ianalyzer.elasticsearch import elasticsearch
-from es.search import get_index, total_hits, hits
-import logging
-from rest_framework.permissions import IsAuthenticated
 from rest_framework.exceptions import APIException
+
 from addcorpus.permissions import CorpusAccessPermission
-from tag.permissions import CanSearchTags
 from api.save_query import should_save_query
 from addcorpus.models import Corpus
 from api.models import Query
 from api.api_query import api_query_to_es_query
+from es.search import get_index, total_hits, hits
+from ianalyzer.elasticsearch import elasticsearch
+from tag.permissions import CanSearchTags
 
 logger = logging.getLogger(__name__)
 
@@ -98,3 +100,80 @@ def _save_query_done(self, query, results):
         query.total_results = total_hits(results)
         query.transferred = len(hits(results))
         query.save()
+
+
+class NamedEntitySearchView(APIView):
+    ''' Construct a terms query for named entities, combined with a term query of the id
+        Perform search via Elasticsearch and reformat the output
+    '''
+    entity_dict = {
+        'PER': 'person',
+        'LOC': 'location',
+        'ORG': 'organization',
+        'MISC': 'miscellaneous'
+    }
+
+    permission_classes = [CorpusAccessPermission]
+
+    def get(self, request, *args, **kwargs):
+        corpus_name = kwargs.get('corpus')
+        document_id = kwargs.get('id')
+        client = elasticsearch(corpus_name)
+        index = get_index(corpus_name)
+        fields = self.find_named_entity_fields(client, index)
+        query = self.construct_named_entity_query(fields, document_id)
+        response = client.search(index=index, query=query, fields=fields)
+        results = hits(response)
+        annotations = {}
+        response = {}
+        if len(results):
+            source = results[0]['_source']
+            for field in fields:
+                text_with_entities = source.get(field)
+                annotations.update({field.replace(':ner', ''): self.find_entities(
+                    text_with_entities)})
+        return Response(annotations)
+
+    def find_named_entity_fields(self, client, index: str) -> list[str]:
+        mapping = client.indices.get_mapping(index=index)
+        fields = mapping[index]['mappings']['properties']
+        field_names = fields.keys()
+        return [name for name in field_names if name.endswith(':ner') and fields[name].get('type') == 'annotated_text']
+
+    def construct_named_entity_query(self, fields: list[str], document_id: str) -> dict:
+        return {
+            "bool": {
+                "must": [
+                    {
+                        "term": {
+                            "id": document_id
+                        }
+                    }, *self.add_terms(fields)
+                ]
+            }
+        }
+
+    def add_terms(self, fields: list[str]) -> list[dict]:
+        return [
+            {
+                "terms": {
+                    field: ["LOC", "PER", "ORG", "MISC"]
+                }
+            } for field in fields
+        ]
+
+    def find_entities(self, input_text: str) -> str:
+        # regex pattern to match annotations of format "[Wally](Person)" and split it into two groups
+        pattern = re.compile('(\[[^]]+\])(\([A-Z]+\))')
+        annotations = pattern.split(input_text)
+        output = []
+        for index, annotation in enumerate(annotations):
+            if annotation.startswith('('):
+                continue
+            elif annotation.startswith('['):
+               output.append(
+                   {'entity': self.entity_dict.get(annotations[index+1][1:-1]), 'text': annotation[1:-1]})
+            else:
+                if annotation:
+                    output.append({'entity': 'flat', 'text': annotation})
+        return output
diff --git a/backend/wordmodels/views.py b/backend/wordmodels/views.py
@@ -1,7 +1,5 @@
-from django.shortcuts import render
 from rest_framework.views import APIView
 from rest_framework.response import Response
-from rest_framework.permissions import IsAuthenticated
 from addcorpus.permissions import CorpusAccessPermission, corpus_name_from_request
 from wordmodels import utils, visualisations
 from rest_framework.exceptions import APIException
@@ -11,7 +9,7 @@ class RelatedWordsView(APIView):
     Get words with the highest similarity to the query term
     '''
 
-    permission_classes = [IsAuthenticated, CorpusAccessPermission]
+    permission_classes = [CorpusAccessPermission]
 
     def post(self, request, *args, **kwargs):
         corpus = corpus_name_from_request(request)
@@ -35,7 +33,7 @@ class SimilarityView(APIView):
     Get similarity between two query terms
     '''
 
-    permission_classes = [IsAuthenticated, CorpusAccessPermission]
+    permission_classes = [CorpusAccessPermission]
 
     def get(self, request, *args, **kwargs):
         corpus = corpus_name_from_request(request)
@@ -55,7 +53,7 @@ class WordInModelView(APIView):
     Check if a word has a vector in the model for a corpus
     '''
 
-    permission_classes = [IsAuthenticated, CorpusAccessPermission]
+    permission_classes = [CorpusAccessPermission]
 
     def get(self, request, *args, **kwargs):
         corpus = corpus_name_from_request(request)

diff --git a/frontend/src/_utilities.scss b/frontend/src/_utilities.scss
@@ -30,6 +30,11 @@ $section-padding:	3rem 1.5rem;
 $boxShadow: 0 2px 3px rgba(10, 10, 10, 0.1), 0 0 0 1px rgba(10, 10, 10, 0.1);
 $boxShadowHover: 0px 5px 3px rgba(10, 10, 10, 0.1), 0 0 0 1px $primary;
 
+$entity-person:  #303F9F;
+$entity-location: #4e8f2d;
+$entity-organization: #efb71d;
+$entity-miscellaneous: #ee5986;
+
 @import "bulma/sass/utilities/_all";
 
 // based on the Bulma loader