Skip to content

Commit

Permalink
Merge branch 'develop' into feature/corpus-doc-order
Browse files Browse the repository at this point in the history
  • Loading branch information
JeltevanBoheemen authored Jul 5, 2024
2 parents d5cdf8c + 70b3f8c commit 945201c
Show file tree
Hide file tree
Showing 63 changed files with 787 additions and 300 deletions.
2 changes: 1 addition & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM docker.io/library/python:3.9
# Setting this means stdout and stderr streams are sent to terminal in real time
ENV PYTHONUNBUFFERED 1
# Get required libraries for xmlsec
RUN apt-get -y update
RUN apt-get -y update && apt-get -y upgrade
RUN apt-get install -y pkg-config libxml2-dev libxmlsec1-dev libxmlsec1-openssl default-libmysqlclient-dev

RUN pip install --upgrade pip
Expand Down
23 changes: 20 additions & 3 deletions backend/addcorpus/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from addcorpus.validation.creation import (
validate_es_mapping, validate_field_language, validate_implication, validate_language_code,
validate_mimetype,
validate_name_is_not_a_route_parameter, validate_search_filter,
validate_search_filter_with_mapping,
validate_name_is_not_a_route_parameter, validate_name_has_no_ner_suffix,
validate_search_filter, validate_search_filter_with_mapping,
validate_searchable_field_has_full_text_search,
validate_sort_configuration, validate_visualizations_with_mapping,
validate_source_data_directory,
Expand All @@ -21,6 +21,8 @@
from django.db import models
from django.db.models.constraints import UniqueConstraint

from ianalyzer.elasticsearch import elasticsearch

MAX_LENGTH_NAME = 126
MAX_LENGTH_DESCRIPTION = 254
MAX_LENGTH_TITLE = 256
Expand Down Expand Up @@ -260,6 +262,20 @@ def clean(self):
e
])

@property
def has_named_entities(self):
client = elasticsearch(self.es_index)
try:
mapping = client.indices.get_mapping(
index=self.es_index)
fields = mapping[self.es_index].get(
'mappings', {}).get('properties', {}).keys()
if any(field.endswith(':ner') for field in fields):
return True
except:
return False
return False


FIELD_DISPLAY_TYPES = [
('text_content', 'text content'),
Expand Down Expand Up @@ -293,7 +309,8 @@ def clean(self):
class Field(models.Model):
name = models.SlugField(
max_length=MAX_LENGTH_NAME,
validators=[validate_name_is_not_a_route_parameter],
validators=[validate_name_is_not_a_route_parameter,
validate_name_has_no_ner_suffix],
help_text='internal name for the field',
)
corpus_configuration = models.ForeignKey(
Expand Down
2 changes: 2 additions & 0 deletions backend/addcorpus/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class CorpusConfigurationSerializer(serializers.ModelSerializer):
languages = serializers.ListField(child=LanguageField())
category = PrettyChoiceField(choices=CATEGORIES)
default_sort = NonEmptyJSONField()
has_named_entities = serializers.ReadOnlyField()

class Meta:
model = CorpusConfiguration
Expand All @@ -89,6 +90,7 @@ class Meta:
'default_sort',
'language_field',
'fields',
'has_named_entities',
]


Expand Down
7 changes: 7 additions & 0 deletions backend/addcorpus/validation/creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,13 @@ def validate_name_is_not_a_route_parameter(value):
f'{value} cannot be used as a field name, because it is also a route parameter'
)


def validate_name_has_no_ner_suffix(value):
if value.endswith(':ner'):
raise ValidationError(
f'{value} cannot be used as a field name: the suffix `:ner` is reserved for annotated_text fields'
)

def mapping_can_be_searched(es_mapping):
'''
Verify if a mapping is appropriate for searching
Expand Down
12 changes: 6 additions & 6 deletions backend/corpora/jewishmigration/jewishmigration.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ class JewishMigration(PeacePortal, JSONCorpusDefinition):
min_date = datetime(year=1, month=1, day=1)
max_date = datetime(year=1800, month=12, day=31)

data_directory = getattr(settings, 'JMIG_DATA')
data_url = getattr(settings, 'JMIG_DATA_URL',
'localhost:8100/api/records/')
data_directory = settings.JMIG_DATA_DIR
data_filepath = getattr(settings, 'JMIG_DATA', None)
data_url = getattr(settings, 'JMIG_DATA_URL', None)

es_index = getattr(settings, 'JMIG_INDEX', 'jewishmigration')
image = 'jewish_inscriptions.jpg'
Expand All @@ -62,12 +62,12 @@ def sources(self, start, end):
if self.data_url:
response = requests.get(self.data_url)
list_of_sources = response.json()
elif self.data_directory:
with open(self.data_directory, 'r') as f:
elif self.data_filepath:
with open(self.data_filepath, 'r') as f:
list_of_sources = json.load(f)
else:
logging.getLogger('indexing').warning(
'No data directory or URL provided.')
'No data filepath or URL provided.')
for source in list_of_sources:
yield source

Expand Down
1 change: 1 addition & 0 deletions backend/corpora/jewishmigration/test_jewishmigration.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def jm_corpus_settings(settings):
settings.CORPORA = {
'jewishmigration': os.path.join(here, 'jewishmigration.py')
}
settings.JMIG_DATA_DIR = '/corpora'
settings.JMIG_DATA = None
settings.JMIG_DATA_URL = 'http://www.example.com'
settings.JMIG_INDEX = 'test-jewishmigration'
Expand Down
23 changes: 23 additions & 0 deletions backend/es/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import pytest
from time import sleep

from django.contrib.auth.models import Group

from addcorpus.python_corpora.load_corpus import load_corpus_definition
Expand All @@ -17,6 +19,27 @@ def corpus_definition(mock_corpus):
yield corpus


@pytest.fixture()
def es_ner_search_client(es_client, basic_mock_corpus, basic_corpus_public, index_basic_mock_corpus):
"""
Create and populate an index for the mock corpus in elasticsearch.
Returns an elastic search client for the mock corpus.
"""
# add data from mock corpus
corpus = Corpus.objects.get(name=basic_mock_corpus)
es_client.indices.put_mapping(index=corpus.configuration.es_index, properties={
"content:ner": {"type": "annotated_text"}})

es_client.index(index=corpus.configuration.es_index, document={
'id': 'my_identifier',
'content': 'Guybrush Threepwood is looking for treasure on Monkey Island',
'content:ner': '[Guybrush Threepwood](PER) is looking for treasure on [Monkey Island](LOC)'})

# ES is "near real time", so give it a second before we start searching the index
sleep(1)
yield es_client


@pytest.fixture()
def es_index_client(es_client, mock_corpus):
"""
Expand Down
48 changes: 48 additions & 0 deletions backend/es/tests/test_named_entity_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from es.views import NamedEntitySearchView


def test_ner_search_view(es_ner_search_client, client):
route = '/api/es/mock-csv-corpus/my_identifier/named_entities'
response = client.get(route, content_type='application/json')
assert response.status_code == 200


def test_construct_ner_query():
viewset = NamedEntitySearchView()
fields = ['content:ner']
query = viewset.construct_named_entity_query(fields, 'my_identifier')
expected = {
"bool": {
"must": [
{
"term": {
"id": "my_identifier"
}
},
{
"terms": {
"content:ner": ["LOC", "PER", "ORG", "MISC"]
}
}
]
}
}
assert query == expected


def test_find_named_entity_fields(es_ner_search_client):
viewset = NamedEntitySearchView()
fields = viewset.find_named_entity_fields(
es_ner_search_client, 'test-basic-corpus')
assert len(fields) == 1
assert fields[0] == 'content:ner'


def test_find_entities():
viewset = NamedEntitySearchView()
text = '[Guybrush Threepwood](PER) is looking for treasure on [Monkey Island](LOC)'
output = viewset.find_entities(text)
expected = [{'entity': 'person', 'text': 'Guybrush Threepwood'},
{'entity': 'flat', 'text': ' is looking for treasure on '},
{'entity': 'location', 'text': 'Monkey Island'}]
assert output == expected
4 changes: 3 additions & 1 deletion backend/es/urls.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from django.urls import path
from es.views import *
from es.views import ForwardSearchView, NamedEntitySearchView

urlpatterns = [
path('<str:corpus>/_search', ForwardSearchView.as_view()),
path('<str:corpus>/<str:id>/named_entities',
NamedEntitySearchView.as_view())
]
89 changes: 84 additions & 5 deletions backend/es/views.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
import logging
import re

from django.utils import timezone
from rest_framework.views import APIView
from rest_framework.response import Response
from ianalyzer.elasticsearch import elasticsearch
from es.search import get_index, total_hits, hits
import logging
from rest_framework.permissions import IsAuthenticated
from rest_framework.exceptions import APIException

from addcorpus.permissions import CorpusAccessPermission
from tag.permissions import CanSearchTags
from api.save_query import should_save_query
from addcorpus.models import Corpus
from api.models import Query
from api.api_query import api_query_to_es_query
from es.search import get_index, total_hits, hits
from ianalyzer.elasticsearch import elasticsearch
from tag.permissions import CanSearchTags

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -98,3 +100,80 @@ def _save_query_done(self, query, results):
query.total_results = total_hits(results)
query.transferred = len(hits(results))
query.save()


class NamedEntitySearchView(APIView):
''' Construct a terms query for named entities, combined with a term query of the id
Perform search via Elasticsearch and reformat the output
'''
entity_dict = {
'PER': 'person',
'LOC': 'location',
'ORG': 'organization',
'MISC': 'miscellaneous'
}

permission_classes = [CorpusAccessPermission]

def get(self, request, *args, **kwargs):
corpus_name = kwargs.get('corpus')
document_id = kwargs.get('id')
client = elasticsearch(corpus_name)
index = get_index(corpus_name)
fields = self.find_named_entity_fields(client, index)
query = self.construct_named_entity_query(fields, document_id)
response = client.search(index=index, query=query, fields=fields)
results = hits(response)
annotations = {}
response = {}
if len(results):
source = results[0]['_source']
for field in fields:
text_with_entities = source.get(field)
annotations.update({field.replace(':ner', ''): self.find_entities(
text_with_entities)})
return Response(annotations)

def find_named_entity_fields(self, client, index: str) -> list[str]:
mapping = client.indices.get_mapping(index=index)
fields = mapping[index]['mappings']['properties']
field_names = fields.keys()
return [name for name in field_names if name.endswith(':ner') and fields[name].get('type') == 'annotated_text']

def construct_named_entity_query(self, fields: list[str], document_id: str) -> dict:
return {
"bool": {
"must": [
{
"term": {
"id": document_id
}
}, *self.add_terms(fields)
]
}
}

def add_terms(self, fields: list[str]) -> list[dict]:
return [
{
"terms": {
field: ["LOC", "PER", "ORG", "MISC"]
}
} for field in fields
]

def find_entities(self, input_text: str) -> str:
# regex pattern to match annotations of format "[Wally](Person)" and split it into two groups
pattern = re.compile('(\[[^]]+\])(\([A-Z]+\))')
annotations = pattern.split(input_text)
output = []
for index, annotation in enumerate(annotations):
if annotation.startswith('('):
continue
elif annotation.startswith('['):
output.append(
{'entity': self.entity_dict.get(annotations[index+1][1:-1]), 'text': annotation[1:-1]})
else:
if annotation:
output.append({'entity': 'flat', 'text': annotation})
return output
8 changes: 3 additions & 5 deletions backend/wordmodels/views.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from django.shortcuts import render
from rest_framework.views import APIView
from rest_framework.response import Response
from rest_framework.permissions import IsAuthenticated
from addcorpus.permissions import CorpusAccessPermission, corpus_name_from_request
from wordmodels import utils, visualisations
from rest_framework.exceptions import APIException
Expand All @@ -11,7 +9,7 @@ class RelatedWordsView(APIView):
Get words with the highest similarity to the query term
'''

permission_classes = [IsAuthenticated, CorpusAccessPermission]
permission_classes = [CorpusAccessPermission]

def post(self, request, *args, **kwargs):
corpus = corpus_name_from_request(request)
Expand All @@ -35,7 +33,7 @@ class SimilarityView(APIView):
Get similarity between two query terms
'''

permission_classes = [IsAuthenticated, CorpusAccessPermission]
permission_classes = [CorpusAccessPermission]

def get(self, request, *args, **kwargs):
corpus = corpus_name_from_request(request)
Expand All @@ -55,7 +53,7 @@ class WordInModelView(APIView):
Check if a word has a vector in the model for a corpus
'''

permission_classes = [IsAuthenticated, CorpusAccessPermission]
permission_classes = [CorpusAccessPermission]

def get(self, request, *args, **kwargs):
corpus = corpus_name_from_request(request)
Expand Down
5 changes: 5 additions & 0 deletions frontend/src/_utilities.scss
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ $section-padding: 3rem 1.5rem;
$boxShadow: 0 2px 3px rgba(10, 10, 10, 0.1), 0 0 0 1px rgba(10, 10, 10, 0.1);
$boxShadowHover: 0px 5px 3px rgba(10, 10, 10, 0.1), 0 0 0 1px $primary;

$entity-person: #303F9F;
$entity-location: #4e8f2d;
$entity-organization: #efb71d;
$entity-miscellaneous: #ee5986;

@import "bulma/sass/utilities/_all";

// based on the Bulma loader
Expand Down
Loading

0 comments on commit 945201c

Please sign in to comment.