Skip to content

Commit

Permalink
Merge branch 'develop' into bugfix/query-in-csv
Browse files Browse the repository at this point in the history
  • Loading branch information
lukavdplas authored Jul 5, 2024
2 parents b0cc71f + 70b3f8c commit 7353420
Show file tree
Hide file tree
Showing 57 changed files with 775 additions and 285 deletions.
23 changes: 20 additions & 3 deletions backend/addcorpus/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from addcorpus.validation.creation import (
validate_es_mapping, validate_field_language, validate_implication, validate_language_code,
validate_mimetype,
validate_name_is_not_a_route_parameter, validate_search_filter,
validate_search_filter_with_mapping,
validate_name_is_not_a_route_parameter, validate_name_has_no_ner_suffix,
validate_search_filter, validate_search_filter_with_mapping,
validate_searchable_field_has_full_text_search,
validate_sort_configuration, validate_visualizations_with_mapping,
validate_source_data_directory,
Expand All @@ -21,6 +21,8 @@
from django.db import models
from django.db.models.constraints import UniqueConstraint

from ianalyzer.elasticsearch import elasticsearch

MAX_LENGTH_NAME = 126
MAX_LENGTH_DESCRIPTION = 254
MAX_LENGTH_TITLE = 256
Expand Down Expand Up @@ -260,6 +262,20 @@ def clean(self):
e
])

@property
def has_named_entities(self):
client = elasticsearch(self.es_index)
try:
mapping = client.indices.get_mapping(
index=self.es_index)
fields = mapping[self.es_index].get(
'mappings', {}).get('properties', {}).keys()
if any(field.endswith(':ner') for field in fields):
return True
except:
return False
return False


FIELD_DISPLAY_TYPES = [
('text_content', 'text content'),
Expand Down Expand Up @@ -293,7 +309,8 @@ def clean(self):
class Field(models.Model):
name = models.SlugField(
max_length=MAX_LENGTH_NAME,
validators=[validate_name_is_not_a_route_parameter],
validators=[validate_name_is_not_a_route_parameter,
validate_name_has_no_ner_suffix],
help_text='internal name for the field',
)
corpus_configuration = models.ForeignKey(
Expand Down
2 changes: 2 additions & 0 deletions backend/addcorpus/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class CorpusConfigurationSerializer(serializers.ModelSerializer):
languages = serializers.ListField(child=LanguageField())
category = PrettyChoiceField(choices=CATEGORIES)
default_sort = NonEmptyJSONField()
has_named_entities = serializers.ReadOnlyField()

class Meta:
model = CorpusConfiguration
Expand All @@ -89,6 +90,7 @@ class Meta:
'default_sort',
'language_field',
'fields',
'has_named_entities',
]


Expand Down
7 changes: 7 additions & 0 deletions backend/addcorpus/validation/creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,13 @@ def validate_name_is_not_a_route_parameter(value):
f'{value} cannot be used as a field name, because it is also a route parameter'
)


def validate_name_has_no_ner_suffix(value):
if value.endswith(':ner'):
raise ValidationError(
f'{value} cannot be used as a field name: the suffix `:ner` is reserved for annotated_text fields'
)

def mapping_can_be_searched(es_mapping):
'''
Verify if a mapping is appropriate for searching
Expand Down
23 changes: 23 additions & 0 deletions backend/es/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import pytest
from time import sleep

from django.contrib.auth.models import Group

from addcorpus.python_corpora.load_corpus import load_corpus_definition
Expand All @@ -17,6 +19,27 @@ def corpus_definition(mock_corpus):
yield corpus


@pytest.fixture()
def es_ner_search_client(es_client, basic_mock_corpus, basic_corpus_public, index_basic_mock_corpus):
"""
Create and populate an index for the mock corpus in elasticsearch.
Returns an elastic search client for the mock corpus.
"""
# add data from mock corpus
corpus = Corpus.objects.get(name=basic_mock_corpus)
es_client.indices.put_mapping(index=corpus.configuration.es_index, properties={
"content:ner": {"type": "annotated_text"}})

es_client.index(index=corpus.configuration.es_index, document={
'id': 'my_identifier',
'content': 'Guybrush Threepwood is looking for treasure on Monkey Island',
'content:ner': '[Guybrush Threepwood](PER) is looking for treasure on [Monkey Island](LOC)'})

# ES is "near real time", so give it a second before we start searching the index
sleep(1)
yield es_client


@pytest.fixture()
def es_index_client(es_client, mock_corpus):
"""
Expand Down
48 changes: 48 additions & 0 deletions backend/es/tests/test_named_entity_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from es.views import NamedEntitySearchView


def test_ner_search_view(es_ner_search_client, client):
route = '/api/es/mock-csv-corpus/my_identifier/named_entities'
response = client.get(route, content_type='application/json')
assert response.status_code == 200


def test_construct_ner_query():
viewset = NamedEntitySearchView()
fields = ['content:ner']
query = viewset.construct_named_entity_query(fields, 'my_identifier')
expected = {
"bool": {
"must": [
{
"term": {
"id": "my_identifier"
}
},
{
"terms": {
"content:ner": ["LOC", "PER", "ORG", "MISC"]
}
}
]
}
}
assert query == expected


def test_find_named_entity_fields(es_ner_search_client):
viewset = NamedEntitySearchView()
fields = viewset.find_named_entity_fields(
es_ner_search_client, 'test-basic-corpus')
assert len(fields) == 1
assert fields[0] == 'content:ner'


def test_find_entities():
viewset = NamedEntitySearchView()
text = '[Guybrush Threepwood](PER) is looking for treasure on [Monkey Island](LOC)'
output = viewset.find_entities(text)
expected = [{'entity': 'person', 'text': 'Guybrush Threepwood'},
{'entity': 'flat', 'text': ' is looking for treasure on '},
{'entity': 'location', 'text': 'Monkey Island'}]
assert output == expected
4 changes: 3 additions & 1 deletion backend/es/urls.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from django.urls import path
from es.views import *
from es.views import ForwardSearchView, NamedEntitySearchView

urlpatterns = [
path('<str:corpus>/_search', ForwardSearchView.as_view()),
path('<str:corpus>/<str:id>/named_entities',
NamedEntitySearchView.as_view())
]
89 changes: 84 additions & 5 deletions backend/es/views.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
import logging
import re

from django.utils import timezone
from rest_framework.views import APIView
from rest_framework.response import Response
from ianalyzer.elasticsearch import elasticsearch
from es.search import get_index, total_hits, hits
import logging
from rest_framework.permissions import IsAuthenticated
from rest_framework.exceptions import APIException

from addcorpus.permissions import CorpusAccessPermission
from tag.permissions import CanSearchTags
from api.save_query import should_save_query
from addcorpus.models import Corpus
from api.models import Query
from api.api_query import api_query_to_es_query
from es.search import get_index, total_hits, hits
from ianalyzer.elasticsearch import elasticsearch
from tag.permissions import CanSearchTags

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -98,3 +100,80 @@ def _save_query_done(self, query, results):
query.total_results = total_hits(results)
query.transferred = len(hits(results))
query.save()


class NamedEntitySearchView(APIView):
''' Construct a terms query for named entities, combined with a term query of the id
Perform search via Elasticsearch and reformat the output
'''
entity_dict = {
'PER': 'person',
'LOC': 'location',
'ORG': 'organization',
'MISC': 'miscellaneous'
}

permission_classes = [CorpusAccessPermission]

def get(self, request, *args, **kwargs):
corpus_name = kwargs.get('corpus')
document_id = kwargs.get('id')
client = elasticsearch(corpus_name)
index = get_index(corpus_name)
fields = self.find_named_entity_fields(client, index)
query = self.construct_named_entity_query(fields, document_id)
response = client.search(index=index, query=query, fields=fields)
results = hits(response)
annotations = {}
response = {}
if len(results):
source = results[0]['_source']
for field in fields:
text_with_entities = source.get(field)
annotations.update({field.replace(':ner', ''): self.find_entities(
text_with_entities)})
return Response(annotations)

def find_named_entity_fields(self, client, index: str) -> list[str]:
mapping = client.indices.get_mapping(index=index)
fields = mapping[index]['mappings']['properties']
field_names = fields.keys()
return [name for name in field_names if name.endswith(':ner') and fields[name].get('type') == 'annotated_text']

def construct_named_entity_query(self, fields: list[str], document_id: str) -> dict:
return {
"bool": {
"must": [
{
"term": {
"id": document_id
}
}, *self.add_terms(fields)
]
}
}

def add_terms(self, fields: list[str]) -> list[dict]:
return [
{
"terms": {
field: ["LOC", "PER", "ORG", "MISC"]
}
} for field in fields
]

def find_entities(self, input_text: str) -> str:
# regex pattern to match annotations of format "[Wally](Person)" and split it into two groups
pattern = re.compile('(\[[^]]+\])(\([A-Z]+\))')
annotations = pattern.split(input_text)
output = []
for index, annotation in enumerate(annotations):
if annotation.startswith('('):
continue
elif annotation.startswith('['):
output.append(
{'entity': self.entity_dict.get(annotations[index+1][1:-1]), 'text': annotation[1:-1]})
else:
if annotation:
output.append({'entity': 'flat', 'text': annotation})
return output
5 changes: 5 additions & 0 deletions frontend/src/_utilities.scss
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ $section-padding: 3rem 1.5rem;
$boxShadow: 0 2px 3px rgba(10, 10, 10, 0.1), 0 0 0 1px rgba(10, 10, 10, 0.1);
$boxShadowHover: 0px 5px 3px rgba(10, 10, 10, 0.1), 0 0 0 1px $primary;

$entity-person: #303F9F;
$entity-location: #4e8f2d;
$entity-organization: #efb71d;
$entity-miscellaneous: #ee5986;

@import "bulma/sass/utilities/_all";

// based on the Bulma loader
Expand Down
11 changes: 10 additions & 1 deletion frontend/src/app/common-test-bed.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { TestBed } from '@angular/core/testing';
import { ElementRef } from '@angular/core';
import { RouterTestingModule } from '@angular/router/testing';
import { HttpClientModule } from '@angular/common/http';
import {FontAwesomeTestingModule} from '@fortawesome/angular-fontawesome/testing';

import { appRoutes, declarations, imports, providers } from './app.module';

Expand All @@ -10,9 +11,12 @@ import { AuthServiceMock } from '../mock-data/auth';
import { CorpusServiceMock } from '../mock-data/corpus';
import { DialogServiceMock } from '../mock-data/dialog';
import { ElasticSearchServiceMock } from '../mock-data/elastic-search';
import { EntityServiceMock } from '../mock-data/entity';
import { MockCorpusResponse } from '../mock-data/corpus-response';
import { SearchServiceMock } from '../mock-data/search';
import { ApiService, AuthService, CorpusService, DialogService, ElasticSearchService, SearchService } from './services';
import { ApiService, AuthService, CorpusService, DialogService, SearchService } from './services';
import { ElasticSearchService } from './services/elastic-search.service';
import { EntityService } from './services/entity.service';
import { WordmodelsService } from './services/wordmodels.service';
import { WordmodelsServiceMock } from '../mock-data/wordmodels';
import { VisualizationService } from './services/visualization.service';
Expand All @@ -25,6 +29,7 @@ import { SimpleStore } from './store/simple-store';
export const commonTestBed = () => {
const filteredImports = imports.filter(value => !(value in [HttpClientModule]));
filteredImports.push(RouterTestingModule.withRoutes(appRoutes));
filteredImports.push(FontAwesomeTestingModule)
const filteredProviders = providers.filter(provider => !(
provider in [ApiService, CorpusService, DialogService, ElasticSearchService, SearchService]));
filteredProviders.push(
Expand All @@ -49,6 +54,10 @@ export const commonTestBed = () => {
provide: ElasticSearchService,
useValue: new ElasticSearchServiceMock(),
},
{
provide: EntityService,
useValue: new EntityServiceMock(),
},
{
provide: ElementRef,
useClass: MockElementRef,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import { Subject } from 'rxjs';
import { CorpusDefinition } from '../../models/corpus-definition';
import { ApiService } from '../../services';
import { ActivatedRoute } from '@angular/router';
import { filter, take } from 'rxjs/operators';
import * as _ from 'lodash';
import { HttpErrorResponse } from '@angular/common/http';

Expand Down
5 changes: 3 additions & 2 deletions frontend/src/app/document-page/document-page.component.html
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
<ia-corpus-header [corpus]="corpus" currentPage="document"></ia-corpus-header>
<div class="section">
<div class="container is-readable">
<div *ngIf="!documentNotFound; else notFound" [ngClass]="{'is-loading': !document}">
<div *ngIf="documentFound; else notFound" [ngClass]="{'is-loading': !document}">
<em *ngIf="showNEROption" iaBalloon="Show named entities for this document">Show named entities<ia-toggle (toggled)="toggleNER($event)"></ia-toggle></em>
<div class="level">
<div class="level-left"></div>
<div class="level-right">
Expand All @@ -14,7 +15,7 @@
</div>
</div>
<div class="block">
<ia-document-view *ngIf="document" [document]="document" [corpus]="corpus"></ia-document-view>
<ia-document-view *ngIf="document" [document]="document" [corpus]="corpus" [showEntities]="showNamedEntities"></ia-document-view>
</div>
</div>
</div>
Expand Down
13 changes: 11 additions & 2 deletions frontend/src/app/document-page/document-page.component.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,14 @@ export class DocumentPageComponent implements OnInit {
documentId: string;
document: FoundDocument;

documentNotFound: boolean;
documentFound: boolean;

documentIcons = documentIcons;

showNEROption: boolean;

showNamedEntities = false;

constructor(
private corpusService: CorpusService,
private elasticSearchService: ElasticSearchService,
Expand Down Expand Up @@ -55,6 +59,7 @@ export class DocumentPageComponent implements OnInit {
]).subscribe(([params, corpus]) => {
this.corpus = corpus;
this.documentId = params['id'];
this.showNEROption = this.corpus.hasNamedEntities;
this.getDocument(this.documentId);
this.title.setTitle(pageTitle(`Document in ${corpus.title}`));
});
Expand All @@ -63,9 +68,13 @@ export class DocumentPageComponent implements OnInit {
getDocument(id: string) {
this.elasticSearchService.getDocumentById(id, this.corpus).then(document => {
this.document = document;
this.documentNotFound = _.isUndefined(this.document);
this.documentFound = !_.isUndefined(this.document);
});
}

toggleNER(active: boolean): void {
this.showNamedEntities = active;
}


}
Loading

0 comments on commit 7353420

Please sign in to comment.