diff --git a/CITATION.cff b/CITATION.cff index d2bf48475..83bfbb734 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -8,9 +8,13 @@ message: >- metadata from this file. type: software authors: - - name: Research Software Lab - email: digitalhumanties@uu.nl - affiliation: 'Centre for Digital Humanities, Utrecht University' + - name: 'Research Software Lab, Centre for Digital Humanities, Utrecht University' + website: 'https://cdh.uu.nl/centre-for-digital-humanities/research-software-lab/' + city: Utrecht + country: NL +identifiers: + - type: doi + value: 10.5281/zenodo.8064133 repository-code: 'https://github.com/UUDigitalHumanitieslab/I-analyzer' url: 'https://ianalyzer.hum.uu.nl' abstract: >- @@ -31,6 +35,6 @@ keywords: - elasticsearch - natural language processing license: MIT -commit: 96b9585 -version: 4.0.2 +commit: fb80497 +version: 4.0.3 date-released: '2023-06-21' diff --git a/README.md b/README.md index 14e7b61c9..63f0b9900 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,14 @@ +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.8064133.svg)](https://doi.org/10.5281/zenodo.8064133) [![Actions Status](https://github.com/UUDigitalHumanitiesLab/I-analyzer/workflows/Unit%20tests/badge.svg)](https://github.com/UUDigitalHumanitiesLab/I-analyzer/actions) + # I-analyzer The text mining tool that obviates all others. I-analyzer is a web application that allows users to search through large text corpora, requiring no experience in text mining or technical know-how. -## Directory structure - -The I-analyzer backend (`/backend`) is a python/Django app that provides the following functionality: - -- A 'users' module that defines user accounts. - -- A 'corpora' module containing corpus definitions and metadata of the currently implemented corpora. For each corpus added in I-analyzer, this module defines how to extract document contents from its source files and sets parameters for displaying the corpus in the interface, such as sorting options. - -- An 'addcorpus' module which manages the functionality to extract data from corpus source files (given the definition) and save this in an elasticsearch index. Source files can be XML or HTML format (which are parsed with `beautifulsoup4` + `lxml`) or CSV. This module also provides the basic data structure for corpora. - -- An 'es' module which handles the communication with elasticsearch. The data is passed through to the index using the `elasticsearch` package for Python (note that `elasticsearch-dsl` is not used, since its [documentation](https://elasticsearch-dsl.readthedocs.io/en/latest) at the time seemed less immediately accessible than the [low-level](https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html) version). - -- An 'api' module that that enables users to search through an ElasticSearch index of a text corpus and stream search results into a CSV file. The module also performs more complex analysis of search results for visualisations. - -- A 'visualizations' module that does the analysis for several types of text-based visualisations. - -- A 'downloads' module that collects results into csv files. - -- A 'wordmodels' module that handles functionality related to word embeddings. - -`ianalyzer/frontend` is an [Angular 13](https://angular.io/) web interface. - -See the documentation for [a more extensive overview](./documentation/Overview.md) +See the documentation for [an overview of the repository](./documentation/Overview.md) ## Prerequisites @@ -38,8 +18,6 @@ See the documentation for [a more extensive overview](./documentation/Overview.m * [Redis](https://www.redis.io/) (used by [Celery](http://www.celeryproject.org/)). Recommended installation is [installing from source](https://redis.io/docs/getting-started/installation/install-redis-from-source/) * Yarn -If you wish to have email functionality, also make sure you have an email server set up, such as [maildev](https://maildev.github.io/maildev/). - The documentation includes a [recipe for installing the prerequisites on Debian 10](./documentation/Local-Debian-I-Analyzer-setup.md) ## First-time setup @@ -77,40 +55,16 @@ yarn postinstall The backend readme provides more details on these steps. 8. Set up the database and migrations by running `yarn django migrate`. 9. Make a superuser account with `yarn django createsuperuser` -10. In `frontend/src/environments`, create a file `environment.private.ts` with the following settings: -``` -privateEnvironment = { - appName: I-Analyzer, - aboutPage: ianalyzer -} -``` ## Adding corpora To include corpora on your environment, you need to index them from their source files. The source files are not included in this directory; ask another developer about their availability. If you have (a sample of) the source files for a corpus, you can add it your our environment as follows: -_Note:_ these instructions are for adding a corpus that already has a corpus definition. For adding new corpus definitions, see [How to add a new corpus to I-analyzer](./documentation/How-to-add-a-new-corpus-to-Ianalyzer.md). +_Note:_ these instructions are for indexing a corpus that already has a corpus definition. For adding new corpus definitions, see [How to add a new corpus to I-analyzer](./documentation/How-to-add-a-new-corpus-to-Ianalyzer.md). 1. Add the corpus to the `CORPORA` dictionary in your local settings file. The key should match the class name of the corpus definition. This match is not case-sensitive, and your key may include extra non-alphabetic characters (they will be ignored when matching). The value should be the absolute path the corpus definition file (e.g. `.../backend/corpora/times/times.py`). -2. Set configurations for your corpus. Check the definition file to see which variables it expects to find in the configuration. Some of these may already be set in settings.py, but you will at least need to define the name of the elasticsearch index and the (absolute) path to your source files. -3. Activate your python virtual environment. Create an ElasticSearch index from the source files by running, e.g., `yarn django index dutchannualreports -s 1785-01-01 -e 2010-12-31`, for indexing the Dutch Annual Reports corpus starting in 1785 and ending in 2010. The dates are optional, and default to specified minimum and maximum dates of the corpus. (Note that new indices are created with `number_of_replicas` set to 0 (this is to make index creation easier/lighter). In production, you can automatically update this setting after index creation by adding the `--prod` flag (e.g. `yarn django index goodreads --prod`). Note though, that the -`--prod` flag creates a _versioned_ index name, which needs an alias to actually work as `name_of_index_without_version` (see below for more details). - -#### Flags of indexing script -- --prod / -p Whether or not to create a versioned index name -- --mappings_only / -m Whether to only create an index with mappings and settings, without adding data to it (useful before reindexing from another index or another server) -- --add / -a Add documents to an existing index (skip index creation) -- --update / -u Add or change fields in the documents. This requires an `update_body` or `update_script` to be set in the corpus definition, see [example for update_body in dutchnewspapers](backend/corpora/dutchnewspapers/dutchnewspapers_all.py) and [example for update_script in goodreads](backend/corpora/goodreads/goodreads.py). -- --delete / -d Delete an existing index with the `corpus.es_index` name. Note that in production, `corpus.es_index` will usually be an *alias*, and you would use the `yarn django es alias -c corpus-name --clean` to achieve the same thing. -- --rollover / -r Only applies in production: rollover a versioned index to the newest version. This *will not* delete the old index (so you have a chance to check the new index and roll back, if necessary) - -#### Production - -On the servers, we work with aliases. Indices created with the `--prod` flag will have a version number (e.g. `indexname-1`), and as such will not be recognized by the corpus definition (which is looking for `indexname`). Create an alias for that using the `alias` command: `yarn django alias -c corpusname`. That script ensures that an alias is present for the index with the highest version numbers, and not for all others (i.e. older versions). The advantage of this approach is that an old version of the index can be kept in place as long as is needed, for example while a new version of the index is created. Note that removing an alias does not remove the index itself. - -Once you have an alias in place, you might want to remove any old versions of the index. The `alias` command can be used for this. If you call `yarn django alias -c corpusname --clean` any versions of the index that are not the newest version will be removed. Note that removing an index also removes any existing aliases for it. You might want to perform this as a separate operation (i.e. after completing step 14) so that the old index stays in place for a bit while you check that all is fine. - -See the documentation for more information about [indexing on the server](./documentation/Indexing-on-server.md). +2. Set configurations for your corpus. Check the definition file to see which variables it expects to find in the configuration. Some of these may already be set in settings.py, but you will at least need to define the (absolute) path to your source files. +3. Activate your python virtual environment. Create an ElasticSearch index from the source files by running, e.g., `yarn django index dutchannualreports`, for indexing the Dutch Annual Reports corpus in a development environment. See [Indexing](documentation/Indexing-corpora.md) for more information. ## Running a dev environment @@ -118,8 +72,7 @@ See the documentation for more information about [indexing on the server](./docu 2. Activate your python environment. Start the backend server with `yarn start-back`. This creates an instance of the Django server at `127.0.0.1:8000`. 3. (optional) If you want to use celery, start your local redis server by running `redis-server` in a separate terminal. 4. (optional) If you want to use celery, activate your python environment. Run `yarn celery worker`. Celery is used for long downloads and the word cloud and ngrams visualisations. -5. (optional) If you want to use email functionality, start your local email server. -6. Start the frontend by running `yarn start-front`. +5. Start the frontend by running `yarn start-front`. ## Notes for development diff --git a/backend/.gitignore b/backend/.gitignore index ef5f1e392..56da2bfd0 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -37,9 +37,14 @@ flask_sql_data/ # Local settings file ianalyzer/settings_local.py +# legacy config +ianalyzer/config.py + # csv downloads download/csv_files/ # word models corpora/*/wm/* !corpora/*/wm/documentation.md + + diff --git a/backend/addcorpus/conftest.py b/backend/addcorpus/conftest.py index 190a5c2bf..d28034323 100644 --- a/backend/addcorpus/conftest.py +++ b/backend/addcorpus/conftest.py @@ -1,11 +1,10 @@ import pytest import os from django.contrib.auth.models import Group -from addcorpus.load_corpus import load_all_corpora from addcorpus.models import Corpus @pytest.fixture() -def group_with_access(db, mock_corpus, mock_corpora_in_db): +def group_with_access(db, mock_corpus): '''Create a group with access to the mock corpus''' group = Group.objects.create(name='nice-users') corpus = Corpus.objects.get(name=mock_corpus) @@ -17,11 +16,5 @@ def group_with_access(db, mock_corpus, mock_corpora_in_db): here = os.path.abspath(os.path.dirname(__file__)) @pytest.fixture() -def mock_corpus(db): +def mock_corpus(): return 'mock-csv-corpus' - -@pytest.fixture() -def mock_corpus_user(auth_user, group_with_access): - auth_user.groups.add(group_with_access) - auth_user.save() - return auth_user diff --git a/backend/addcorpus/constants.py b/backend/addcorpus/constants.py new file mode 100644 index 000000000..86b9f9cae --- /dev/null +++ b/backend/addcorpus/constants.py @@ -0,0 +1,14 @@ +CATEGORIES = [ + ('newspaper', 'Newspapers'), + ('parliament', 'Parliamentary debates'), + ('periodical', 'Periodicals'), + ('finance', 'Financial reports'), + ('ruling', 'Court rulings'), + ('review', 'Online reviews'), + ('inscription', 'Funerary inscriptions'), + ('oration', 'Orations'), + ('book', 'Books'), +] +''' +Types of data +''' diff --git a/backend/addcorpus/corpus.py b/backend/addcorpus/corpus.py index b38e0005a..1c8cf55b9 100644 --- a/backend/addcorpus/corpus.py +++ b/backend/addcorpus/corpus.py @@ -10,11 +10,13 @@ import bs4 import csv import sys -from datetime import datetime, timedelta +from datetime import datetime +from langcodes import Language from os.path import isdir import logging logger = logging.getLogger('indexing') -import os +from addcorpus.constants import CATEGORIES + class Corpus(object): ''' @@ -61,6 +63,25 @@ def max_date(self): ''' raise NotImplementedError() + @property + def languages(self): + ''' + Language(s) used in the corpus + + Should be a list of strings. Each language should + correspond to an ISO-639 code. + ''' + return [''] + + @property + def category(self): + ''' + Type of documents in the corpus + + See addcorpus.constants.CATEGORIES for options + ''' + raise NotImplementedError() + @property def es_index(self): ''' @@ -249,6 +270,13 @@ def serialize(self): for field in self.fields: field_list.append(field.serialize()) corpus_dict[ca[0]] = field_list + elif ca[0] == 'languages': + corpus_dict[ca[0]] = [ + Language.make(language).display_name() + for language in ca[1] + ] + elif ca[0] == 'category': + corpus_dict[ca[0]] = self._format_option(ca[1], CATEGORIES) elif type(ca[1]) == datetime: timedict = {'year': ca[1].year, 'month': ca[1].month, @@ -260,6 +288,16 @@ def serialize(self): corpus_dict[ca[0]] = ca[1] return corpus_dict + def _format_option(self, value, options): + ''' + For serialisation: format language or category based on list of options + ''' + return next( + nice_string + for code, nice_string in options + if value == code + ) + def sources(self, start=datetime.min, end=datetime.max): ''' Obtain source files for the corpus, relevant to the given timespan. @@ -293,6 +331,15 @@ def documents(self, sources=None): ) ) + def _reject_extractors(self, *inapplicable_extractors): + ''' + Raise errors if any fields use extractors that are not applicable + for the corpus. + ''' + for field in self.fields: + if isinstance(field.extractor, inapplicable_extractors): + raise RuntimeError( + "Specified extractor method cannot be used with this type of data") class XMLCorpus(Corpus): ''' @@ -302,13 +349,25 @@ class XMLCorpus(Corpus): @property def tag_toplevel(self): ''' - The top-level tag in the source documents. Either a string or a function that maps metadata to a string. + The top-level tag in the source documents. + + Can be: + - None + - A string with the name of the tag + - A dictionary that gives the named arguments to soup.find_all() + - A bound method that takes the metadata of the document as input and outputs one of the above. ''' @property def tag_entry(self): ''' - The tag that corresponds to a single document entry. Either a string or a function that maps metadata to a string. + The tag that corresponds to a single document entry. + + Can be: + - None + - A string with the name of the tag + - A dictionary that gives the named arguments to soup.find_all() + - A bound method that takes the metadata of the document as input and outputs one of the above. ''' def source2dicts(self, source): @@ -317,18 +376,8 @@ def source2dicts(self, source): default implementation for XML layouts; may be subclassed if more ''' # Make sure that extractors are sensible - for field in self.fields: - if not isinstance(field.extractor, ( - extract.Choice, - extract.Combined, - extract.XML, - extract.Metadata, - extract.Constant, - extract.ExternalFile, - extract.Backup, - )): - raise RuntimeError( - "Specified extractor method cannot be used with an XML corpus") + self._reject_extractors(extract.HTML, extract.CSV) + # extract information from external xml files first, if applicable metadata = {} if isinstance(source, str): @@ -357,17 +406,18 @@ def source2dicts(self, source): required_fields = [ field.name for field in self.fields if field.required] # Extract fields from the soup - tag = self.get_entry_tag(metadata) + tag = self.get_tag_requirements(self.tag_entry, metadata) bowl = self.bowl_from_soup(soup, metadata=metadata) if bowl: - spoonfuls = bowl.find_all(tag) if tag else [bowl] - for spoon in spoonfuls: + spoonfuls = bowl.find_all(**tag) if tag else [bowl] + for i, spoon in enumerate(spoonfuls): regular_field_dict = {field.name: field.extractor.apply( # The extractor is put to work by simply throwing at it # any and all information it might need soup_top=bowl, soup_entry=spoon, - metadata=metadata + metadata=metadata, + index=i, ) for field in regular_fields if field.indexed} external_dict = {} if external_fields: @@ -387,19 +437,32 @@ def source2dicts(self, source): logger.warning( 'Top-level tag not found in `{}`'.format(filename)) - def get_entry_tag(self, metadata): - if type(self.tag_entry) == str: - return self.tag_entry - elif self.tag_entry is None: - return None + def get_tag_requirements(self, specification, metadata): + ''' + Get the requirements for a tag given the specification. + + The specification can be: + - None + - A string with the name of the tag + - A dict with the named arguments to soup.find() / soup.find_all() + - A callable that takes the document metadata as input and outputs one of the above. + + Output is either None or a dict with the arguments for soup.find() / soup.find_all() + ''' + + if callable(specification): + condition = specification(metadata) else: - return self.tag_entry(metadata) + condition = specification - def get_toplevel_tag(self, metadata): - if type(self.tag_toplevel) == str: - return self.tag_toplevel + if condition is None: + return None + elif type(condition) == str: + return {'name': condition} + elif type(condition) == dict: + return condition else: - return self.tag_toplevel(metadata) + raise TypeError('Tag must be a string or dict') def external_source2dict(self, soup, external_fields, metadata): ''' @@ -453,11 +516,9 @@ def bowl_from_soup(self, soup, toplevel_tag=None, entry_tag=None, metadata = {}) If no such tag is present, it contains the entire soup. ''' if toplevel_tag == None: - toplevel_tag = self.get_toplevel_tag(metadata) - if entry_tag == None: - entry_tag = self.get_entry_tag(metadata) + toplevel_tag = self.get_tag_requirements(self.tag_toplevel, metadata) - return soup.find(toplevel_tag) if toplevel_tag else soup + return soup.find(**toplevel_tag) if toplevel_tag else soup def metadata_from_xml(self, filename, tags): ''' @@ -520,18 +581,7 @@ def source2dicts(self, source): ''' (filename, metadata) = source - # Make sure that extractors are sensible - for field in self.fields: - if not isinstance(field.extractor, ( - extract.Choice, - extract.Combined, - extract.HTML, - extract.Metadata, - extract.Constant, - extract.Backup, - )): - raise RuntimeError( - "Specified extractor method cannot be used with an HTML corpus") + self._reject_extractors(extract.XML, extract.CSV) # Loading HTML logger.info('Reading HTML file {} ...'.format(filename)) @@ -550,7 +600,7 @@ def source2dicts(self, source): # if there is a entry level tag, with html this is not always the case if bowl and tag: # Note that this is non-recursive: will only find direct descendants of the top-level tag - for spoon in bowl.find_all(tag): + for i, spoon in enumerate(bowl.find_all(tag)): # yield yield { field.name: field.extractor.apply( @@ -558,7 +608,8 @@ def source2dicts(self, source): # any and all information it might need soup_top=bowl, soup_entry=spoon, - metadata=metadata + metadata=metadata, + index=i ) for field in self.fields if field.indexed } else: @@ -569,7 +620,7 @@ def source2dicts(self, source): # any and all information it might need soup_top='', soup_entry=soup, - metadata=metadata + metadata=metadata, ) for field in self.fields if field.indexed } @@ -601,20 +652,17 @@ def delimiter(self): ''' return ',' + @property + def skip_lines(self): + ''' + Number of lines to skip before reading the header + ''' + return 0 + def source2dicts(self, source): # make sure the field size is as big as the system permits csv.field_size_limit(sys.maxsize) - for field in self.fields: - if not isinstance(field.extractor, ( - extract.Choice, - extract.Combined, - extract.CSV, - extract.Constant, - extract.Backup, - extract.Metadata, - )): - raise RuntimeError( - "Specified extractor method cannot be used with a CSV corpus") + self._reject_extractors(extract.XML, extract.HTML) if isinstance(source, str): filename = source @@ -626,9 +674,15 @@ def source2dicts(self, source): with open(filename, 'r') as f: logger.info('Reading CSV file {}...'.format(filename)) + + # skip first n lines + for _ in range(self.skip_lines): + next(f) + reader = csv.DictReader(f, delimiter=self.delimiter) document_id = None rows = [] + index = 0 for row in reader: is_new_document = True @@ -644,19 +698,20 @@ def source2dicts(self, source): document_id = identifier if is_new_document and rows: - yield self.document_from_rows(rows, metadata) + yield self.document_from_rows(rows, metadata, index) rows = [row] + index += 1 else: rows.append(row) - yield self.document_from_rows(rows, metadata) + yield self.document_from_rows(rows, metadata, index) - def document_from_rows(self, rows, metadata): + def document_from_rows(self, rows, metadata, row_index): doc = { field.name: field.extractor.apply( # The extractor is put to work by simply throwing at it # any and all information it might need - rows=rows, metadata = metadata + rows=rows, metadata = metadata, index=row_index ) for field in self.fields if field.indexed } diff --git a/backend/addcorpus/es_mappings.py b/backend/addcorpus/es_mappings.py index 735325ea8..b2a350edb 100644 --- a/backend/addcorpus/es_mappings.py +++ b/backend/addcorpus/es_mappings.py @@ -67,3 +67,11 @@ def date_mapping(format='yyyy-MM-dd'): 'type': 'date', 'format': format } + +def int_mapping(): + return { + 'type': 'integer' + } + +def bool_mapping(): + return {'type': 'boolean'} diff --git a/backend/addcorpus/es_settings.py b/backend/addcorpus/es_settings.py index 664aa8213..21a0cc2fe 100644 --- a/backend/addcorpus/es_settings.py +++ b/backend/addcorpus/es_settings.py @@ -1,5 +1,6 @@ import nltk import os +from langcodes import Language HERE = os.path.abspath(os.path.dirname(__file__)) NLTK_DATA_PATH = os.path.join(HERE, 'nltk_data') @@ -29,10 +30,20 @@ } } -def get_nltk_stopwords(language): +def get_language_key(language_code): + ''' + Get the nltk stopwords file / elasticsearch stemmer name for a language code + + E.g. 'en' -> 'english' + ''' + + return Language.make(language_code).display_name().lower() + +def get_nltk_stopwords(language_code): nltk.download('stopwords', NLTK_DATA_PATH) stopwords_dir = os.path.join(NLTK_DATA_PATH, 'corpora', 'stopwords') languages = os.listdir(stopwords_dir) + language = get_language_key(language_code) if language in languages: filepath = os.path.join(stopwords_dir, language) @@ -46,7 +57,7 @@ def get_nltk_stopwords(language): def es_settings(language = None, stopword_analyzer = False, stemming_analyzer = False): ''' Make elasticsearch settings json for a corpus index. Options: - - `language`: string with the language of the corpus. Must be specified if you want to use stopword or stemming analysers. + - `language`: string with the language code. See addcorpus.constants for options, and which languages support stopwords/stemming - `stopword_analyzer`: define an analyser that removes stopwords. - `stemming_analyzer`: define an analyser that removes stopwords and performs stemming. ''' @@ -92,9 +103,10 @@ def make_stopword_analyzer(): } def make_stemmer_filter(language): + stemmer_language = get_language_key(language) return { "type": "stemmer", - "language": language + "language": stemmer_language } def make_stemmed_analyzer(): diff --git a/backend/addcorpus/extract.py b/backend/addcorpus/extract.py index ca51db37f..fba462923 100644 --- a/backend/addcorpus/extract.py +++ b/backend/addcorpus/extract.py @@ -125,6 +125,23 @@ def __init__(self, key, *nargs, **kwargs): def _apply(self, metadata, *nargs, **kwargs): return metadata.get(self.key) +class Pass(Extractor): + ''' + An extractor that just passes the value of another extractor. + + Useful if you want to stack multiple `transform` arguments + ''' + + def __init__(self, extractor, *nargs, **kwargs): + self.extractor = extractor + super().__init__(**kwargs) + + def _apply(self, *nargs, **kwargs): + return self.extractor.apply(*nargs, **kwargs) + +class Order(Extractor): + def _apply(self, index=None, *nargs, **kwargs): + return index class XML(Extractor): ''' @@ -235,7 +252,10 @@ def _apply(self, soup_top, soup_entry, *nargs, **kwargs): else: soup = self._select(soup_top if self.toplevel else soup_entry) if self.transform_soup_func: - soup = self.transform_soup_func(soup) + if type(soup) == bs4.element.ResultSet: + soup = [self.transform_soup_func(bowl) for bowl in soup] + else: + soup = self.transform_soup_func(soup) if not soup: return None diff --git a/backend/addcorpus/tests/mock_csv_corpus.py b/backend/addcorpus/tests/mock_csv_corpus.py index 2d22a3190..5beaaf337 100644 --- a/backend/addcorpus/tests/mock_csv_corpus.py +++ b/backend/addcorpus/tests/mock_csv_corpus.py @@ -17,6 +17,9 @@ class MockCSVCorpus(CSVCorpus): data_directory = os.path.join(here, 'csv_example') field_entry = 'character' + languages = ['en'] + category = 'book' + def sources(self, start, end): for filename in os.listdir(self.data_directory): full_path = os.path.join(self.data_directory, filename) diff --git a/backend/addcorpus/tests/test_corpus_access.py b/backend/addcorpus/tests/test_corpus_access.py index 808b12b1f..4765f1b71 100644 --- a/backend/addcorpus/tests/test_corpus_access.py +++ b/backend/addcorpus/tests/test_corpus_access.py @@ -1,19 +1,19 @@ from users.models import CustomUser -def test_access_through_group(mock_corpus, group_with_access): +def test_access_through_group(db, mock_corpus, group_with_access): user = CustomUser.objects.create(username='nice-user', password='secret') user.groups.add(group_with_access) user.save() assert user.has_access(mock_corpus) -def test_superuser_access(mock_corpus, admin_user, mock_corpora_in_db): +def test_superuser_access(mock_corpus, admin_user): assert admin_user.has_access(mock_corpus) -def test_no_corpus_access(mock_corpus): +def test_no_corpus_access(db, mock_corpus): user = CustomUser.objects.create(username='bad-user', password='secret') assert not user.has_access(mock_corpus) -def test_api_access(mock_corpus, group_with_access, auth_client, auth_user): +def test_api_access(db, mock_corpus, group_with_access, auth_client, auth_user): # default: no access response = auth_client.get('/api/corpus/') assert len(response.data) == 0 @@ -25,7 +25,7 @@ def test_api_access(mock_corpus, group_with_access, auth_client, auth_user): assert len(response.data) == 1 assert response.data[0].get('name') == mock_corpus -def test_superuser_api_access(admin_client, mock_corpus, mock_corpora_in_db): +def test_superuser_api_access(admin_client, mock_corpus): response = admin_client.get('/api/corpus/') assert response.status_code == 200 assert any(corpus['name'] == mock_corpus for corpus in response.data) diff --git a/backend/addcorpus/tests/test_corpus_views.py b/backend/addcorpus/tests/test_corpus_views.py index e79bccf0e..354faa3fa 100644 --- a/backend/addcorpus/tests/test_corpus_views.py +++ b/backend/addcorpus/tests/test_corpus_views.py @@ -1,33 +1,35 @@ -from rest_framework.test import APIClient from rest_framework import status from users.models import CustomUser +from addcorpus.tests.mock_csv_corpus import MockCSVCorpus - -def test_no_corpora(db, settings, auth_client): +def test_no_corpora(db, settings, admin_client): settings.CORPORA = {} - response = auth_client.get('/api/corpus/') + response = admin_client.get('/api/corpus/') assert status.is_success(response.status_code) assert response.data == [] -def test_corpus_documentation_view(client, mock_corpus, mock_corpus_user): - client.force_login(mock_corpus_user) - response = client.get(f'/api/corpus/documentation/{mock_corpus}/mock-csv-corpus.md') +def test_corpus_documentation_view(admin_client, mock_corpus): + response = admin_client.get(f'/api/corpus/documentation/{mock_corpus}/mock-csv-corpus.md') assert response.status_code == 200 -def test_nonexistent_corpus(client, mock_corpus, mock_corpus_user): - client.force_login(mock_corpus_user) - response = client.get(f'/api/corpus/documentation/unknown-corpus/mock-csv-corpus.md') +def test_nonexistent_corpus(admin_client): + response = admin_client.get(f'/api/corpus/documentation/unknown-corpus/mock-csv-corpus.md') assert response.status_code == 404 -def test_no_corpus_access(client, mock_corpus, mock_corpus_user): +def test_no_corpus_access(db, client, mock_corpus): '''Test a request from a user that should not have access to the corpus''' - # mock_corpus_user makes sure the corpus is properly saved in the database. - # now make a new user that does not have access - user = CustomUser.objects.create(username='bad-user', password='secret') client.force_login(user) response = client.get(f'/api/corpus/documentation/{mock_corpus}/mock-csv-corpus.md') assert response.status_code == 403 + +def test_corpus_serialization(admin_client, mock_corpus): + response = admin_client.get('/api/corpus/') + corpus = next(c for c in response.data if c['title'] == MockCSVCorpus.title) + assert corpus + assert corpus['languages'] == ['English'] + assert corpus['category'] == 'Books' + assert len(corpus['fields']) == 2 diff --git a/backend/addcorpus/tests/test_corpusimport.py b/backend/addcorpus/tests/test_corpusimport.py index 620a9afbd..b55d1a13b 100644 --- a/backend/addcorpus/tests/test_corpusimport.py +++ b/backend/addcorpus/tests/test_corpusimport.py @@ -19,16 +19,16 @@ def test_import_error(db, settings): - in case the file path in config.CORPORA is faulty ''' - settings.CORPORA = {'times': '/somewhere/times/times.py'} + settings.CORPORA = {'times2': '/somewhere/times/times.py'} with pytest.raises(FileNotFoundError) as e: - load_corpus.load_corpus('times') + load_corpus.load_corpus('times2') # corpus should not be included when # loading all corpora corpora = load_corpus.load_all_corpora() - assert 'times' not in corpora - assert not Corpus.objects.filter(name='times') + assert 'times2' not in corpora + assert not Corpus.objects.filter(name='times2') mock_corpus_definition = ''' class Times(): diff --git a/backend/addcorpus/tests/test_models.py b/backend/addcorpus/tests/test_models.py index 05ef2cb29..00c248b14 100644 --- a/backend/addcorpus/tests/test_models.py +++ b/backend/addcorpus/tests/test_models.py @@ -1,11 +1,11 @@ from addcorpus.models import Corpus def test_corpus_model(db): - corpus = Corpus(name = 'test_corpus', description = 'test.md') + corpus = Corpus(name = 'test_corpus', description = 'test') corpus.save() - assert len(Corpus.objects.all()) == 1 + assert Corpus.objects.filter(name = corpus.name) corpus.delete() - assert len(Corpus.objects.all()) == 0 + assert not Corpus.objects.filter(name = corpus) diff --git a/backend/api/es_query_to_query_model.py b/backend/api/es_query_to_query_model.py new file mode 100644 index 000000000..b22df9fdc --- /dev/null +++ b/backend/api/es_query_to_query_model.py @@ -0,0 +1,102 @@ +from visualization import query +from urllib.parse import quote + +def es_query_to_query_model(es_query): + model = dict() + + transformations = [ + include_query_text, + include_search_fields, + include_filters, + include_sort, + include_highlight + ] + + for transform in transformations: + transform(model, es_query) + + return model + +def include_query_text(model, es_query): + query_text = query.get_query_text(es_query) + model['queryText'] = query_text + return model + +def include_search_fields(model, es_query): + search_fields = query.get_search_fields(es_query) + if search_fields: + model['fields'] = search_fields + return model + +def include_sort(model, es_query): + if 'sort' in es_query: + sort = es_query['sort'][0] + field = list(sort.keys())[0] + direction = sort[field] + ascending = direction == 'asc' + model['sortBy'] = field + model['sortAscending'] = ascending + + return model + +def include_highlight(model, es_query): + if 'highlight' in es_query: + higlight = es_query['highlight'] + size = higlight['fragment_size'] + model['highlight'] = size + return model + +def include_filters(model, es_query): + filters = query.get_filters(es_query) + model['filters'] = list(map(format_filter_for_query_model, filters)) + return model + +def format_filter_for_query_model(es_filter): + type = list(es_filter.keys())[0] + condition = es_filter[type] + field = list(condition.keys())[0] + + data_formatters = { + 'range': format_range_data, + 'terms': format_terms_data, + 'term': format_term_data, + } + + current_data = data_formatters[type](condition[field]) + + return { + 'fieldName': field, + 'description': '', + 'useAsFilter': True, + 'currentData': current_data, + } + +def format_range_data(data): + min = data['gte'] + max = data['lte'] + + if data.get('format', None): + return { + 'filterType': 'DateFilter', + 'min': min, + 'max': max, + } + + return { + 'filterType': 'RangeFilter', + 'min': min, + 'max': max, + } + +def format_term_data(data): + return { + 'filterType': 'BooleanFilter', + 'checked': data + } + +def format_terms_data(data): + selected = list(map(quote, data)) + return { + 'filterType': 'MultipleChoiceFilter', + 'selected': selected + } diff --git a/backend/api/migrations/0003_convert_to_es_query.py b/backend/api/migrations/0003_convert_to_es_query.py new file mode 100644 index 000000000..4a45aacf7 --- /dev/null +++ b/backend/api/migrations/0003_convert_to_es_query.py @@ -0,0 +1,32 @@ +# Generated by Django 4.1.5 on 2023-03-13 15:46 + +from django.db import migrations +from api.query_model_to_es_query import query_model_to_es_query +from api.es_query_to_query_model import es_query_to_query_model + +def convert_query_format_to_es_query(apps, schema_editor): + Query = apps.get_model('api', 'Query') + queries = Query.objects.all() + for query in queries: + query.query_json = query_model_to_es_query(query.query_json) + query.save() + +def convert_query_format_to_query_model(apps, schema_editor): + Query = apps.get_model('api', 'Query') + queries = Query.objects.all() + for query in queries: + query.query_json = es_query_to_query_model(query.query_json) + query.save() + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0002_alter_query_started'), + ] + + operations = [ + migrations.RunPython( + code=convert_query_format_to_es_query, + reverse_code=convert_query_format_to_query_model, + ) + ] diff --git a/backend/api/query_model_to_es_query.py b/backend/api/query_model_to_es_query.py new file mode 100644 index 000000000..a1c96695d --- /dev/null +++ b/backend/api/query_model_to_es_query.py @@ -0,0 +1,101 @@ +# converts json of the frontend 'QueryModel' to an elasticsearch query. + +from visualization import query +from urllib.parse import unquote + +def query_model_to_es_query(query_model): + es_query = query.set_query_text(query.MATCH_ALL, get_query_text(query_model)) + + filters = get_filters(query_model) + for filter in filters: + es_query = query.add_filter(es_query, filter) + + search_fields = get_search_fields(query_model) + if search_fields: + es_query = query.set_search_fields(es_query, search_fields) + + sort_by, sort_direction = get_sort(query_model) + if sort_by: + es_query = query.set_sort(es_query, sort_by, sort_direction) + + highlight = get_highlight(query_model) + if highlight: + es_query = query.set_highlight(es_query, highlight) + + return es_query + +def get_query_text(query_model): + return query_model.get('queryText', None) + +def get_search_fields(query_model): + return query_model.get('fields', None) + +def get_sort(query_model): + sort_by = query_model.get('sortBy', None) + direction = 'asc' if query_model.get('sortAscending', True) else 'desc' + return sort_by, direction + +def get_highlight(query_model): + return query_model.get('highlight', None) + +def get_filters(query_model): + return [ + convert_filter(filter) + for filter in query_model.get('filters', []) + if filter.get('useAsFilter', False) + ] + +def convert_filter(filter): + field = filter['fieldName'] + type = filter['currentData']['filterType'] + + type_converters = { + 'DateFilter': convert_date_filter, + 'RangeFilter': convert_range_filter, + 'MultipleChoiceFilter': convert_terms_filter, + 'BooleanFilter': convert_boolean_filter, + } + + return type_converters[type](field, filter['currentData']) + +def convert_date_filter(field, data): + min = data['min'] + max = data['max'] + + return { + 'range': { + field: { + 'gte': min, + 'lte': max, + 'format':'yyyy-MM-dd', + } + } + } + +def convert_range_filter(field, data): + min = data['min'] + max = data['max'] + + return { + 'range': { + field: { + 'gte': min, + 'lte': max, + } + } + } + +def convert_terms_filter(field, data): + selected = data['selected'] + decoded = list(map(unquote, selected)) + + return { + 'terms': {field: decoded} + } + +def convert_boolean_filter(field, data): + checked = data['checked'] + + return { + 'term': {field: checked} + } diff --git a/backend/api/tests/test_query_model_to_es_query.py b/backend/api/tests/test_query_model_to_es_query.py new file mode 100644 index 000000000..86dafd8a6 --- /dev/null +++ b/backend/api/tests/test_query_model_to_es_query.py @@ -0,0 +1,81 @@ +import pytest +from api.query_model_to_es_query import query_model_to_es_query +from api.es_query_to_query_model import es_query_to_query_model +from copy import deepcopy + +cases = [ + ( + 'blank search', + {'queryText': None, 'filters': [], 'sortAscending': True}, + {'query': {'bool': {'must': {'match_all': {}}, 'filter': []}}} + ), ( + 'query text, no filters', + {'queryText':'test','filters':[],'sortAscending':True}, + {'query':{'bool':{'must':{'simple_query_string':{'query':'test','lenient':True,'default_operator':'or'}},'filter':[]}}} + ), ( + 'search fields', + {'queryText':'test','filters':[],'sortAscending':True,'fields':['content']}, + {'query':{'bool':{'must':{'simple_query_string':{'query':'test','lenient':True,'default_operator':'or','fields':['content']}},'filter':[]}}} + ), ( + 'date filter', + {'queryText':None,'filters':[{'fieldName':'date','description':'Search only within this time range.','useAsFilter':True,'defaultData':{'filterType':'DateFilter','min':'1815-01-01','max':'2022-12-31'},'currentData':{'filterType':'DateFilter','min':'1900-01-01','max':'2000-12-31'}}]}, + {'query':{'bool':{'must':{'match_all':{}},'filter':[{'range':{'date':{'gte':'1900-01-01','lte':'2000-12-31','format':'yyyy-MM-dd'}}}]}}}, + ), ( + 'range filter', + {'queryText':None,'filters':[{'fieldName':'year','description':'Restrict the years from which search results will be returned.','useAsFilter':True,'defaultData':{'filterType':'RangeFilter','min':1957,'max':2008},'currentData':{'filterType':'RangeFilter','min':1967,'max':1989}}],'sortAscending':True}, + {'query':{'bool':{'must':{'match_all':{}},'filter':[{'range':{'year':{'gte':1967,'lte':1989}}}]}}} + ), ( + 'terms filter', + {'queryText':None,'filters':[{'fieldName':'chamber','description':'Search only in debates from the selected chamber(s)','useAsFilter':True,'defaultData':{'filterType':'MultipleChoiceFilter','optionCount':2,'selected':[]},'currentData':{'filterType':'MultipleChoiceFilter','selected':['Eerste%20Kamer']}}]}, + {'query':{'bool':{'must':{'match_all':{}},'filter':[{'terms':{'chamber':['Eerste Kamer']}}]}}}, + ), ( + 'boolean filter', + {'queryText':None,'filters':[{'fieldName':'has_content','description':'Accept only articles that have available text content.','useAsFilter':True,'defaultData':{'filterType':'BooleanFilter','checked':False},'currentData':{'filterType':'BooleanFilter','checked':True}}],'sortBy':'date','sortAscending':True}, + {'query':{'bool':{'must':{'match_all':{}},'filter':[{'term':{'has_content':True}}]}},'sort':[{'date':'asc'}]} + ),( + 'sort by field', + {'queryText':None,'filters':[],'sortBy':'date','sortAscending':True}, + {'query':{'bool':{'must':{'match_all':{}},'filter':[]}},'sort':[{'date':'asc'}]}, + ), ( + 'highlight', + {'queryText':'test','filters':[],'sortAscending':True,'highlight':10}, + {'query':{'bool':{'must':{'simple_query_string':{'query':'test','lenient':True,'default_operator':'or'}},'filter':[]}},'highlight':{'fragment_size':10}} + ) +] + +def get_name(case): return case[0] + +@pytest.mark.parametrize('name,query_model,es_query', cases, ids=map(get_name, cases)) +def test_query_model_to_es_query(name, query_model, es_query): + result = query_model_to_es_query(query_model) + assert result == es_query + +extra_cases_for_reverse_op = [ + ( + 'es query without filters / boolean logic', + {'queryText': None, 'filters': [], 'sortAscending': True}, + {'query': {'match_all': {}}} + # should be able to parse an es query without bool structure + # this structur is not generated by the query model -> es query conversion + # but IS sometimes generated by the frontend + ), +] + +reverse_cases = cases + extra_cases_for_reverse_op + +@pytest.mark.parametrize('name,query_model,es_query', reverse_cases, ids=map(get_name, reverse_cases)) +def test_es_query_to_query_model(name, query_model, es_query): + result = es_query_to_query_model(es_query) + + # clean up the model to remove some data that is irrelevant for querying + # and thus not represented in es_query + # it's not relevant for the search history either, so we don't need it + + model_copy = deepcopy(query_model) + if 'sortBy' not in model_copy and 'sortAscending' in model_copy: + del model_copy['sortAscending'] + for filter in model_copy['filters']: + filter['description'] = '' + del filter['defaultData'] + + assert result == model_copy diff --git a/backend/conftest.py b/backend/conftest.py index 20f158275..c59dc44ed 100644 --- a/backend/conftest.py +++ b/backend/conftest.py @@ -80,7 +80,8 @@ def es_client(): # mock corpora -@pytest.fixture() -def mock_corpora_in_db(db): - '''Make sure the mock corpora are included in the database''' - load_all_corpora() +@pytest.fixture(scope='function') +def django_db_setup(django_db_setup, django_db_blocker): + #add mock corpora to the database at the start of each test + with django_db_blocker.unblock(): + load_all_corpora() diff --git a/backend/corpora/dbnl/dbnl.py b/backend/corpora/dbnl/dbnl.py new file mode 100644 index 000000000..5ea9c1f9a --- /dev/null +++ b/backend/corpora/dbnl/dbnl.py @@ -0,0 +1,418 @@ +from datetime import datetime +import os +import re +from tqdm import tqdm + +from django.conf import settings +from addcorpus.corpus import XMLCorpus, Field +from addcorpus.extract import Metadata, XML, Pass, Order, Backup, Combined +import corpora.dbnl.utils as utils +from addcorpus.es_mappings import * +from addcorpus.filters import RangeFilter, MultipleChoiceFilter, BooleanFilter +from corpora.dbnl.dbnl_metadata import DBNLMetadata + +class DBNL(XMLCorpus): + title = 'DBNL' + description = 'Digital Library for Dutch Literature' + data_directory = settings.DBNL_DATA + min_date = datetime(year=1200, month=1, day=1) + max_date = datetime(year=1890, month=12, day=31) + es_index = getattr(settings, 'DBNL_ES_INDEX', 'dbnl') + image = 'dbnl.jpg' + description_page = 'dbnl.md' + + languages = ['nl', 'dum', 'fr', 'la', 'fy', 'lat', 'en', 'nds', 'de', 'af'] + category = 'book' + + tag_toplevel = 'TEI.2' + tag_entry = { 'name': 'div', 'attrs': {'type': 'chapter'} } + + document_context = { + 'context_fields': ['title_id'], + 'sort_field': 'chapter_index', + 'context_display_name': 'book' + } + + def sources(self, start = None, end = None): + metadata_corpus = DBNLMetadata() + all_metadata = utils.index_by_id(metadata_corpus.documents()) + + print('Extracting XML files...') + for id, path in tqdm(list(self._xml_files())): + metadata_id, *_ = re.split(r'_(?=\d+$)', id) + csv_metadata = all_metadata.pop(metadata_id) + metadata = { + 'id': id, + 'has_xml': True, + **csv_metadata + } + + year = int(metadata['year']) + if utils.between_years(year, start, end): + yield path, metadata + + # we popped metadata while going through the XMLs + # now add data for the remaining records (without text) + + print('Extracting metadata-only records...') + with utils.BlankXML(self.data_directory) as blank_file: + for id in tqdm(all_metadata): + csv_metadata = all_metadata[id] + metadata = { + 'id': id, + 'has_xml': False, + **csv_metadata + } + year = int(metadata['year']) + if utils.between_years(year, start, end): + yield blank_file, metadata + + def _xml_files(self): + xml_dir = os.path.join(self.data_directory, 'xml_pd') + for filename in os.listdir(xml_dir): + if filename.endswith('.xml'): + id, _ = os.path.splitext(filename) + path = os.path.join(xml_dir, filename) + yield id, path + + title_field = Field( + name='title', + display_name='Title', + description='Title of the book', + results_overview=True, + search_field_core=True, + csv_core=True, + extractor=Metadata('title'), + es_mapping=text_mapping(), + visualizations=['wordcloud'] + ) + + title_id = Field( + name='title_id', + display_name='Title ID', + description='ID of the book', + extractor = Metadata('id'), + es_mapping=keyword_mapping() + ) + + id = Field( + name='id', + es_mapping=keyword_mapping(), + extractor=Combined( + Metadata('id'), + Order(transform=lambda i: str(i).zfill(4)), + transform='_'.join, + ) + ) + + volumes = Field( + name='volumes', + display_name='Volumes', + description='Number of volumes in which this book was published', + extractor=Metadata('volumes'), + es_mapping=text_mapping(), + ) + + # text version of the year, can include things like 'ca. 1500', '14e eeuw' + year_full = Field( + name='year_full', + display_name='Publication year', + description='Year of publication in text format. May describe a range.', + results_overview=True, + csv_core=True, + extractor=Metadata('year_full'), + es_mapping=text_mapping(), + ) + + # version of the year that is always a number + year_int = Field( + name='year', + display_name='Publication year (est.)', + description='Year of publication as a number. May not be an estimate.', + extractor=Metadata('year'), + es_mapping=int_mapping(), + search_filter=RangeFilter( + description='Select books by publication year', + lower=1200, upper=1890 + ), + visualizations=['resultscount', 'termfrequency'], + sortable=True, + visualization_sort='key', + ) + + edition = Field( + name='edition', + display_name='Edition', + description='Edition of the book', + extractor=Metadata('edition'), + es_mapping=text_mapping(), + ) + + periodical = Field( + name='periodical', + display_name='Periodical', + description='Periodical in which the text appeared', + extractor=Metadata('periodical'), + es_mapping=keyword_mapping(), + search_filter=MultipleChoiceFilter( + description='Select texts from periodicals', + ), + visualizations=['resultscount', 'termfrequency'], + ) + + author = Field( + name='author', + display_name='Author', + description='Name(s) of the author(s)', + results_overview=True, + search_field_core=True, + csv_core=True, + extractor=Metadata('author_name'), + es_mapping=keyword_mapping(enable_full_text_search=True), + visualizations=['resultscount', 'termfrequency'], + ) + + author_id = Field( + name='author_id', + display_name='Author ID', + description='ID(s) of the author(s)', + extractor=Metadata('author_id'), + es_mapping=keyword_mapping(), + ) + + author_year_of_birth = Field( + name='author_year_of_birth', + display_name='Author year of birth', + description='Year in which the author(s) was(/were) born', + extractor=Metadata('author_year_of_birth'), + es_mapping=text_mapping(), + ) + + author_year_of_death = Field( + name='author_year_of_death', + display_name='Author year of death', + description='Year in which the author(s) died', + extractor=Metadata('author_year_of_death'), + es_mapping=text_mapping(), + ) + + # the above fields are also given as proper dates in geb_datum / overl_datum + # but implementing them as date fields requires support for multiple values + + author_place_of_birth = Field( + name='author_place_of_birth', + display_name='Author place of birth', + description='Place the author(s) was(/were) born', + extractor=Metadata('author_place_of_birth'), + es_mapping=keyword_mapping(), + ) + + author_place_of_death = Field( + name='author_place_of_death', + display_name='Author place of death', + description='Place where the author(s) died', + extractor=Metadata('author_place_of_death'), + es_mapping=keyword_mapping(), + ) + + author_gender = Field( + name='author_gender', + display_name='Author gender', + description='Gender of the author(s)', + extractor=Metadata('author_gender'), + es_mapping=keyword_mapping(), + search_filter=MultipleChoiceFilter( + description='Select books based on the gender of the author(s)', + ), + visualizations=['resultscount', 'termfrequency'], + ) + + url = Field( + name='url', + display_name='View on DBNL', + description='Link to the book\'s page in DBNL', + extractor=Metadata('url'), + es_mapping=keyword_mapping(), + ) + + genre = Field( + name='genre', + display_name='Genre', + description='Genre of the book', + extractor=Metadata('genre'), + es_mapping=keyword_mapping(), + search_filter=MultipleChoiceFilter( + description='Select books in these genres', + ), + visualizations=['resultscount', 'termfrequency'], + ) + + language = Field( + name='language', + display_name='Language', + description='Language in which the book is written', + # this extractor is similar to language_code below, + # but designed to accept multiple values in case of uncertainty + extractor=Pass( + Pass( + Backup( + XML( # get the language on chapter-level if available + attribute='lang', + transform=lambda value: [value] if value else None, + ), + XML( # look for section-level codes + {'name': 'div', 'attrs': {'type': 'section'}}, + attribute='lang', + multiple=True, + ), + XML( # look in the top-level metadata + 'language', + toplevel=True, + recursive=True, + multiple=True, + attribute='id' + ), + transform = lambda codes: map(utils.language_name, codes) if codes else None, + ), + transform=utils.sorted_and_unique, + ), + transform=utils.join_values, + ), + es_mapping=keyword_mapping(), + search_filter=MultipleChoiceFilter( + description='Select books in these languages', + option_count=20, + ), + visualizations=['resultscount', 'termfrequency'], + ) + + language_code = Field( + name='language_code', + display_name='Language code', + description='ISO code of the text\'s language', + # as this may be used to set the HTML lang attribute, it forces a single value + extractor=Pass( + Backup( + XML( # get the language on chapter-level if available + attribute='lang', + ), + XML( # look for section-level code + {'name': 'div', 'attrs': {'type': 'section'}}, + attribute='lang' + ), + XML( #otherwise, get the (first) language for the book + 'language', + attribute='id', + toplevel=True, + recursive=True, + ), + transform=utils.single_language_code, + ), + transform=utils.standardize_language_code, + ), + es_mapping=keyword_mapping(), + ) + + chapter_title = Field( + name='chapter_title', + display_name='Chapter', + extractor=Backup( + XML( + tag='head', + recursive=True, + flatten=True, + ), + XML( + tag=utils.LINE_TAG, + recursive=True, + flatten=True, + ) + ), + results_overview=True, + search_field_core=True, + csv_core=True, + visualizations=['wordcloud'], + ) + + chapter_index = Field( + name='chapter_index', + display_name='Chapter index', + description='Order of this chapter within the book', + extractor=Order( + transform=lambda x : x + 1, + applicable=lambda metadata: metadata['has_xml'] + ), + es_mapping=int_mapping(), + sortable=True, + ) + + content = Field( + name='content', + display_name='Content', + description='Text in this chapter', + display_type='text_content', + results_overview=True, + search_field_core=True, + csv_core=True, + extractor=XML( + tag=utils.LINE_TAG, + recursive=True, + multiple=True, + flatten=True, + transform_soup_func=utils.pad_content, + ), + es_mapping=main_content_mapping(token_counts=True), + visualizations=['wordcloud', 'ngram'], + ) + + has_content = Field( + name='has_content', + display_name='Content available', + description='Whether the contents of this book are available on I-analyzer', + extractor=Metadata('has_xml'), + es_mapping=bool_mapping(), + search_filter=BooleanFilter( + description='Select books with text available on I-analyzer, or metadata-only books', + true='Content available', + false='Metadata only' + ), + ) + + is_primary = Field( + name='is_primary', + display_name='Primary', + description='Whether this is the primary document for this book - each book has only one primary document', + extractor=Order(transform = lambda index : index == 0), + search_filter=BooleanFilter( + true='Primary', + false='Other', + description='Select only primary documents - i.e. only one result per book', + ) + ) + + fields = [ + title_field, + title_id, + id, + volumes, + edition, + periodical, + year_full, + year_int, + author, + author_id, + author_year_of_birth, + author_place_of_birth, + author_year_of_death, + author_place_of_death, + author_gender, + url, + genre, + language, + language_code, + chapter_title, + chapter_index, + content, + has_content, + is_primary, + ] diff --git a/backend/corpora/dbnl/dbnl_metadata.py b/backend/corpora/dbnl/dbnl_metadata.py new file mode 100644 index 000000000..37af17db7 --- /dev/null +++ b/backend/corpora/dbnl/dbnl_metadata.py @@ -0,0 +1,93 @@ +import os +from django.conf import settings +from addcorpus.corpus import CSVCorpus, Field +from addcorpus.extract import CSV, Combined, Pass +import corpora.dbnl.utils as utils + +class DBNLMetadata(CSVCorpus): + '''Helper corpus for extracting the DBNL metadata. + + Used by the DBNL corpus for CSV extraction utilities - + not intended as a standalone corpus.''' + + data_directory = settings.DBNL_DATA + + field_entry = 'ti_id' + delimiter = '|' + skip_lines = 1 + + def sources(self, start=None, end=None): + csv_path = os.path.join(self.data_directory, 'titels_pd.csv') + yield csv_path, {} + + # fields that have a singular value + _singular_fields = [ + ('title_id', 'ti_id'), + ('title', 'titel'), + ('volumes', 'vols'), + ('year', '_jaar'), + ('year_full', 'jaar'), + ('edition', 'druk'), + ('url', 'url'), + ] + + # fields that should be extracted for each author + # but have otherwise straightfoward extraction + _simple_author_fields = [ + ('id', 'pers_id'), + ('year_of_birth', 'jaar_geboren'), + ('place_of_birth', 'geb_plaats'), + ('year_of_death', 'jaar_overlijden'), + ('place_of_death', 'overl_plaats') + ] + + fields = [ + Field(name=name, extractor=CSV(column)) + for name, column in _singular_fields + ] + [ + Field( + name='genre', + extractor=Pass( + utils.filter_by( + CSV('genre', multiple=True), + CSV('genre', multiple=True, transform=utils.which_unique) + ), + transform=utils.join_values, + ) + ), + Field( + name='periodical', + extractor=CSV('achternaam', multiple=True, + transform=utils.get_periodical + ) + ) + ] + [ + Field( + 'author_' + name, + extractor=utils.by_author( + CSV(column, multiple=True), + ) + ) + for name, column in _simple_author_fields + ] + [ + Field( + 'author_name', + extractor=utils.by_author( + Combined( + CSV('voornaam', multiple=True), + CSV('voorvoegsel', multiple=True), + CSV('achternaam', multiple=True), + transform=lambda names: map(utils.format_name, zip(*names)) + ), + ) + ), + Field( + 'author_gender', + extractor=utils.by_author( + CSV('vrouw', multiple=True, + transform=lambda values: map(utils.format_gender, values) + ) + ) + ) + ] + diff --git a/backend/corpora/dbnl/description/dbnl.md b/backend/corpora/dbnl/description/dbnl.md new file mode 100644 index 000000000..583046679 --- /dev/null +++ b/backend/corpora/dbnl/description/dbnl.md @@ -0,0 +1,15 @@ +### About DBNL + +The Digital Library of Dutch Literature ([DBNL](https://www.dbnl.org/)) is a digital collection of texts from Dutch literature, linguistics, and cultural history, from the earliest period to the present. The collection represents the whole of the Dutch language area. DBNL is a collaboration between the [Taalunie](https://taalunie.org/), the [Vlaamse Erfgoedbibliotheken](https://vlaamse-erfgoedbibliotheken.be/), and the [KB, the Dutch Royal Library](https://www.kb.nl/). + +### What can you find in the DBNL dataset? + +The DBNL dataset can be used for research into Dutch and Flemish linguistics and literature, from the middle ages to the present. Limburghish, Frisian, Surinam, and South African literature are represented. + +The dataset contains digitised texts, which have been manually corrected, with metadata. It includes medieval literature as well as classic novels. In addition, the dataset contains magazines from Dutch language and literary studies, such as De Gids and De Revisor. + +### Availability + +The I-analyzer corpus contains the publicly available portion of the [DBNL-dataset](https://www.kb.nl/onderzoeken-vinden/datasets/dbnl-dataset). These texts are in the public domain. (You can also download the full dataset through this link.) + +For some books, the public dataset provides metadata but not the full text. In documents with metadata only, the full text is usually available on the DBNL interface. diff --git a/backend/corpora/dbnl/images/dbnl.jpg b/backend/corpora/dbnl/images/dbnl.jpg new file mode 100644 index 000000000..4b31eff17 Binary files /dev/null and b/backend/corpora/dbnl/images/dbnl.jpg differ diff --git a/backend/corpora/dbnl/readme.md b/backend/corpora/dbnl/readme.md new file mode 100644 index 000000000..1a52048f4 --- /dev/null +++ b/backend/corpora/dbnl/readme.md @@ -0,0 +1,11 @@ +# DBNL Corpus + +## Data + +The public domain can be downloaded from the [DBNL interface](https://www.dbnl.org/letterkunde/pd/index.php). You will need the .csv metadata file, and the XML files. + +Your source directory should consist of the following: +- `'titels_pd.csv'`: the metadata file +- `xml_pd`: a directory that contains all the (unzipped) XML files. + +If you want to use a sample for development, you can include fewer XML files. Any metadata not found as XML will still be added as a metadata-only records, but this is much faster than parsing and indexing the content. diff --git a/backend/corpora/dbnl/tests/data/titels_pd.csv b/backend/corpora/dbnl/tests/data/titels_pd.csv new file mode 100644 index 000000000..035a37c96 --- /dev/null +++ b/backend/corpora/dbnl/tests/data/titels_pd.csv @@ -0,0 +1,18 @@ +sep=| +ti_id|titel|vols|jaar|druk|ppn_o|bibliotheek|categorie|_jaar|pers_id|voornaam|voorvoegsel|achternaam|jaar_geboren|jaar_overlijden|geb_datum|overl_datum|geb_plaats|overl_plaats|geb_plaats_code|geb_land_code|overl_plaats_code|overl_land_code|vrouw|url|text_url|maand|genre| +"maer002alex01"|"Alexanders geesten"||"13de eeuw"|"handschrift"|||"1"|"1200"|"maer002"|"Jacob"|"van"|"Maerlant"|"ca. 1230"|"ca. 1300"||||"Damme"|||"damme001"||"0"|"https://dbnl.org/tekst/maer002alex01_01"|||"poëzie"| +"maer002spie00"|"Spiegel historiael (5 delen)"||"ca. 1283-1325"|"handschrift"|||"1"|"1283"|"maer002"|"Jacob"|"van"|"Maerlant"|"ca. 1230"|"ca. 1300"||||"Damme"|||"damme001"||"0"|"https://dbnl.org/tekst/maer002spie00_01"|||"poëzie"| +"maer002spie00"|"Spiegel historiael (5 delen)"||"ca. 1283-1325"|"handschrift"|||"1"|"1283"|"uten001"|"Philip"||"Utenbroecke"|"?(13de eeuw)"|"?(14de eeuw)"|||||||||"0"|"https://dbnl.org/tekst/maer002spie00_01"|||"poëzie"| +"maer002spie00"|"Spiegel historiael (5 delen)"||"ca. 1283-1325"|"handschrift"|||"1"|"1283"|"velt003"|"Lodewijk"|"van"|"Velthem"|"ca. 1270"|"na 1326"||||"Waldenrath"|||"walde001"||"0"|"https://dbnl.org/tekst/maer002spie00_01"|||"poëzie"| +"maer002spie02"|"Spiegel historiael. Eerste partie"||"ca. 1283-1296"|"handschrift"|||"1"|"1283"|"maer002"|"Jacob"|"van"|"Maerlant"|"ca. 1230"|"ca. 1300"||||"Damme"|||"damme001"||"0"|"https://dbnl.org/tekst/maer002spie02_01"|||"poëzie"| +"maer002spie05"|"Spiegel historiael. Derde partie"||"ca. 1283-1296"|"handschrift"|||"1"|"1283"|"maer002"|"Jacob"|"van"|"Maerlant"|"ca. 1230"|"ca. 1300"||||"Damme"|||"damme001"||"0"|"https://dbnl.org/tekst/maer002spie05_01"|||"poëzie"| +"maer002spie06"|"Spiegel historiael. Vierde partie"||"ca. 1283-1296"|"handschrift"|||"1"|"1283"|"maer002"|"Jacob"|"van"|"Maerlant"|"ca. 1230"|"ca. 1300"||||"Damme"|||"damme001"||"0"|"https://dbnl.org/tekst/maer002spie06_01"|||"poëzie"| +"maer005sing01"|"Het singende nachtegaeltje"||"1671"|"1ste druk"|"393478793"|"denha004koni01"|"1"|"1671"|"maer005"|"Cornelis"||"Maertsz."|"?"|"na 1671"|||"Wervershoof"||"werve001"||||"0"|"https://dbnl.org/tekst/maer005sing01_01"|"https://dbnl.org/nieuws/text.php?id=maer005sing01"|"2012_10 "|"poëzie"| +"will028belg00"|"Belgisch museum voor de Nederduitsche tael- en letterkunde en de geschiedenis des vaderlands"||"1837-1846"|"1ste druk"|"394987047"||"1"|"1837"|"will028"|"J.F."||"Willems"|"1793"|"1846"|"11 maart"|"24 juni"|"Boechout"|"Gent"|"boech001"||"gent_001"||"0"|"https://dbnl.org/tekst/will028belg00_01"|||"proza"| +"will028belg00"|"Belgisch museum voor de Nederduitsche tael- en letterkunde en de geschiedenis des vaderlands"||"1837-1846"|"1ste druk"|"394987047"||"1"|"1837"|"_bel001"|||"[tijdschrift] Belgisch Museum"|||||||||||"0"|"https://dbnl.org/tekst/will028belg00_01"|||"proza"| +"_ale002alex01"|"Die historie dat leven ende dat regiment des alre grootsten ende machtichsten coninc alexanders die heer was ende prince alle der werelt"||"1477"|"1ste druk"||"berli004staa01"|"1"|"1477"|"_ale002"|"anoniem"||"Die hystorie vanden grooten Coninck Alexander"|||||||||||"0"|"https://dbnl.org/tekst/_ale002alex01_01"|"https://dbnl.org/nieuws/text.php?id=_ale002alex01"|"2020_05 "|"proza"| +"_ale002alex01"|"Die historie dat leven ende dat regiment des alre grootsten ende machtichsten coninc alexanders die heer was ende prince alle der werelt"||"1477"|"1ste druk"||"berli004staa01"|"1"|"1477"|"_ale002"|"anoniem"||"Die hystorie vanden grooten Coninck Alexander"|||||||||||"0"|"https://dbnl.org/tekst/_ale002alex01_01"|"https://dbnl.org/nieuws/text.php?id=_ale002alex01"|"2020_05 "|"non-fictie"| +"_gid001184801"|"De Gids. Jaargang 12"||"1848"|"1ste druk"||"leide001univ01"|"1"|"1848"|"_gid001"|||"[tijdschrift] Gids, De"|||||||||||"0"|"https://dbnl.org/tekst/_gid001184801_01"|"https://dbnl.org/nieuws/text.php?id=_gid001184801"|"2008_03 "|"proza"| +"_gid001184801"|"De Gids. Jaargang 12"||"1848"|"1ste druk"||"leide001univ01"|"1"|"1848"|"_gid001"|||"[tijdschrift] Gids, De"|||||||||||"0"|"https://dbnl.org/tekst/_gid001184801_01"|"https://dbnl.org/nieuws/text.php?id=_gid001184801"|"2008_03 "|"poëzie"| +"_gid001184801"|"De Gids. Jaargang 12"||"1848"|"1ste druk"||"leide001univ01"|"1"|"1848"|"_gid001"|||"[tijdschrift] Gids, De"|||||||||||"0"|"https://dbnl.org/tekst/_gid001184801_01"|"https://dbnl.org/nieuws/text.php?id=_gid001184801"|"2008_03 "|"sec - letterkunde"| +"_gid001184801"|"De Gids. Jaargang 12"||"1848"|"1ste druk"||"leide001univ01"|"1"|"1848"|"_gid001"|||"[tijdschrift] Gids, De"|||||||||||"0"|"https://dbnl.org/tekst/_gid001184801_01"|"https://dbnl.org/nieuws/text.php?id=_gid001184801"|"2008_03 "|"sec - taalkunde"| diff --git a/backend/corpora/dbnl/tests/data/xml_pd/maer005sing01_01.xml b/backend/corpora/dbnl/tests/data/xml_pd/maer005sing01_01.xml new file mode 100644 index 000000000..d9ada5c61 --- /dev/null +++ b/backend/corpora/dbnl/tests/data/xml_pd/maer005sing01_01.xml @@ -0,0 +1,225 @@ + + + + + + + Het singende nachtegaeltje + Cornelis Maertsz. + + + + Dit bestand biedt, behoudens een aantal hierna te noemen ingrepen, een diplomatische weergave van Het singende nachtegaeltje van Cornelis Maertsz. in de eerste druk uit 1671. + + + p. 9: 4 → 5: ‘5 Wel aen, ghy schoone, en ghy nette ieught.’ + p. 28: david → David: ‘Heeft David in sijn tooren.’ + p. 41: Veriogen → Verioegen: ‘Verioegen, en sloegen' + p. 42: 4. → 3.: ‘3. Weest wellekom Vorsten van Iacobs geslacht.’ + p. 43: thiemael → thienmael: ‘g' Hebt thiemael, den Philisteen meer ontbloot.’ + p. 48: leveu → leven: En geeft niemant in't leven.’ + p. 62: vulcht → vlucht: ‘Soo dacht hy met een snelle vlucht.’ + p. 63: Hemles → Hemels: ‘Soo comt des Hemels Heer.’ + p. 67: Danneer → Wanneer: ‘Wanneer als hy op rijst in de kimmen.’ + p. 69: Buyttn → Buyten: ‘Buyten 't spoor, en buyten 't padt.’ + p. 69: 3 → 2.: ‘2. 't Geselschap dat u hier toe-lacht.’ + p. 72: in het origineel is een gedeelte van de tekst slecht leesbaar. De redactie heeft de tekst tussen vierkante haken aangevuld: ‘De [y]delhe[y]t der Rijckdommen.’ + p. 72: in het origineel is een gedeelte van de tekst slecht leesbaar. De redactie heeft de tekst tussen vierkante haken aangevuld: 'Stem: Doe ic lest wandeld[e] over de Helder.’ + p. 75: sonde → soude: ‘Dus heerlijck soude werden opgetoyt.’ + p. 86: bruyseu → bruysen: ‘Haer gollefjes doet bruysen.’ + p. 86: 5. → 4.: ‘4. Die van laveeren dwers, en langhs.’ + p. 94: aenschowen → aenschouwen: ‘3. Daer sulje aenschouwen.’ + p. 96: FN → EN: ‘EN roemt niet van u Landt, noch Stadt.’ + p. 97: 8. → 3.: ‘3. Den Noor-man hackt gestaegh in 't Wout.’ + p. 108: koop mans goedt → koop-mans goedt: ‘4. Dan sal al des koop-mans goedt.’ + p. 110: 3. → 7.: ‘7. Maer soud' ick nu singen uyt.’ + p. 118: lnsten → lusten: ‘5. Wie soud' lusten nu te schieten.’ + p. 121: in het origineel is een gedeelte van de tekst onleesbaar. In deze digitale editie is ‘[...]’ geplaatst: Wanneer hy flau[...]r en naer.’ + p. 123: verbldt → verblijdt: ‘'t Zy men is verblijdt.’ + p. 127: vlas → vals: ‘Geen vals geruchte smeet.’, 'En haet vals sweeren.’ + p. 127: nytsuyght → uytsuyght: ‘Sijn naesten niet uytsuyght.’ + p. 127: nytstorten → uytstorten: ‘4. Die sijn gemoedt voor Gode kan uytstorten.’ + p. 138: fruy → fury: ‘10. Den satan valt in fury aen.’ + p. 139: flaverny → slaverny: ‘Van des satans slaverny.’ + p. 141: voleck → vloeck: ‘Sonde, Satan, Wet en vloeck.’ + p. 144: Steeren → Sterren: ‘Bij de Sterren.’ + p. 148: wreeet → wreet: ‘Als hy verstont dit wreet bedrijf.’ + p. 148: Eu → En: ‘5. Eu stapte mee een rassche voet.’ + p. 151: wrecken → wercken: ‘Het aerdtsche lichaem, en sijn wercken.’ + p. 159: 4. → 5.: ‘5. Wel wieje dan zijt, en laet u hert.’ + p. 161: 4. → 3.: ‘3. Komt Ierusalems Vier-schaer.’ + p. 164: Ie → In: ‘In't eerst veel goedts in t' lest veel straf.’ + p. 165: deu → den: ‘2. In den Eersten quammer voort.’ + p. 165: duysterhendt → duysterheydt: ‘Inde duysterheydt.’ + p. 171: Vermeedert → Vermeerdert: ‘Vermeerdert alle daegh.’ + p. 187: Het foutieve paginanummer 107 is verbeterd in 187. + p. 192: Nn → Nu: ‘Nu noodige vruchten al rijp, en gans.’ + p. 194: Massouw → Nassouw: ‘Stem: Treurt edel huys Nassouw.’ + p. 196: gemeeen → gemeen: ‘Dat is dan in 't gemeen.’ + p. 196: in het origineel is een gedeelte van de tekst slecht leesbaar. De redactie heeft de tekst tussen vierkante haken aangevuld: 'D[u]s aengeprickelt, soude.’ + p. 222: minmermeer → nimmermeer: ‘En nimmermeer.’ + p. 223: bijjven → blijven: ‘Hoe kan met Godt ons Vni blijven.’ + p. 224: vreuhtbare → vreuchtbare: ‘Veldt-Sangh, Op 't vreuchtbare ghewas in Iulius 1654.’ + p. 230: ougeluck → ongeluck: ‘My een ongeluck aentreft.’ + p. 233: schnnent → schijnent: ‘Als dan alleen, wanneer door 't schijnent' licht.’ + p. 234: d'outucht → d'ontucht: ‘Wanneer d'ontucht haer swang're wangen lost.’ + p. 239: 1645 → 1654: ‘Wellekomst, Aen Iuffr. Geertruyde Maria Bailly. Gekomen tot Wervershoof den 5. September 1654.’ + p. 240: Vrouwet → Vrouwe,: ‘WElkom, welkom waerde Vrouwe.’ + p. 244: nn → nu: ‘Doe nu ons Vaderlandt.’ + p. 246: lnst → lust: ‘Ghy die aen lust, en weeldt.’ + + + + + + + + + + + + + A1v + + + + + + maer005sing01_01 + DBNL-TEI 1 + +

2012 dbnl

+
+
+ + DSOLmetadata:yes + + + + + exemplaar universiteitsbibliotheek Leiden, signatuur: 1197 H 21 + +

Cornelis Maertsz., Het singende nachtegaeltje. Michiel de Groot, Amsterdam 1671.

+

 

+
+
+ + +

Wijze van coderen: standaard

+

+
+
+ + + Nederlands + + + + + Het singende nachtegaeltje + Cornelis Maertsz. + + + + + + + + + + + + +

+

+

+
+ + + Het singende nachtegaeltje + Cornelis Maertsz. + + + + + + + + + + + + +

+

+

+
+ + + 2012-09-17 + + SW + + colofon toegevoegd + + +
+
+ + +

 

+

Het singende Nachtegaeltje

+

Quelende soetelijck, tot stichtelijck vermaeck voor de Christelijck Ieught.

+

Door.

+

Cornelis Maertsz. tot Wervers hoof.

+

't Amsterdam Voor Michiel de Groot, Boek-Verkooper op den Nieuwen Dijck, 1671.

+
+ + + + + +Op De vermakelijke en stightelijke Liedekens van Cornelis Maarts +SOo wort de schrand're Rey der vloeiende Poëten +Door u, o waerde Vriend! vervult, +Soo wort uw' Naam met Eer vergult, +En door de Lof-bazuin roem rughtigh uitgekreten, + +De Dight-kunst scheen wel eer in Amstel silte Plassen +Alleen te sitten op haer throon, +Maar ghy stelt in uw' Dight ten toon +Dat in ons Wervershoof nogh eed'ler vrughten wassen. +Want't baat niet dat men kan een yd'le Pen beswang'ren +Met wonderlijck Gedight, +Indienmen niet en stight, +Maer met een Heydensch rot vervult de Mond der Sang'ren. +Ghy soeckt de Af-breuk van het Rijck des Helschen lagers. + +Dies ghy een Heiligh Ooghwit raakt, +En onse Ieught sticht en vermaakt +Soo volght ghy 't saligh Spoor des Ioodschen Harpe-Slagers. +Treet voort dien Eerbaan in, en laat geen Aardsch gewemel +V hind'ren in soo eed'len Saak, +Soo streckt uw 'Lands-lien tot een Baak +En voert ons dart'le Ieughd al singende ten Hemel. +H. Vander Meer. +
+
+ +Register der Liedekens. +
+A. + + ACh gesalfde van den Heer.Pag. 30 + Als Saul, en david den vyant in't velt.41 + Als ick de Son verhoogen sie.184 + Als hem de Son begeeft.189 + Als ick den Herfst aenschou.194 + Als in koelt, de nacht komt overkleeden208 + Als van der meer op Eng'le-vleug'len vloog.232 +
+
+
+ +
+
diff --git a/backend/corpora/dbnl/tests/test_dbnl_extraction.py b/backend/corpora/dbnl/tests/test_dbnl_extraction.py new file mode 100644 index 000000000..33a55981f --- /dev/null +++ b/backend/corpora/dbnl/tests/test_dbnl_extraction.py @@ -0,0 +1,200 @@ +import pytest +import os +from bs4 import BeautifulSoup + +from addcorpus.load_corpus import load_corpus +from addcorpus.extract import XML +from corpora.dbnl.utils import append_to_tag, index_by_id, which_unique, language_name + +here = os.path.abspath(os.path.dirname(__file__)) + +@pytest.fixture +def dbnl_corpus(settings): + settings.DBNL_DATA = os.path.join(here, 'data') + # for testing purposes, also add the metadata helper corpus + settings.CORPORA = { + 'dbnl': os.path.join(here, '..', 'dbnl.py'), + 'dbnl_metadata': os.path.join(here, '..', 'dbnl_metadata.py'), + } + return 'dbnl' + +language_name_testcases = [ + ('nl', 'Dutch'), + ('la', 'Latin'), + ('lat', 'Latin'), + ('rus', 'Russian') +] + +@pytest.mark.parametrize(['code', 'name'], language_name_testcases) +def test_language_names(code, name): + assert language_name(code) == name + +which_unique_testcases = [ + (['_ale002', '_ale002'], [True, False]), + (['proza', 'poëzie', 'proza'], [True, True, False]) +] + +@pytest.mark.parametrize(['items', 'uniquenesses'], which_unique_testcases) +def test_which_unique(items, uniquenesses): + result = list(which_unique(items)) + assert result == uniquenesses + +def test_metadata_extraction(dbnl_corpus): + corpus = load_corpus('dbnl_metadata') + data = index_by_id(corpus.documents()) + assert len(data) == 9 + + item = data['maer005sing01'] + assert item['title'] == 'Het singende nachtegaeltje' + assert item['author_name'] == 'Cornelis Maertsz.' + + multiple_authors = data['maer002spie00'] + assert multiple_authors['title'] == 'Spiegel historiael (5 delen)' + assert multiple_authors['author_name'] == 'Jacob van Maerlant, Philip Utenbroecke, Lodewijk van Velthem' + +append_testcases = [ + ( + 'Vraeghje wie het meeste goedt.107', + 'cell', + ' ', + 'Vraeghje wie het meeste goedt.107', + 'Vraeghje wie het meeste goedt. 107', + ), + ( + 'Nu lokken schone Prenten\nHun beider vrolijke ogen', + 'lb', + '\n', + 'Nu lokken schone Prenten Hun beider vrolijke ogen', + 'Nu lokken schone Prenten\nHun beider vrolijke ogen', + ), +] + +@pytest.mark.parametrize(['xml', 'tag', 'padding', 'original_output', 'new_output'], append_testcases) +def test_append_to_tag(xml, tag, padding, original_output, new_output): + soup = BeautifulSoup(xml, 'lxml-xml') + extractor = XML(flatten=True) + assert extractor._flatten(soup) == original_output + + edited_soup = append_to_tag(soup, tag, padding) + + assert extractor._flatten(edited_soup) == new_output + +expected_docs = [ + { + 'title_id': 'maer005sing01_01', + 'title': 'Het singende nachtegaeltje', + 'id': 'maer005sing01_01_0000', + 'volumes': None, + 'edition': '1ste druk', + 'periodical': None, + 'author_id': 'maer005', + 'author': 'Cornelis Maertsz.', + 'author_year_of_birth': None, + 'author_place_of_birth': 'Wervershoof', + 'author_year_of_death': 'na 1671', + 'author_place_of_death': None, + 'author_gender': 'man/unknown', + 'url': 'https://dbnl.org/tekst/maer005sing01_01', + 'year': '1671', + 'year_full': '1671', + 'genre': 'poëzie', + 'language': 'Dutch', + 'language_code': 'nl', + 'content': '\n'.join([ + 'Het singende Nachtegaeltje', + 'Quelende soetelijck, tot stichtelijck vermaeck voor de Christelijck Ieught.', + 'Door.', + 'Cornelis Maertsz. tot Wervers hoof.', + '\'t Amsterdam Voor Michiel de Groot, Boek-Verkooper op den Nieuwen Dijck, 1671.', + ]), + 'chapter_title': None, + 'chapter_index': 1, + 'has_content': True, + 'is_primary': True, + }, + { + 'content': '\n'.join([ + 'Op De vermakelijke en stightelijke Liedekens van Cornelis Maarts', + 'SOo wort de schrand\'re Rey der vloeiende Poëten', + 'Door u, o waerde Vriend! vervult,', + 'Soo wort uw\' Naam met Eer vergult,', + 'En door de Lof-bazuin roem rughtigh uitgekreten,', + 'De Dight-kunst scheen wel eer in Amstel silte Plassen', + 'Alleen te sitten op haer throon,', + 'Maar ghy stelt in uw\' Dight ten toon', + 'Dat in ons Wervershoof nogh eed\'ler vrughten wassen.', + 'Want\'t baat niet dat men kan een yd\'le Pen beswang\'ren', + 'Met wonderlijck Gedight,', + 'Indienmen niet en stight,', + 'Maer met een Heydensch rot vervult de Mond der Sang\'ren.', + 'Ghy soeckt de Af-breuk van het Rijck des Helschen lagers.', + 'Dies ghy een Heiligh Ooghwit raakt,', + 'En onse Ieught sticht en vermaakt', + 'Soo volght ghy \'t saligh Spoor des Ioodschen Harpe-Slagers.', + 'Treet voort dien Eerbaan in, en laat geen Aardsch gewemel', + 'V hind\'ren in soo eed\'len Saak,', + 'Soo streckt uw \'Lands-lien tot een Baak', + 'En voert ons dart\'le Ieughd al singende ten Hemel.', + 'H. Vander Meer.', + ]), + 'chapter_title': 'Op De vermakelijke en stightelijke Liedekens van Cornelis Maarts', + 'chapter_index': 2, + 'is_primary': False, + }, { + 'chapter_title': 'Register der Liedekens.', + 'content': '\n'.join([ + 'Register der Liedekens.', + 'A.', + 'ACh gesalfde van den Heer. Pag. 30 ', + 'Als Saul, en david den vyant in\'t velt. 41 ', + 'Als ick de Son verhoogen sie. 184 ', + 'Als hem de Son begeeft. 189 ', + 'Als ick den Herfst aenschou. 194 ', + 'Als in koelt, de nacht komt overkleeden 208 ', + 'Als van der meer op Eng\'le-vleug\'len vloog. 232', + ]) + }, { # metadata-only book + 'title_id': 'maer002alex01', + 'title': 'Alexanders geesten', + 'year_full': '13de eeuw', + 'year': '1200', + 'author_id': 'maer002', + 'author': 'Jacob van Maerlant', + 'url': 'https://dbnl.org/tekst/maer002alex01_01', + 'content': None, + 'has_content': False, + 'is_primary': True, + }, { # book with multiple authors + 'title_id': 'maer002spie00', + 'author_id': 'maer002, uten001, velt003', + 'author': 'Jacob van Maerlant, Philip Utenbroecke, Lodewijk van Velthem', + 'author_year_of_birth': 'ca. 1230, ?(13de eeuw), ca. 1270', + 'author_place_of_birth': None, + } +] + [{}] * 3 + [ + { # periodical + 'title_id': 'will028belg00', + 'author_id': 'will028', + 'author': 'J.F. Willems', + 'periodical': 'Belgisch Museum', + }, { #anonymous author + 'author': 'anoniem [Die hystorie vanden grooten Coninck Alexander]' + }, { # periodical with multiple genres + 'author': None, + 'periodical': 'Gids, De' + } +] + +def test_dbnl_extraction(dbnl_corpus): + corpus = load_corpus(dbnl_corpus) + docs = list(corpus.documents()) + + assert len(docs) == 3 + 8 # 3 chapters + 7 metadata-only books + + for actual, expected in zip(docs, expected_docs): + # assert that actual is a superset of expected + for key in expected: + assert expected[key] == actual[key] + assert expected.items() <= actual.items() + + diff --git a/backend/corpora/dbnl/utils.py b/backend/corpora/dbnl/utils.py new file mode 100644 index 000000000..7ffeab826 --- /dev/null +++ b/backend/corpora/dbnl/utils.py @@ -0,0 +1,206 @@ +import re +from bs4 import BeautifulSoup +import os +from langcodes import standardize_tag, Language + +from addcorpus.extract import Pass, Combined, CSV + +# === METADATA EXTRACTION === + +def index_by_id(data): + return { + row['title_id']: row + for row in data + } + +PERIODIAL_PREFIX = '[tijdschrift]' + +def is_periodical(name): + return name.startswith(PERIODIAL_PREFIX) + +def sorted_and_unique(items): + if items: + return list(sorted(set(items))) + +def get_periodical(names): + periodicals = list(filter(is_periodical, names)) + format = lambda name: name[len(PERIODIAL_PREFIX):].strip() + if periodicals: + return ', '.join(sorted_and_unique(map(format, periodicals))) + +def which_are_people(names): + ''' + returns which names are NOT names of periodals + ''' + return map(lambda name: name and not is_periodical(name), names) + +def which_unique(items): + ''' + Which items in a list should be included to ensure uniqueness + + returns a list of booleans the same length as items, where `result[n] == True` + iff `items[n]` is the first occurrence of that value. + ''' + + is_first = lambda n: n == 0 or items[n] not in items[:n] + return map(is_first, range(len(items))) + +def filter_values_by(values, which): + if not values: + return None + + return [ + value + for value, include in zip(values, which) + if include + ] + +def filter_by(values_extractor, condition_extractor): + ''' + Takes an extractor that returns a list of values and one that returns a list + of booleans, of the same length. + + Extracts each value of the first extractor, where the corresponding value + of the second extractor is truthy. + ''' + + return Combined( + values_extractor, condition_extractor, + transform=lambda data : filter_values_by(*data) + ) + +def format_gender(value): + ''' + Format gender into a string for clarity + + Gender is coded as a binary value (∈ ['1', '0']). + 0 is used for men, unknown/anonymous authors, and institutions, + 1 is used for women. + ''' + + return {'0': 'man/unknown', '1': 'woman'}.get(value, None) + +def by_author(extractor): + # extractor of which rows represent unique authors + _which_are_authors = Combined( + CSV( + 'pers_id', + multiple=True, + transform=which_unique, + ), + CSV( + 'achternaam', + multiple=True, + transform=which_are_people, + ), + transform=lambda values: map(all, zip(*values)) + ) + + return Pass( + filter_by( + extractor, + _which_are_authors, + ), + transform=join_values, + ) + +# === METADATA-ONLY RECORDS === + +class BlankXML: + def __init__(self, data_directory): + self.filename = os.path.join(data_directory, '_.xml') + + def __enter__(self): + # create an xml that will generate one "spoonful", i.e. one document + # but no actual content + soup = BeautifulSoup('
', 'lxml-xml') + with open(self.filename, 'w') as file: + file.write(soup.prettify()) + + return self.filename + + def __exit__(self, exc_type, exc_value, traceback): + os.remove(self.filename) + +# === UTILITY FUNCTIONS === + +def join_values(values): + ''' + Join extracted values into a string with proper handling of None values. + + Input should be an iterable of strings or None. + + - If all values are '', None, or '?', return None + - If some values are non-empty, convert empty values to '?' and join + them into a single string. + ''' + + if values: + formatted = [value or '?' for value in values] + if any(value != '?' for value in formatted): + return ', '.join(formatted) + +def between_years(year, start_date, end_date): + if start_date and year < start_date.year: + return False + + if end_date and year > end_date.year: + return False + + return True + +def format_name(parts): + '''Format a person's name''' + + #exception for anonymous authors + if parts[0] == 'anoniem': + work = parts[-1] + return f'anoniem [{work}]' + + return ' '.join(filter(None, parts)) + +LINE_TAG = re.compile('^(p|l|head|row|item)$') +''' +Describes the tags for a single line in the content. Can be: + +-

paragraphs +- headers +- line (used for poems/songs) +- table rows (used for poems/songs) +- list items +''' + +def append_to_tag(soup, tag, padding): + ''' + Insert a string at the end of each instance of a tag. + ''' + + for tag in soup.find_all(tag): + tag.append(padding) + + return soup + +def pad_content(node): + pad_cells = lambda n: append_to_tag(n, 'cell', ' ') + pad_linebreaks = lambda n: append_to_tag(n, 'lb', '\n') + return pad_cells(pad_linebreaks(node)) + +def standardize_language_code(code): + if code: + return standardize_tag(code) + +def single_language_code(code): + if code and '-' in code: + primary, *rest = code.split('-') + return primary + return code + +def language_name(code): + if not code: + return None + codes = code.split('-') + names = set(map( + lambda code: Language.make(language=standardize_tag(code)).display_name(), + codes + )) + return ', '.join(names) diff --git a/backend/corpora/dutchannualreports/dutchannualreports.py b/backend/corpora/dutchannualreports/dutchannualreports.py index 8e3569593..fb882c044 100644 --- a/backend/corpora/dutchannualreports/dutchannualreports.py +++ b/backend/corpora/dutchannualreports/dutchannualreports.py @@ -30,6 +30,9 @@ class DutchAnnualReports(XMLCorpus): allow_image_download = getattr(settings, 'DUTCHANNUALREPORTS_ALLOW_IMAGE_DOWNLOAD', True) word_model_path = getattr(settings, 'DUTCHANNUALREPORTS_WM', None) + languages = ['nl'] + category = 'finance' + mimetype = 'application/pdf' # Data overrides from .common.XMLCorpus diff --git a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py index 41b98674d..d2a39923a 100644 --- a/backend/corpora/dutchnewspapers/dutchnewspapers_public.py +++ b/backend/corpora/dutchnewspapers/dutchnewspapers_public.py @@ -36,6 +36,8 @@ class DutchNewspapersPublic(XMLCorpus): data_directory = settings.DUTCHNEWSPAPERS_DATA es_index = getattr(settings, 'DUTCHNEWSPAPERS_ES_INDEX', 'dutchnewspapers-public') image = 'dutchnewspapers.jpg' + languages = ['nl'] + category = 'newspaper' tag_toplevel = 'text' tag_entry = 'p' diff --git a/backend/corpora/ecco/ecco.py b/backend/corpora/ecco/ecco.py index 3366d08d0..1bff611a3 100644 --- a/backend/corpora/ecco/ecco.py +++ b/backend/corpora/ecco/ecco.py @@ -31,6 +31,8 @@ class Ecco(XMLCorpus): image = 'ecco.jpg' scan_image_type = getattr(settings, 'ECCO_SCAN_IMAGE_TYPE', 'application/pdf') es_settings = None + languages = ['en', 'cy', 'ga', 'gd'] # according to gale's documentation + category = 'book' tag_toplevel = 'pageContent' tag_entry = 'page' diff --git a/backend/corpora/goodreads/goodreads.py b/backend/corpora/goodreads/goodreads.py index 3df78abe2..6e865b4b8 100644 --- a/backend/corpora/goodreads/goodreads.py +++ b/backend/corpora/goodreads/goodreads.py @@ -31,6 +31,8 @@ class GoodReads(CSVCorpus): image = 'DioptraL.png' description_page = 'goodreads.md' visualize = [] + languages = ['en', 'es', 'it', 'pt', 'fr', 'nl', 'de', 'ar', 'af', 'sv', ''] # languages with > 1000 docs + category = 'review' # New data members non_xml_msg = 'Skipping non-XML file {}' diff --git a/backend/corpora/guardianobserver/guardianobserver.py b/backend/corpora/guardianobserver/guardianobserver.py index dce3444c8..669391a3a 100644 --- a/backend/corpora/guardianobserver/guardianobserver.py +++ b/backend/corpora/guardianobserver/guardianobserver.py @@ -37,6 +37,8 @@ class GuardianObserver(XMLCorpus): es_index = getattr(settings, 'GO_ES_INDEX', 'guardianobserver') image = 'guardianobserver.jpg' scan_image_type = getattr(settings, 'GO_SCAN_IMAGE_TYPE', 'application/pdf') + languages = ['en'] + category = 'newspaper' tag_toplevel = 'Record' diff --git a/backend/corpora/jewishinscriptions/jewishinscriptions.py b/backend/corpora/jewishinscriptions/jewishinscriptions.py index 204deb90b..b0684f1c4 100644 --- a/backend/corpora/jewishinscriptions/jewishinscriptions.py +++ b/backend/corpora/jewishinscriptions/jewishinscriptions.py @@ -23,6 +23,8 @@ class JewishInscriptions(XMLCorpus): es_index = getattr(settings, 'JEWISH_INSCRIPTIONS_ES_INDEX', 'jewishinscriptions') image = 'jewish_inscriptions.jpg' visualize = [] + languages = ['heb', 'lat'] + category = 'inscription' # Data overrides from .common.XMLCorpus tag_toplevel = '' diff --git a/backend/corpora/parliament/canada.py b/backend/corpora/parliament/canada.py index 5b78dd21b..b8b0f35ee 100644 --- a/backend/corpora/parliament/canada.py +++ b/backend/corpora/parliament/canada.py @@ -19,7 +19,7 @@ class ParliamentCanada(Parliament, CSVCorpus): data_directory = settings.PP_CANADA_DATA es_index = getattr(settings, 'PP_CANADA_INDEX', 'parliament-canada') image = 'canada.jpeg' - language = 'english' + languages = ['en'] description_page = 'canada.md' field_entry = 'speech_id' required_field = 'content' diff --git a/backend/corpora/parliament/denmark-new.py b/backend/corpora/parliament/denmark-new.py index 9b9c880d3..98b8d05c2 100644 --- a/backend/corpora/parliament/denmark-new.py +++ b/backend/corpora/parliament/denmark-new.py @@ -41,7 +41,7 @@ class ParliamentDenmarkNew(Parliament, CSVCorpus): es_index = getattr(settings, 'PP_DENMARK_NEW_INDEX', 'parliament-denmark-new') image = 'denmark.jpg' description_page = 'denmark-new.md' - language = 'danish' + languages = ['da'] delimiter = '\t' document_context = constants.document_context() document_context['context_fields'] = ['date'] diff --git a/backend/corpora/parliament/denmark.py b/backend/corpora/parliament/denmark.py index 97e38ccb6..30dc14521 100644 --- a/backend/corpora/parliament/denmark.py +++ b/backend/corpora/parliament/denmark.py @@ -41,7 +41,7 @@ class ParliamentDenmark(Parliament, CSVCorpus): image = 'denmark.jpg' description_page = 'denmark.md' - language = 'danish' + languages = ['da'] required_field = 'text' diff --git a/backend/corpora/parliament/finland.py b/backend/corpora/parliament/finland.py index 7225a84a5..2cc610e6a 100644 --- a/backend/corpora/parliament/finland.py +++ b/backend/corpora/parliament/finland.py @@ -62,7 +62,7 @@ def sources(self, start, end): yield xml_file, metadata - language = 'finnish' + languages = ['fi'] description_page = 'finland.md' image = 'finland.jpg' diff --git a/backend/corpora/parliament/france.py b/backend/corpora/parliament/france.py index f32ee403e..7a26652ab 100644 --- a/backend/corpora/parliament/france.py +++ b/backend/corpora/parliament/france.py @@ -18,7 +18,7 @@ class ParliamentFrance(Parliament, CSVCorpus): data_directory = settings.PP_FR_DATA es_index = getattr(settings, 'PP_FR_INDEX', 'parliament-france') image = 'france.jpeg' - language = 'french' + languages = ['fr'] description_page = 'france.md' word_model_path = getattr(settings, 'PP_FR_WM', None) diff --git a/backend/corpora/parliament/germany-new.py b/backend/corpora/parliament/germany-new.py index 2bcf33cbe..3571bd831 100644 --- a/backend/corpora/parliament/germany-new.py +++ b/backend/corpora/parliament/germany-new.py @@ -19,7 +19,7 @@ class ParliamentGermanyNew(Parliament, CSVCorpus): data_directory = settings.PP_GERMANY_NEW_DATA es_index = getattr(settings, 'PP_GERMANY_NEW_INDEX', 'parliament-germany-new') image = 'germany-new.jpeg' - language = 'german' + languages = ['de'] word_model_path = getattr(settings, 'PP_DE_WM', None) field_entry = 'id' diff --git a/backend/corpora/parliament/germany-old.py b/backend/corpora/parliament/germany-old.py index 198164bc2..47c52badd 100644 --- a/backend/corpora/parliament/germany-old.py +++ b/backend/corpora/parliament/germany-old.py @@ -20,7 +20,7 @@ class ParliamentGermanyOld(Parliament, CSVCorpus): data_directory = settings.PP_GERMANY_OLD_DATA es_index = getattr(settings, 'PP_GERMANY_OLD_INDEX', 'parliament-germany-old') image = 'germany-old.jpeg' - language = 'german' + languages = ['de'] word_model_path = getattr(settings, 'PP_DE_WM', None) description_page = 'germany-old.md' diff --git a/backend/corpora/parliament/ireland.py b/backend/corpora/parliament/ireland.py index 7462e9ea2..98bfa99ad 100644 --- a/backend/corpora/parliament/ireland.py +++ b/backend/corpora/parliament/ireland.py @@ -449,8 +449,8 @@ class ParliamentIreland(Parliament, Corpus): es_index = getattr(settings, 'PP_IRELAND_INDEX', 'parliament-ireland') image = 'ireland.png' description_page = 'ireland.md' - language = None # corpus uses multiple languages, so we will not be using language-specific analyzers es_settings = {'index': {'number_of_replicas': 0}} # do not include analyzers in es_settings + languages = ['en', 'ga'] @property def subcorpora(self): diff --git a/backend/corpora/parliament/netherlands.py b/backend/corpora/parliament/netherlands.py index 472c31d79..6a0703f76 100644 --- a/backend/corpora/parliament/netherlands.py +++ b/backend/corpora/parliament/netherlands.py @@ -134,7 +134,9 @@ class ParliamentNetherlands(Parliament, XMLCorpus): description_page = 'netherlands.md' tag_toplevel = lambda _, metadata: 'root' if is_old(metadata) else 'TEI' tag_entry = lambda _, metadata: 'speech' if is_old(metadata) else 'u' - language = 'dutch' + languages = ['nl'] + + category = 'parliament' def sources(self, start, end): logger = logging.getLogger(__name__) diff --git a/backend/corpora/parliament/norway-new.py b/backend/corpora/parliament/norway-new.py index 3fef94d03..fd64ac17a 100644 --- a/backend/corpora/parliament/norway-new.py +++ b/backend/corpora/parliament/norway-new.py @@ -53,7 +53,7 @@ class ParliamentNorwayNew(Parliament, CSVCorpus): data_directory = settings.PP_NORWAY_NEW_DATA es_index = getattr(settings, 'PP_NORWAY_NEW_INDEX', 'parliament-norway-new') image = 'norway.JPG' - language = 'norwegian' + languages = ['no'] description_page = 'norway-new.md' document_context = document_context() diff --git a/backend/corpora/parliament/norway.py b/backend/corpora/parliament/norway.py index be04e2071..224784d26 100644 --- a/backend/corpora/parliament/norway.py +++ b/backend/corpora/parliament/norway.py @@ -27,7 +27,7 @@ class ParliamentNorway(Parliament, CSVCorpus): data_directory = settings.PP_NORWAY_DATA es_index = getattr(settings, 'PP_NORWAY_INDEX','parliament-norway') image = 'norway.JPG' - language = 'norwegian' + languages = ['no'] description_page = 'norway.md' document_context = document_context( context_fields=['book_id'], diff --git a/backend/corpora/parliament/parliament.py b/backend/corpora/parliament/parliament.py index 7aca87c33..6a979752c 100644 --- a/backend/corpora/parliament/parliament.py +++ b/backend/corpora/parliament/parliament.py @@ -34,11 +34,11 @@ class Parliament(Corpus): image = 'parliament.jpeg' data_directory = 'bogus' - language = 'english' + category = 'parliament' @property def es_settings(self): - return es_settings(self.language, stopword_analyzer=True, stemming_analyzer=True) + return es_settings(self.languages[0], stopword_analyzer=True, stemming_analyzer=True) # overwrite below in child class if you need to extract the (converted) transcription diff --git a/backend/corpora/parliament/sweden-old.py b/backend/corpora/parliament/sweden-old.py index b065d9068..5c34d9a8f 100644 --- a/backend/corpora/parliament/sweden-old.py +++ b/backend/corpora/parliament/sweden-old.py @@ -48,7 +48,7 @@ def sources(self, start, end): yield csv_file, {} - language = 'swedish' + languages = ['sv'] description_page = 'sweden-old.md' image = 'sweden-old.jpg' diff --git a/backend/corpora/parliament/sweden.py b/backend/corpora/parliament/sweden.py index 2269db149..2be223160 100644 --- a/backend/corpora/parliament/sweden.py +++ b/backend/corpora/parliament/sweden.py @@ -52,7 +52,7 @@ def sources(self, start, end): yield csv_file, {} - language = 'swedish' + languages = ['sv'] description_page = 'sweden.md' image = 'sweden.jpg' diff --git a/backend/corpora/parliament/tests/test_es_settings.py b/backend/corpora/parliament/tests/test_es_settings.py index 19ceb5012..8c228d85e 100644 --- a/backend/corpora/parliament/tests/test_es_settings.py +++ b/backend/corpora/parliament/tests/test_es_settings.py @@ -11,35 +11,35 @@ def test_stopwords(clean_nltk_data_directory): cases = [ { - 'language': 'english', + 'language': 'en', 'stopwords': ['the', 'i', 'have'] }, { - 'language': 'dutch', + 'language': 'nl', 'stopwords': ['ik'] }, { - 'language': 'german', + 'language': 'de', 'stopwords': ['ich'] }, { - 'language': 'french', + 'language': 'fr', 'stopwords': ['je'] }, { - 'language': 'danish', + 'language': 'da', 'stopwords': ['jeg'] }, { - 'language': 'norwegian', + 'language': 'no', 'stopwords': ['jeg'] }, { - 'language': 'swedish', + 'language': 'sv', 'stopwords': ['jag'] }, { - 'language': 'finnish', + 'language': 'fi', 'stopwords': ['minä'] } ] diff --git a/backend/corpora/parliament/uk.py b/backend/corpora/parliament/uk.py index 66bc7963a..4d781abfc 100644 --- a/backend/corpora/parliament/uk.py +++ b/backend/corpora/parliament/uk.py @@ -40,7 +40,7 @@ class ParliamentUK(Parliament, CSVCorpus): es_index = getattr(settings, 'PP_UK_INDEX', 'parliament-uk') image = 'uk.jpeg' word_model_path = getattr(settings, 'PP_UK_WM', None) - language = 'english' + languages = ['en'] description_page = 'uk.md' field_entry = 'speech_id' document_context = document_context() diff --git a/backend/corpora/periodicals/periodicals.py b/backend/corpora/periodicals/periodicals.py index 1572ebf13..d946e372f 100644 --- a/backend/corpora/periodicals/periodicals.py +++ b/backend/corpora/periodicals/periodicals.py @@ -33,6 +33,8 @@ class Periodicals(XMLCorpus): image = 'Fleet_Street.jpg' scan_image_type = getattr(settings, 'PERIODICALS_SCAN_IMAGE_TYPE', 'image/jpeg') description_page = '19thCenturyUKPeriodicals.md' + languages = ['en'] + category = 'periodical' tag_toplevel = 'articles' tag_entry = 'artInfo' diff --git a/backend/corpora/rechtspraak/description/rechtspraak.md b/backend/corpora/rechtspraak/description/rechtspraak.md new file mode 100644 index 000000000..2cd00a143 --- /dev/null +++ b/backend/corpora/rechtspraak/description/rechtspraak.md @@ -0,0 +1,3 @@ +Court rulings published to [uitspraken.rechtspraak.nl](uitspraken.rechtspraak.nl). This corpus contains a portion of all court rulings in the Netherlands; rechtspraak.nl contains [detailed information about the selection criteria](https://www.rechtspraak.nl/Uitspraken/Paginas/Selectiecriteria.aspx). Rulings are anonimised. + +Be aware that while the rulings on rechtspraak.nl are updated regularly, the rulings on I-analyzer are not, and may be out of date. The latest version of the corpus has been retrieved on 04-10-2022. We are working on this! diff --git a/backend/corpora/rechtspraak/rechtspraak.py b/backend/corpora/rechtspraak/rechtspraak.py index 4e829c4e0..37762905e 100644 --- a/backend/corpora/rechtspraak/rechtspraak.py +++ b/backend/corpora/rechtspraak/rechtspraak.py @@ -36,7 +36,10 @@ class Rechtspraak(XMLCorpus): data_directory = settings.RECHTSPRAAK_DATA es_index = getattr(settings, 'RECHTSPRAAK_ES_INDEX', 'rechtspraak') image = 'rechtszaal.jpeg' + description_page = 'rechtspraak.md' toplevel_zip_file = 'OpenDataUitspraken.zip' + languages = ['nl'] + category = 'ruling' tag_toplevel = 'open-rechtspraak' diff --git a/backend/corpora/times/times.py b/backend/corpora/times/times.py index 1218e3db5..44ff5ce28 100644 --- a/backend/corpora/times/times.py +++ b/backend/corpora/times/times.py @@ -32,6 +32,8 @@ class Times(XMLCorpus): image = 'times.jpg' scan_image_type = getattr(settings, 'TIMES_SCAN_IMAGE_TYPE', 'image/png') description_page = 'times.md' + languages = ['en'] + category = 'newspaper' tag_toplevel = 'issue' tag_entry = 'article' diff --git a/backend/corpora/troonredes/description/troonredes.md b/backend/corpora/troonredes/description/troonredes.md new file mode 100644 index 000000000..fe1f39154 --- /dev/null +++ b/backend/corpora/troonredes/description/troonredes.md @@ -0,0 +1,5 @@ +Troonredes (throne speeches) are the speeches from the throne that formally mark the opening of the parliamentary year, known in the Netherlands as “Prinsjesdag”, taking place on the third Tuesday of September each year. During a joint session of the Dutch senate and the house of representatives, the queen or king reads a speech that has been prepared by the Dutch government which outlines the state of affairs and plans for the coming parliamentary year. This corpus contains the “troonredes” from 1814 until 2018, as well as some inaugural addresses, which are speeches given during coronation. + +Missing years: in 1940-1944 no speech was written. + +The transcripts are provided by [troonredes.nl](troonredes.nl). diff --git a/backend/corpora/troonredes/troonredes.py b/backend/corpora/troonredes/troonredes.py index 6ea96ebbb..67c52b860 100644 --- a/backend/corpora/troonredes/troonredes.py +++ b/backend/corpora/troonredes/troonredes.py @@ -35,6 +35,9 @@ class Troonredes(XMLCorpus): es_index = getattr(settings, 'TROONREDES_ES_INDEX', 'troonredes') image = 'troon.jpg' word_model_path = getattr(settings, 'TROONREDES_WM', None) + languages = ['nl'] + category = 'oration' + description_page = 'troonredes.md' tag_toplevel = 'doc' tag_entry = 'entry' diff --git a/backend/download/conftest.py b/backend/download/conftest.py index 2e93ed4fb..46bea5a4a 100644 --- a/backend/download/conftest.py +++ b/backend/download/conftest.py @@ -1,11 +1,9 @@ -from users.models import CustomUser import pytest import os -from visualization.tests.mock_corpora.small_mock_corpus import SPECS as SMALL_MOCK_CORPUS_SPECS -from visualization.tests.mock_corpora.large_mock_corpus import SPECS as LARGE_MOCK_CORPUS_SPECS from download.tests.mock_corpora.multilingual_mock_corpus import SPECS as ML_MOCK_CORPUS_SPECS -from visualization.conftest import index_mock_corpus, select_small_mock_corpus, select_large_mock_corpus -from addcorpus.load_corpus import load_all_corpora +from visualization.conftest import small_mock_corpus, large_mock_corpus, index_small_mock_corpus, \ + index_large_mock_corpus, small_mock_corpus_specs, large_mock_corpus_specs, index_test_corpus, \ + clear_test_corpus from visualization.query import MATCH_ALL from download import tasks @@ -17,39 +15,42 @@ def csv_directory(settings, tmpdir): settings.CSV_FILES_PATH = str(dir) return settings.CSV_FILES_PATH +@pytest.fixture(scope='session') +def ml_mock_corpus(): + return 'multilingual-mock-corpus' -@pytest.fixture(params=['small-mock-corpus', 'large-mock-corpus', 'multilingual-mock-corpus'], scope='module') +@pytest.fixture(params=['small-mock-corpus', 'large-mock-corpus', 'multilingual-mock-corpus'], scope='session') def mock_corpus(request): - '''Return the name of a mock corpus''' + 'parametrised version of the mock corpus fixtures: runs with all' return request.param - @pytest.fixture() -def select_multilingual_mock_corpus(mock_corpus): - '''Only run test with the large mock corpus - skip otherwise.''' - - if mock_corpus != 'multilingual-mock-corpus': - pytest.skip() - - return mock_corpus - +def ml_mock_corpus_specs(): + return ML_MOCK_CORPUS_SPECS @pytest.fixture() -def mock_corpus_specs(mock_corpus): +def mock_corpus_specs(mock_corpus, small_mock_corpus, large_mock_corpus, ml_mock_corpus, small_mock_corpus_specs, large_mock_corpus_specs, ml_mock_corpus_specs): '''Return various specifications for the mock corpus (number of documents etc.)''' specs = { - 'small-mock-corpus': SMALL_MOCK_CORPUS_SPECS, - 'large-mock-corpus': LARGE_MOCK_CORPUS_SPECS, - 'multilingual-mock-corpus': ML_MOCK_CORPUS_SPECS + small_mock_corpus: small_mock_corpus_specs, + large_mock_corpus: large_mock_corpus_specs, + ml_mock_corpus: ml_mock_corpus_specs, } return specs[mock_corpus] -@pytest.fixture() -def all_results_csv(mock_corpus, mock_corpus_specs, index_mock_corpus, csv_directory): - '''generate a results csv for the mock corpus corpus based on a match_all query''' +@pytest.fixture(scope='session') +def index_ml_mock_corpus(es_client, ml_mock_corpus): + index_test_corpus(es_client, ml_mock_corpus) + yield ml_mock_corpus + clear_test_corpus(es_client, ml_mock_corpus) + +@pytest.fixture(scope='session') +def index_mock_corpus(es_client, mock_corpus, index_small_mock_corpus, index_large_mock_corpus, index_ml_mock_corpus): + yield mock_corpus +def save_all_results_csv(mock_corpus, mock_corpus_specs): fields = mock_corpus_specs['fields'] query = mock_corpus_specs['example_query'] @@ -63,3 +64,25 @@ def all_results_csv(mock_corpus, mock_corpus_specs, index_mock_corpus, csv_direc filename = tasks.make_csv(results, request_json) return filename + +@pytest.fixture() +def small_mock_corpus_results_csv(small_mock_corpus, small_mock_corpus_specs, index_small_mock_corpus, csv_directory): + return save_all_results_csv(small_mock_corpus, small_mock_corpus_specs) + +@pytest.fixture() +def large_mock_corpus_results_csv(large_mock_corpus, large_mock_corpus_specs, index_large_mock_corpus, csv_directory): + return save_all_results_csv(large_mock_corpus, large_mock_corpus_specs) + +@pytest.fixture() +def ml_mock_corpus_results_csv(ml_mock_corpus, ml_mock_corpus_specs, index_ml_mock_corpus, csv_directory): + return save_all_results_csv(ml_mock_corpus, ml_mock_corpus_specs) + +@pytest.fixture() +def mock_corpus_results_csv(mock_corpus, small_mock_corpus, large_mock_corpus, ml_mock_corpus, small_mock_corpus_results_csv, large_mock_corpus_results_csv, ml_mock_corpus_results_csv): + files = { + small_mock_corpus: small_mock_corpus_results_csv, + large_mock_corpus: large_mock_corpus_results_csv, + ml_mock_corpus: ml_mock_corpus_results_csv, + } + + return files[mock_corpus] diff --git a/backend/download/mail.py b/backend/download/mail.py index 3b9197a7e..5dc840ec1 100644 --- a/backend/download/mail.py +++ b/backend/download/mail.py @@ -32,6 +32,7 @@ def send_csv_email(user_email, username, download_id): 'link_text': 'Download .csv file', 'logo_link': settings.LOGO_LINK, 'url_i_analyzer': settings.BASE_URL, + 'organization': 'the Research Software Lab at the Centre for Digital Humanities (Utrecht University)' } html_message = render_to_string('download_mail.html', context) diff --git a/backend/download/templates/download_mail.html b/backend/download/templates/download_mail.html index 3bad6d45f..2dd2c8f96 100644 --- a/backend/download/templates/download_mail.html +++ b/backend/download/templates/download_mail.html @@ -103,7 +103,7 @@ -

+
@@ -115,7 +115,7 @@

Hello {{ username }},

{{ message }}

{% if login %} -

Login name: {{ username }}

+

Login name: {{ username }}

{% endif %}

{{ prompt }}

@@ -148,7 +148,7 @@
- I-analyzer is a product of Digital Humanities Lab + I-analyzer is a product of {{organization}}
@@ -162,4 +162,4 @@ - \ No newline at end of file + diff --git a/backend/download/tests/mock_corpora/multilingual_mock_corpus.py b/backend/download/tests/mock_corpora/multilingual_mock_corpus.py index 97cf935cf..507e76235 100644 --- a/backend/download/tests/mock_corpora/multilingual_mock_corpus.py +++ b/backend/download/tests/mock_corpora/multilingual_mock_corpus.py @@ -16,6 +16,8 @@ class MultilingualMockCorpus(CSVCorpus): es_index = 'ianalyzer-mixed-language-mock-corpus' image = 'test.jpeg' data_directory = 'bogus' + languages = ['sv', 'de'] + category = 'book' def sources(self, start=min_date, end=max_date): for csv_file in os.listdir(os.path.join(here, 'sources_mixed_language')): diff --git a/backend/download/tests/test_convert_csv.py b/backend/download/tests/test_convert_csv.py index beb08c50e..f210e9f09 100644 --- a/backend/download/tests/test_convert_csv.py +++ b/backend/download/tests/test_convert_csv.py @@ -18,10 +18,10 @@ def assert_content_matches(file_1, encoding_1, file_2, encoding_2): assert contents_1 == contents_2 -def test_encoding_conversion_results(csv_directory, mock_corpus, select_multilingual_mock_corpus, all_results_csv, file_encoding): - converted = convert_csv.convert_csv(csv_directory, all_results_csv, 'search_results', encoding = file_encoding) +def test_encoding_conversion_results(csv_directory, ml_mock_corpus, ml_mock_corpus_results_csv, file_encoding): + converted = convert_csv.convert_csv(csv_directory, ml_mock_corpus_results_csv, 'search_results', encoding = file_encoding) converted_path = os.path.join(csv_directory, converted) - assert_content_matches(all_results_csv, 'utf-8', converted_path, file_encoding) + assert_content_matches(ml_mock_corpus_results_csv, 'utf-8', converted_path, file_encoding) def test_conversion_with_highlights(csv_directory, result_csv_with_highlights, file_encoding): converted = convert_csv.convert_csv(csv_directory, result_csv_with_highlights, 'search_results', encoding = file_encoding) diff --git a/backend/download/tests/test_csv_results.py b/backend/download/tests/test_csv_results.py index 191d7abde..b6ff8b0da 100644 --- a/backend/download/tests/test_csv_results.py +++ b/backend/download/tests/test_csv_results.py @@ -58,8 +58,8 @@ def test_create_csv(result_csv_with_highlights): assert 'speech' in row assert counter == 1 -def test_csv_fieldnames(all_results_csv, mock_corpus_specs): - with open(all_results_csv) as csv_file: +def test_csv_fieldnames(mock_corpus_results_csv, mock_corpus_specs): + with open(mock_corpus_results_csv) as csv_file: reader = csv.DictReader(csv_file, delimiter=';') assert set(reader.fieldnames) == set(mock_corpus_specs['fields'] + ['query']) @@ -77,13 +77,13 @@ def assert_result_csv_expectations(csv_path, expectations, delimiter=','): for item in expected_row: assert rows[i][item] == expected_row[item] -def test_csv_contents(mock_corpus, all_results_csv): +def test_csv_contents(mock_corpus, small_mock_corpus, large_mock_corpus, ml_mock_corpus, mock_corpus_results_csv): '''Check the contents of the results csv for the basic mock corpus. Also includes the multilingual corpus, which includes some special characters, making sure that none of the steps in the download make encoding issues.''' - if mock_corpus == 'small-mock-corpus': + if mock_corpus == small_mock_corpus: expected = [{ 'date': '1818-01-01', 'genre': "Science fiction", @@ -92,7 +92,7 @@ def test_csv_contents(mock_corpus, all_results_csv): }, { 'content': 'It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.', }] - elif mock_corpus == 'multilingual-mock-corpus': + elif mock_corpus == ml_mock_corpus: expected = [{ 'language': 'Swedish', 'content': 'Svenska är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk, men även som det ena nationalspråket i Finland och som enda officiella språk på Åland. I övriga Finland talas det som modersmål framförallt i de finlandssvenska kustområdena i Österbotten, Åboland och Nyland. En liten minoritet svenskspråkiga finns även i Estland. Svenska är nära besläktat och i hög grad ömsesidigt begripligt med danska och norska. De andra nordiska språken, isländska och färöiska, är mindre ömsesidigt begripliga med svenska. Liksom de övriga nordiska språken härstammar svenskan från en gren av fornnordiska, vilket var det språk som talades av de germanska folken i Skandinavien.' @@ -101,14 +101,14 @@ def test_csv_contents(mock_corpus, all_results_csv): 'content': 'Das Deutsche ist eine plurizentrische Sprache, enthält also mehrere Standardvarietäten in verschiedenen Regionen. Ihr Sprachgebiet umfasst Deutschland, Österreich, die Deutschschweiz, Liechtenstein, Luxemburg, Ostbelgien, Südtirol, das Elsass und Lothringen sowie Nordschleswig. Außerdem ist Deutsch eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern, z. B. in Rumänien und Südafrika sowie Nationalsprache im afrikanischen Namibia. Deutsch ist die meistgesprochene Muttersprache in der Europäischen Union (EU).' }] else: - pytest.skip() + expected = [] - assert_result_csv_expectations(all_results_csv, expected, delimiter=';') + assert_result_csv_expectations(mock_corpus_results_csv, expected, delimiter=';') -def test_csv_encoding(select_multilingual_mock_corpus, all_results_csv): +def test_csv_encoding(ml_mock_corpus_results_csv): '''Assert that the results csv file matches utf-8 encoding''' - with open(all_results_csv, 'rb') as f: + with open(ml_mock_corpus_results_csv, 'rb') as f: binary_contents = f.read() expected_sentence = 'Svenska är ett östnordiskt språk som talas av ungefär tio miljoner personer främst i Sverige där språket har en dominant ställning som huvudspråk' @@ -189,7 +189,7 @@ def test_csv_encoding(select_multilingual_mock_corpus, all_results_csv): ] @pytest.fixture() -def term_frequency_file(mock_corpus, select_small_mock_corpus, index_mock_corpus, csv_directory): +def term_frequency_file(index_small_mock_corpus, csv_directory): filename = create_csv.term_frequency_csv(mock_queries, mock_timeline_result, 'date', unit = 'year') return filename diff --git a/backend/download/tests/test_download_limit.py b/backend/download/tests/test_download_limit.py index 535c430b4..0ede44a15 100644 --- a/backend/download/tests/test_download_limit.py +++ b/backend/download/tests/test_download_limit.py @@ -5,7 +5,7 @@ "match_all": {} } } -def test_no_donwnload_limit(mock_corpus, index_mock_corpus, mock_corpus_specs): +def test_no_download_limit(mock_corpus, index_mock_corpus, mock_corpus_specs): results, total = es_download.scroll(mock_corpus, match_all) docs_in_corpus = mock_corpus_specs['total_docs'] assert total == docs_in_corpus diff --git a/backend/download/tests/test_download_records.py b/backend/download/tests/test_download_records.py index b1e405f2d..9f68b24e2 100644 --- a/backend/download/tests/test_download_records.py +++ b/backend/download/tests/test_download_records.py @@ -7,7 +7,7 @@ } } -def test_download_records(admin_user, mock_corpus, mock_corpora_in_db): +def test_download_records(admin_user, mock_corpus): assert list(admin_user.downloads.all()) == [] parameters = { diff --git a/backend/download/tests/test_download_views.py b/backend/download/tests/test_download_views.py index 84551e8d1..360fd85b5 100644 --- a/backend/download/tests/test_download_views.py +++ b/backend/download/tests/test_download_views.py @@ -7,7 +7,7 @@ from addcorpus.models import Corpus import io -def test_direct_download_view(admin_client, mock_corpus, index_mock_corpus, csv_directory, mock_corpora_in_db): +def test_direct_download_view(admin_client, mock_corpus, index_mock_corpus, csv_directory): request_json = { "corpus": mock_corpus, "es_query": {"query":{"bool":{"must":{"match_all":{}},"filter":[]}}}, @@ -23,13 +23,13 @@ def test_direct_download_view(admin_client, mock_corpus, index_mock_corpus, csv_ ) assert status.is_success(response.status_code) -def test_schedule_download_view(transactional_db, admin_client, mock_corpus, select_small_mock_corpus, - index_mock_corpus, celery_worker, csv_directory, mock_corpora_in_db): +def test_schedule_download_view(transactional_db, admin_client, small_mock_corpus, + index_small_mock_corpus, celery_worker, csv_directory): request_json = { - "corpus": mock_corpus, + "corpus": small_mock_corpus, "es_query": {"query":{"bool":{"must":{"match_all":{}},"filter":[]}}}, "fields": ['date','content'], - "route": f"/search/{mock_corpus}", + "route": f"/search/{small_mock_corpus}", "encoding":"utf-8" } response = admin_client.post( @@ -39,7 +39,6 @@ def test_schedule_download_view(transactional_db, admin_client, mock_corpus, sel ) assert status.is_success(response.status_code) -@pytest.fixture() def term_frequency_parameters(mock_corpus, mock_corpus_specs): min_year = mock_corpus_specs['min_date'].year max_year = mock_corpus_specs['max_date'].year @@ -76,13 +75,14 @@ def term_frequency_parameters(mock_corpus, mock_corpus_specs): 'unit': 'year', } -def test_full_data_download_view(transactional_db, admin_client, mock_corpus, term_frequency_parameters, - select_small_mock_corpus, index_mock_corpus, celery_worker, - csv_directory, mock_corpora_in_db): +def test_full_data_download_view(transactional_db, admin_client, small_mock_corpus, + index_small_mock_corpus, small_mock_corpus_specs, celery_worker, + csv_directory): + parameters = term_frequency_parameters(small_mock_corpus, small_mock_corpus_specs) request_json = { 'visualization': 'date_term_frequency', - 'parameters': [term_frequency_parameters], - 'corpus': mock_corpus + 'parameters': [parameters], + 'corpus': small_mock_corpus } response = admin_client.post( '/api/download/full_data', @@ -100,9 +100,9 @@ def test_empty_download_history_view(admin_client): assert response.data == [] @pytest.fixture() -def finished_download(admin_user, csv_directory, mock_corpus, select_small_mock_corpus, mock_corpora_in_db): - filepath = os.path.join(csv_directory, mock_corpus + '.csv') - corpus = Corpus.objects.get(name=mock_corpus) +def finished_download(admin_user, csv_directory, small_mock_corpus): + filepath = os.path.join(csv_directory, small_mock_corpus + '.csv') + corpus = Corpus.objects.get(name=small_mock_corpus) download = Download.objects.create(download_type='search_results', corpus=corpus, parameters={}, user=admin_user) with open(filepath, 'w') as outfile: @@ -114,7 +114,7 @@ def finished_download(admin_user, csv_directory, mock_corpus, select_small_mock_ 'content': "You will rejoice to hear...", 'date': '1818-01-01', 'genre': 'Science fiction', - 'query': mock_corpus, + 'query': small_mock_corpus, 'title': 'Frankenstein, or, the Modern Prometheus' }) @@ -122,7 +122,7 @@ def finished_download(admin_user, csv_directory, mock_corpus, select_small_mock_ download.complete(filename) return download.id -def test_download_history_view(admin_client, finished_download, mock_corpus): +def test_download_history_view(admin_client, finished_download, small_mock_corpus): response = admin_client.get( '/api/download/' ) @@ -130,7 +130,7 @@ def test_download_history_view(admin_client, finished_download, mock_corpus): assert status.is_success(response.status_code) assert len(response.data) == 1 download = next(d for d in response.data) - assert download['corpus'] == mock_corpus + assert download['corpus'] == small_mock_corpus assert download['status'] == 'done' def test_csv_download_view(admin_client, finished_download): diff --git a/backend/download/tests/test_full_data.py b/backend/download/tests/test_full_data.py index 061895fd3..385fb701b 100644 --- a/backend/download/tests/test_full_data.py +++ b/backend/download/tests/test_full_data.py @@ -3,13 +3,13 @@ from download import tasks import pytest -def test_timeline_full_data(mock_corpus, select_small_mock_corpus, index_mock_corpus, mock_corpus_specs): - min_year = mock_corpus_specs['min_date'].year - max_year = mock_corpus_specs['max_date'].year - search_fields = [mock_corpus_specs['content_field']] +def test_timeline_full_data(small_mock_corpus, index_small_mock_corpus, small_mock_corpus_specs): + min_year = small_mock_corpus_specs['min_date'].year + max_year = small_mock_corpus_specs['max_date'].year + search_fields = [small_mock_corpus_specs['content_field']] full_data_parameters = [{ 'es_query': make_query(query_text = 'the', search_in_fields=search_fields), - 'corpus_name': mock_corpus, + 'corpus_name': small_mock_corpus, 'field_name': 'date', 'bins': [ { @@ -32,7 +32,7 @@ def test_timeline_full_data(mock_corpus, select_small_mock_corpus, index_mock_co expected_frequency = 2 total_expectations = { - 'Total documents': mock_corpus_specs['total_docs'], + 'Total documents': small_mock_corpus_specs['total_docs'], 'Term frequency': expected_frequency, # 2 hits per document 'Relative term frequency (by # documents)': expected_frequency } diff --git a/backend/download/tests/test_mail.py b/backend/download/tests/test_mail.py index 7f0748f8b..a16311d4c 100644 --- a/backend/download/tests/test_mail.py +++ b/backend/download/tests/test_mail.py @@ -7,7 +7,7 @@ from download.mail import send_csv_email @pytest.fixture() -def finished_download(admin_user, mock_corpus, csv_directory, mock_corpora_in_db): +def finished_download(admin_user, mock_corpus, csv_directory): corpus = Corpus.objects.get(name=mock_corpus) download = Download.objects.create( download_type='search_results', diff --git a/backend/ianalyzer/common_settings.py b/backend/ianalyzer/common_settings.py index de63f8347..986a11b91 100644 --- a/backend/ianalyzer/common_settings.py +++ b/backend/ianalyzer/common_settings.py @@ -36,6 +36,7 @@ 'download', 'wordmodels', 'media', + 'tag', ] SITE_ID = 1 @@ -129,4 +130,4 @@ 'REGISTER_SERIALIZER': 'users.serializers.CustomRegisterSerializer', } -LOGO_LINK = 'http://dhstatic.hum.uu.nl/logo-lab/png/dighum-logo.png' +LOGO_LINK = 'https://dhstatic.hum.uu.nl/logo-cdh/png/UU_CDH_logo_EN_whiteFC.png' diff --git a/backend/ianalyzer/flask_data_transfer.py b/backend/ianalyzer/flask_data_transfer.py index 85c4645a0..e33093580 100644 --- a/backend/ianalyzer/flask_data_transfer.py +++ b/backend/ianalyzer/flask_data_transfer.py @@ -12,6 +12,7 @@ from django.conf import settings import warnings from allauth.account.models import EmailAddress +from api.query_model_to_es_query import query_model_to_es_query def adapt_password_encoding(flask_encoded): @@ -163,9 +164,11 @@ def save_flask_query(row): # some queries refer to corpus names that no longer exist return + query_model = load_json_value(row['query']) + es_query = query_model_to_es_query(query_model) query = Query( id=row['id'], - query_json=load_json_value(row['query']), + query_json=es_query, corpus=Corpus.objects.get(name=corpus_name), user=CustomUser.objects.get(id=user_id), completed=null_to_none(row['completed']), diff --git a/backend/ianalyzer/settings_test.py b/backend/ianalyzer/settings_test.py index fd29c0508..88b610d2b 100644 --- a/backend/ianalyzer/settings_test.py +++ b/backend/ianalyzer/settings_test.py @@ -1,15 +1,19 @@ from ianalyzer.settings import * + def path_in_testdir(app, *path_from_app_tests): return os.path.join(BASE_DIR, app, 'tests', *path_from_app_tests) + CORPORA = { 'small-mock-corpus': path_in_testdir('visualization', 'mock_corpora', 'small_mock_corpus.py'), 'large-mock-corpus': path_in_testdir('visualization', 'mock_corpora', 'large_mock_corpus.py'), 'multilingual-mock-corpus': path_in_testdir('download', 'mock_corpora', 'multilingual_mock_corpus.py'), 'times': os.path.join(BASE_DIR, 'corpora', 'times', 'times.py'), 'media-mock-corpus': path_in_testdir('media', 'media_mock_corpus.py'), - 'mock-csv-corpus': path_in_testdir('addcorpus', 'mock_csv_corpus.py') + 'mock-csv-corpus': path_in_testdir('addcorpus', 'mock_csv_corpus.py'), + 'wordmodels-mock-corpus': path_in_testdir('wordmodels', 'mock-corpus', 'mock_corpus.py'), + 'tagging-mock-corpus': path_in_testdir('tag', 'tag_mock_corpus.py'), } TIMES_DATA = path_in_testdir('addcorpus', '') diff --git a/backend/ianalyzer/tests/test_flask_data_transfer.py b/backend/ianalyzer/tests/test_flask_data_transfer.py index 0618e65ef..034a799fd 100644 --- a/backend/ianalyzer/tests/test_flask_data_transfer.py +++ b/backend/ianalyzer/tests/test_flask_data_transfer.py @@ -70,7 +70,7 @@ def test_save_legacy_user(db): users = CustomUser.objects.all() assert len(users) == 4 - admin = users[0] + admin = CustomUser.objects.get(username='admin') assert admin.username == 'admin' assert admin.email == 'admin@ianalyzer.nl' assert admin.is_superuser @@ -119,7 +119,10 @@ def test_save_queries(db): query = Query.objects.get(id='507') assert query.query_json == { - "queryText": "", "filters": [], "sortBy": "date", "sortAscending": False} + "sort": [{"date": "desc"}], + "query": {"bool": {"must": {"match_all": {}}, "filter": []}} + } + assert dates_match(query.started, datetime(year=2022, month=12, day=7, hour=14, minute=18, second=6)) diff --git a/backend/media/conftest.py b/backend/media/conftest.py index 8947755f1..71006e2ce 100644 --- a/backend/media/conftest.py +++ b/backend/media/conftest.py @@ -1,12 +1,6 @@ import pytest -from users.models import CustomUser from addcorpus.load_corpus import load_all_corpora @pytest.fixture() def mock_corpus(): return 'media-mock-corpus' - -@pytest.fixture() -def mock_corpus_user(db, mock_corpora_in_db): - user = CustomUser.objects.create(username='mock-user', password='secret', is_superuser=True) - return user diff --git a/backend/media/tests/test_media.py b/backend/media/tests/test_media.py index bf47cd450..a8034ad6e 100644 --- a/backend/media/tests/test_media.py +++ b/backend/media/tests/test_media.py @@ -9,9 +9,8 @@ expected_url = f'/api/get_media?corpus=media-mock-corpus&image_path=images%2Fhamlet.png' -def test_media_views(client, mock_corpus, mock_corpus_user): - client.force_login(mock_corpus_user) - response = client.post( +def test_media_views(client, mock_corpus, admin_client): + response = admin_client.post( '/api/request_media', {'corpus': mock_corpus, 'document': example_document}, content_type='application/json' diff --git a/backend/requirements.in b/backend/requirements.in index 0356e66d8..ab5812765 100644 --- a/backend/requirements.in +++ b/backend/requirements.in @@ -23,3 +23,6 @@ celery Redis pypdf2 openpyxl +tqdm +langcodes +language_data diff --git a/backend/requirements.txt b/backend/requirements.txt index 8d3f58517..5bcacbe73 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -12,6 +12,8 @@ async-timeout==4.0.2 # via redis attrs==22.2.0 # via pytest +backports-zoneinfo==0.2.1 + # via django beautifulsoup4==4.11.1 # via # -r requirements.in @@ -84,6 +86,8 @@ elementpath==4.1.1 # via xmlschema et-xmlfile==1.1.0 # via openpyxl +exceptiongroup==1.1.1 + # via pytest execnet==1.9.0 # via pytest-xdist fst-pso==1.8.1 @@ -94,6 +98,8 @@ gensim==4.3.0 # via -r requirements.in idna==3.4 # via requests +importlib-resources==5.12.0 + # via pysaml2 iniconfig==2.0.0 # via pytest joblib==1.2.0 @@ -102,8 +108,14 @@ joblib==1.2.0 # scikit-learn kombu==5.2.4 # via celery +langcodes==3.3.0 + # via -r requirements.in +language-data==1.1 + # via -r requirements.in lxml==4.9.1 # via -r requirements.in +marisa-trie==0.7.8 + # via language-data miniful==0.0.6 # via fst-pso nltk==3.8.1 @@ -208,10 +220,16 @@ textdistance==4.5.0 # via -r requirements.in threadpoolctl==3.1.0 # via scikit-learn +tomli==2.0.1 + # via pytest tornado==6.3.2 # via django-livereload-server tqdm==4.64.1 - # via nltk + # via + # -r requirements.in + # nltk +typing-extensions==4.6.3 + # via pypdf2 urllib3==1.26.13 # via # django-revproxy @@ -226,3 +244,8 @@ wcwidth==0.2.6 # via prompt-toolkit xmlschema==2.2.3 # via pysaml2 +zipp==3.15.0 + # via importlib-resources + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/backend/tag/__init__.py b/backend/tag/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/tag/admin.py b/backend/tag/admin.py new file mode 100644 index 000000000..8c38f3f3d --- /dev/null +++ b/backend/tag/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/backend/tag/apps.py b/backend/tag/apps.py new file mode 100644 index 000000000..72862e535 --- /dev/null +++ b/backend/tag/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class TagConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'tag' diff --git a/backend/tag/conftest.py b/backend/tag/conftest.py new file mode 100644 index 000000000..780f0787b --- /dev/null +++ b/backend/tag/conftest.py @@ -0,0 +1,32 @@ +import pytest +from addcorpus.load_corpus import load_all_corpora +from tag.models import Tag, TagInstance +from addcorpus.models import Corpus + +@pytest.fixture() +def mock_corpus(db): + load_all_corpora() + return 'tagging-mock-corpus' + +@pytest.fixture() +def auth_user_tag(db, auth_user): + tag = Tag.objects.create( + name='fascinating', + description='some very interesting documents', + user=auth_user + ) + + return tag + +@pytest.fixture() +def tagged_documents(auth_user_tag, mock_corpus): + corpus = Corpus.objects.get(name=mock_corpus) + docs = ['1', '2', '3'] + + tagged = TagInstance.objects.create( + tag=auth_user_tag, + corpus=corpus, + document_ids=docs, + ) + + return tagged, docs diff --git a/backend/tag/migrations/0001_initial.py b/backend/tag/migrations/0001_initial.py new file mode 100644 index 000000000..6ff68a4ec --- /dev/null +++ b/backend/tag/migrations/0001_initial.py @@ -0,0 +1,41 @@ +# Generated by Django 4.1.9 on 2023-06-13 10:20 + +from django.conf import settings +import django.contrib.postgres.fields +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ('addcorpus', '0002_alter_corpus_options'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name='Tag', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=512)), + ('description', models.TextField(blank=True)), + ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='tags', to=settings.AUTH_USER_MODEL)), + ], + ), + migrations.CreateModel( + name='TagInstance', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('document_ids', django.contrib.postgres.fields.ArrayField(base_field=models.CharField(max_length=512), default=list, size=500)), + ('corpus', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='tag_instances', to='addcorpus.corpus', to_field='name')), + ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='instances', to='tag.tag')), + ], + ), + migrations.AddConstraint( + model_name='tag', + constraint=models.UniqueConstraint(fields=('user', 'name'), name='unique_name_for_user'), + ), + ] diff --git a/backend/tag/migrations/__init__.py b/backend/tag/migrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/tag/models.py b/backend/tag/models.py new file mode 100644 index 000000000..51bd9784f --- /dev/null +++ b/backend/tag/models.py @@ -0,0 +1,47 @@ +from django.db import models +from django.db.models.constraints import UniqueConstraint +from django.contrib.postgres.fields import ArrayField + +from addcorpus.models import Corpus +from django.conf import settings + +DOCS_PER_TAG_LIMIT = 500 + +class Tag(models.Model): + name = models.CharField(blank=False, null=False, max_length=512) + description = models.TextField(blank=True, null=False) + user = models.ForeignKey( + to=settings.AUTH_USER_MODEL, + related_name='tags', + on_delete=models.CASCADE, + null=False, + ) + + class Meta: + constraints = [ + UniqueConstraint(fields=['user', 'name'], name='unique_name_for_user') + ] + +class TagInstance(models.Model): + tag = models.ForeignKey( + to=Tag, + related_name='instances', + on_delete=models.CASCADE, + null=False + ) + corpus = models.ForeignKey( + to=Corpus, + on_delete=models.CASCADE, + to_field='name', + related_name='tag_instances', + ) + document_ids = ArrayField( + models.CharField( + blank=False, + null=False, + max_length=512, + ), + default=list, + null=False, + size=DOCS_PER_TAG_LIMIT, + ) diff --git a/backend/tag/tests/data/test_data.csv b/backend/tag/tests/data/test_data.csv new file mode 100644 index 000000000..620276085 --- /dev/null +++ b/backend/tag/tests/data/test_data.csv @@ -0,0 +1,7 @@ +id,content +1,"Text: the final frontier..." +2,"There are the voyages of the starship I-Analyzer." +3,"Its continuing mission:" +4,"To explore strange new documents," +5,"To seek out new texts, and new corpora," +6,"To boldy go where no text-mining application has gone before!" diff --git a/backend/tag/tests/tag_mock_corpus.py b/backend/tag/tests/tag_mock_corpus.py new file mode 100644 index 000000000..3a92e6b78 --- /dev/null +++ b/backend/tag/tests/tag_mock_corpus.py @@ -0,0 +1,33 @@ +import os +import datetime + +from addcorpus.corpus import CSVCorpus, Field +from addcorpus.extract import CSV +from addcorpus.es_mappings import keyword_mapping, main_content_mapping + +here = os.path.abspath(os.path.dirname(__file__)) + +class TaggingMockCorpus(CSVCorpus): + title = 'Tagging Mock Corpus' + description = 'Mock corpus for tagging' + es_index = 'tagging-mock-corpus' + min_date = datetime.datetime(year=1, month=1, day=1) + max_date = datetime.datetime(year=2022, month=12, day=31) + image = 'nothing.jpeg' + data_directory = os.path.join(here, 'csv_example') + languages = ['en'] + category = 'book' + + + fields = [ + Field( + name='id', + extractor=CSV('id'), + es_mapping=keyword_mapping() + ), + Field( + name='content', + extractor=CSV('content'), + es_mapping=main_content_mapping(), + ) + ] diff --git a/backend/tag/tests/test_tag_models.py b/backend/tag/tests/test_tag_models.py new file mode 100644 index 000000000..dfc776aa9 --- /dev/null +++ b/backend/tag/tests/test_tag_models.py @@ -0,0 +1,36 @@ +from addcorpus.models import Corpus +from tag.models import TagInstance, DOCS_PER_TAG_LIMIT +import pytest +from django.core.exceptions import ValidationError + +def test_tag_models(db, auth_user, auth_user_tag, tagged_documents): + assert len(auth_user.tags.all()) == 1 + + instance, docs = tagged_documents + + assert len(auth_user_tag.instances.all()) == 1 + assert len(instance.document_ids) == len(docs) + +def test_tag_lookup(mock_corpus, tagged_documents): + instance, docs = tagged_documents + corpus = Corpus.objects.get(name=mock_corpus) + + for doc in docs: + assert TagInstance.objects.filter(corpus=corpus, document_ids__contains=[doc]) + + assert not TagInstance.objects.filter(corpus=corpus, document_ids__contains=['not_tagged']) + +def test_max_length(db, mock_corpus, auth_user_tag): + corpus = Corpus.objects.get(name=mock_corpus) + instance = TagInstance.objects.create(tag=auth_user_tag, corpus=corpus) + + for i in range(DOCS_PER_TAG_LIMIT): + instance.document_ids.append(str(i)) + instance.save() + instance.full_clean() # should validate without error + + instance.document_ids.append('too_much') + instance.save() + + with pytest.raises(ValidationError): + instance.full_clean() diff --git a/backend/tag/views.py b/backend/tag/views.py new file mode 100644 index 000000000..91ea44a21 --- /dev/null +++ b/backend/tag/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/backend/visualization/conftest.py b/backend/visualization/conftest.py index 1dd9b48a1..e1ee46212 100644 --- a/backend/visualization/conftest.py +++ b/backend/visualization/conftest.py @@ -1,47 +1,45 @@ -from users.models import CustomUser import pytest import os -from ianalyzer.elasticsearch import elasticsearch from es import es_index as index -from addcorpus.load_corpus import load_corpus, load_all_corpora +from addcorpus.load_corpus import load_corpus from time import sleep from visualization.tests.mock_corpora.small_mock_corpus import SPECS as SMALL_MOCK_CORPUS_SPECS from visualization.tests.mock_corpora.large_mock_corpus import SPECS as LARGE_MOCK_CORPUS_SPECS -from redis import Redis here = os.path.abspath(os.path.dirname(__file__)) -@pytest.fixture(params=['small-mock-corpus', 'large-mock-corpus'], scope='module') +@pytest.fixture(scope='session') +def small_mock_corpus(): + return 'small-mock-corpus' + +@pytest.fixture(scope='session') +def large_mock_corpus(scope='session'): + return 'large-mock-corpus' + +@pytest.fixture(params=['small-mock-corpus', 'large-mock-corpus'], scope='session') def mock_corpus(request): - '''Return the name of a mock corpus''' + 'parametrised version of the mock corpus fixtures: runs with both' return request.param @pytest.fixture() -def select_small_mock_corpus(mock_corpus): - '''Only run test with the small mock corpus - skip otherwise''' - - if mock_corpus != 'small-mock-corpus': - pytest.skip() - - return mock_corpus +def small_mock_corpus_specs(): + '''Return various specifications for the mock corpus (number of documents etc.)''' + return SMALL_MOCK_CORPUS_SPECS @pytest.fixture() -def select_large_mock_corpus(mock_corpus): - '''Only run test with the large mock corpus - skip otherwise.''' - - if mock_corpus != 'large-mock-corpus': - pytest.skip() - - return mock_corpus +def large_mock_corpus_specs(): + '''Return various specifications for the mock corpus (number of documents etc.)''' + return LARGE_MOCK_CORPUS_SPECS @pytest.fixture() -def mock_corpus_specs(mock_corpus): +def mock_corpus_specs(mock_corpus, small_mock_corpus, large_mock_corpus, + small_mock_corpus_specs, large_mock_corpus_specs): '''Return various specifications for the mock corpus (number of documents etc.)''' specs = { - 'small-mock-corpus': SMALL_MOCK_CORPUS_SPECS, - 'large-mock-corpus': LARGE_MOCK_CORPUS_SPECS, + small_mock_corpus: small_mock_corpus_specs, + large_mock_corpus: large_mock_corpus_specs, } return specs[mock_corpus] @@ -56,30 +54,31 @@ def index_test_corpus(es_client, corpus_name): def clear_test_corpus(es_client, corpus_name): corpus = load_corpus(corpus_name) index = corpus.es_index - es_client.indices.delete(index = index) + # check existence in case teardown is executed more than once + if es_client.indices.exists(index = index): + es_client.indices.delete(index = index) -@pytest.fixture(scope='module') -def index_mock_corpus(mock_corpus, es_client): - '''Create and populate an index for the mock corpus.''' +@pytest.fixture(scope='session') +def index_small_mock_corpus(small_mock_corpus, es_client): + '''Create and populate an index for the small mock corpus.''' - index_test_corpus(es_client, mock_corpus) - yield mock_corpus - clear_test_corpus(es_client, mock_corpus) + index_test_corpus(es_client, small_mock_corpus) + yield small_mock_corpus + clear_test_corpus(es_client, small_mock_corpus) -@pytest.fixture() -def corpus_user(transactional_db, mock_corpus): # use transactional_db instead of db for async task support - '''Make a user with access to the mock corpus''' +@pytest.fixture(scope='session') +def index_large_mock_corpus(large_mock_corpus, es_client): + '''Create and populate an index for the large mock corpus''' - username = 'mock-user' - password = 'secret' - user = CustomUser.objects.create(username=username, password=password, is_superuser=True) - load_all_corpora() - return user + index_test_corpus(es_client, large_mock_corpus) + yield large_mock_corpus + clear_test_corpus(es_client, large_mock_corpus) -@pytest.fixture() -def authenticated_client(client, corpus_user): - client.force_login(corpus_user) - return client +@pytest.fixture(scope='module') +def index_mock_corpus(mock_corpus, index_small_mock_corpus, index_large_mock_corpus): + '''Create and populate an index for the mock corpus.''' + + yield mock_corpus @pytest.fixture def basic_query(): diff --git a/backend/visualization/field_stats.py b/backend/visualization/field_stats.py new file mode 100644 index 000000000..5ac27c680 --- /dev/null +++ b/backend/visualization/field_stats.py @@ -0,0 +1,52 @@ +from ianalyzer.elasticsearch import elasticsearch +from es.search import total_hits, search +from addcorpus.load_corpus import load_corpus +from visualization.query import MATCH_ALL + +def count_field(es_client, corpus_name, fieldname): + ''' + The absolute of documents that has a value for this field + ''' + + body = {'query': {'exists': {'field': fieldname}}} + result = search( + corpus=corpus_name, + query_model=body, + client=es_client, + size=0, + track_total_hits=True, + ) + + return total_hits(result) + + +def count_total(es_client, corpus_name): + ''' + The total number of documents in the corpus + ''' + + result = search( + corpus=corpus_name, + client=es_client, + query_model=MATCH_ALL, + size=0, + track_total_hits=True, + ) + return total_hits(result) + +def report_coverage(corpus_name): + ''' + Returns a dict with the ratio of documents that have a value for each field in the corpus + ''' + + es_client = elasticsearch(corpus_name) + corpus = load_corpus(corpus_name) + + total = count_total(es_client, corpus_name) + + return { + field.name: count_field(es_client, corpus_name, field.name) / total + for field in corpus.fields + } + + diff --git a/backend/visualization/query.py b/backend/visualization/query.py index af0b940eb..aa7d2e306 100644 --- a/backend/visualization/query.py +++ b/backend/visualization/query.py @@ -18,31 +18,35 @@ def set_query_text(query, text): if get_query_text(query): new_query['query']['bool']['must']['simple_query_string']['query'] = text - elif query['query']['bool']['must']: - new_query['query']['bool']['must'] = { - "simple_query_string": { - "query": text, - "lenient": True, - "default_operator": "or" - } - } + elif 'bool' in query['query'] and query['query']['bool']['must']: + new_query['query']['bool']['must'] = format_query_text(text) else: new_query['query'] ={ "bool": { - "must": { - "simple_query_string": { - "query": text, - "lenient": True, - "default_operator": "or" - } - }, + "must": format_query_text(text), "filter": [] } } return new_query +def format_query_text(query_text = None): + '''Render the portion of the query that specifies the query text. Either simple_query_string, + or match_all if the query text is None.''' + + if query_text: + return {'simple_query_string': + { + 'query': query_text, + 'lenient': True, + 'default_operator':'or' + } + } + else: + return {'match_all': {}} + + def get_search_fields(query): """Get the search fields specified in the query.""" try: @@ -52,12 +56,21 @@ def get_search_fields(query): return fields +def set_search_fields(query, fields): + '''Set the search fields for a query''' + + if get_query_text(query) == None: + return query + else: + query['query']['bool']['must']['simple_query_string']['fields'] = fields + return query + def get_filters(query): - """Get the list of filters in a query, or `None` if there are none.""" + """Get the list of filters in a query. Returns an empty list if there are none.""" try: filters = query['query']['bool']['filter'] except KeyError: - filters = None + filters = [] return filters @@ -121,6 +134,22 @@ def make_term_filter(field, value): } } +def set_sort(query, sort_by, sort_direction): + '''sets the 'sort' specification for a query. + Parameters: + - `query`: elasticsearch query + - `sort_by`: string; the name of the field by which you want to sort + - `direction`: either `'asc'` or `'desc'` + ''' + specification = [{sort_by:sort_direction}] + query['sort'] = specification + return query + +def set_highlight(query, fragment_size): + specification = { 'fragment_size': fragment_size } + query['highlight'] = specification + return query + def remove_query(query): """ Remove the query part of the query object diff --git a/backend/visualization/tests/mock_corpora/large_mock_corpus.py b/backend/visualization/tests/mock_corpora/large_mock_corpus.py index 1e3795ee7..c6e7a0236 100644 --- a/backend/visualization/tests/mock_corpora/large_mock_corpus.py +++ b/backend/visualization/tests/mock_corpora/large_mock_corpus.py @@ -34,6 +34,8 @@ class LargeMockCorpus(Corpus): es_index = 'large-mock-corpus' image = 'test.jpeg' data_directory = 'bogus' + languages = ['en'] + category = 'book' def sources(self, start=min_date, end=max_date): return range(TOTAL_DOCUMENTS) diff --git a/backend/visualization/tests/mock_corpora/small_mock_corpus.py b/backend/visualization/tests/mock_corpora/small_mock_corpus.py index eed44cc9d..92c6cbdc3 100644 --- a/backend/visualization/tests/mock_corpora/small_mock_corpus.py +++ b/backend/visualization/tests/mock_corpora/small_mock_corpus.py @@ -16,6 +16,8 @@ class SmallMockCorpus(CSVCorpus): es_index = 'ianalyzer-mock-corpus' image = 'test.jpeg' data_directory = 'bogus' + languages = ['en'] + category = 'book' def sources(self, start=min_date, end=max_date): for csv_file in os.listdir(os.path.join(here, 'source_files')): diff --git a/backend/visualization/tests/test_field_stats.py b/backend/visualization/tests/test_field_stats.py new file mode 100644 index 000000000..46c4321f1 --- /dev/null +++ b/backend/visualization/tests/test_field_stats.py @@ -0,0 +1,22 @@ +from visualization.field_stats import count_field, count_total, report_coverage + + +def test_count(small_mock_corpus, es_client, index_small_mock_corpus, small_mock_corpus_specs): + total_docs = small_mock_corpus_specs['total_docs'] + + for field in small_mock_corpus_specs['fields']: + count = count_field(es_client, small_mock_corpus, field) + assert count == total_docs + + assert count_total(es_client, small_mock_corpus) == total_docs + + +def test_report(small_mock_corpus, es_client,index_small_mock_corpus, small_mock_corpus_specs): + report = report_coverage(small_mock_corpus) + + assert report == { + 'date': 1.0, + 'title': 1.0, + 'content': 1.0, + 'genre': 1.0, + } diff --git a/backend/visualization/tests/test_ngrams.py b/backend/visualization/tests/test_ngrams.py index 3efa3f2ee..86bc2e1e0 100644 --- a/backend/visualization/tests/test_ngrams.py +++ b/backend/visualization/tests/test_ngrams.py @@ -21,11 +21,11 @@ (1890, 1894), (1895, 1899) ] -def test_total_time_interval_no_filter(basic_query, mock_corpus, select_small_mock_corpus, mock_corpus_specs): +def test_total_time_interval_no_filter(basic_query, small_mock_corpus, small_mock_corpus_specs): # no date filter: should use corpus min_date/max_date - min_date, max_date = ngram.get_total_time_interval(basic_query, mock_corpus) - assert min_date == mock_corpus_specs['min_date'] - assert max_date == mock_corpus_specs['max_date'] + min_date, max_date = ngram.get_total_time_interval(basic_query, small_mock_corpus) + assert min_date == small_mock_corpus_specs['min_date'] + assert max_date == small_mock_corpus_specs['max_date'] def test_total_time_interval_with_filter(mock_corpus, basic_query): datefilter = query.make_date_filter(FILTER_MIN_DATE, FILTER_MAX_DATE) @@ -36,16 +36,16 @@ def test_total_time_interval_with_filter(mock_corpus, basic_query): assert min_date == FILTER_MIN_DATE assert max_date == FILTER_MAX_DATE -def test_time_bins(mock_corpus, select_small_mock_corpus, basic_query): +def test_time_bins(small_mock_corpus, basic_query): # 100 year interval - bins = ngram.get_time_bins(basic_query, mock_corpus) + bins = ngram.get_time_bins(basic_query, small_mock_corpus) target_bins = CENTURY_BINS assert bins == target_bins # 10 year interval datefilter = query.make_date_filter(FILTER_MIN_DATE, FILTER_MAX_DATE) query_with_date_filter = query.add_filter(basic_query, datefilter) - bins = ngram.get_time_bins(query_with_date_filter, mock_corpus) + bins = ngram.get_time_bins(query_with_date_filter, small_mock_corpus) target_bins = [ (1850, 1850), (1851, 1851), (1852, 1852), (1853, 1853), @@ -55,13 +55,13 @@ def test_time_bins(mock_corpus, select_small_mock_corpus, basic_query): ] assert bins == target_bins -def test_short_interval(mock_corpus, select_small_mock_corpus, basic_query): +def test_short_interval(small_mock_corpus, basic_query): start_date = datetime(year=1850, month=1, day=1) end_date = datetime(year=1850, month=12, day=31) date_filter = query.make_date_filter(start_date, end_date) short_query = query.add_filter(basic_query, date_filter) - bins = ngram.get_time_bins(short_query, mock_corpus) + bins = ngram.get_time_bins(short_query, small_mock_corpus) assert bins == [(1850, 1850)] @@ -114,7 +114,7 @@ def test_top_10_ngrams(): -def test_absolute_bigrams(mock_corpus, select_small_mock_corpus, index_mock_corpus, basic_query): +def test_absolute_bigrams(small_mock_corpus, index_small_mock_corpus, basic_query): # search for a word that occurs a few times frequent_query = query.set_query_text(basic_query, 'to') @@ -160,7 +160,7 @@ def test_absolute_bigrams(mock_corpus, select_small_mock_corpus, index_mock_corp } ] - result = ngram.get_ngrams(frequent_query, mock_corpus, 'content', freq_compensation=False) + result = ngram.get_ngrams(frequent_query, small_mock_corpus, 'content', freq_compensation=False) assert result['time_points'] == ['{}-{}'.format(start, end) for start, end in CENTURY_BINS] @@ -174,7 +174,7 @@ def test_absolute_bigrams(mock_corpus, select_small_mock_corpus, index_mock_corp else: assert freq == 0 -def test_bigrams_with_quote(mock_corpus, select_small_mock_corpus, index_mock_corpus, basic_query): +def test_bigrams_with_quote(small_mock_corpus, index_small_mock_corpus, basic_query): cases = [ { 'query': '"to hear"', @@ -204,7 +204,7 @@ def test_bigrams_with_quote(mock_corpus, select_small_mock_corpus, index_mock_co # search for a word that occurs a few times case_query = query.set_query_text(basic_query, case['query']) - result = ngram.get_ngrams(case_query, mock_corpus, 'content', freq_compensation=False) + result = ngram.get_ngrams(case_query, small_mock_corpus, 'content', freq_compensation=False) ngrams = case['ngrams'] @@ -253,14 +253,14 @@ def test_bigrams_with_quote(mock_corpus, select_small_mock_corpus, index_mock_co } ] -def test_number_of_ngrams(mock_corpus, select_small_mock_corpus, index_mock_corpus, basic_query): +def test_number_of_ngrams(small_mock_corpus, index_small_mock_corpus, basic_query): # search for a word that occurs a few times frequent_query = query.set_query_text(basic_query, 'to') max_frequency = 6 for size in range(1, max_frequency + 2): - result = ngram.get_ngrams(frequent_query, mock_corpus, 'content', number_of_ngrams= size) + result = ngram.get_ngrams(frequent_query, small_mock_corpus, 'content', number_of_ngrams= size) series = result['words'] assert len(series) == min(max_frequency, size) diff --git a/backend/visualization/tests/test_query.py b/backend/visualization/tests/test_query.py index 96ac6519c..12afe6235 100644 --- a/backend/visualization/tests/test_query.py +++ b/backend/visualization/tests/test_query.py @@ -36,13 +36,13 @@ def test_date_manipulation(basic_query): assert query_min_date == min_date assert query_max_date == max_date -def test_search(mock_corpus, es_client, select_small_mock_corpus, index_mock_corpus, basic_query): +def test_search(small_mock_corpus, es_client, index_small_mock_corpus, basic_query): """ Test some search requests based on queries manipulated in the query module """ query_no_text = query.remove_query(basic_query) result = search( - corpus = mock_corpus, + corpus = small_mock_corpus, query_model=query_no_text, client=es_client, ) @@ -54,7 +54,7 @@ def test_search(mock_corpus, es_client, select_small_mock_corpus, index_mock_cor query_no_text = query.add_filter(query_no_text, date_filter) result = search( - corpus = mock_corpus, + corpus = small_mock_corpus, query_model = query_no_text, client=es_client ) diff --git a/backend/visualization/tests/test_term_frequency.py b/backend/visualization/tests/test_term_frequency.py index c57274022..a3fcddef1 100644 --- a/backend/visualization/tests/test_term_frequency.py +++ b/backend/visualization/tests/test_term_frequency.py @@ -2,9 +2,9 @@ import csv -def test_extract_data_for_term_frequency(mock_corpus, select_small_mock_corpus): +def test_extract_data_for_term_frequency(small_mock_corpus): es_query = make_query('test', ['content', 'title']) - search_fields, aggregators = term_frequency.extract_data_for_term_frequency(mock_corpus, es_query) + search_fields, aggregators = term_frequency.extract_data_for_term_frequency(small_mock_corpus, es_query) # fieldnames should look at specified fields target_fields = ['content', 'title'] @@ -16,7 +16,7 @@ def test_extract_data_for_term_frequency(mock_corpus, select_small_mock_corpus): # restrict the search field to one with token counts fields_with_token_counts = ['content'] es_query = make_query('test', fields_with_token_counts) - fieldnames, aggregators = term_frequency.extract_data_for_term_frequency(mock_corpus, es_query) + fieldnames, aggregators = term_frequency.extract_data_for_term_frequency(small_mock_corpus, es_query) # fieldnames should be restricted as well assert set(fields_with_token_counts) == set(fieldnames) @@ -31,7 +31,7 @@ def test_extract_data_for_term_frequency(mock_corpus, select_small_mock_corpus): } assert aggregators == aggregators_target -def test_match_count(mock_corpus, es_client, select_small_mock_corpus, index_mock_corpus): +def test_match_count(small_mock_corpus, es_client, index_small_mock_corpus): """Test counting matches of the search term""" frequencies = [ @@ -52,8 +52,8 @@ def test_match_count(mock_corpus, es_client, select_small_mock_corpus, index_moc for text, freq in frequencies: query = make_query(query_text=text) - fieldnames, aggregators = term_frequency.extract_data_for_term_frequency(mock_corpus, query) - match_count = term_frequency.get_match_count(es_client, query, mock_corpus, 100, fieldnames) + fieldnames, aggregators = term_frequency.extract_data_for_term_frequency(small_mock_corpus, query) + match_count = term_frequency.get_match_count(es_client, query, small_mock_corpus, 100, fieldnames) assert match_count == freq def test_total_docs_and_tokens(es_client, mock_corpus, index_mock_corpus, mock_corpus_specs): @@ -67,25 +67,25 @@ def test_total_docs_and_tokens(es_client, mock_corpus, index_mock_corpus, mock_c assert total_doc_count == mock_corpus_specs['total_docs'] assert token_count == (mock_corpus_specs['total_words'] if mock_corpus_specs['has_token_counts'] else None) -def test_term_frequency(mock_corpus, select_small_mock_corpus, index_mock_corpus, mock_corpus_specs,): +def test_term_frequency(small_mock_corpus, index_small_mock_corpus, small_mock_corpus_specs,): ## search in all fields query = make_query(query_text='Alice') - match_count, total_doc_count, token_count = term_frequency.get_term_frequency(query, mock_corpus, 100) + match_count, total_doc_count, token_count = term_frequency.get_term_frequency(query, small_mock_corpus, 100) assert match_count == 2 - assert total_doc_count == mock_corpus_specs['total_docs'] + assert total_doc_count == small_mock_corpus_specs['total_docs'] assert token_count == None ## search in content (includes token count) query = make_query(query_text='Alice', search_in_fields=['content']) - match_count, total_doc_count, token_count = term_frequency.get_term_frequency(query, mock_corpus, 100) + match_count, total_doc_count, token_count = term_frequency.get_term_frequency(query, small_mock_corpus, 100) assert match_count == 1 - assert total_doc_count == mock_corpus_specs['total_docs'] - assert token_count == mock_corpus_specs['total_words'] + assert total_doc_count == small_mock_corpus_specs['total_docs'] + assert token_count == small_mock_corpus_specs['total_words'] -def test_histogram_term_frequency(mock_corpus, select_small_mock_corpus, index_mock_corpus): +def test_histogram_term_frequency(small_mock_corpus, index_small_mock_corpus): cases = [ { @@ -105,7 +105,7 @@ def test_histogram_term_frequency(mock_corpus, select_small_mock_corpus, index_m for case in cases: query = make_query(query_text='of', search_in_fields=['content']) - result = term_frequency.get_aggregate_term_frequency(query, mock_corpus, 'genre', case['genre']) + result = term_frequency.get_aggregate_term_frequency(query, small_mock_corpus, 'genre', case['genre']) assert result == { 'key': case['genre'], @@ -114,7 +114,7 @@ def test_histogram_term_frequency(mock_corpus, select_small_mock_corpus, index_m 'token_count': case['tokens'] } -def test_timeline_term_frequency(mock_corpus, select_small_mock_corpus, index_mock_corpus): +def test_timeline_term_frequency(small_mock_corpus, index_small_mock_corpus): cases = [ { @@ -128,7 +128,7 @@ def test_timeline_term_frequency(mock_corpus, select_small_mock_corpus, index_mo for case in cases: query = make_query(query_text='of', search_in_fields=['content']) - result = term_frequency.get_date_term_frequency(query, mock_corpus, 'date', case['min_date'], case['max_date']) + result = term_frequency.get_date_term_frequency(query, small_mock_corpus, 'date', case['min_date'], case['max_date']) assert result == { 'key': case['min_date'], diff --git a/backend/visualization/tests/test_termvectors.py b/backend/visualization/tests/test_termvectors.py index 8433d0ead..215ba6ad5 100644 --- a/backend/visualization/tests/test_termvectors.py +++ b/backend/visualization/tests/test_termvectors.py @@ -24,7 +24,7 @@ def test_tokens(termvectors_result): assert token['term'] == word assert token['ttf'] == 1 # title has no duplicate words -def test_find_matches(es_client, termvectors_result): +def test_find_matches(es_client, termvectors_result, small_mock_corpus): title_terms = termvectors.get_terms(termvectors_result, 'title') title_tokens = termvectors.get_tokens(title_terms, sort = True) @@ -43,8 +43,9 @@ def test_find_matches(es_client, termvectors_result): ('fronkenstien~2 modern', 2), ] + corpus = load_corpus(small_mock_corpus) for query_text, expected_matches in cases: - matches = list(termvectors.token_matches(title_tokens, query_text, 'ianalyzer-mock-corpus', 'title', es_client)) + matches = list(termvectors.token_matches(title_tokens, query_text, corpus.es_index, 'title', es_client)) assert len(matches) == expected_matches QUERY_ANALYSIS_CASES = [ @@ -89,8 +90,8 @@ def test_query_components(): assert sorted(components) == sorted(case['components']) # ignore order -def test_query_analysis(es_client, mock_corpus, index_mock_corpus, select_small_mock_corpus): - corpus = load_corpus(mock_corpus) +def test_query_analysis(es_client, small_mock_corpus, index_small_mock_corpus): + corpus = load_corpus(small_mock_corpus) es_index = corpus.es_index for case in QUERY_ANALYSIS_CASES: @@ -99,8 +100,8 @@ def test_query_analysis(es_client, mock_corpus, index_mock_corpus, select_small_ @pytest.fixture -def termvectors_result(es_client, mock_corpus, index_mock_corpus, select_small_mock_corpus): - corpus = load_corpus(mock_corpus) +def termvectors_result(es_client, small_mock_corpus, index_small_mock_corpus): + corpus = load_corpus(small_mock_corpus) es_index = corpus.es_index frankenstein_query = { @@ -110,7 +111,7 @@ def termvectors_result(es_client, mock_corpus, index_mock_corpus, select_small_m } } } - result = search.search(mock_corpus, frankenstein_query, es_client) + result = search.search(small_mock_corpus, frankenstein_query, es_client) hit = search.hits(result)[0] id = hit['_id'] diff --git a/backend/visualization/tests/test_visualization_views.py b/backend/visualization/tests/test_visualization_views.py index 9d82cb2cf..c80df21db 100644 --- a/backend/visualization/tests/test_visualization_views.py +++ b/backend/visualization/tests/test_visualization_views.py @@ -11,8 +11,8 @@ def wordcloud_body(mock_corpus): 'size': 1000, } -def test_wordcloud_view(authenticated_client, mock_corpus, index_mock_corpus, wordcloud_body): - response = authenticated_client.post( +def test_wordcloud_view(admin_client, mock_corpus, index_mock_corpus, wordcloud_body): + response =admin_client.post( '/api/visualization/wordcloud', wordcloud_body, content_type='application/json' @@ -20,10 +20,10 @@ def test_wordcloud_view(authenticated_client, mock_corpus, index_mock_corpus, wo assert status.is_success(response.status_code) @pytest.fixture -def date_term_frequency_body(basic_query, mock_corpus, select_small_mock_corpus): +def date_term_frequency_body(basic_query, small_mock_corpus): return { 'es_query': basic_query, - 'corpus_name': mock_corpus, + 'corpus_name': small_mock_corpus, 'field_name': 'date', 'bins': [ { @@ -41,10 +41,10 @@ def date_term_frequency_body(basic_query, mock_corpus, select_small_mock_corpus) } @pytest.fixture -def aggregate_term_frequency_body(basic_query, mock_corpus, select_small_mock_corpus): +def aggregate_term_frequency_body(basic_query, small_mock_corpus): return { 'es_query': basic_query, - 'corpus_name': mock_corpus, + 'corpus_name': small_mock_corpus, 'field_name': 'genre', 'bins': [ { 'field_value': 'Romance', 'size': 10 }, @@ -54,10 +54,10 @@ def aggregate_term_frequency_body(basic_query, mock_corpus, select_small_mock_co } @pytest.fixture -def ngram_body(basic_query, mock_corpus, select_small_mock_corpus): +def ngram_body(basic_query, small_mock_corpus): return { 'es_query': basic_query, - 'corpus_name': mock_corpus, + 'corpus_name': small_mock_corpus, 'field': 'content', 'ngram_size': 2, 'term_position': [0, 1], @@ -68,21 +68,20 @@ def ngram_body(basic_query, mock_corpus, select_small_mock_corpus): 'date_field': 'date', } - -def test_ngrams(authenticated_client, ngram_body, index_mock_corpus, celery_worker): - post_response = authenticated_client.post('/api/visualization/ngram', ngram_body, content_type='application/json') +def test_ngrams(transactional_db, admin_client, ngram_body, index_small_mock_corpus, celery_worker): + post_response = admin_client.post('/api/visualization/ngram', ngram_body, content_type='application/json') assert post_response.status_code == 200 -def test_aggregate_term_frequency(authenticated_client, aggregate_term_frequency_body, index_mock_corpus, celery_worker): - post_response = authenticated_client.post('/api/visualization/aggregate_term_frequency', aggregate_term_frequency_body, content_type='application/json') +def test_aggregate_term_frequency(transactional_db, admin_client, aggregate_term_frequency_body, index_small_mock_corpus, celery_worker): + post_response = admin_client.post('/api/visualization/aggregate_term_frequency', aggregate_term_frequency_body, content_type='application/json') assert post_response.status_code == 200 del aggregate_term_frequency_body['es_query'] - post_response = authenticated_client.post('/api/visualization/aggregate_term_frequency', aggregate_term_frequency_body, content_type='application/json') + post_response = admin_client.post('/api/visualization/aggregate_term_frequency', aggregate_term_frequency_body, content_type='application/json') assert post_response.status_code == 400 -def test_date_term_frequency(authenticated_client, date_term_frequency_body, index_mock_corpus, celery_worker): - post_response = authenticated_client.post('/api/visualization/date_term_frequency', date_term_frequency_body, content_type='application/json') +def test_date_term_frequency(transactional_db, admin_client, date_term_frequency_body, index_small_mock_corpus, celery_worker): + post_response = admin_client.post('/api/visualization/date_term_frequency', date_term_frequency_body, content_type='application/json') assert post_response.status_code == 200 del date_term_frequency_body['es_query'] - post_response = authenticated_client.post('/api/visualization/date_term_frequency', date_term_frequency_body, content_type='application/json') + post_response = admin_client.post('/api/visualization/date_term_frequency', date_term_frequency_body, content_type='application/json') assert post_response.status_code == 400 diff --git a/backend/visualization/tests/test_wordcloud.py b/backend/visualization/tests/test_wordcloud.py index d8c680f18..5f887467c 100644 --- a/backend/visualization/tests/test_wordcloud.py +++ b/backend/visualization/tests/test_wordcloud.py @@ -16,7 +16,7 @@ def make_filtered_query(): return query.add_filter(empty_query, datefilter) -def test_wordcloud(mock_corpus, select_small_mock_corpus, index_mock_corpus): +def test_wordcloud(small_mock_corpus, index_small_mock_corpus): query = { "query": { "match_all": {} @@ -24,7 +24,7 @@ def test_wordcloud(mock_corpus, select_small_mock_corpus, index_mock_corpus): } result = search.search( - corpus = mock_corpus, + corpus = small_mock_corpus, query_model = query, size = 10 ) @@ -76,7 +76,7 @@ def test_wordcloud(mock_corpus, select_small_mock_corpus, index_mock_corpus): { 'key': 'accompanied', 'doc_count': 1 } ] - output = wordcloud.make_wordcloud_data(documents, 'content', mock_corpus) + output = wordcloud.make_wordcloud_data(documents, 'content', small_mock_corpus) for item in target_unfiltered: term = item['key'] doc_count = item['doc_count'] @@ -84,7 +84,7 @@ def test_wordcloud(mock_corpus, select_small_mock_corpus, index_mock_corpus): assert match assert doc_count == match['doc_count'] -def test_wordcloud_filtered(mock_corpus, select_small_mock_corpus, es_client, index_mock_corpus): +def test_wordcloud_filtered(small_mock_corpus, es_client, index_small_mock_corpus): """Test the word cloud on a query with date filter""" filtered_query = make_filtered_query() @@ -121,14 +121,14 @@ def test_wordcloud_filtered(mock_corpus, select_small_mock_corpus, es_client, in ] result = search.search( - corpus = mock_corpus, + corpus = small_mock_corpus, query_model = filtered_query, size = 10, client = es_client ) documents = search.hits(result) - output = wordcloud.make_wordcloud_data(documents, 'content', mock_corpus) + output = wordcloud.make_wordcloud_data(documents, 'content', small_mock_corpus) for item in target_filtered: term = item['key'] diff --git a/backend/visualization/urls.py b/backend/visualization/urls.py index b9ea66a93..7c943cdaf 100644 --- a/backend/visualization/urls.py +++ b/backend/visualization/urls.py @@ -7,4 +7,5 @@ path('ngram', NgramView.as_view()), path('date_term_frequency', DateTermFrequencyView.as_view()), path('aggregate_term_frequency', AggregateTermFrequencyView.as_view()), + path('coverage/', FieldCoverageView.as_view()) ] diff --git a/backend/visualization/views.py b/backend/visualization/views.py index ebbe517b1..47b4b3610 100644 --- a/backend/visualization/views.py +++ b/backend/visualization/views.py @@ -7,10 +7,13 @@ from django.conf import settings from rest_framework.permissions import IsAuthenticated from addcorpus.permissions import CorpusAccessPermission +from visualization.field_stats import report_coverage +from addcorpus.permissions import corpus_name_from_request from api.utils import check_json_keys logger = logging.getLogger() + class WordcloudView(APIView): ''' Most frequent terms for a small batch of results @@ -22,15 +25,18 @@ def post(self, request, *args, **kwargs): check_json_keys(request, ['corpus', 'es_query', 'field', 'size']) wordcloud_limit = settings.WORDCLOUD_LIMIT if request.data['size'] > wordcloud_limit: - raise ParseError(detail=f'size exceeds {wordcloud_limit} documents') + raise ParseError( + detail=f'size exceeds {wordcloud_limit} documents') try: - word_counts = tasks.get_wordcloud_data(request.data) # no need to run async: we will use the result directly + # no need to run async: we will use the result directly + word_counts = tasks.get_wordcloud_data(request.data) return Response(word_counts) except Exception as e: logger.error(e) raise APIException(detail='could not generate word cloud data') + class WordcloudTaskView(APIView): ''' Schedule a task to retrieve the word cloud @@ -49,6 +55,7 @@ def post(self, request, *args, **kwargs): except: raise APIException('Could not set up word cloud generation') + class NgramView(APIView): ''' Schedule a task to retrieve ngrams containing the search term @@ -72,6 +79,7 @@ def post(self, request, *args, **kwargs): logger.error(e) raise APIException(detail='Could not set up ngram generation.') + class DateTermFrequencyView(APIView): ''' Schedule a task to retrieve term frequency @@ -81,15 +89,18 @@ class DateTermFrequencyView(APIView): permission_classes = [IsAuthenticated, CorpusAccessPermission] def post(self, request, *args, **kwargs): - check_json_keys(request, ['es_query', 'corpus_name', 'field_name', 'bins']) + check_json_keys( + request, ['es_query', 'corpus_name', 'field_name', 'bins']) for bin in request.data['bins']: for key in ['start_date', 'end_date', 'size']: if not key in bin: - raise ParseError(detail=f'key {key} is not present for all bins in request data') + raise ParseError( + detail=f'key {key} is not present for all bins in request data') try: - group = tasks.timeline_term_frequency_tasks(request.data).apply_async() + group = tasks.timeline_term_frequency_tasks( + request.data).apply_async() subtasks = group.children return Response({'task_ids': [task.id for task in subtasks]}) except Exception as e: @@ -104,17 +115,33 @@ class AggregateTermFrequencyView(APIView): ''' def post(self, request, *args, **kwargs): - check_json_keys(request, ['es_query', 'corpus_name', 'field_name', 'bins']) + check_json_keys( + request, ['es_query', 'corpus_name', 'field_name', 'bins']) for bin in request.data['bins']: for key in ['field_value', 'size']: if not key in bin: - raise ParseError(detail=f'key {key} is not present for all bins in request data') + raise ParseError( + detail=f'key {key} is not present for all bins in request data') try: - group = tasks.histogram_term_frequency_tasks(request.data).apply_async() + group = tasks.histogram_term_frequency_tasks( + request.data).apply_async() subtasks = group.children return Response({'task_ids': [task.id for task in subtasks]}) except Exception as e: logger.error(e) raise APIException('Could not set up term frequency generation.') + + +class FieldCoverageView(APIView): + ''' + Get the coverage of each field in a corpus + ''' + + permission_classes = [IsAuthenticated, CorpusAccessPermission] + + def get(self, request, *args, **kwargs): + corpus = corpus_name_from_request(request) + report = report_coverage(corpus) + return Response(report) diff --git a/backend/wordmodels/conftest.py b/backend/wordmodels/conftest.py index 9e2211884..73042a427 100644 --- a/backend/wordmodels/conftest.py +++ b/backend/wordmodels/conftest.py @@ -1,9 +1,5 @@ import pytest import os -from django.conf import settings -from users.models import CustomUser -from addcorpus.load_corpus import load_all_corpora -from addcorpus.models import Corpus here = os.path.abspath(os.path.dirname(__file__)) @@ -13,28 +9,5 @@ @pytest.fixture() -def mock_corpus_settings(settings): - settings.CORPORA = { - 'mock-corpus': os.path.join(here, 'tests', 'mock-corpus', 'mock_corpus.py'), - } - return settings - -@pytest.fixture() -def mock_corpus(mock_corpus_settings): - ''' return the first key of the CORPORA dict ''' - return next(iter(settings.CORPORA.keys())) - -@pytest.fixture() -def corpus_user(db, mock_corpus): - '''Make a user with access to the mock corpus''' - - username = 'mock-user' - password = 'secret' - user = CustomUser.objects.create(username=username, password=password, is_superuser = True) - load_all_corpora() - return user - -@pytest.fixture() -def authenticated_client(client, corpus_user): - client.force_login(corpus_user) - return client +def mock_corpus(): + return 'wordmodels-mock-corpus' diff --git a/backend/wordmodels/similarity.py b/backend/wordmodels/similarity.py index 04664b5f5..3a9d45602 100644 --- a/backend/wordmodels/similarity.py +++ b/backend/wordmodels/similarity.py @@ -1,53 +1,40 @@ import numpy as np -from wordmodels.utils import transform_query, index_to_term - +from wordmodels.utils import transform_query def term_similarity(wm, term1, term2): - matrix = wm['matrix'] + vectors = wm['vectors'] transformed1 = transform_query(term1) transformed2 = transform_query(term2) - vocab = wm['vocab'] + vocab = vectors.index_to_key if transformed1 in vocab and transformed2 in vocab: - similarity = matrix.similarity(transformed1, transformed2) + similarity = vectors.similarity(transformed1, transformed2) return float(similarity) def find_n_most_similar(wm, query_term, n): - """given a matrix of svd_ppmi or word2vec values + """given vectors of svd_ppmi or word2vec values with its vocabulary and analyzer, determine which n terms match the given query term best """ - vocab = wm['vocab'] transformed_query = transform_query(query_term) - matrix = wm['matrix'] - results = most_similar_items(matrix, vocab, transformed_query, n) + vectors = wm['vectors'] + results = most_similar_items(vectors, transformed_query, n) return [{ 'key': result[0], 'similarity': result[1] } for result in results] -def most_similar_items(matrix, vocab, term, n, missing_terms = 0): +def most_similar_items(vectors, term, n): ''' Find the n most similar terms in a keyed vectors matrix, while filtering on the vocabulary. parameters: - - `matrix`: the KeyedVectors matrix - - `vocab`: the vocabulary for the model. This may be a subst of the keys in `matrix`, so - results will be filtered on vocab. - - `term`: the term for which to find the nearest neighbours. Should already have been - passed through the model's analyzer. + - `vectors`: the KeyedVectors + - `term`: the term for which to find the nearest neighbours, transformed with `transform_query` - `n`: number of neighbours to return - - `missing_terms`: used for recursion. indicates that of the `n` nearest vectors, `missing_terms` vectors - are not actually included in `vocab`, hence we should request `n + missing_terms` vectors ''' - + vocab = vectors.index_to_key if term in vocab: - results = matrix.most_similar(term, topn=n + missing_terms) - filtered_results = [(key, score) for key, score in results if key in vocab] - results_complete = len(filtered_results) == min(n, len(vocab) - 1) - if results_complete: - return filtered_results - else: - delta = n - len(filtered_results) - return most_similar_items(matrix, vocab, term, n, missing_terms=delta + missing_terms) + results = vectors.most_similar(term, topn=n) + return results return [] diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1839.w2v b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1839.w2v deleted file mode 100644 index 5dcde1905..000000000 Binary files a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1839.w2v and /dev/null differ diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1839.wv b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1839.wv new file mode 100644 index 000000000..076f7e549 Binary files /dev/null and b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1839.wv differ diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1839_vocab.pkl b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1839_vocab.pkl deleted file mode 100644 index d2b3fd88d..000000000 Binary files a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1839_vocab.pkl and /dev/null differ diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1899_full.w2v b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1899_full.w2v deleted file mode 100644 index 4d9404b39..000000000 Binary files a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1899_full.w2v and /dev/null differ diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1899_full_analyzer.pkl b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1899_full_analyzer.pkl deleted file mode 100644 index a5fcea044..000000000 Binary files a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1899_full_analyzer.pkl and /dev/null differ diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1899_full_vocab.pkl b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1899_full_vocab.pkl deleted file mode 100644 index eaed24b20..000000000 Binary files a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1810_1899_full_vocab.pkl and /dev/null differ diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1840_1869.w2v b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1840_1869.w2v deleted file mode 100644 index 6c93f0815..000000000 Binary files a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1840_1869.w2v and /dev/null differ diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1840_1869.wv b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1840_1869.wv new file mode 100644 index 000000000..8e6f2e4b6 Binary files /dev/null and b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1840_1869.wv differ diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1840_1869_vocab.pkl b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1840_1869_vocab.pkl deleted file mode 100644 index 211e18fc0..000000000 Binary files a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1840_1869_vocab.pkl and /dev/null differ diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1870_1899.w2v b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1870_1899.w2v deleted file mode 100644 index 46b3f1406..000000000 Binary files a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1870_1899.w2v and /dev/null differ diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1870_1899.wv b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1870_1899.wv new file mode 100644 index 000000000..a317cc3ef Binary files /dev/null and b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1870_1899.wv differ diff --git a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1870_1899_vocab.pkl b/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1870_1899_vocab.pkl deleted file mode 100644 index 9fce8fe89..000000000 Binary files a/backend/wordmodels/tests/mock-corpus/mock-word-models/model_1870_1899_vocab.pkl and /dev/null differ diff --git a/backend/wordmodels/tests/mock-corpus/mock_corpus.py b/backend/wordmodels/tests/mock-corpus/mock_corpus.py index e45f24a9d..caa5eaaac 100644 --- a/backend/wordmodels/tests/mock-corpus/mock_corpus.py +++ b/backend/wordmodels/tests/mock-corpus/mock_corpus.py @@ -5,8 +5,8 @@ here = abspath(dirname(__file__)) -class MockCorpus(Corpus): - title = "Mock corpus with SVD_PPMI models" +class WordmodelsMockCorpus(Corpus): + title = "Mock corpus with word models represented as Keyed Vectors" description = "Mock corpus for testing word models, saved as gensim Keyed Vectors" es_index = 'nothing' min_date = datetime.datetime(year=1810, month=1, day=1) @@ -19,3 +19,6 @@ class MockCorpus(Corpus): name = 'content', ) ] + languages = ['en'] + category = 'book' + diff --git a/backend/wordmodels/tests/test_related_words.py b/backend/wordmodels/tests/test_related_words.py index 05a827f2d..831b438e4 100644 --- a/backend/wordmodels/tests/test_related_words.py +++ b/backend/wordmodels/tests/test_related_words.py @@ -4,7 +4,7 @@ from wordmodels.visualisations import get_diachronic_contexts from wordmodels.conftest import TEST_BINS -def assert_similarity_format(item, must_specify_time = True): +def assert_similarity_format(item, must_specify_time=True): assert 'key' in item and type(item['key']) == str assert 'similarity' in item @@ -24,7 +24,7 @@ def test_context_time_interval(mock_corpus): } term = case.get('term') - _, _, times, results = get_diachronic_contexts(term, mock_corpus, 5) + _, times, results = get_diachronic_contexts(term, mock_corpus, 5) bin_without_match = case.get('bin_without_match') if bin_without_match: @@ -40,7 +40,7 @@ def test_context_time_interval(mock_corpus): # check format for item in context: - assert_similarity_format(item, must_specify_time = False) + assert_similarity_format(item, must_specify_time=False) # check common-sense nearest neighbours @@ -52,11 +52,7 @@ def test_context_time_interval(mock_corpus): assert most_similar_term == case.get('similar1') def test_diachronic_context(mock_corpus): - word_list, word_data, times, _ = get_diachronic_contexts('she', mock_corpus) - # test format - - for item in word_list: - assert_similarity_format(item, must_specify_time=False) + word_data, times, _ = get_diachronic_contexts('she', mock_corpus) for item in word_data: assert_similarity_format(item) diff --git a/backend/wordmodels/tests/test_similarity.py b/backend/wordmodels/tests/test_similarity.py index 6bddd253d..95bb3fa7e 100644 --- a/backend/wordmodels/tests/test_similarity.py +++ b/backend/wordmodels/tests/test_similarity.py @@ -1,5 +1,7 @@ +from copy import deepcopy import numpy as np import pytest +from gensim.models import KeyedVectors from addcorpus.load_corpus import load_corpus import wordmodels.similarity as similarity @@ -14,7 +16,7 @@ def test_term_similarity(mock_corpus): 'uppercase_term': 'She' } corpus = load_corpus(mock_corpus) - binned_models = load_word_models(corpus, True) + binned_models = load_word_models(corpus) model = binned_models[0] similarity1 = similarity.term_similarity(model, case['term'], case['similar_term']) @@ -32,7 +34,7 @@ def test_n_nearest_neighbours_amount(mock_corpus): for n in range(1, 16, 5): term = 'elizabeth' corpus = load_corpus(mock_corpus) - binned_models = load_word_models(corpus, True) + binned_models = load_word_models(corpus) model = binned_models[0] result = similarity.find_n_most_similar(model, term, n) @@ -41,21 +43,25 @@ def test_n_nearest_neighbours_amount(mock_corpus): @pytest.fixture def model_with_term_removed(mock_corpus): corpus = load_corpus(mock_corpus) - binned_models = load_word_models(corpus, True) + binned_models = load_word_models(corpus) original_model = binned_models[0] - model = copy(original_model) term = 'darcy' - model['vocab'] = list(model['vocab']) # convert from np.array if needed - model['vocab'].remove(term) + vocab = deepcopy(original_model['vectors'].index_to_key) + vocab.remove(term) - return corpus, model, original_model, term + new_vectors = KeyedVectors(original_model['vectors'].vector_size) + new_vectors.add_vectors( + vocab, [original_model['vectors'][word] for word in vocab]) + new_model = {'vectors': new_vectors} + + return corpus, new_model, original_model, term, vocab def test_vocab_is_subset_of_model(model_with_term_removed): '''Test cases where the vocab array is a subset of terms with vectors.''' - corpus, model, original_model, missing_term = model_with_term_removed - assert missing_term not in model['vocab'] + corpus, model, original_model, missing_term, vocab = model_with_term_removed + assert missing_term not in vocab other_term = 'elizabeth' @@ -77,28 +83,3 @@ def test_vocab_is_subset_of_model(model_with_term_removed): neighbours = similarity.find_n_most_similar(model, similar_term, 10) assert not any([neighbour['key'] == missing_term for neighbour in neighbours]) assert len(neighbours) == 10 - - -class TestMatrix: - """Mock of keyed vectors for term similarity. Implements - a most_similar function.""" - - def __init__(self, terms): - self.terms = terms - - def most_similar(self, term, topn=10): - n = min(topn, len(self.terms) - 1) - filtered_terms = list(filter(lambda t: t != term, self.terms)) - index_to_score = lambda index: 1 / (1 + index) - return ((term, index_to_score(i)) for i, term in enumerate(filtered_terms[:n])) - -def test_most_similar_recursion(): - """Test that the recursion in the most_similar_items - function works correctly""" - - terms = [str(i) for i in range(0, 100)] - vocab = [str(i) for i in range(0, 100, 2)] - matrix = TestMatrix(terms) - - neighbours = similarity.most_similar_items(matrix, vocab, '50', 10) - assert len(neighbours) == 10 diff --git a/backend/wordmodels/tests/test_wm_import.py b/backend/wordmodels/tests/test_wm_import.py index 0641e0b6e..e6a36b467 100644 --- a/backend/wordmodels/tests/test_wm_import.py +++ b/backend/wordmodels/tests/test_wm_import.py @@ -3,24 +3,13 @@ import pytest from addcorpus.load_corpus import load_corpus -from wordmodels.utils import load_word_models, word_in_model, transform_query +from wordmodels.utils import load_word_models, word_in_models, transform_query from wordmodels.conftest import TEST_VOCAB_SIZE, TEST_DIMENSIONS, TEST_BINS from wordmodels.utils import load_wm_documentation -def test_complete_import(mock_corpus): +def test_import(mock_corpus): corpus = load_corpus(mock_corpus) - model = load_word_models(corpus) - assert model - - weights = model['matrix'] - assert weights.vector_size == TEST_DIMENSIONS - vocab = model['vocab'] - assert len(vocab) == TEST_VOCAB_SIZE - - -def test_binned_import(mock_corpus): - corpus = load_corpus(mock_corpus) - models = load_word_models(corpus, binned=True) + models = load_word_models(corpus) assert len(models) == len(TEST_BINS) for model, t_bin in zip(models, TEST_BINS): @@ -29,13 +18,13 @@ def test_binned_import(mock_corpus): assert model['start_year'] == start_year assert model['end_year'] == end_year - weights = model['matrix'] + weights = model['vectors'] assert weights.vector_size == TEST_DIMENSIONS - vocab = model['vocab'] + vocab = weights.index_to_key assert len(vocab) == TEST_VOCAB_SIZE -def test_word_in_model(mock_corpus): +def test_word_in_models(mock_corpus): cases = [ { 'term': 'she', @@ -57,7 +46,7 @@ def test_word_in_model(mock_corpus): for case in cases: corpus = load_corpus(mock_corpus) - result = word_in_model(case['term'], corpus, 1) + result = word_in_models(case['term'], corpus, 1) assert result == case['expected'] def test_description_import(mock_corpus): diff --git a/backend/wordmodels/tests/test_wordmodels_views.py b/backend/wordmodels/tests/test_wordmodels_views.py index a2be98ae8..8e95187e1 100644 --- a/backend/wordmodels/tests/test_wordmodels_views.py +++ b/backend/wordmodels/tests/test_wordmodels_views.py @@ -1,30 +1,33 @@ import pytest -def test_related_words_view(authenticated_client, mock_corpus): + +def test_related_words_view(admin_client, mock_corpus): query_json = { 'query_term': 'alice', 'corpus_name': mock_corpus, 'neighbours': 5 } - response = authenticated_client.post( + response = admin_client.post( '/api/wordmodels/related_words', query_json, content_type='application/json' ) assert response.status_code == 200 -def test_word_similarity_view(authenticated_client, mock_corpus): + +def test_word_similarity_view(admin_client, mock_corpus): term_1 = 'test' term_2 = 'testing' - response = authenticated_client.get( + response = admin_client.get( f'/api/wordmodels/similarity_over_time?term_1={term_1}&term_2={term_2}&corpus_name={mock_corpus}', content_type='application/json' ) assert response.status_code == 200 -def test_wm_documentation_view(authenticated_client, mock_corpus): - response = authenticated_client.get( + +def test_wm_documentation_view(admin_client, mock_corpus): + response = admin_client.get( f'/api/wordmodels/documentation?corpus={mock_corpus}', content_type='application/json' ) @@ -34,16 +37,18 @@ def test_wm_documentation_view(authenticated_client, mock_corpus): assert 'documentation' in data assert data['documentation'] == 'Description for testing.\n' -word_in_model_test_cases = [ + +word_in_models_test_cases = [ ('alice', True), ('Alice', True), ('aalice', False), ] -@pytest.mark.parametrize('term,in_model', word_in_model_test_cases) -def test_word_in_model_view(term, in_model, authenticated_client, mock_corpus): - response = authenticated_client.get( - f'/api/wordmodels/word_in_model?query_term={term}&corpus_name={mock_corpus}', + +@pytest.mark.parametrize('term,in_model', word_in_models_test_cases) +def test_word_in_models_view(term, in_model, admin_client, mock_corpus): + response = admin_client.get( + f'/api/wordmodels/word_in_models?query_term={term}&corpus_name={mock_corpus}', content_type='application/json' ) assert response.status_code == 200 diff --git a/backend/wordmodels/urls.py b/backend/wordmodels/urls.py index 3fa6d1db0..876df2d80 100644 --- a/backend/wordmodels/urls.py +++ b/backend/wordmodels/urls.py @@ -5,5 +5,5 @@ path('related_words', RelatedWordsView.as_view()), path('similarity_over_time', SimilarityView.as_view()), path('documentation', DocumentationView.as_view()), - path('word_in_model', WordInModelView.as_view()), + path('word_in_models', WordInModelView.as_view()), ] diff --git a/backend/wordmodels/utils.py b/backend/wordmodels/utils.py index 56b9092d3..32ee61895 100644 --- a/backend/wordmodels/utils.py +++ b/backend/wordmodels/utils.py @@ -10,59 +10,39 @@ from glob import glob -def load_word_models(corpus, binned=False): +def load_word_models(corpus): if type(corpus)==str: corpus = load_corpus(corpus) - w2v_list = glob('{}/*.w2v'.format(corpus.word_model_path)) - full_model = next((item for item in w2v_list if item.endswith('full.w2v')), None) - try: - w2v_list.remove(full_model) - except: - raise(Exception("No full word model found for this corpus.")) - if binned: - w2v_list.sort() - wm = [ - { - "start_year": get_year(wm_file, 1), - "end_year": get_year(wm_file, 2), - "matrix": KeyedVectors.load_word2vec_format(wm_file, binary=True), - "vocab": get_vocab(wm_file) - } - for wm_file in w2v_list - ] - else: - model = KeyedVectors.load_word2vec_format(full_model, binary=True) - wm = { - "start_year": get_year(full_model, 1), - "end_year": get_year(full_model, 2), - "matrix": model, - "vocab": get_vocab(full_model) - } + wv_list = glob('{}/*.wv'.format(corpus.word_model_path)) + wv_list.sort() + wm = [ + { + "start_year": get_year(wm_file, 1), + "end_year": get_year(wm_file, 2), + "vectors": KeyedVectors.load(wm_file), + } + for wm_file in wv_list + ] return wm -def get_vocab(kv_filename): - vocab_name = '{}_vocab.pkl'.format(splitext(kv_filename)[0]) - with open(vocab_name, 'rb') as f: - return pickle.load(f) - def get_year(kv_filename, position): return int(splitext(basename(kv_filename))[0].split('_')[position]) -def word_in_model(query_term, corpus, max_distance = 2): - model = load_word_models(corpus) - vocab = model['vocab'] +def word_in_models(query_term, corpus, max_distance=2): + models = load_word_models(corpus) transformed_query = transform_query(query_term) - - if transformed_query in model['vocab']: + vocab = set() + for model in models: + vocab.update(model['vectors'].index_to_key) + if transformed_query in list(vocab): return { 'exists': True } - else: - is_similar = lambda term : damerau_levenshtein(query_term, term) <= max_distance - similar_keys = [term for term in vocab if is_similar(term)] - - return { - 'exists': False, - 'similar_keys': similar_keys - } + # if word is not in vocab, search for close matches + is_similar = lambda term : damerau_levenshtein(query_term, term) <= max_distance + similar_keys = [term for term in list(vocab) if is_similar(term)] + return { + 'exists': False, + 'similar_keys': similar_keys + } def load_wm_documentation(corpus_string): diff --git a/backend/wordmodels/views.py b/backend/wordmodels/views.py index 693c2b6b4..c564f7561 100644 --- a/backend/wordmodels/views.py +++ b/backend/wordmodels/views.py @@ -25,10 +25,9 @@ def post(self, request, *args, **kwargs): raise APIException(detail=results) else: return Response({ - 'total_similarities': results[0], - 'similarities_over_time': results[1], - 'similarities_over_time_local_top_n': results[3], - 'time_points': results[2] + 'similarities_over_time': results[0], + 'similarities_over_time_local_top_n': results[2], + 'time_points': results[1] }) class SimilarityView(APIView): @@ -77,7 +76,7 @@ def get(self, request, *args, **kwargs): corpus = corpus_name_from_request(request) query_term = request.query_params.get('query_term') - results = utils.word_in_model(query_term, corpus) + results = utils.word_in_models(query_term, corpus) if isinstance(results, str): # the method returned an error string diff --git a/backend/wordmodels/visualisations.py b/backend/wordmodels/visualisations.py index cd07a4272..4eb865f80 100644 --- a/backend/wordmodels/visualisations.py +++ b/backend/wordmodels/visualisations.py @@ -1,3 +1,7 @@ +from functools import reduce +from operator import concat +import pandas as pd + from addcorpus.load_corpus import load_corpus from wordmodels.similarity import find_n_most_similar, term_similarity from wordmodels.utils import load_word_models @@ -7,16 +11,16 @@ def get_similarity_over_time(query_term, comparison_term, corpus_string): corpus = load_corpus(corpus_string) - binned = load_word_models(corpus, True) + wm_list = load_word_models(corpus) data = [ term_similarity( time_bin, query_term, comparison_term ) - for time_bin in binned + for time_bin in wm_list ] - time_labels = get_time_labels(binned) + time_labels = get_time_labels(wm_list) similarities = [ { @@ -38,17 +42,20 @@ def get_time_labels(binned_model): def get_diachronic_contexts(query_term, corpus_string, number_similar=NUMBER_SIMILAR): corpus = load_corpus(corpus_string) - complete = load_word_models(corpus) - binned = load_word_models(corpus, binned=True) - word_list = find_n_most_similar( - complete, - query_term, - number_similar) - if not word_list: - return "The query term is not in the word models' vocabulary. \ - Is your query field empty, does it contain multiple words, or did you search for a stop word?" - times = get_time_labels(binned) - words = [word['key'] for word in word_list] + wm_list = load_word_models(corpus) + times = get_time_labels(wm_list) + data_per_timeframe = [ + find_n_most_similar(time_bin, query_term, number_similar) + for time_bin in wm_list + ] + flattened_data = reduce(concat, data_per_timeframe) + all_words = list(set([item.get('key') for item in flattened_data])) + frequencies = {word: [] for word in all_words} + for item in flattened_data: + frequencies[item['key']].append(item['similarity']) + max_similarities = pd.DataFrame({'word': all_words, 'max': [max(f) for f in frequencies.values()]}) + words = max_similarities.nlargest(number_similar, 'max')['word'] + get_similarity = lambda word, time_bin: term_similarity( time_bin, query_term, @@ -61,11 +68,6 @@ def get_diachronic_contexts(query_term, corpus_string, number_similar=NUMBER_SIM 'similarity': get_similarity(word, time_bin), 'time': time_label } - for (time_label, time_bin) in zip(times, binned) for word in words] - - data_per_timeframe = [ - find_n_most_similar(time_bin, query_term, number_similar) - for time_bin in binned - ] + for (time_label, time_bin) in zip(times, wm_list) for word in words] - return word_list, word_data, times, data_per_timeframe + return word_data, times, data_per_timeframe diff --git a/documentation/Adding-word-models.md b/documentation/Adding-word-models.md index 213490b5a..8d420c70c 100644 --- a/documentation/Adding-word-models.md +++ b/documentation/Adding-word-models.md @@ -4,11 +4,9 @@ Corpora have the option to include word vectors. I-analyzer visualisations are b ## Expected file format Word embeddings are expected to come with the following files: -- `_full.w2v` (contains gensim KeyedVectors for a model trained on the whole time period) -- `_full_vocab.pkl` (contains a list of terms present in the word vectors of the whole time period) +- `_full.wv` (contains gensim KeyedVectors for a model trained on the whole time period) For each time bin, it expects files of the format -- `_{startYear}_{endYear}.w2v` (contains gensim KeyedVectors for a model trained on the time bin) -- `_{startYear}_{endYear}_vocab.pkl` (contains a list of terms present in the word vectors of the time bin) +- `_{startYear}_{endYear}.wv` (contains gensim KeyedVectors for a model trained on the time bin) ## Documentation Please include documentation on the method and settings used to train a model. This documentation is expected to be located in `wm/documentation.md`, next to the corpus definition that includes word models. diff --git a/documentation/Defining-corpus-fields.md b/documentation/Defining-corpus-fields.md index 0d2e722fd..b53d8dab2 100644 --- a/documentation/Defining-corpus-fields.md +++ b/documentation/Defining-corpus-fields.md @@ -9,7 +9,8 @@ Various classes are defined in `backend/addcorpus/extract.py`. - The extractors `XML`, `HTML` and `CSV` are intended to extract values from the document type of your corpus. Naturally, `XML` is only available for `XMLCorpus`, et cetera. All other extractors are available for all corpora. - The `Metadata` extractor is used to collect any information that you passed on during file discovery, such as information based on the file path. - The `Constant` extractor can be used to define a constant value. -- The `Choice` and `Combined`, and `Backup` extractors can be used to combine multiple extractors. +- The `Order` extractor gives you the index of that document within the file. +- The `Choice` and `Combined`, `Backup`, and `Pass` extractors can be used to combine multiple extractors. A field can have the property `required = True`, which means the document will not be added to the index if the extracted value for this field is falsy. @@ -46,9 +47,9 @@ The following properties determine how a field appears in the interface. `search_filter` can be set if the interface should include a search filter widget for the field. I-analyzer includes date filters, multiplechoice filters (used for keyword data), range filters, and boolean filters. See [filters.py](../backend/addcorpus/filters.py). -`visualizations` optionally specifies a list of visualisations that apply for the field. Generally speaking, this is based on the type of data. For date fields and categorical/ordinal fields (usually keyword type), you can use `['resultcount', 'termfrequency']`. For text fields, you can use `['wordcloud', 'ngram']`. +`visualizations` optionally specifies a list of visualisations that apply for the field. Generally speaking, this is based on the type of data. For date fields and categorical/ordinal fields (usually keyword type), you can use `['resultscount', 'termfrequency']`. For text fields, you can use `['wordcloud', 'ngram']`. -If a field includes the `'resultcount' and/or `'termfrequency'` visualisations and it is not a date field, you can also specify `visualisation_sort`, which determines how to sort the x-axis of the graph. Default is `'value'`, where categories are sorted based on the y-axis value (i.e., frequency). You may specify that they should be sorted on `'key'`, so that categories are sorted alphabetically (for keywords) or small-to-large (for numbers). +If a field includes the `'resultscount'` and/or `'termfrequency'` visualisations and it is not a date field, you can also specify `visualisation_sort`, which determines how to sort the x-axis of the graph. Default is `'value'`, where categories are sorted based on the y-axis value (i.e., frequency). You may specify that they should be sorted on `'key'`, so that categories are sorted alphabetically (for keywords) or small-to-large (for numbers). `search_field_core` determines if a field is listed by default when selecting specific fields to search in. If it is not set to `True`, the user would have to click on "show all fields" to see it. diff --git a/documentation/Email.md b/documentation/Email.md new file mode 100644 index 000000000..3abf52af4 --- /dev/null +++ b/documentation/Email.md @@ -0,0 +1,13 @@ +## Email + +The backend sends emails about account administration (veryfing emails, resetting passwords), and downloads. + +By default, the backend will use the django console backend for emails, so any outgoing mail will be displayed on your console. + +If you want to use a server like [maildev](https://maildev.github.io/maildev/), you can configure a different email backend in your local settings, for example: + +```python +EMAIL_BACKEND = 'django.core.mail.backends.smtp.EmailBackend' +EMAIL_HOST = '0.0.0.0' +EMAIL_PORT = '1025' +``` diff --git a/documentation/How-to-add-a-new-corpus-to-Ianalyzer.md b/documentation/How-to-add-a-new-corpus-to-Ianalyzer.md index 760aa2e1c..ad96f1553 100644 --- a/documentation/How-to-add-a-new-corpus-to-Ianalyzer.md +++ b/documentation/How-to-add-a-new-corpus-to-Ianalyzer.md @@ -14,6 +14,8 @@ The corpus class should define the following properties: - `es_index`: the name of the index in elasticsearch. - `image`: a path or url to the image used for the corpus in the interface. - `fields`: a list of `Field` objects. See [defining corpus fields](./Defining-corpus-fields.md). +- `languages`: a list of ISO 639 codes of the languages used in your corpus. Corpus languages are intended as a way for users to select interesting datasets, so only include languages for which your corpus contains a meaningful amount of data. The list should go from most to least frequent. +- `category`: the type of data in the corpus. The list of options is in `backend/addcorpus/constants`. The following properties are optional: - `es_alias`: an alias for the index in elasticsearch. diff --git a/documentation/Indexing-corpora.md b/documentation/Indexing-corpora.md new file mode 100644 index 000000000..fdbadbf22 --- /dev/null +++ b/documentation/Indexing-corpora.md @@ -0,0 +1,37 @@ +# Indexing corpora + +Indexing is the step to load corpus data into elasticsearch, which makes the data available through the I-analyzer interface. + +You can start indexing once you have +- A python source file for the corpus +- A directory with source data +- Added the necessary properties to django settings + +The basic indexing command is: + +```bash +yarn django index my-corpus +``` + +Use `yarn django index --help` to see all possible flags. Some useful options are highlighted below. + +## Development + +For development environments, we usually maintain a single index per corpus, rather than creating versioned indices. New indices are also created with `number_of_replicas` set to 0 (this is to make index creation easier/lighter). + +Some options that may be useful for development: + +### Delete index before starting + +`--delete` / `-d` deletes an existing index of this name, if there is one. Without this flag, you will add your data to the existing index. + +### Date selection + +`--start` / `-s` and `--end` / `-e` respectively give a start and end date to select source files. Note that this only works if the `sources` function in your corpus definition makes use of these options; not all corpora have this defined. (It is not always possible to infer exact dates from source file metadata.) + +The filtering of source files may not be exact (e.g. only take the year into account). These flags do *not* filter documents based on their contents. + + +## Production + +See [Indexing on server](documentation/Indexing-on-server.md) for more information about production-specific settings. diff --git a/documentation/Indexing-on-server.md b/documentation/Indexing-on-server.md index 95fe96137..4493bfc68 100644 --- a/documentation/Indexing-on-server.md +++ b/documentation/Indexing-on-server.md @@ -1,5 +1,8 @@ # Indexing corpora on the server +For production environments, we use *versioned* index names (e.g. `times-1`, `times-2`), and use an alias (e.g. `times`) to point to the correct version. The advantage of this approach is that an old version of the index can be kept in place as long as is needed, for example while a new version of the index is created. + + ## Moving data to server On the server, move data to a location in the `/its` share. @@ -14,14 +17,20 @@ Start a screen with a descriptive name (e.g., `screen -S index-superb-corpus`). Call the flask command for indexing, e.g., `yarn django index superb-corpus -p`. The production flag indicates that we have a *versioned* index after this: `superb-corpus-1`. You can also choose to add the `--rollover` (`-r`) flag: this is equivalent with automaticaly calling `yarn django alias` after `yarn django index`. As it's advisable to double-check a new index before setting / rolling over the alias, this flag should be used with caution. ## Additional indexing flags + It is also possible to only add settings and mappings by providing the `--mappings-only` or `-m` flag. This is useful, for instance, before a `REINDEX` via Kibana from one Elasticsearch cluster to another (which is often faster than reindexing from source). +`--update` / `-u` can be used to run an update script for the corpus. This requires an `update_body` or `update_script` to be set in the corpus definition, see [example for update_body in dutchnewspapers](backend/corpora/dutchnewspapers/dutchnewspapers_all.py) and [example for update_script in goodreads](backend/corpora/goodreads/goodreads.py). + + ## Alias Either: - create an alias `superb-corpus` on Kibana manually: `PUT suberb-corpus-1/_alias/superb-corpus`. After this, the corpus will be reachable under the alias. - or: run `yarn django alias superb-corpus-name`. This will set an alias with the name defined by `es_alias` or (fallback) `es_index`. If you additionally provide the `--clean` flag, this will also remove the index with the lower version number. Naturally, this should only be used if the new index version has the expected number of documents, fields, etc., and the old index version is fully dispensable. +Note that removing an alias does not remove the index itself, but removing an index also removes any existing aliases for it. + ## Indexing from multiple corpus definitions If you have separate datasets for different parts of a corpus, you may combine them by setting the `ES_INDEX` variable in the corpus definitions to the same `overarching-corpus` index name. diff --git a/documentation/Migration-from-flask.md b/documentation/Migration-from-flask.md index a4f60fda5..2f4be1d4a 100644 --- a/documentation/Migration-from-flask.md +++ b/documentation/Migration-from-flask.md @@ -79,19 +79,7 @@ Regarding the directory: - In production, the location of the flask migration is stored in the django settings. Use `directory = settings.FLASK_MIGRATION_DATA` - If `directory` does not exist or does not contain relevant files, the script will not import anything. -The script expects to run on an **empty** database, as it will also copy object IDs. This means that if the script fails halfway through, you will need to reset the database before you can re-attempt. You can use the following script in the shell to do so: - -``` -from django.contrib.auth.models import Group -from users.models import CustomUser -from addcorpus.models import Corpus - -Group.objects.all().delete() -CustomUser.objects.all().delete() -Corpus.objects.all().delete() -``` - -Objects in other tables (such as the search history) will be deleted through cascade. +The script expects to run on an **empty** database, as it will also copy object IDs. This means that if the script fails halfway through, you will need to reset the database before you can re-attempt. You can do this from the command line with `yarn django flush`. ### Update object IDs @@ -108,10 +96,12 @@ python manage.py sqlsequencereset download | python manage.py dbshell In `backend/ianalyzer`, make a file `settings_local.py`. Transfer relevant local settings you had configured in your `config.py` file for Flask. +Note that the new `settings_local` does not need all the information you had provided in`config`. For a development environment, is is probably sufficient to simply specify the `CORPORA`, and the locations of corpus source data and word model files. + ## Transfer downloads In the flask backend, the default storage location for CSV files was `/backend/api/csv_files/`. In a development environment, the new default location is `/backend/download/csv_files/`. (This can be configured in settings.) You will have to move the contents of your CSV directory here if you want to keep your download history. -For a production environment, the csv files need to be moved from the old flask server to the new django server. Check the deployment settings for the new location of the downloads. (This should be outside of the repository.) +For a production environment, the csv files need to be moved from the old flask server to the new django server, if you are also moving servers. Check the deployment settings for the new location of the downloads. (This should be outside of the repository.) diff --git a/documentation/Overview.md b/documentation/Overview.md index b4964d10a..f8118bd2b 100644 --- a/documentation/Overview.md +++ b/documentation/Overview.md @@ -2,6 +2,28 @@ The application consists of a backend, implemented in [Django](https://www.djangoproject.com/) and a frontend implemented in [Angular](https://angular.io/). +## Directory structure + +The I-analyzer backend (`/backend`) is a python/Django app that provides the following functionality: + +- A 'users' module that defines user accounts. + +- A 'corpora' module containing corpus definitions and metadata of the currently implemented corpora. For each corpus added in I-analyzer, this module defines how to extract document contents from its source files and sets parameters for displaying the corpus in the interface, such as sorting options. + +- An 'addcorpus' module which manages the functionality to extract data from corpus source files (given the definition) and save this in an elasticsearch index. Source files can be XML or HTML format (which are parsed with `beautifulsoup4` + `lxml`) or CSV. This module also provides the basic data structure for corpora. + +- An 'es' module which handles the communication with elasticsearch. The data is passed through to the index using the `elasticsearch` package for Python (note that `elasticsearch-dsl` is not used, since its [documentation](https://elasticsearch-dsl.readthedocs.io/en/latest) at the time seemed less immediately accessible than the [low-level](https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html) version). + +- An 'api' module that that enables users to search through an ElasticSearch index of a text corpus and stream search results into a CSV file. The module also performs more complex analysis of search results for visualisations. + +- A 'visualizations' module that does the analysis for several types of text-based visualisations. + +- A 'downloads' module that collects results into csv files. + +- A 'wordmodels' module that handles functionality related to word embeddings. + +`ianalyzer/frontend` is an [Angular 13](https://angular.io/) web interface. + # Backend The backend has three responsibilities: diff --git a/frontend/src/_utilities.scss b/frontend/src/_utilities.scss index fa284f0df..225735d40 100644 --- a/frontend/src/_utilities.scss +++ b/frontend/src/_utilities.scss @@ -9,6 +9,7 @@ $primary: #303F9F; $contrast-primary-color: #3F51B5; +$contrast-primary-accent-color: #495cc9; $text-primary-color: #FFFFFF; $highlight-color: #D6D6D6; diff --git a/frontend/src/app/app.module.ts b/frontend/src/app/app.module.ts index 9a43978aa..0fd77585b 100644 --- a/frontend/src/app/app.module.ts +++ b/frontend/src/app/app.module.ts @@ -27,7 +27,7 @@ import { PdfViewerModule } from 'ng2-pdf-viewer'; import { CookieService } from 'ngx-cookie-service'; import { ApiService, ApiRetryService, CorpusService, DialogService, DownloadService, - ElasticSearchService, ParamService, HighlightService, NotificationService, SearchService, SessionService, + ElasticSearchService, HighlightService, NotificationService, SearchService, SessionService, UserService, QueryService } from './services/index'; import { AppComponent } from './app.component'; @@ -88,6 +88,11 @@ import { DownloadOptionsComponent } from './download/download-options/download-o import { JoyplotComponent } from './visualization/ngram/joyplot/joyplot.component'; import { VerifyEmailComponent } from './login/verify-email/verify-email.component'; import { DocumentPageComponent } from './document-page/document-page.component'; +import { CorpusSelectorComponent } from './corpus-selection/corpus-selector/corpus-selector.component'; +import { CorpusFilterComponent } from './corpus-selection/corpus-filter/corpus-filter.component'; +import { DatePickerComponent } from './corpus-selection/corpus-filter/date-picker/date-picker.component'; +import { CorpusInfoComponent } from './corpus-info/corpus-info.component'; +import { FieldInfoComponent } from './corpus-info/field-info/field-info.component'; export const appRoutes: Routes = [ @@ -101,6 +106,11 @@ export const appRoutes: Routes = [ component: WordModelsComponent, canActivate: [CorpusGuard, LoggedOnGuard], }, + { + path: 'info/:corpus', + component: CorpusInfoComponent, + canActivate: [CorpusGuard, LoggedOnGuard] + }, { path: 'document/:corpus/:id', component: DocumentPageComponent, @@ -169,8 +179,12 @@ export const declarations: any[] = [ BalloonDirective, BarchartOptionsComponent, BooleanFilterComponent, + CorpusFilterComponent, CorpusHeaderComponent, + CorpusInfoComponent, CorpusSelectionComponent, + CorpusSelectorComponent, + DatePickerComponent, DateFilterComponent, DialogComponent, DocumentPageComponent, @@ -181,6 +195,7 @@ export const declarations: any[] = [ DropdownComponent, ErrorComponent, FilterManagerComponent, + FieldInfoComponent, FooterComponent, FreqtableComponent, FullDataButtonComponent, @@ -280,7 +295,6 @@ export const providers: any[] = [ ElasticSearchService, HighlightService, NotificationService, - ParamService, QueryService, SearchService, SessionService, diff --git a/frontend/src/app/corpus-header/corpus-header.component.html b/frontend/src/app/corpus-header/corpus-header.component.html index 7a3463acd..6902541c1 100644 --- a/frontend/src/app/corpus-header/corpus-header.component.html +++ b/frontend/src/app/corpus-header/corpus-header.component.html @@ -7,27 +7,12 @@

Search Word models of Document in + About “{{corpus.title}}”

-
- - -
-
+
diff --git a/frontend/src/app/corpus-header/corpus-header.component.ts b/frontend/src/app/corpus-header/corpus-header.component.ts index b2f8630a7..1cc8fe988 100644 --- a/frontend/src/app/corpus-header/corpus-header.component.ts +++ b/frontend/src/app/corpus-header/corpus-header.component.ts @@ -1,8 +1,6 @@ import { Component, Input, OnChanges, OnInit, SimpleChanges } from '@angular/core'; -import { ActivatedRoute } from '@angular/router'; -import { faDiagramProject, faInfoCircle, faMagnifyingGlass } from '@fortawesome/free-solid-svg-icons'; +import { faDiagramProject, faInfo, faMagnifyingGlass } from '@fortawesome/free-solid-svg-icons'; import { Corpus } from '../models'; -import { DialogService } from '../services'; @Component({ selector: 'ia-corpus-header', @@ -11,17 +9,15 @@ import { DialogService } from '../services'; }) export class CorpusHeaderComponent implements OnChanges, OnInit { @Input() corpus: Corpus; - @Input() currentPage: 'search'|'word-models'|'document'; - @Input() modelDocumentation: string; + @Input() currentPage: 'search'|'word-models'|'document'|'info'; searchIcon = faMagnifyingGlass; wordModelsIcon = faDiagramProject; + infoIcon = faInfo; wordModelsPresent: boolean; - faInfo = faInfoCircle; - - constructor(private dialogService: DialogService) { + constructor() { } ngOnInit() { @@ -33,17 +29,4 @@ export class CorpusHeaderComponent implements OnChanges, OnInit { this.wordModelsPresent = this.corpus.word_models_present; } } - - public showCorpusInfo(corpus: Corpus) { - this.dialogService.showDescriptionPage(corpus); - } - - public showModelInfo() { - this.dialogService.showDocumentation( - this.corpus.name + '_wm', - `Word models of ${this.corpus.title}`, - this.modelDocumentation, - ); - } - } diff --git a/frontend/src/app/corpus-info/corpus-info.component.html b/frontend/src/app/corpus-info/corpus-info.component.html new file mode 100644 index 000000000..d97f0a875 --- /dev/null +++ b/frontend/src/app/corpus-info/corpus-info.component.html @@ -0,0 +1,48 @@ + + +
+
+
+
+
+

{{corpus.description}}

+
+ +
+
+ {{corpus.title}} +
+
+
+
+ +
+
+ + + +
+ +
+
+ +
+
+ +
+
+
+
diff --git a/frontend/src/app/corpus-info/corpus-info.component.scss b/frontend/src/app/corpus-info/corpus-info.component.scss new file mode 100644 index 000000000..72d1d95c3 --- /dev/null +++ b/frontend/src/app/corpus-info/corpus-info.component.scss @@ -0,0 +1,13 @@ +.heading { + font-size: 14px; +} + +.metadata { + letter-spacing: 1px; + text-transform: uppercase; + font-size: small; + + p { + margin-bottom: 0.25rem; + } +} diff --git a/frontend/src/app/corpus-info/corpus-info.component.spec.ts b/frontend/src/app/corpus-info/corpus-info.component.spec.ts new file mode 100644 index 000000000..7ec500d61 --- /dev/null +++ b/frontend/src/app/corpus-info/corpus-info.component.spec.ts @@ -0,0 +1,24 @@ +import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; + +import { CorpusInfoComponent } from './corpus-info.component'; +import { commonTestBed } from '../common-test-bed'; + +describe('CorpusInfoComponent', () => { + let component: CorpusInfoComponent; + let fixture: ComponentFixture; + + beforeEach(waitForAsync(() => { + commonTestBed().testingModule.compileComponents(); + })); + + + beforeEach(() => { + fixture = TestBed.createComponent(CorpusInfoComponent); + component = fixture.componentInstance; + fixture.detectChanges(); + }); + + it('should create', () => { + expect(component).toBeTruthy(); + }); +}); diff --git a/frontend/src/app/corpus-info/corpus-info.component.ts b/frontend/src/app/corpus-info/corpus-info.component.ts new file mode 100644 index 000000000..a1fe9b065 --- /dev/null +++ b/frontend/src/app/corpus-info/corpus-info.component.ts @@ -0,0 +1,65 @@ +import { Component, OnInit } from '@angular/core'; +import { ApiService, CorpusService, WordmodelsService } from '../services'; +import { Corpus, FieldCoverage } from '../models'; +import { marked } from 'marked'; +import { BehaviorSubject } from 'rxjs'; + +@Component({ + selector: 'ia-corpus-info', + templateUrl: './corpus-info.component.html', + styleUrls: ['./corpus-info.component.scss'] +}) +export class CorpusInfoComponent implements OnInit { + corpus: Corpus; + + description: string; + wordModelDocumentation: string; + fieldCoverage: FieldCoverage; + + tabs = [ + { + name: 'general', + title: 'General information', + property: 'descriptionpage', + }, { + name: 'fields', + title: 'Fields', + property: 'fields', + }, { + name: 'models', + title: 'Word models', + property: 'word_models_present', + } + ]; + + currentTab = new BehaviorSubject<'general'|'fields'|'models'>( + 'general' + ); + + constructor(private corpusService: CorpusService, private apiService: ApiService, private wordModelsService: WordmodelsService) { } + + ngOnInit(): void { + this.corpusService.currentCorpus.subscribe(this.setCorpus.bind(this)); + } + + setCorpus(corpus: Corpus) { + this.corpus = corpus; + if (corpus.descriptionpage) { + this.apiService.corpusdescription({filename: corpus.descriptionpage, corpus: corpus.name}) + .then(marked.parse) + .then(doc => this.description = doc); + } else { + this.currentTab.next('fields'); + } + this.apiService.fieldCoverage(corpus.name).then( + result => this.fieldCoverage = result + ); + if (this.corpus.word_models_present) { + this.wordModelsService.wordModelsDocumentationRequest({corpus_name: this.corpus.name}) + .then(result => marked.parse(result.documentation)) + .then(doc => this.wordModelDocumentation = doc); + } + } + + +} diff --git a/frontend/src/app/corpus-info/field-info/field-info.component.html b/frontend/src/app/corpus-info/field-info/field-info.component.html new file mode 100644 index 000000000..c50735816 --- /dev/null +++ b/frontend/src/app/corpus-info/field-info/field-info.component.html @@ -0,0 +1,44 @@ +
+ +
+

{{field.displayName}}

+

{{field.description}}

+
+
+ +
+ +
+

Type of data: {{mappingNames[field.mappingType]}}

+ +
    +
  • This field {{field.searchable ? 'can' : 'cannot'}} be searched
  • +
  • This field {{field.searchFilter ? 'has' : 'does not have'}} a search filter
  • +
+ + +

+ This field supports the following options for text analysis: +

+
    +
  • + Counting the total number of words, and calculating term frequencies relative to the total word count. +
  • +
  • + Removing stopwords +
  • +
  • + Stemming +
  • +
+
+ +

+ {{coveragePercentage}}% of the documents in this corpus have a value for this field +

+ +

+ Loading coverage data... +

+
+
diff --git a/frontend/src/app/corpus-info/field-info/field-info.component.scss b/frontend/src/app/corpus-info/field-info/field-info.component.scss new file mode 100644 index 000000000..77d11fcec --- /dev/null +++ b/frontend/src/app/corpus-info/field-info/field-info.component.scss @@ -0,0 +1,9 @@ +.foldable-header { + display: inline-block; + vertical-align: text-top; + margin-left: 0.5rem; +} + +summary { + cursor: pointer; +} diff --git a/frontend/src/app/corpus-info/field-info/field-info.component.spec.ts b/frontend/src/app/corpus-info/field-info/field-info.component.spec.ts new file mode 100644 index 000000000..0431bb47d --- /dev/null +++ b/frontend/src/app/corpus-info/field-info/field-info.component.spec.ts @@ -0,0 +1,23 @@ +import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; + +import { FieldInfoComponent } from './field-info.component'; +import { commonTestBed } from '../../common-test-bed'; + +describe('FieldInfoComponent', () => { + let component: FieldInfoComponent; + let fixture: ComponentFixture; + + beforeEach(waitForAsync(() => { + commonTestBed().testingModule.compileComponents(); + })); + + beforeEach(() => { + fixture = TestBed.createComponent(FieldInfoComponent); + component = fixture.componentInstance; + fixture.detectChanges(); + }); + + it('should create', () => { + expect(component).toBeTruthy(); + }); +}); diff --git a/frontend/src/app/corpus-info/field-info/field-info.component.ts b/frontend/src/app/corpus-info/field-info/field-info.component.ts new file mode 100644 index 000000000..d7add40e7 --- /dev/null +++ b/frontend/src/app/corpus-info/field-info/field-info.component.ts @@ -0,0 +1,36 @@ +import { Component, Input, OnInit } from '@angular/core'; +import { CorpusField } from '../../models'; +import * as _ from 'lodash'; + +@Component({ + selector: 'ia-field-info', + templateUrl: './field-info.component.html', + styleUrls: ['./field-info.component.scss'] +}) +export class FieldInfoComponent implements OnInit { + @Input() field: CorpusField; + @Input() coverage: number; + + mappingNames = { + text: 'text', + keyword: 'categorical', + integer: 'numeric', + float: 'numeric', + date: 'date', + boolean: 'binary' + }; + + constructor() { } + + get coveragePercentage() { + if (this.coverage) { + return (this.coverage * 100).toPrecision(3); + } else { + return this.coverage; // return undefined or 0 as-is + } + } + + ngOnInit(): void { + } + +} diff --git a/frontend/src/app/corpus-selection/corpus-filter/corpus-filter.component.html b/frontend/src/app/corpus-selection/corpus-filter/corpus-filter.component.html new file mode 100644 index 000000000..c5f973119 --- /dev/null +++ b/frontend/src/app/corpus-selection/corpus-filter/corpus-filter.component.html @@ -0,0 +1,46 @@ +
+
+
+ +
+ +
+
+
+ +
+ +
+
+
+ +
+
+
+ +
+
+ - +
+
+ +
+
+
+
+
+
+ +
+
diff --git a/frontend/src/app/corpus-selection/corpus-filter/corpus-filter.component.scss b/frontend/src/app/corpus-selection/corpus-filter/corpus-filter.component.scss new file mode 100644 index 000000000..a5a02a474 --- /dev/null +++ b/frontend/src/app/corpus-selection/corpus-filter/corpus-filter.component.scss @@ -0,0 +1,8 @@ +@import "../../../_utilities"; + +::ng-deep .p-datepicker { + margin: 1px 0; + .p-highlight { + background-color: $primary !important; + }; +} diff --git a/frontend/src/app/corpus-selection/corpus-filter/corpus-filter.component.spec.ts b/frontend/src/app/corpus-selection/corpus-filter/corpus-filter.component.spec.ts new file mode 100644 index 000000000..43added3b --- /dev/null +++ b/frontend/src/app/corpus-selection/corpus-filter/corpus-filter.component.spec.ts @@ -0,0 +1,59 @@ +import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; + +import { CorpusFilterComponent } from './corpus-filter.component'; +import { commonTestBed } from '../../common-test-bed'; +import { mockCorpus, mockCorpus2 } from '../../../mock-data/corpus'; +import { Corpus } from '../../models'; + +describe('CorpusFilterComponent', () => { + let component: CorpusFilterComponent; + let fixture: ComponentFixture; + + beforeEach(waitForAsync(() => { + commonTestBed().testingModule.compileComponents(); + })); + + beforeEach(() => { + fixture = TestBed.createComponent(CorpusFilterComponent); + component = fixture.componentInstance; + component.corpora = [mockCorpus, mockCorpus2]; + fixture.detectChanges(); + }); + + it('should create', () => { + expect(component).toBeTruthy(); + }); + + it('should filter corpora', async () => { + let result: Corpus[]; + component.filtered.subscribe(data => result = data); + + const filterResult = async (language: string, category: string, minDate: Date, maxDate: Date) => { + component.selectedLanguage.next(language); + component.selectedCategory.next(category); + component.selectedMinDate.next(minDate); + component.selectedMaxDate.next(maxDate); + await fixture.whenStable(); + return result; + }; + + expect(await filterResult('English', undefined, undefined, undefined)) + .toEqual([mockCorpus, mockCorpus2]); + expect(await filterResult('French', undefined, undefined, undefined)) + .toEqual([mockCorpus2]); + expect(await filterResult(undefined, undefined, undefined, undefined)) + .toEqual([mockCorpus, mockCorpus2]); + expect(await filterResult(undefined, 'Tests', undefined, undefined)) + .toEqual([mockCorpus]); + expect(await filterResult('French', 'Different tests', undefined, undefined)) + .toEqual([mockCorpus2]); + expect(await filterResult('French', 'Tests', undefined, undefined)) + .toEqual([]); + expect(await filterResult(undefined, undefined, new Date('1920-01-01'), undefined)) + .toEqual([mockCorpus2]); + expect(await filterResult(undefined, undefined, new Date('1820-01-01'), undefined)) + .toEqual([mockCorpus, mockCorpus2]); + expect(await filterResult(undefined, undefined, undefined, new Date('1830-01-01'))) + .toEqual([mockCorpus]); + }); +}); diff --git a/frontend/src/app/corpus-selection/corpus-filter/corpus-filter.component.ts b/frontend/src/app/corpus-selection/corpus-filter/corpus-filter.component.ts new file mode 100644 index 000000000..c9fb4068a --- /dev/null +++ b/frontend/src/app/corpus-selection/corpus-filter/corpus-filter.component.ts @@ -0,0 +1,99 @@ +import { Component, Input, OnInit, Output } from '@angular/core'; +import { Corpus } from '../../models'; +import { BehaviorSubject, Subject, combineLatest } from 'rxjs'; +import * as _ from 'lodash'; +import { faTimes } from '@fortawesome/free-solid-svg-icons'; +import { map } from 'rxjs/operators'; +import { Observable } from 'rxjs-compat'; + +@Component({ + selector: 'ia-corpus-filter', + templateUrl: './corpus-filter.component.html', + styleUrls: ['./corpus-filter.component.scss'] +}) +export class CorpusFilterComponent implements OnInit { + @Input() corpora: Corpus[]; + @Output() filtered = new Subject(); + + selectedLanguage = new BehaviorSubject(undefined); + selectedCategory = new BehaviorSubject(undefined); + selectedMinDate = new BehaviorSubject(undefined); + selectedMaxDate = new BehaviorSubject(undefined); + + selection: [BehaviorSubject, BehaviorSubject, BehaviorSubject, BehaviorSubject] + = [this.selectedLanguage, this.selectedCategory, this.selectedMinDate, this.selectedMaxDate]; + + canReset: Observable = combineLatest(this.selection).pipe( + map(values => _.some(values, value => !_.isUndefined(value))) + ); + + resetIcon = faTimes; + + constructor() { } + + get minDate(): Date { + if (this.corpora) { + const dates = this.corpora.map(corpus => corpus.minDate); + return _.min(dates); + } + } + + get maxDate(): Date { + if (this.corpora) { + const dates = this.corpora.map(corpus => corpus.maxDate); + return _.max(dates); + } + } + + + get languages(): string[] { + return this.collectOptions('languages'); + } + + get categories(): string[] { + return this.collectOptions('category'); + } + + ngOnInit(): void { + combineLatest(this.selection).subscribe(values => this.filterCorpora(...values)); + } + + collectOptions(property): string[] { + const values = _.flatMap( + this.corpora || [], + property + ) as string[]; + return _.uniq(values).sort(); + } + + filterCorpora(language?: string, category?: string, minDate?: Date, maxDate?: Date): void { + if (this.corpora) { + const filter = this.corpusFilter(language, category, minDate, maxDate); + const filtered = this.corpora.filter(filter); + this.filtered.next(filtered); + } + } + + corpusFilter(language?: string, category?: string, minDate?: Date, maxDate?: Date): ((a: Corpus) => boolean) { + return (corpus) => { + if (language && !corpus.languages.includes(language)) { + return false; + } + if (category && corpus.category !== category) { + return false; + } + if (minDate && corpus.maxDate < minDate) { + return false; + } + if (maxDate && corpus.minDate > maxDate) { + return false; + } + return true; + }; + } + + reset() { + this.selection.forEach(subject => subject.next(undefined)); + } + +} diff --git a/frontend/src/app/corpus-selection/corpus-filter/date-picker/date-picker.component.html b/frontend/src/app/corpus-selection/corpus-filter/date-picker/date-picker.component.html new file mode 100644 index 000000000..14744d792 --- /dev/null +++ b/frontend/src/app/corpus-selection/corpus-filter/date-picker/date-picker.component.html @@ -0,0 +1,6 @@ + + diff --git a/frontend/src/app/corpus-selection/corpus-filter/date-picker/date-picker.component.scss b/frontend/src/app/corpus-selection/corpus-filter/date-picker/date-picker.component.scss new file mode 100644 index 000000000..e69de29bb diff --git a/frontend/src/app/corpus-selection/corpus-filter/date-picker/date-picker.component.spec.ts b/frontend/src/app/corpus-selection/corpus-filter/date-picker/date-picker.component.spec.ts new file mode 100644 index 000000000..6e8a47d30 --- /dev/null +++ b/frontend/src/app/corpus-selection/corpus-filter/date-picker/date-picker.component.spec.ts @@ -0,0 +1,24 @@ +import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; + +import { DatePickerComponent } from './date-picker.component'; +import { commonTestBed } from '../../../common-test-bed'; + +describe('DatePickerComponent', () => { + let component: DatePickerComponent; + let fixture: ComponentFixture; + + beforeEach(waitForAsync(() => { + commonTestBed().testingModule.compileComponents(); + })); + + + beforeEach(() => { + fixture = TestBed.createComponent(DatePickerComponent); + component = fixture.componentInstance; + fixture.detectChanges(); + }); + + it('should create', () => { + expect(component).toBeTruthy(); + }); +}); diff --git a/frontend/src/app/corpus-selection/corpus-filter/date-picker/date-picker.component.ts b/frontend/src/app/corpus-selection/corpus-filter/date-picker/date-picker.component.ts new file mode 100644 index 000000000..a02688465 --- /dev/null +++ b/frontend/src/app/corpus-selection/corpus-filter/date-picker/date-picker.component.ts @@ -0,0 +1,45 @@ +import { Component, Input, Output } from '@angular/core'; +import * as _ from 'lodash'; +import * as moment from 'moment'; +import { BehaviorSubject } from 'rxjs'; + +@Component({ + selector: 'ia-date-picker', + templateUrl: './date-picker.component.html', + styleUrls: ['./date-picker.component.scss'] +}) +export class DatePickerComponent { + @Input() @Output() subject: BehaviorSubject = new BehaviorSubject(undefined); + @Input() minDate: Date; + @Input() maxDate: Date; + @Input() default: Date; + @Input() unit: 'year'|'date' = 'year'; + + constructor() { } + + get dateFormat(): string { + return this.unit === 'year' ? 'yy' : 'dd-mm-yy'; + } + + formatInput(value: string|Date): Date { + let valueAsDate: Date; + if (typeof(value) == 'string') { + const format = this.unit === 'year' ? 'YYYY' : 'DD-MM-YYYY'; + const m = moment(value, format); + if (m.isValid()) { + valueAsDate = m.toDate(); + } + } else { + valueAsDate = value; + } + + return valueAsDate; + } + + set(value: string|Date) { + const valueAsDate = this.formatInput(value); + const checkedValue = _.min([_.max([valueAsDate, this.minDate]), this.maxDate]); + this.subject.next(checkedValue); + } + +} diff --git a/frontend/src/app/corpus-selection/corpus-selection.component.html b/frontend/src/app/corpus-selection/corpus-selection.component.html index 1b3d777af..cc90ff34a 100644 --- a/frontend/src/app/corpus-selection/corpus-selection.component.html +++ b/frontend/src/app/corpus-selection/corpus-selection.component.html @@ -7,37 +7,15 @@

Select a corpus to search through


- + +
+ +
+ + +
+ +
+ diff --git a/frontend/src/app/corpus-selection/corpus-selection.component.scss b/frontend/src/app/corpus-selection/corpus-selection.component.scss index 6c5380af9..832d7c30c 100644 --- a/frontend/src/app/corpus-selection/corpus-selection.component.scss +++ b/frontend/src/app/corpus-selection/corpus-selection.component.scss @@ -1,46 +1,7 @@ -@import "../../_utilities"; - -.card { - cursor: pointer; -} - -.card-content { - padding: 1.5rem; - color: $text-primary-color; - text-decoration: none; - background-color: $primary; -} - -.card-info-icon { - color: white; - font-size: medium; -} - -.card-footer { - border-top: 1px solid $contrast-primary-color; -} - -.card-footer-item { - border-top: 1px solid $contrast-primary-color; - color: $text-primary-color; - cursor: pointer; - text-decoration: none; - background-color: $contrast-primary-color; - - &:hover{ - color: $text-primary-color; - } -} - -.title-content { - font-size: 1.5em; -} - .subtitle { margin-top: 1.5em !important; } -.moreInfoLink { - background: none; - border: none; +.year-input { + width: 6em; } diff --git a/frontend/src/app/corpus-selection/corpus-selection.component.ts b/frontend/src/app/corpus-selection/corpus-selection.component.ts index 63b570b58..6bcdac5cb 100644 --- a/frontend/src/app/corpus-selection/corpus-selection.component.ts +++ b/frontend/src/app/corpus-selection/corpus-selection.component.ts @@ -1,9 +1,7 @@ import { Component, Input, OnInit } from '@angular/core'; -import { Router } from '@angular/router'; import { Corpus } from '../models/corpus'; +import * as _ from 'lodash'; -import { DomSanitizer, SafeHtml } from '@angular/platform-browser'; -import { DialogService } from '../services/dialog.service'; @Component({ selector: 'ia-corpus-selection', @@ -14,18 +12,18 @@ export class CorpusSelectionComponent implements OnInit { @Input() public items: Corpus[]; - constructor(private router: Router, private domSanitizer: DomSanitizer, private dialogService: DialogService) { } + filteredItems: Corpus[]; - ngOnInit() { - } + constructor() { } - showMoreInfo(corpus: Corpus): void { - this.dialogService.showDescriptionPage(corpus); + get displayItems(): Corpus[] { + if (_.isUndefined(this.filteredItems)) { + return this.items; + } else { + return this.filteredItems; + } } - navigateToCorpus(event: any, corpusName: string): void { - if (!event.target.classList.contains('moreInfoLink')) { - this.router.navigate(['/search', corpusName]); - } + ngOnInit() { } } diff --git a/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.html b/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.html new file mode 100644 index 000000000..6007bdcc8 --- /dev/null +++ b/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.html @@ -0,0 +1,46 @@ +
+
+
+ {{corpus.title}} +
+
+
+
+

+ {{corpus.title}} +

+
+ + +
+
+ +
+ +
+ {{corpus.description}} +
+ +
+
+
+

Language: {{corpus.displayLanguages}}

+

Type: {{corpus.category}}

+

Period: {{corpus.minYear}}-{{corpus. maxYear}}

+
+
+ +
+
+
diff --git a/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.scss b/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.scss new file mode 100644 index 000000000..22dbd5ffb --- /dev/null +++ b/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.scss @@ -0,0 +1,79 @@ +@import "../../../_utilities"; + +.corpus-container { + padding: 0; + color: $text-primary-color; + text-decoration: none; + background-color: $primary; + + > .column { + padding: 1rem; // slightly more generous padding + } +} + + +.image-column { + position: relative; + overflow-y: hidden; +} + +.corpus-image { + height: 100%; + width: 100%; + position: absolute; + top: 0; + left: 0; + + img { + height: 100%; + width: 100%; + object-fit: cover; + object-position: center; + } +} + + +.info-icon { + color: white; +} + + +.corpus-action { + border-color: $highlight-color; + color: $text-primary-color; + background-color: $contrast-primary-color; + + &:hover { + border-color: $text-primary-color; + background-color: $contrast-primary-accent-color; + } +} + +.title-row { + margin-bottom: 0px; +} + +.title-divider { + margin-top: 0px; + margin-bottom: 1rem; +} + +.title-content { + font-size: 1.5em; + cursor: pointer; + color: white !important; +} + +.moreInfoLink { + background: none; + border: none; + font-size: 1.25em; +} + +.columns .align-bottom { + align-items: end; +} + +strong { + color: inherit; +} diff --git a/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.spec.ts b/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.spec.ts new file mode 100644 index 000000000..c2815f06e --- /dev/null +++ b/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.spec.ts @@ -0,0 +1,25 @@ +import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; + +import { CorpusSelectorComponent } from './corpus-selector.component'; +import { commonTestBed } from '../../common-test-bed'; +import { mockCorpus } from '../../../mock-data/corpus'; + +describe('CorpusSelectorComponent', () => { + let component: CorpusSelectorComponent; + let fixture: ComponentFixture; + + beforeEach(waitForAsync(() => { + commonTestBed().testingModule.compileComponents(); + })); + + beforeEach(() => { + fixture = TestBed.createComponent(CorpusSelectorComponent); + component = fixture.componentInstance; + component.corpus = mockCorpus; + fixture.detectChanges(); + }); + + it('should create', () => { + expect(component).toBeTruthy(); + }); +}); diff --git a/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.ts b/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.ts new file mode 100644 index 000000000..95d4ac5a8 --- /dev/null +++ b/frontend/src/app/corpus-selection/corpus-selector/corpus-selector.component.ts @@ -0,0 +1,34 @@ +import { Component, Input, OnInit } from '@angular/core'; +import { Corpus } from '../../models'; +import { DialogService } from '../../services'; +import { Router } from '@angular/router'; +import { faInfoCircle, faSearch } from '@fortawesome/free-solid-svg-icons'; +import * as _ from 'lodash'; + +@Component({ + selector: 'ia-corpus-selector', + templateUrl: './corpus-selector.component.html', + styleUrls: ['./corpus-selector.component.scss'] +}) +export class CorpusSelectorComponent implements OnInit { + @Input() corpus: Corpus; + + infoIcon = faInfoCircle; + searchIcon = faSearch; + + constructor(private dialogService: DialogService, private router: Router) { } + + ngOnInit(): void { + } + + showMoreInfo(): void { + this.dialogService.showDescriptionPage(this.corpus); + } + + navigateToCorpus(event: any): void { + if (!event.target.classList.contains('moreInfoLink')) { + this.router.navigate(['/search', this.corpus.name]); + } + } + +} diff --git a/frontend/src/app/dialog/dialog.component.html b/frontend/src/app/dialog/dialog.component.html index 8009e39e0..88c2ccdb4 100644 --- a/frontend/src/app/dialog/dialog.component.html +++ b/frontend/src/app/dialog/dialog.component.html @@ -1,8 +1,8 @@ {{title}} -
+
- + diff --git a/frontend/src/app/dialog/dialog.component.ts b/frontend/src/app/dialog/dialog.component.ts index 7730761ee..9ef1a9f0f 100644 --- a/frontend/src/app/dialog/dialog.component.ts +++ b/frontend/src/app/dialog/dialog.component.ts @@ -57,8 +57,4 @@ export class DialogComponent implements OnDestroy, OnInit { ngOnDestroy(): void { this.dialogEventSubscription.unsubscribe(); } - - navigate(): void { - this.router.navigate(this.footerRouterLink); - } } diff --git a/frontend/src/app/document-view/document-view.component.spec.ts b/frontend/src/app/document-view/document-view.component.spec.ts index cd43d313b..5e7d9e458 100644 --- a/frontend/src/app/document-view/document-view.component.spec.ts +++ b/frontend/src/app/document-view/document-view.component.spec.ts @@ -1,5 +1,7 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; import { By } from '@angular/platform-browser'; +import * as _ from 'lodash'; +import { mockCorpus, mockField } from '../../mock-data/corpus'; import { commonTestBed } from '../common-test-bed'; @@ -16,26 +18,14 @@ describe('DocumentViewComponent', () => { beforeEach(() => { fixture = TestBed.createComponent(DocumentViewComponent); component = fixture.componentInstance; - component.corpus = { - scan_image_type: 'farout_image_type' - }; - component.fields = [{ - name: 'test', - displayName: 'Test', - displayType: 'text', - description: 'Description', - hidden: false, - sortable: false, - primarySort: false, - searchable: false, - searchFilter: null, - downloadable: true, - mappingType: 'text' - }]; + component.corpus = _.merge({ + scan_image_type: 'farout_image_type', + fields: [mockField] + }, mockCorpus); component.document = { id: 'test', relevance: 0.5, - fieldValues: { test: 'Hello world!' } + fieldValues: { great_field: 'Hello world!' } }; fixture.detectChanges(); }); @@ -47,6 +37,8 @@ describe('DocumentViewComponent', () => { it('should render fields', async () => { await fixture.whenStable(); + expect(component.propertyFields).toEqual([mockField]); + const debug = fixture.debugElement.queryAll(By.css('[data-test-field-value]')); expect(debug.length).toEqual(1); // number of fields const element = debug[0].nativeElement; diff --git a/frontend/src/app/document-view/document-view.component.ts b/frontend/src/app/document-view/document-view.component.ts index 05d16fb53..6018e8cac 100644 --- a/frontend/src/app/document-view/document-view.component.ts +++ b/frontend/src/app/document-view/document-view.component.ts @@ -11,16 +11,13 @@ import { CorpusField, FoundDocument, Corpus } from '../models/index'; export class DocumentViewComponent implements OnChanges { public get contentFields() { - return this.fields.filter(field => !field.hidden && field.displayType === 'text_content'); + return this.corpus.fields.filter(field => !field.hidden && field.displayType === 'text_content'); } public get propertyFields() { - return this.fields.filter(field => !field.hidden && field.displayType !== 'text_content'); + return this.corpus.fields.filter(field => !field.hidden && field.displayType !== 'text_content'); } - @Input() - public fields: CorpusField[] = []; - @Input() public document: FoundDocument; diff --git a/frontend/src/app/download/download.component.html b/frontend/src/app/download/download.component.html index 463fbe73b..9dfde3a82 100644 --- a/frontend/src/app/download/download.component.html +++ b/frontend/src/app/download/download.component.html @@ -11,8 +11,8 @@

- +

diff --git a/frontend/src/app/download/download.component.spec.ts b/frontend/src/app/download/download.component.spec.ts index f31f74923..fcdfd2b3a 100644 --- a/frontend/src/app/download/download.component.spec.ts +++ b/frontend/src/app/download/download.component.spec.ts @@ -1,12 +1,12 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; import { commonTestBed } from '../common-test-bed'; -import { CorpusField } from "../models"; -import { mockCorpus, mockField, mockField2 } from "../../mock-data/corpus"; +import { CorpusField } from '../models'; +import { mockCorpus, mockField, mockField2 } from '../../mock-data/corpus'; -import { DownloadComponent } from "./download.component"; +import { DownloadComponent } from './download.component'; -describe("DownloadComponent", () => { +describe('DownloadComponent', () => { let component: DownloadComponent; let fixture: ComponentFixture; @@ -21,22 +21,22 @@ describe("DownloadComponent", () => { fixture.detectChanges(); }); - it("should create", () => { + it('should create', () => { expect(component).toBeTruthy(); }); - it("should respond to field selection", () => { + it('should respond to field selection', () => { // Start with a single field - expect(component["getCsvFields"]()).toEqual([mockField]); + expect(component['getCsvFields']()).toEqual(mockCorpus.fields); // Deselect all component.selectCsvFields([]); - expect(component["getCsvFields"]()).toEqual([]); + expect(component['getCsvFields']()).toEqual([]); // Select two component.selectCsvFields([mockField, mockField2]); const expected_fields = [mockField, mockField2]; - expect(component["getCsvFields"]()).toEqual(expected_fields); + expect(component['getCsvFields']()).toEqual(expected_fields); expect(component.selectedCsvFields).toEqual(expected_fields); }); }); diff --git a/frontend/src/app/download/download.component.ts b/frontend/src/app/download/download.component.ts index 5775eaf05..c79182a66 100644 --- a/frontend/src/app/download/download.component.ts +++ b/frontend/src/app/download/download.component.ts @@ -38,7 +38,7 @@ export class DownloadComponent implements OnChanges { ngOnChanges() { this.availableCsvFields = Object.values(this.corpus.fields).filter(field => field.downloadable); - const highlight = this.queryModel.highlight; + const highlight = this.queryModel.highlightSize; // 'Query in context' becomes an extra option if any field in the corpus has been marked as highlightable if (highlight !== undefined) { this.availableCsvFields.push({ @@ -52,9 +52,9 @@ export class DownloadComponent implements OnChanges { primarySort: false, searchable: false, downloadable: true, - searchFilter: null, + filterOptions: null, mappingType: null, - }); + } as unknown as CorpusField) ; } } diff --git a/frontend/src/app/filter/ad-hoc-filter.component.html b/frontend/src/app/filter/ad-hoc-filter.component.html index 25048bd00..ecab52e8a 100644 --- a/frontend/src/app/filter/ad-hoc-filter.component.html +++ b/frontend/src/app/filter/ad-hoc-filter.component.html @@ -1 +1 @@ -

{{data.value}}

+

{{data}}

diff --git a/frontend/src/app/filter/ad-hoc-filter.component.ts b/frontend/src/app/filter/ad-hoc-filter.component.ts index bf17213d9..06384741c 100644 --- a/frontend/src/app/filter/ad-hoc-filter.component.ts +++ b/frontend/src/app/filter/ad-hoc-filter.component.ts @@ -1,5 +1,5 @@ -import { Component, Input, OnInit } from '@angular/core'; -import { SearchFilter, SearchFilterData } from '../models'; +import { Component } from '@angular/core'; +import { AdHocFilter, } from '../models'; import { BaseFilterComponent } from './base-filter.component'; @Component({ @@ -7,34 +7,5 @@ import { BaseFilterComponent } from './base-filter.component'; templateUrl: './ad-hoc-filter.component.html', styleUrls: ['./ad-hoc-filter.component.scss'] }) -export class AdHocFilterComponent extends BaseFilterComponent implements OnInit { - data: { value: any} = { value: undefined }; - - ngOnInit() { - if (this.filter) { - this.data = this.getDisplayData(this.filter); - } - } - - getValue(data: SearchFilterData) { - switch (data.filterType) { - case 'BooleanFilter': - return data.checked; - case 'DateFilter': - return data.min; // can return either: min == max for ad hoc filters - case 'MultipleChoiceFilter': - return data.selected[0]; // only one value for ad hoc filters - case 'RangeFilter': - return data.min; - } - } - - getDisplayData(filter: SearchFilter) { - return { value: this.getValue( filter.currentData) }; - } - - getFilterData(): SearchFilter { - return undefined; - } - +export class AdHocFilterComponent extends BaseFilterComponent { } diff --git a/frontend/src/app/filter/base-filter.component.ts b/frontend/src/app/filter/base-filter.component.ts index b6fef9f18..50ecca998 100644 --- a/frontend/src/app/filter/base-filter.component.ts +++ b/frontend/src/app/filter/base-filter.component.ts @@ -1,7 +1,8 @@ -import { Component, EventEmitter, Input, Output } from '@angular/core'; -import { Subject } from 'rxjs'; +import { Component, Input, OnChanges, SimpleChanges } from '@angular/core'; +import * as _ from 'lodash'; -import { SearchFilter, SearchFilterData } from '../models/index'; +import { QueryModel, SearchFilter } from '../models/index'; +import { Subscription } from 'rxjs'; /** * Filter component receives the corpus fields containing search filters as input @@ -10,51 +11,43 @@ import { SearchFilter, SearchFilterData } from '../models/index'; @Component({ template: '' }) -export abstract class BaseFilterComponent { - @Input() inputChanged: Subject; +export abstract class BaseFilterComponent implements OnChanges { + @Input() filter: SearchFilter; + @Input() queryModel: QueryModel; - @Input() - public filter: SearchFilter; + private queryModelSubscription: Subscription; - @Input() - public grayedOut: boolean; + constructor() { } - @Output('update') public updateEmitter = new EventEmitter>(); - - /** - * The data of the applied filter transformed to use as input for the value editors. - */ - public data: any; // holds the user data - - public useAsFilter = false; - - constructor() { + get data(): FilterData { + return this.filter?.currentData; } - provideFilterData() { - if (this.filter) { - this.data = this.getDisplayData(this.filter); - this.useAsFilter = this.filter.useAsFilter; + ngOnChanges(changes: SimpleChanges): void { + if (changes.filter) { + this.onFilterSet(this.filter); } - } - - abstract getDisplayData(filter: SearchFilter); - /** - * Create a new version of the filter data from the user input. - */ - abstract getFilterData(): SearchFilter; + if (changes.queryModel) { + if (this.queryModelSubscription) { + this.queryModelSubscription.unsubscribe(); + } + this.queryModelSubscription = this.queryModel.update.subscribe(() => + this.onQueryModelUpdate() + ); + this.onQueryModelUpdate(); // run update immediately + } + } /** * Trigger a change event. */ - update() { - if (this.data.selected && this.data.selected.length === 0) { - this.useAsFilter = false; - } else { - this.useAsFilter = true; // update called through user input - } - this.filter.useAsFilter = this.useAsFilter; - this.updateEmitter.emit(this.getFilterData()); + update(data: FilterData) { + this.filter.set(data); } + + /** possible administration when the filter is set, e.g. setting data limits */ + onFilterSet(filter): void {}; + + onQueryModelUpdate() {} } diff --git a/frontend/src/app/filter/boolean-filter.component.html b/frontend/src/app/filter/boolean-filter.component.html index 643de4523..7e4c0a6b7 100644 --- a/frontend/src/app/filter/boolean-filter.component.html +++ b/frontend/src/app/filter/boolean-filter.component.html @@ -1,4 +1,4 @@ -
- - {{data.checked | json | titlecase }} +
+ + {{data | json | titlecase }}
diff --git a/frontend/src/app/filter/boolean-filter.component.spec.ts b/frontend/src/app/filter/boolean-filter.component.spec.ts index 352d7d5d7..95ab41254 100644 --- a/frontend/src/app/filter/boolean-filter.component.spec.ts +++ b/frontend/src/app/filter/boolean-filter.component.spec.ts @@ -1,6 +1,8 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; +import { mockCorpus3, mockField } from '../../mock-data/corpus'; import { commonTestBed } from '../common-test-bed'; +import { QueryModel } from '../models'; import { BooleanFilterComponent } from './boolean-filter.component'; @@ -15,19 +17,8 @@ describe('BooleanFilterComponent', () => { beforeEach(() => { fixture = TestBed.createComponent(BooleanFilterComponent); component = fixture.componentInstance; - component.filter = { - fieldName: 'A yes/no question', - description: 'What is the average speed of a swallow?', - useAsFilter: false, - defaultData: { - filterType: 'BooleanFilter', - checked: false - }, - currentData: { - filterType: 'BooleanFilter', - checked: true - } - }; + component.queryModel = new QueryModel(mockCorpus3); + component.filter = component.queryModel.filterForField(mockField); fixture.detectChanges(); }); diff --git a/frontend/src/app/filter/boolean-filter.component.ts b/frontend/src/app/filter/boolean-filter.component.ts index ed83081dd..3467da463 100644 --- a/frontend/src/app/filter/boolean-filter.component.ts +++ b/frontend/src/app/filter/boolean-filter.component.ts @@ -1,37 +1,12 @@ -import { Component, DoCheck, OnInit } from '@angular/core'; +import { Component } from '@angular/core'; import { BaseFilterComponent } from './base-filter.component'; -import { SearchFilter, BooleanFilterData } from '../models'; +import { BooleanFilter } from '../models'; @Component({ selector: 'ia-boolean-filter', templateUrl: './boolean-filter.component.html', styleUrls: ['./boolean-filter.component.scss'] }) -export class BooleanFilterComponent extends BaseFilterComponent implements DoCheck, OnInit { - - ngOnInit() { - this.provideFilterData(); - } - - ngDoCheck() { - if (this.filter.reset) { - this.filter.reset = false; - this.provideFilterData(); - } - } - - getDisplayData(filter: SearchFilter) { - const data = filter.currentData; - return { checked: data.checked }; - } - - getFilterData(): SearchFilter { - this.filter.currentData = { - filterType: 'BooleanFilter', - checked: this.data.checked - }; - return this.filter; - } - +export class BooleanFilterComponent extends BaseFilterComponent { } diff --git a/frontend/src/app/filter/date-filter.component.html b/frontend/src/app/filter/date-filter.component.html index b73288f17..865a52224 100644 --- a/frontend/src/app/filter/date-filter.component.html +++ b/frontend/src/app/filter/date-filter.component.html @@ -1,6 +1,8 @@ -
- - +
+ +
diff --git a/frontend/src/app/filter/date-filter.component.spec.ts b/frontend/src/app/filter/date-filter.component.spec.ts index 1ccabf96f..c801cf112 100644 --- a/frontend/src/app/filter/date-filter.component.spec.ts +++ b/frontend/src/app/filter/date-filter.component.spec.ts @@ -1,6 +1,8 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; +import { mockCorpus3, mockFieldDate } from '../../mock-data/corpus'; import { commonTestBed } from '../common-test-bed'; +import { QueryModel } from '../models'; import { DateFilterComponent } from './date-filter.component'; @@ -15,25 +17,12 @@ describe('DateFilterComponent', () => { beforeEach(() => { fixture = TestBed.createComponent(DateFilterComponent); component = fixture.componentInstance; - component.filter = { - fieldName: 'Publication date', - description: 'When this book was published', - useAsFilter: false, - defaultData: { - filterType: 'DateFilter', - min: '1099-01-01', - max: '1300-12-31' - }, - currentData: { - filterType: 'DateFilter', - min: '1111-01-01', - max: '1299-12-31' - } - }; - component.data = { - minYear: 1099, - maxYear: 1300 - }; + component.queryModel = new QueryModel(mockCorpus3); + component.filter = component.queryModel.filterForField(mockFieldDate); + component.filter.set({ + min: new Date('Jan 1 1810'), + max: new Date('Dec 31 1820') + }); fixture.detectChanges(); }); diff --git a/frontend/src/app/filter/date-filter.component.ts b/frontend/src/app/filter/date-filter.component.ts index 762f5054f..857d81669 100644 --- a/frontend/src/app/filter/date-filter.component.ts +++ b/frontend/src/app/filter/date-filter.component.ts @@ -1,64 +1,32 @@ -import { Component, DoCheck, OnChanges, OnInit } from '@angular/core'; +import { Component } from '@angular/core'; +import * as _ from 'lodash'; -import * as moment from 'moment'; - -import { SearchFilter, DateFilterData } from '../models'; +import { DateFilterData, DateFilter } from '../models'; import { BaseFilterComponent } from './base-filter.component'; +import { BehaviorSubject, Observable, combineLatest } from 'rxjs'; +import { map, tap } from 'rxjs/operators'; @Component({ selector: 'ia-date-filter', templateUrl: './date-filter.component.html', styleUrls: ['./date-filter.component.scss'] }) -export class DateFilterComponent extends BaseFilterComponent implements DoCheck, OnInit { +export class DateFilterComponent extends BaseFilterComponent { public minDate: Date; public maxDate: Date; - public minYear: number; - public maxYear: number; - - ngOnInit() { - this.provideFilterData(); - this.minDate = new Date(this.filter.defaultData.min); - this.maxDate = new Date(this.filter.defaultData.max); - this.minYear = this.minDate.getFullYear(); - this.maxYear = this.maxDate.getFullYear(); - } - - ngDoCheck() { - if (this.filter.reset) { - this.filter.reset = false; - this.provideFilterData(); - } - } + public selectedMinDate: BehaviorSubject; + public selectedMaxDate: BehaviorSubject; - getDisplayData(filter: SearchFilter) { - const data = filter.currentData; - return { - min: new Date(data.min), - max: new Date(data.max), - minYear: this.minYear, - maxYear: this.maxYear - }; - } + onFilterSet(filter: DateFilter): void { + this.minDate = filter.defaultData.min; + this.maxDate = filter.defaultData.max; - /** - * Create a new version of the filter data from the user input. - */ - getFilterData(): SearchFilter { - this.filter.currentData = { - filterType: 'DateFilter', - min: this.formatDate(this.data.min), - max: this.formatDate(this.data.max) - }; - return this.filter; - } + this.selectedMinDate = new BehaviorSubject(this.minDate); + this.selectedMaxDate = new BehaviorSubject(this.maxDate); - /** - * Return a string of the form 0123-04-25. - */ - formatDate(date: Date): string { - return moment(date).format().slice(0, 10); + combineLatest([this.selectedMinDate, this.selectedMaxDate]).subscribe(([min, max]) => + this.update({min, max}) + ); } - } diff --git a/frontend/src/app/filter/filter-manager.component.html b/frontend/src/app/filter/filter-manager.component.html index 372c2332d..e03cafafe 100644 --- a/frontend/src/app/filter/filter-manager.component.html +++ b/frontend/src/app/filter/filter-manager.component.html @@ -4,49 +4,50 @@

Filters

- -

- -
+ +
-
-

- -

- - + + - - - - - + + + + +
diff --git a/frontend/src/app/filter/filter-manager.component.spec.ts b/frontend/src/app/filter/filter-manager.component.spec.ts index da42a32c4..3604f7472 100644 --- a/frontend/src/app/filter/filter-manager.component.spec.ts +++ b/frontend/src/app/filter/filter-manager.component.spec.ts @@ -3,9 +3,8 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; import { commonTestBed } from '../common-test-bed'; import { FilterManagerComponent } from './filter-manager.component'; -import { mockCorpus, mockCorpus2, mockFilter } from '../../mock-data/corpus'; -import { convertToParamMap } from '@angular/router'; -import { findByName } from '../utils/utils'; +import { mockCorpus, mockCorpus2 } from '../../mock-data/corpus'; +import { QueryModel } from '../models'; describe('FilterManagerComponent', () => { let component: FilterManagerComponent; @@ -18,34 +17,37 @@ describe('FilterManagerComponent', () => { beforeEach(() => { fixture = TestBed.createComponent(FilterManagerComponent); component = fixture.componentInstance; - component.corpus = mockCorpus; + const corpus = mockCorpus; + component.queryModel = new QueryModel(mockCorpus); fixture.detectChanges(); }); it('should create', () => { expect(component).toBeTruthy(); - expect(component.searchFilters.length).toEqual(1); + expect(component.filters.length).toEqual(2); }); it('resets filters when corpus changes', () => { - component.corpus = mockCorpus2; - component.initialize(); - expect(component.searchFilters.length).toEqual(0); - component.corpus = mockCorpus; - component.initialize(); - expect(component.searchFilters.length).toEqual(1); - }); + component.queryModel = new QueryModel(mockCorpus2); + fixture.detectChanges(); + expect(component.filters.length).toEqual(1); + expect(component.filters[0].adHoc).toBeTrue(); - it('parses parameters to filters', () => { - expect(component.activeFilters.length).toEqual(0); - const params = convertToParamMap({great_field: 'checked'}); - component.setStateFromParams(params); - expect(component.activeFilters.length).toEqual(1); - }); + component.queryModel = new QueryModel(mockCorpus); + fixture.detectChanges(); + expect(component.filters.length).toEqual(2); + expect(component.filters[0].adHoc).toBeFalse(); + +}); - it('toggles filters on and off', async() => { - findByName(component.corpusFields, 'great_field').searchFilter.useAsFilter = true; - const params = component.filtersChanged(); - expect(Object.keys(params)).toContain('great_field'); + it('toggles filters on and off', async () => { + const filter = component.filters.find(f => f.corpusField.name === 'great_field'); + expect(component.queryModel.activeFilters.length).toBe(0); + filter.set(['test']); + expect(component.queryModel.activeFilters.length).toBe(1); + filter.toggle(); + expect(component.queryModel.activeFilters.length).toBe(0); + filter.toggle(); + expect(component.queryModel.activeFilters.length).toBe(1); }); }); diff --git a/frontend/src/app/filter/filter-manager.component.ts b/frontend/src/app/filter/filter-manager.component.ts index 50453148f..0121fdec8 100644 --- a/frontend/src/app/filter/filter-manager.component.ts +++ b/frontend/src/app/filter/filter-manager.component.ts @@ -1,178 +1,64 @@ -import { Component, Input, OnChanges } from '@angular/core'; -import { ActivatedRoute, ParamMap, Router } from '@angular/router'; +/* eslint-disable @typescript-eslint/member-ordering */ +import { Component, Input } from '@angular/core'; import * as _ from 'lodash'; -import { Subject } from 'rxjs'; - -import { AggregateData, Corpus, MultipleChoiceFilterData, SearchFilter, - SearchFilterData, CorpusField } from '../models/index'; -import { SearchService } from '../services'; -import { ParamDirective } from '../param/param-directive'; -import { ParamService } from '../services/param.service'; -import { findByName } from '../utils/utils'; -import { filtersFromParams, paramForFieldName, searchFilterDataToParam } from '../utils/params'; +import { combineLatest, Observable } from 'rxjs'; +import { map } from 'rxjs/operators'; +import { SearchFilter, QueryModel } from '../models/index'; @Component({ selector: 'ia-filter-manager', templateUrl: './filter-manager.component.html', styleUrls: ['./filter-manager.component.scss'] }) -export class FilterManagerComponent extends ParamDirective implements OnChanges { - @Input() public corpus: Corpus; - - inputChanged = new Subject(); - - public corpusFields: CorpusField[]; - public searchFilters: SearchFilter [] = []; - public activeFilters: SearchFilter [] = []; - - public showFilters: boolean; - public grayOutFilters: boolean; - - public multipleChoiceData: Object = {}; - - constructor( - private paramService: ParamService, - private searchService: SearchService, - route: ActivatedRoute, - router: Router) { - super(route, router); - } +export class FilterManagerComponent { + @Input() queryModel: QueryModel; - initialize() { - this.corpusFields = _.cloneDeep(this.corpus.fields); - this.searchFilters = this.corpusFields.filter(field => field.searchFilter).map(field => field.searchFilter); + constructor() { } - ngOnChanges() { - this.initialize(); - this.inputChanged.next(); + get activeFilters(): SearchFilter[] { + return this.queryModel?.activeFilters; } - setStateFromParams(params: ParamMap) { - this.activeFilters = filtersFromParams( - params, this.corpusFields - ); - this.aggregateSearchForMultipleChoiceFilters(params); - - } - - teardown() { - const params = {}; - this.activeFilters.forEach(filter => { - const paramName = paramForFieldName(filter.fieldName); - params[paramName] = null; - }); - this.setParams(params); + get filters(): SearchFilter[] { + return this.queryModel?.filters; } - /** - * For all multiple choice filters, get the bins and counts - * Exclude the filter itself from the aggregate search - * Save results in multipleChoiceData, which is structured as follows: - * fieldName1: [{key: option1, doc_count: 42}, {key: option2, doc_count: 3}], - * fieldName2: [etc] - */ - private aggregateSearchForMultipleChoiceFilters(params) { - const multipleChoiceFilters = this.searchFilters.filter(f => !f.adHoc && f.currentData.filterType === 'MultipleChoiceFilter'); - - const aggregateResultPromises = multipleChoiceFilters.map(filter => this.getMultipleChoiceFilterOptions(filter, params)); - Promise.all(aggregateResultPromises).then(results => { - results.forEach( r => - this.multipleChoiceData[Object.keys(r)[0]] = Object.values(r)[0] + get anyActiveFilters$(): Observable { + if (this.filters) { + const statuses = this.filters.map(filter => filter.active); + return combineLatest(statuses).pipe( + map(values => _.some(values)), ); - // if multipleChoiceData is empty, gray out all filters - if (multipleChoiceFilters && multipleChoiceFilters.length != 0) {this.grayOutFilters = this.multipleChoiceData[multipleChoiceFilters[0].fieldName].length === 0;} - }); - } - - async getMultipleChoiceFilterOptions(filter: SearchFilter, params: ParamMap): Promise { - let filters = _.cloneDeep(this.searchFilters.filter(f => f.useAsFilter === true)); - // get the filter's choices, based on all other filters' choices, but not this filter's choices - if (filters.length > 0) { - const index = filters.findIndex(f => f.fieldName === filter.fieldName); - if (index >= 0) { - filters.splice(index, 1); - } - } else { - filters = null; } - const defaultData = filter.defaultData as MultipleChoiceFilterData; - const aggregator = {name: filter.fieldName, size: defaultData.optionCount}; - const queryModel = this.paramService.queryModelFromParams(params, this.corpusFields); - return this.searchService.aggregateSearch(this.corpus, queryModel, [aggregator]).then(results => { - return results.aggregations; - }, error => { - console.trace(error, aggregator); - return {}; - }); - } - - toggleFilter(filter: SearchFilter) { - filter.useAsFilter = !filter.useAsFilter; - this.updateFilterData(filter); - } - - resetFilter(filter: SearchFilter) { - filter.useAsFilter = false; - filter.currentData = filter.defaultData; - filter.reset = true; - this.updateFilterData(filter); } - /** - * Event triggered from filter components - * - * @param filterData - */ - public updateFilterData(filter: SearchFilter) { - findByName(this.corpusFields, filter.fieldName).searchFilter = filter; - this.filtersChanged(); + get anyNonDefaultFilters$(): Observable { + if (this.filters) { + const statuses = this.filters.map(filter => filter.isDefault$); + return combineLatest(statuses).pipe( + map(values => !_.every(values)), + ); + } } public toggleActiveFilters() { if (this.activeFilters.length) { - this.activeFilters.forEach(filter => filter.useAsFilter = false); + this.filters.forEach(filter => filter.deactivate()); } else { // if we don't have active filters, set all filters to active which don't use default data - let filtersWithSettings = this.corpusFields.filter( - field => field.searchFilter && field.searchFilter.currentData != field.searchFilter.defaultData - ).map( field => field.searchFilter ); - filtersWithSettings.forEach( field => field.useAsFilter = true); + const filtersWithSettings = this.filters.filter(filter => + !_.isEqual(filter.currentData, filter.defaultData)); + filtersWithSettings.forEach(filter => filter.toggle()); } - this.filtersChanged(); } public resetAllFilters() { - this.activeFilters.forEach(filter => { - filter.currentData = filter.defaultData; - filter.reset = true; - }); - this.toggleActiveFilters(); - } - - public filtersChanged(): Object { - const newFilters = this.corpusFields.filter(field => field.searchFilter?.useAsFilter).map(f => f.searchFilter); - let params = {}; - this.activeFilters.forEach(filter => { - // set any params for previously active filters to null - if (!newFilters.map(f => f.fieldName).find(name => name === filter.fieldName)) { - const paramName = paramForFieldName(filter.fieldName); - params[paramName] = null; - if (filter.adHoc) { - // also set sort null in case of an adHoc filter - params['sort'] = null; - } - } - }); - newFilters.forEach(filter => { - const paramName = paramForFieldName(filter.fieldName); - const value = filter.useAsFilter? searchFilterDataToParam(filter) : null; - params[paramName] = value; + this.filters.forEach(filter => { + filter.reset(); }); - this.setParams(params); - return params; } } diff --git a/frontend/src/app/filter/multiple-choice-filter.component.html b/frontend/src/app/filter/multiple-choice-filter.component.html index a2986c768..7fb112711 100644 --- a/frontend/src/app/filter/multiple-choice-filter.component.html +++ b/frontend/src/app/filter/multiple-choice-filter.component.html @@ -1,8 +1,9 @@
- +
{{item.label}}
-
{{item.doc_count}}
+
{{item.doc_count}}
diff --git a/frontend/src/app/filter/multiple-choice-filter.component.spec.ts b/frontend/src/app/filter/multiple-choice-filter.component.spec.ts index 24593bf9a..b8f22213f 100644 --- a/frontend/src/app/filter/multiple-choice-filter.component.spec.ts +++ b/frontend/src/app/filter/multiple-choice-filter.component.spec.ts @@ -1,8 +1,11 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; +import { mockCorpus, mockFieldMultipleChoice } from '../../mock-data/corpus'; import { commonTestBed } from '../common-test-bed'; +import { QueryModel } from '../models'; import { MultipleChoiceFilterComponent } from './multiple-choice-filter.component'; +import * as _ from 'lodash'; describe('MultipleChoiceFilterComponent', () => { let component: MultipleChoiceFilterComponent; @@ -15,10 +18,10 @@ describe('MultipleChoiceFilterComponent', () => { beforeEach(() => { fixture = TestBed.createComponent(MultipleChoiceFilterComponent); component = fixture.componentInstance; - component.optionsAndCounts = [{key: 'Andy', doc_count: 2}, {key: 'Lou', doc_count: 3}]; - component.data = { - options: ['Andy', 'Lou'] - }; + const corpus = _.cloneDeep(mockCorpus); + corpus.fields.push(mockFieldMultipleChoice); + component.queryModel = new QueryModel(corpus); + component.filter = component.queryModel.filterForField(mockFieldMultipleChoice); fixture.detectChanges(); }); diff --git a/frontend/src/app/filter/multiple-choice-filter.component.ts b/frontend/src/app/filter/multiple-choice-filter.component.ts index 812301e67..da2ac3200 100644 --- a/frontend/src/app/filter/multiple-choice-filter.component.ts +++ b/frontend/src/app/filter/multiple-choice-filter.component.ts @@ -1,43 +1,45 @@ -import { Component, Input, OnInit, OnChanges } from '@angular/core'; +import { Component } from '@angular/core'; import * as _ from 'lodash'; import { BaseFilterComponent } from './base-filter.component'; -import { SearchFilter, MultipleChoiceFilterData, AggregateResult } from '../models'; +import { MultipleChoiceFilterOptions } from '../models'; +import { SearchService } from '../services'; @Component({ selector: 'ia-multiple-choice-filter', templateUrl: './multiple-choice-filter.component.html', styleUrls: ['./multiple-choice-filter.component.scss'] }) -export class MultipleChoiceFilterComponent extends BaseFilterComponent implements OnChanges { - @Input() public optionsAndCounts: AggregateResult[]; +export class MultipleChoiceFilterComponent extends BaseFilterComponent { + options: { label: string; value: string; doc_count: number }[] = []; - ngOnChanges() { - this.provideFilterData(); + constructor(private searchService: SearchService) { + super(); } - getDisplayData(filter: SearchFilter) { - this.data = filter.currentData; - let options = []; - if (this.optionsAndCounts) { - options = _.sortBy( - this.optionsAndCounts.map(x => ({ label: x.key, value: encodeURIComponent(x.key), doc_count: x.doc_count })), - o => o.label - ); - } else { -options = [1, 2, 3]; -} // dummy array to make sure the component loads - return { options, selected: this.data.selected }; + onFilterSet(): void { + this.getOptions(); } - getFilterData(): SearchFilter { - this.filter.currentData = { - filterType: 'MultipleChoiceFilter', - selected: this.data.selected - }; - return this.filter; + onQueryModelUpdate(): void { + this.getOptions(); } - + private async getOptions(): Promise { + if (this.filter && this.queryModel) { + const optionCount = (this.filter.corpusField.filterOptions as MultipleChoiceFilterOptions).option_count; + const aggregator = {name: this.filter.corpusField.name, size: optionCount}; + const queryModel = this.queryModel.clone(); + queryModel.filterForField(this.filter.corpusField).deactivate(); + this.searchService.aggregateSearch(queryModel.corpus, queryModel, [aggregator]).then( + response => response.aggregations[this.filter.corpusField.name]).then(aggregations => + this.options = _.sortBy( + aggregations.map(x => ({ label: x.key, value: x.key, doc_count: x.doc_count })), + o => o.label + ) + ).catch(() => this.options = []); + + } + } } diff --git a/frontend/src/app/filter/range-filter.component.html b/frontend/src/app/filter/range-filter.component.html index 9f82ac618..ba498e929 100644 --- a/frontend/src/app/filter/range-filter.component.html +++ b/frontend/src/app/filter/range-filter.component.html @@ -1,4 +1,5 @@
- {{data[0]}} - {{data[1]}} - -
\ No newline at end of file + {{data.min}} - {{data.max}} + +
diff --git a/frontend/src/app/filter/range-filter.component.spec.ts b/frontend/src/app/filter/range-filter.component.spec.ts index 000885e7b..95f948f06 100644 --- a/frontend/src/app/filter/range-filter.component.spec.ts +++ b/frontend/src/app/filter/range-filter.component.spec.ts @@ -3,7 +3,8 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; import { commonTestBed } from '../common-test-bed'; import { RangeFilterComponent } from './range-filter.component'; -import { RangeFilterData } from '../models'; +import { QueryModel } from '../models'; +import { mockCorpus3, mockField3 } from '../../mock-data/corpus'; describe('RangeFilterComponent', () => { let component: RangeFilterComponent; @@ -16,20 +17,9 @@ describe('RangeFilterComponent', () => { beforeEach(() => { fixture = TestBed.createComponent(RangeFilterComponent); component = fixture.componentInstance; - const mockRangeData = { - filterType: 'RangeFilter', - min: 1984, - max: 1984 - } as RangeFilterData; - const data = { - fieldName: 'year', - description: 'Orwellian', - useAsFilter: false, - defaultData: mockRangeData, - currentData: mockRangeData - }; - component.filter = data; - component.data = data; + component.queryModel = new QueryModel(mockCorpus3); + component.filter = component.queryModel.filterForField(mockField3); + component.filter.set({min: 1984, max: 1984}); fixture.detectChanges(); }); diff --git a/frontend/src/app/filter/range-filter.component.ts b/frontend/src/app/filter/range-filter.component.ts index 86d294b38..162c2ab7f 100644 --- a/frontend/src/app/filter/range-filter.component.ts +++ b/frontend/src/app/filter/range-filter.component.ts @@ -1,6 +1,6 @@ -import { Component, DoCheck, OnInit } from '@angular/core'; +import { Component } from '@angular/core'; -import { SearchFilter, RangeFilterData } from '../models'; +import { RangeFilterData, RangeFilter } from '../models'; import { BaseFilterComponent } from './base-filter.component'; @Component({ @@ -8,30 +8,24 @@ import { BaseFilterComponent } from './base-filter.component'; templateUrl: './range-filter.component.html', styleUrls: ['./range-filter.component.scss'] }) -export class RangeFilterComponent extends BaseFilterComponent implements DoCheck, OnInit { - ngOnInit() { - this.provideFilterData(); - } +export class RangeFilterComponent extends BaseFilterComponent { + min: number; + max: number; - ngDoCheck() { - if (this.filter.reset) { - this.filter.reset = false; - this.provideFilterData(); - } + onFilterSet(filter: RangeFilter): void { + this.min = filter.defaultData.min; + this.max = filter.defaultData.max; } - getDisplayData(filter: SearchFilter) { - this.data = filter.currentData; - return [this.data.min, this.data.max]; + getDisplayData(filterData: RangeFilterData): [number, number] { + return [filterData.min, filterData.max]; } - getFilterData(): SearchFilter { - this.filter.currentData = { - filterType: 'RangeFilter', - min: this.data[0], - max: this.data[1] + getFilterData(value: [number, number]): RangeFilterData { + return { + min: value[0], + max: value[1], }; - return this.filter; } } diff --git a/frontend/src/app/footer/footer.component.html b/frontend/src/app/footer/footer.component.html index d44e20643..84fd7a699 100644 --- a/frontend/src/app/footer/footer.component.html +++ b/frontend/src/app/footer/footer.component.html @@ -2,17 +2,17 @@
-

+

With support from -

+

diff --git a/frontend/src/app/history/download-history/download-history.component.ts b/frontend/src/app/history/download-history/download-history.component.ts index 7979094e4..24b495aab 100644 --- a/frontend/src/app/history/download-history/download-history.component.ts +++ b/frontend/src/app/history/download-history/download-history.component.ts @@ -1,8 +1,9 @@ import { Component, OnInit } from '@angular/core'; import { faDownload } from '@fortawesome/free-solid-svg-icons'; import * as _ from 'lodash'; -import { Corpus, Download, DownloadOptions, DownloadParameters, DownloadType, QueryModel } from '../../models'; -import { ApiService, CorpusService, DownloadService, ElasticSearchService, EsQuery, NotificationService } from '../../services'; +import { esQueryToQueryModel } from '../../utils/es-query'; +import { Download, DownloadOptions, DownloadParameters, DownloadType, QueryModel } from '../../models'; +import { ApiService, CorpusService, DownloadService, NotificationService } from '../../services'; import { HistoryDirective } from '../history.directive'; import { findByName } from '../../utils/utils'; @@ -22,7 +23,6 @@ export class DownloadHistoryComponent extends HistoryDirective implements OnInit private downloadService: DownloadService, private apiService: ApiService, corpusService: CorpusService, - private elasticSearchService: ElasticSearchService, private notificationService: NotificationService ) { super(corpusService); @@ -55,7 +55,7 @@ export class DownloadHistoryComponent extends HistoryDirective implements OnInit const esQueries = 'es_query' in download.parameters ? [download.parameters.es_query] : download.parameters.map(p => p.es_query); const corpus = findByName(this.corpora, download.corpus); - return esQueries.map(esQuery => this.elasticSearchService.esQueryToQueryModel(esQuery, corpus)); + return esQueries.map(esQuery => esQueryToQueryModel(esQuery, corpus)); } diff --git a/frontend/src/app/history/history.directive.ts b/frontend/src/app/history/history.directive.ts index a9c5b76d2..deae409f2 100644 --- a/frontend/src/app/history/history.directive.ts +++ b/frontend/src/app/history/history.directive.ts @@ -2,6 +2,7 @@ import { Directive } from '@angular/core'; import { MenuItem } from 'primeng/api'; import { Corpus, Download, QueryDb } from '../models'; import { CorpusService } from '../services'; +import { findByName } from '../utils/utils'; @Directive({ selector: '[iaHistory]' @@ -29,6 +30,6 @@ export class HistoryDirective { corpusTitle(corpusName: string): string { - return this.corpora.find(corpus => corpus.name === corpusName).title || corpusName; + return findByName(this.corpora, corpusName).title || corpusName; } } diff --git a/frontend/src/app/history/search-history/query-filters.component.spec.ts b/frontend/src/app/history/search-history/query-filters.component.spec.ts index 0886cde6c..5acccd124 100644 --- a/frontend/src/app/history/search-history/query-filters.component.spec.ts +++ b/frontend/src/app/history/search-history/query-filters.component.spec.ts @@ -1,4 +1,6 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; +import { QueryModel } from '../../models'; +import { mockCorpus } from '../../../mock-data/corpus'; import { commonTestBed } from '../../common-test-bed'; import { QueryFiltersComponent } from './query-filters.component'; @@ -14,10 +16,8 @@ describe('QueryFiltersComponent', () => { beforeEach(() => { fixture = TestBed.createComponent(QueryFiltersComponent); component = fixture.componentInstance; - component.queryModel = { - queryText: 'testing', - filters: [] - }; + component.queryModel = new QueryModel(mockCorpus); + component.queryModel.setQueryText('testing'); fixture.detectChanges(); }); diff --git a/frontend/src/app/history/search-history/query-filters.component.ts b/frontend/src/app/history/search-history/query-filters.component.ts index 475cf5fef..b382ada5c 100644 --- a/frontend/src/app/history/search-history/query-filters.component.ts +++ b/frontend/src/app/history/search-history/query-filters.component.ts @@ -1,8 +1,6 @@ import { Component, Input, OnInit } from '@angular/core'; -import { ParamService } from '../../services'; import { QueryModel } from '../../models/index'; -import { searchFilterDataToParam } from '../../utils/params'; @Component({ selector: '[ia-query-filters]', @@ -15,13 +13,14 @@ export class QueryFiltersComponent implements OnInit { name: string; formattedData: string | string[]; }[]; - constructor(private paramService: ParamService) { } + constructor() { } ngOnInit() { if (this.queryModel.filters?.length>0) { - this.formattedFilters = this.queryModel.filters.map(filter => { - return {name: filter.fieldName, formattedData: searchFilterDataToParam(filter)} - }); + this.formattedFilters = this.queryModel.activeFilters.map(filter => ({ + name: filter.corpusField.name, + formattedData: filter.dataToString(filter.currentData) + })); } } diff --git a/frontend/src/app/history/search-history/search-history.component.html b/frontend/src/app/history/search-history/search-history.component.html index e6a731dd5..85efdf7f4 100644 --- a/frontend/src/app/history/search-history/search-history.component.html +++ b/frontend/src/app/history/search-history/search-history.component.html @@ -27,8 +27,8 @@ {{query.started | date:'medium'}} - {{query.query_json | formatQueryText }} - + {{query.queryModel | formatQueryText }} + {{query.total_results}} {{corpusTitle(query.corpus)}} diff --git a/frontend/src/app/history/search-history/search-history.component.ts b/frontend/src/app/history/search-history/search-history.component.ts index e9eb43398..7e6addc15 100644 --- a/frontend/src/app/history/search-history/search-history.component.ts +++ b/frontend/src/app/history/search-history/search-history.component.ts @@ -1,9 +1,11 @@ import { Component, OnInit } from '@angular/core'; import { Router } from '@angular/router'; import * as _ from 'lodash'; +import { esQueryToQueryModel } from '../../utils/es-query'; import { QueryDb } from '../../models/index'; -import { CorpusService, SearchService, QueryService, ParamService } from '../../services/index'; +import { CorpusService, QueryService } from '../../services/index'; import { HistoryDirective } from '../history.directive'; +import { findByName } from '../../utils/utils'; @Component({ selector: 'search-history', @@ -14,10 +16,9 @@ export class SearchHistoryComponent extends HistoryDirective implements OnInit { public queries: QueryDb[]; public displayCorpora = false; constructor( - private paramService: ParamService, corpusService: CorpusService, private queryService: QueryService, - private router: Router + private router: Router, ) { super(corpusService); } @@ -28,16 +29,21 @@ export class SearchHistoryComponent extends HistoryDirective implements OnInit { searchHistory => { const sortedQueries = this.sortByDate(searchHistory); // not using _.sortedUniqBy as sorting and filtering takes place w/ different aspects - this.queries = _.uniqBy(sortedQueries, query => query.query_json); + this.queries = _.uniqBy(sortedQueries, query => query.query_json).map(this.addQueryModel.bind(this)); }); } + addQueryModel(query?: QueryDb) { + const corpus = findByName(this.corpora, query.corpus); + query.queryModel = esQueryToQueryModel(query.query_json, corpus); + return query; + } + returnToSavedQuery(query: QueryDb) { - const route = this.paramService.queryModelToRoute(query.query_json); - this.router.navigate(['/search', query.corpus, route]); + this.router.navigate(['/search', query.corpus], + {queryParams: query.queryModel.toQueryParams()}); if (window) { window.scrollTo(0, 0); } } - } diff --git a/frontend/src/app/models/corpus.ts b/frontend/src/app/models/corpus.ts index 3c3c361dd..9dfb5c52c 100644 --- a/frontend/src/app/models/corpus.ts +++ b/frontend/src/app/models/corpus.ts @@ -1,4 +1,6 @@ -import { SearchFilter, SearchFilterData } from './search-filter'; +import * as _ from 'lodash'; +import { AdHocFilter, BooleanFilter, DateFilter, MultipleChoiceFilter, RangeFilter, SearchFilter } from './search-filter'; +import { FilterOptions } from './search-filter-options'; // corresponds to the corpus definition on the backend. export class Corpus implements ElasticSearchIndex { @@ -24,10 +26,23 @@ export class Corpus implements ElasticSearchIndex { public scan_image_type: string, public allow_image_download: boolean, public word_models_present: boolean, + public languages: string[], + public category: string, public descriptionpage?: string, public documentContext?: DocumentContext, ) { } + get minYear(): number { + return this.minDate.getFullYear(); + } + + get maxYear(): number { + return this.maxDate.getFullYear(); + } + + get displayLanguages(): string { + return this.languages.join(', '); // may have to truncate long lists? + } } export interface ElasticSearchIndex { @@ -43,14 +58,38 @@ export interface DocumentContext { displayName: string; } -export interface CorpusField { +export type FieldDisplayType = 'text_content' | 'px' | 'keyword' | 'integer' | 'text' | 'date' | 'boolean'; + +/** Corpus field info as sent by the backend api */ +export interface ApiCorpusField { + name: string; + display_name: string; + display_type: FieldDisplayType; + description: string; + search_filter: FilterOptions | null; + results_overview: boolean; + csv_core: boolean; + search_field_core: boolean; + visualizations: string[]; + visualization_sort: string | null; + es_mapping: any; + indexed: boolean; + hidden: boolean; + required: boolean; + sortable: boolean; + primary_sort: boolean; + searchable: boolean; + downloadable: boolean; +} + +export class CorpusField { description: string; displayName: string; /** * How the field value should be displayed. * text_content: Main text content of the document */ - displayType: 'text_content' | 'px' | 'keyword' | 'integer' | 'text' | 'date' | 'boolean'; + displayType: FieldDisplayType; resultsOverview?: boolean; csvCore?: boolean; searchFieldCore?: boolean; @@ -63,6 +102,44 @@ export interface CorpusField { searchable: boolean; downloadable: boolean; name: string; - searchFilter: SearchFilter | null; + filterOptions: FilterOptions; mappingType: 'text' | 'keyword' | 'boolean' | 'date' | 'integer' | null; + + constructor(data: ApiCorpusField) { + this.description = data.description; + this.displayName = data.display_name || data.name; + this.displayType = data.display_type || data['es_mapping']?.type; + this.resultsOverview = data.results_overview; + this.csvCore = data.csv_core; + this.searchFieldCore = data.search_field_core; + this.visualizations = data.visualizations; + this.visualizationSort = data.visualization_sort; + this.multiFields = data['es_mapping']?.fields + ? Object.keys(data['es_mapping'].fields) + : undefined; + this.hidden = data.hidden; + this.sortable = data.sortable; + this.primarySort = data.primary_sort; + this.searchable = data.searchable; + this.downloadable = data.downloadable; + this.name = data.name; + this.filterOptions = data['search_filter']; + this.mappingType = data.es_mapping?.type; + } + + /** make a SearchFilter for this field */ + makeSearchFilter(): SearchFilter { + const filterClasses = { + DateFilter, + MultipleChoiceFilter, + BooleanFilter, + RangeFilter, + }; + const Filter = _.get( + filterClasses, + this.filterOptions?.name, + AdHocFilter + ); + return new Filter(this); + } } diff --git a/frontend/src/app/models/elasticsearch.ts b/frontend/src/app/models/elasticsearch.ts index c1c94b9cf..697fb6060 100644 --- a/frontend/src/app/models/elasticsearch.ts +++ b/frontend/src/app/models/elasticsearch.ts @@ -45,7 +45,7 @@ export interface BooleanQuery { } export interface MatchAll { - match_all: {}; + match_all: Record; } export interface SimpleQueryString { diff --git a/frontend/src/app/models/index.ts b/frontend/src/app/models/index.ts index 378c9acca..764f71ba5 100644 --- a/frontend/src/app/models/index.ts +++ b/frontend/src/app/models/index.ts @@ -2,8 +2,9 @@ export * from './corpus'; export * from './found-document'; export * from './query'; export * from './search-filter'; +export * from './search-filter-options'; export * from './search-results'; -export * from './sort-event'; +export * from './sort'; export * from './user'; export * from './user-role'; export * from './visualization'; diff --git a/frontend/src/app/models/query.spec.ts b/frontend/src/app/models/query.spec.ts new file mode 100644 index 000000000..390f81b3f --- /dev/null +++ b/frontend/src/app/models/query.spec.ts @@ -0,0 +1,192 @@ +import { mockField2, mockFieldDate, mockFieldMultipleChoice } from '../../mock-data/corpus'; +import { Corpus, } from './corpus'; +import { QueryModel } from './query'; +import { DateFilter, MultipleChoiceFilter, SearchFilter } from './search-filter'; +import { convertToParamMap } from '@angular/router'; + +const corpus: Corpus = { + name: 'mock-corpus', + title: 'Mock Corpus', + serverName: 'default', + description: '', + index: 'mock-corpus', + minDate: new Date('1800-01-01'), + maxDate: new Date('1900-01-01'), + image: '', + scan_image_type: null, + allow_image_download: true, + word_models_present: false, + fields: [ + mockField2, + mockFieldDate, + mockFieldMultipleChoice, + ], + languages: ['English'], + category: 'Tests', +} as Corpus; + +describe('QueryModel', () => { + let query: QueryModel; + let filter: SearchFilter; + let filter2: SearchFilter; + + const someDate = new Date('Jan 1 1850'); + const someSelection = ['hooray!']; + + beforeEach(() => { + query = new QueryModel(corpus); + + filter = query.filterForField(mockFieldDate); + filter2 = query.filterForField(mockFieldMultipleChoice); + }); + + it('should create', () => { + expect(query).toBeTruthy(); + }); + + it('should signal updates', () => { + let updates = 0; + query.update.subscribe(() => updates += 1); + + query.setQueryText('test'); + expect(updates).toBe(1); + + filter.setToValue(someDate); + expect(updates).toBe(2); + + filter.deactivate(); + expect(updates).toBe(3); + }); + + it('should remove filters', () => { + let updates = 0; + query.update.subscribe(() => updates += 1); + + filter.setToValue(someDate); + filter2.setToValue(someSelection); + + expect(query.activeFilters.length).toBe(2); + expect(updates).toBe(2); + + filter.setToValue(new Date('Jan 1 1860')); + + expect(query.activeFilters.length).toBe(2); + expect(updates).toBe(3); + + filter.deactivate(); + + expect(query.activeFilters.length).toBe(1); + expect(updates).toBe(4); + + filter.setToValue(new Date('Jan 1 1870')); + + expect(query.activeFilters.length).toBe(2); + expect(updates).toBe(5); + }); + + it('should convert to an elasticsearch query', () => { + expect(query.toEsQuery()).toEqual({ + query: { + match_all: {} + } + }); + + query.setQueryText('test'); + + expect(query.toEsQuery()).toEqual({ + query: { + bool: { + must: { + simple_query_string: { + query: 'test', + lenient: true, + default_operator: 'or', + } + }, + filter: [] + } + } + }); + }); + + it('should formulate parameters', () => { + expect(query.toRouteParam()).toEqual({ + query: null, + fields: null, + speech: null, + date: null, + greater_field: null, + sort: null, + highlight: null + }); + + query.setQueryText('test'); + + expect(query.toRouteParam()).toEqual({ + query: 'test', + fields: null, + speech: null, + date: null, + greater_field: null, + sort: null, + highlight: null, + }); + + filter.setToValue(someDate); + + expect(query.toRouteParam()).toEqual({ + query: 'test', + fields: null, + speech: null, + date: '1850-01-01:1850-01-01', + greater_field: null, + sort: null, + highlight: null, + }); + + query.setQueryText(''); + filter.deactivate(); + + expect(query.toRouteParam()).toEqual({ + query: null, + fields: null, + speech: null, + date: null, + greater_field: null, + sort: null, + highlight: null + }); + }); + + it('should set from parameters', () => { + const params = convertToParamMap({ + query: 'test', + date: '1850-01-01:1850-01-01', + }); + + const newQuery = new QueryModel(corpus, params); + expect(newQuery.queryText).toEqual('test'); + expect(newQuery.activeFilters.length).toBe(1); + }); + + it('should formulate a link', () => { + query.setQueryText('test'); + filter.setToValue(someDate); + + expect(query.toQueryParams()).toEqual({ query: 'test', date: '1850-01-01:1850-01-01' }); + }); + + it('should clone', () => { + query.setQueryText('test'); + filter.setToValue(someDate); + + const clone = query.clone(); + + query.setQueryText('different test'); + expect(clone.queryText).toEqual('test'); + + filter.setToValue(new Date('Jan 2 1850')); + expect(query.filterForField(mockFieldDate).currentData.min).toEqual(new Date('Jan 2 1850')); + expect(clone.filterForField(mockFieldDate).currentData.min).toEqual(new Date('Jan 1 1850')); + }); +}); diff --git a/frontend/src/app/models/query.ts b/frontend/src/app/models/query.ts index a1323c953..361ad1bfe 100644 --- a/frontend/src/app/models/query.ts +++ b/frontend/src/app/models/query.ts @@ -1,10 +1,19 @@ -import {SearchFilter } from '../models/index'; -import { SearchFilterData } from './search-filter'; +import { convertToParamMap, ParamMap } from '@angular/router'; +import * as _ from 'lodash'; +import { combineLatest, Subject } from 'rxjs'; +import { Corpus, CorpusField, EsFilter, SortBy, SortConfiguration, SortDirection, } from '../models/index'; +import { EsQuery } from '../services'; +import { combineSearchClauseAndFilters, makeHighlightSpecification } from '../utils/es-query'; +import { + filtersFromParams, highlightFromParams, omitNullParameters, queryFiltersToParams, + queryFromParams, searchFieldsFromParams +} from '../utils/params'; +import { SearchFilter } from './search-filter'; /** This is the query object as it is saved in the database.*/ export class QueryDb { constructor( - query: QueryModel, + esQuery: EsQuery, /** * Name of the corpus for which the query was performed. */ @@ -14,7 +23,7 @@ export class QueryDb { * User that performed this query. */ public user: number) { - this.query_json = query; + this.query_json = esQuery; } /** @@ -25,7 +34,8 @@ export class QueryDb { /** * JSON string representing the query model (i.e., query text and filters, see below). */ - public query_json: QueryModel; + public query_json: EsQuery; + queryModel?: QueryModel; /** * Time the first document was sent. @@ -53,18 +63,147 @@ export class QueryDb { public total_results: number; } -/** This is the client's representation of the query by the user, shared between components */ -export interface QueryModel { - queryText: string; - fields?: string[]; - filters?: SearchFilter[]; - sortBy?: string; - sortAscending?: boolean; - highlight?: number; -} - /** These are the from / size parameters emitted by the pagination component */ export interface SearchParameters { from: number; size: number; } + + +export class QueryModel { + corpus: Corpus; + queryText: string; + searchFields: CorpusField[]; + filters: SearchFilter[]; + sort: SortConfiguration; + highlightSize: number; + + update = new Subject(); + + constructor(corpus: Corpus, params?: ParamMap) { + this.corpus = corpus; + this.filters = this.corpus.fields.map(field => field.makeSearchFilter()); + this.sort = new SortConfiguration(this.corpus); + if (params) { + this.setFromParams(params); + } + this.subscribeToFilterUpdates(); + } + + get activeFilters() { + return this.filters.filter(f => f.active.value); + } + + setQueryText(text?: string) { + this.queryText = text || undefined; + this.update.next(); + } + + addFilter(filter: SearchFilter) { + this.filterForField(filter.corpusField).set(filter.currentData); + } + + + setSortBy(value: SortBy) { + this.sort.setSortBy(value); + this.update.next(); + } + + setSortDirection(value: SortDirection) { + this.sort.setSortDirection(value); + this.update.next(); + } + + + removeFilter(filter: SearchFilter) { + this.deactivateFiltersForField(filter.corpusField); + } + + /** get an active search filter on this query for the field (undefined if none exists) */ + filterForField(field: CorpusField): SearchFilter { + return this.filters.find(filter => filter.corpusField.name === field.name); + } + + /** remove all filters that apply to a corpus field */ + deactivateFiltersForField(field: CorpusField) { + this.filters.filter(filter => + filter.corpusField.name === field.name + ).forEach(filter => + filter.deactivate() + ); + } + + setHighlight(size?: number) { + this.highlightSize = size; + this.update.next(); + } + + /** + * make a clone of the current query. + */ + clone() { + return new QueryModel(this.corpus, convertToParamMap(this.toQueryParams())); + } + + /** + * convert the query to a parameter map + * + * All query-related params are explicity listed; + * empty parameters have value null. + */ + toRouteParam(): {[param: string]: string|null} { + const queryTextParams = { query: this.queryText || null }; + const searchFieldsParams = { fields: this.searchFields?.map(f => f.name).join(',') || null}; + const sortParams = this.sort.toRouteParam(); + const highlightParams = { highlight: this.highlightSize || null }; + const filterParams = queryFiltersToParams(this); + + return { + ...queryTextParams, + ...searchFieldsParams, + ...filterParams, + ...sortParams, + ...highlightParams, + }; + } + + /** + * convert the query to a a parameter map, only + * including properties that should actually be explicated + * in the route. Same as query.toRouteParam() but + * without null values. + */ + toQueryParams(): {[param: string]: string} { + return omitNullParameters(this.toRouteParam()); + } + + /** convert the query to an elasticsearch query */ + toEsQuery(): EsQuery { + const filters = this.activeFilters.map(filter => filter.toEsFilter()) as EsFilter[]; + const query = combineSearchClauseAndFilters(this.queryText, filters, this.searchFields); + + const sort = this.sort.toEsQuerySort(); + const highlight = makeHighlightSpecification(this.corpus, this.queryText, this.highlightSize); + + return { + ...query, ...sort, ...highlight + }; + } + + /** set the query values from a parameter map */ + private setFromParams(params: ParamMap) { + this.queryText = queryFromParams(params); + this.searchFields = searchFieldsFromParams(params, this.corpus); + filtersFromParams(params, this.corpus).forEach(filter => { + this.filterForField(filter.corpusField).set(filter.data.value); + }); + this.sort = new SortConfiguration(this.corpus, params); + this.highlightSize = highlightFromParams(params); + } + + private subscribeToFilterUpdates() { + this.filters.forEach(filter => { + filter.update.subscribe(() => this.update.next()); + }); + } +} diff --git a/frontend/src/app/models/search-filter-options.ts b/frontend/src/app/models/search-filter-options.ts new file mode 100644 index 000000000..5cc4a4343 --- /dev/null +++ b/frontend/src/app/models/search-filter-options.ts @@ -0,0 +1,35 @@ +// Types for serialised filter options for a corpus by the API + +export type SearchFilterType = 'DateFilter' | 'MultipleChoiceFilter' | 'RangeFilter' | 'BooleanFilter'; + +export interface HasDescription { + description: string; +} + +export type DateFilterOptions = { + name: 'DateFilter'; + lower: string; + upper: string; +} & HasDescription; + +export type MultipleChoiceFilterOptions = { + name: 'MultipleChoiceFilter'; + option_count: number; +} & HasDescription; + +export type RangeFilterOptions = { + name: 'RangeFilter'; + lower: number; + upper: number; +} & HasDescription; + +export type BooleanFilterOptions = { + name: 'BooleanFilter'; + checked: boolean; +} & HasDescription; + +export type FilterOptions = + DateFilterOptions | + MultipleChoiceFilterOptions | + RangeFilterOptions | + BooleanFilterOptions; diff --git a/frontend/src/app/models/search-filter.spec.ts b/frontend/src/app/models/search-filter.spec.ts new file mode 100644 index 000000000..714cc3172 --- /dev/null +++ b/frontend/src/app/models/search-filter.spec.ts @@ -0,0 +1,196 @@ +import { convertToParamMap } from '@angular/router'; +import { mockFieldMultipleChoice, mockFieldDate } from '../../mock-data/corpus'; +import { EsDateFilter, EsTermsFilter } from './elasticsearch'; +import { DateFilter, DateFilterData, MultipleChoiceFilter } from './search-filter'; +import { of } from 'rxjs'; +import { distinct } from 'rxjs/operators'; + +describe('SearchFilter', () => { + // while these tests are ran on the DateFilter, + // they test logic implemented in the abstract + // SearchFilter class + + const field = mockFieldDate; + let filter: DateFilter; + const exampleData: DateFilterData = { + min: new Date(Date.parse('Jan 01 1850')), + max: new Date(Date.parse('Dec 31 1860')) + }; + const isActive = () => filter.active.value; + + beforeEach(() => { + filter = new DateFilter(field); + }); + + it('should toggle', () => { + expect(isActive()).toBeFalse(); + + filter.toggle(); + expect(isActive()).toBeTrue(); + filter.toggle(); + expect(isActive()).toBeFalse(); + + filter.deactivate(); + expect(isActive()).toBeFalse(); + filter.activate(); + expect(isActive()).toBeTrue(); + filter.deactivate(); + expect(isActive()).toBeFalse(); + }); + + it('should activate when value is set to non-default', () => { + expect(isActive()).toBeFalse(); + + filter.set(filter.defaultData); + expect(isActive()).toBeFalse(); + + filter.set(exampleData); + expect(isActive()).toBeTrue(); + }); + + it('should deactivate when reset', () => { + filter.set(exampleData); + expect(isActive()).toBeTrue(); + + filter.reset(); + expect(isActive()).toBeFalse(); + }); + + it('should set from parameters', () => { + filter.setFromParams(convertToParamMap({ + date: '1850-01-01:1860-01-01' + })); + + expect(filter.active.value).toBeTrue(); + + filter.setFromParams(convertToParamMap({ + query: 'test' + })); + + expect(filter.active.value).toBeFalse(); + }); + + it('should signal updates', () => { + let updates = 0; + filter.update.subscribe(() => updates += 1); + + filter.set(exampleData); + expect(updates).toBe(1); + + filter.deactivate(); + expect(updates).toBe(2); + + filter.reset(); // this does not affect anything since the filter is inactive + expect(updates).toBe(2); + + }); +}); + +describe('DateFilter', () => { + const field = mockFieldDate; + let filter: DateFilter; + const exampleData = { + min: new Date(Date.parse('Jan 01 1850')), + max: new Date(Date.parse('Dec 31 1860')) + }; + + beforeEach(() => { + filter = new DateFilter(field); + }); + + it('should create', () => { + expect(filter).toBeTruthy(); + expect(filter.currentData).toEqual(filter.defaultData); + expect(filter.currentData).toEqual({ + min: new Date(Date.parse('Jan 01 1800')), + max: new Date(Date.parse('Dec 31 1899')) + }); + }); + + it('should convert to string', () => { + expect(filter.dataFromString(filter.dataToString(filter.currentData))) + .toEqual(filter.currentData); + }); + + it('should set data from a value', () => { + const date = new Date(Date.parse('Jan 01 1850')); + filter.setToValue(date); + expect(filter.currentData).toEqual({ + min: date, + max: date, + }); + }); + + it('should convert to an elasticsearch filter', () => { + filter.set(exampleData); + const esFilter = filter.toEsFilter(); + expect(esFilter).toEqual({ + range: { + date: { + gte: '1850-01-01', + lte: '1860-12-31', + format: 'yyyy-MM-dd' + } + } + }); + }); + + it('should parse an elasticsearch filter', () => { + filter.set(exampleData); + const esFilter = filter.toEsFilter(); + expect(filter.dataFromEsFilter(esFilter)).toEqual(filter.currentData); + }); +}); + +describe('MultipleChoiceFilter', () => { + const field = mockFieldMultipleChoice; + let filter: MultipleChoiceFilter; + const exampleData = ['test']; + + beforeEach(() => { + filter = new MultipleChoiceFilter(field); + }); + + it('should create', () => { + expect(filter).toBeTruthy(); + expect(filter.currentData).toEqual(filter.defaultData); + expect(filter.currentData).toEqual([]); + }); + + it('should convert to a string', () => { + expect(filter.dataFromString(filter.dataToString(filter.currentData))) + .toEqual(filter.currentData); + + // non-empty value + filter.set(['a', 'b', 'value with spaces']); + expect(filter.dataFromString(filter.dataToString(filter.currentData))) + .toEqual(filter.currentData); + }); + + it('should convert values to valid URI components', () => { + filter.set(['a long value']); + expect(filter.dataToString(filter.currentData)).not.toContain(' '); + }); + + it('should set data from a value', () => { + const value = 'a great value'; + filter.setToValue(value); + expect(filter.currentData).toEqual([value]); + }); + + it('should convert to an elasticsearch filter', () => { + filter.set(['wow!', 'a great selection!']); + const esFilter = filter.toEsFilter(); + expect(esFilter).toEqual({ + terms: { + greater_field: ['wow!', 'a great selection!'] + } + }); + }); + + it('should parse an elasticsearch filter', () => { + filter.set(exampleData); + const esFilter = filter.toEsFilter(); + expect(filter.dataFromEsFilter(esFilter)).toEqual(filter.currentData); + }); +}); diff --git a/frontend/src/app/models/search-filter.ts b/frontend/src/app/models/search-filter.ts index ce57a82a1..79c5c579b 100644 --- a/frontend/src/app/models/search-filter.ts +++ b/frontend/src/app/models/search-filter.ts @@ -1,79 +1,348 @@ +import * as _ from 'lodash'; +import * as moment from 'moment'; +import { BehaviorSubject, Observable, Subject, combineLatest } from 'rxjs'; +import { distinct, map } from 'rxjs/operators'; import { CorpusField } from './corpus'; +import { EsBooleanFilter, EsDateFilter, EsFilter, EsTermsFilter, EsRangeFilter, EsTermFilter } from './elasticsearch'; +import { BooleanFilterOptions, DateFilterOptions, FilterOptions, MultipleChoiceFilterOptions, + RangeFilterOptions } from './search-filter-options'; +import { ParamMap } from '@angular/router'; -export interface SearchFilter { - fieldName: string; - description: string; - useAsFilter: boolean; - reset?: boolean; - grayedOut?: boolean; - adHoc?: boolean; - defaultData?: T; - currentData: T; -}; +abstract class AbstractSearchFilter { + corpusField: CorpusField; + defaultData: FilterData; + data: BehaviorSubject; + active: BehaviorSubject; + + update = new Subject(); + + constructor(corpusField: CorpusField) { + this.corpusField = corpusField; + this.defaultData = this.makeDefaultData(corpusField.filterOptions); + this.data = new BehaviorSubject(this.defaultData); + this.active = new BehaviorSubject(false); + } + + get filterType() { + return this.corpusField.filterOptions?.name; + } + + get currentData() { + return this.data?.value; + } + + get isDefault$(): Observable { + return this.data.asObservable().pipe( + map(data => _.isEqual(data, this.defaultData)) + ); + } + + + get adHoc() { + return !(this.corpusField.filterOptions); + } + + get description() { + if (this.corpusField?.filterOptions?.description) { + return this.corpusField.filterOptions.description; + } else { + return `Filter results based on ${this.corpusField.displayName}`; + } + } + + set(data: FilterData) { + if (!_.isEqual(data, this.currentData)) { + this.data.next(data); + + const active = this.active.value; + const toDefault = _.isEqual(data, this.defaultData); + const deactivate = active && toDefault; + const activate = !active && !toDefault; + + if (deactivate || activate) { + this.toggle(); + } else if (active) { + this.update.next(); + } + } + } + + reset() { + this.set(this.defaultData); + } + + /** + * set value based on route parameter + */ + setFromParams(params: ParamMap): void { + const value = params.get(this.corpusField.name); + if (value) { + this.set(this.dataFromString(value)); + } else { + this.reset(); + } + } + + /** + * filter for one specific value (used to find documents from + * the same day, page, publication, etc. as a specific document) + */ + setToValue(value: any) { + this.set(this.dataFromValue(value)); + } + + toRouteParam(): {[param: string]: any} { + const value = this.active.value ? this.dataToString(this.currentData) : undefined; + return { + [this.corpusField.name]: value || null + }; + } + + toEsFilter(): EsFilterType { + if (this.active.value) { + return this.dataToEsFilter(); + } + } + + public activate() { + if (!this.active.value) { + this.toggle(); + } + } -export type SearchFilterData = BooleanFilterData | MultipleChoiceFilterData | RangeFilterData | DateFilterData; + public deactivate() { + if (this.active.value) { + this.toggle(); + } + } + + public toggle() { + this.active.next(!this.active.value); + this.update.next(); + } + + abstract makeDefaultData(filterOptions: FilterOptions): FilterData; + + abstract dataFromValue(value: any): FilterData; + + abstract dataFromString(value: string): FilterData; + + abstract dataToString(data: FilterData): string; + + /** + * export data as filter specification in elasticsearch query language + */ + abstract dataToEsFilter(): EsFilterType; + + abstract dataFromEsFilter(esFilter: EsFilterType): FilterData; -export interface BooleanFilterData { - filterType: 'BooleanFilter'; - checked: boolean; } -export interface MultipleChoiceFilterData { - filterType: 'MultipleChoiceFilter'; - optionCount?: number; - selected: string[]; + +export interface DateFilterData { + min: Date; + max: Date; +} + +export class DateFilter extends AbstractSearchFilter { + makeDefaultData(filterOptions: DateFilterOptions) { + return { + min: this.parseDate(filterOptions.lower), + max: this.parseDate(filterOptions.upper) + }; + } + + dataFromValue(value: Date) { + return { + min: value, + max: value, + }; + } + + dataFromString(value: string) { + const [minString, maxString] = parseMinMax(value.split(',')); + return { + min: this.parseDate(minString), + max: this.parseDate(maxString), + }; + } + + dataToString(data: DateFilterData) { + const min = this.formatDate(data.min); + const max = this.formatDate(data.max); + return `${min}:${max}`; + } + + dataToEsFilter(): EsDateFilter { + return { + range: { + [this.corpusField.name]: { + gte: this.formatDate(this.currentData.min), + lte: this.formatDate(this.currentData.max), + format: 'yyyy-MM-dd' + } + } + }; + } + + dataFromEsFilter(esFilter: EsDateFilter): DateFilterData { + const data = _.first(_.values(esFilter.range)); + const min = this.parseDate(data.gte); + const max = this.parseDate(data.lte); + return { min, max }; + } + + private formatDate(date: Date): string { + return moment(date).format('YYYY-MM-DD'); + } + + private parseDate(dateString: string): Date { + return moment(dateString, 'YYYY-MM-DD').toDate(); + } } + +export class BooleanFilter extends AbstractSearchFilter { + + makeDefaultData(filterOptions: BooleanFilterOptions) { + return false; + } + + dataFromValue(value: any): boolean { + return value as boolean; + } + + dataFromString(value: string): boolean { + return value === 'true'; + } + + dataToString(data: boolean): string { + return data.toString(); + } + + dataToEsFilter(): EsBooleanFilter { + return { + term: { + [this.corpusField.name]: this.currentData + } + }; + } + + dataFromEsFilter(esFilter: EsBooleanFilter): boolean { + const data = _.first(_.values(esFilter.term)); + return data; + } +} + +type MultipleChoiceFilterData = string[]; + +export class MultipleChoiceFilter extends AbstractSearchFilter { + makeDefaultData(filterOptions: MultipleChoiceFilterOptions): MultipleChoiceFilterData { + return []; + } + + dataFromValue(value: any): MultipleChoiceFilterData { + return [value.toString()]; + } + + dataFromString(value: string): MultipleChoiceFilterData { + if (value.length) { + return value.split(',').map(decodeURIComponent); + } + return []; + } + + dataToString(data: MultipleChoiceFilterData): string { + return data.map(encodeURIComponent).join(','); + } + + dataToEsFilter(): EsTermsFilter { + return { + terms: { + [this.corpusField.name]: this.currentData + } + }; + } + + dataFromEsFilter(esFilter: EsTermsFilter): MultipleChoiceFilterData { + return _.first(_.values(esFilter.terms)); + } +} + export interface RangeFilterData { - filterType: 'RangeFilter'; min: number; max: number; } -export interface DateFilterData { - filterType: 'DateFilter'; - /** minimum of date range, format: yyyy-MM-dd */ - min: string; - /** maximum of date range, format: yyyy-MM-dd */ - max: string; + +export class RangeFilter extends AbstractSearchFilter { + makeDefaultData(filterOptions: RangeFilterOptions): RangeFilterData { + return { + min: filterOptions.lower, + max: filterOptions.upper + }; + } + + dataFromValue(value: number): RangeFilterData { + return { min: value, max: value }; + } + + dataFromString(value: string): RangeFilterData { + const [minString, maxString] = parseMinMax(value.split(',')); + return { + min: parseFloat(minString), + max: parseFloat(maxString) + }; + } + + dataToString(data: RangeFilterData): string { + return `${data.min},${data.max}`; + } + + dataToEsFilter(): EsRangeFilter { + return { + range: { + [this.corpusField.name]: { + gte: this.currentData.min, + lte: this.currentData.max, + } + } + }; + } + + dataFromEsFilter(esFilter: EsRangeFilter): RangeFilterData { + const data = _.first(_.values(esFilter.range)); + const min = data.gte; + const max = data.lte; + return { min, max }; + } } -export type SearchFilterType = SearchFilterData['filterType']; - -export function searchFilterDataFromSettings(filterType: SearchFilterType|undefined, value: string[], field: CorpusField): SearchFilterData { - switch (filterType) { - case 'BooleanFilter': - return { filterType, checked: value[0] === 'true' }; - case 'MultipleChoiceFilter': - return { filterType, selected: value }; - case 'RangeFilter': { - const [min, max] = parseMinMax(value); - return { filterType, min: parseFloat(min), max: parseFloat(max) }; - } - case 'DateFilter': { - const [min, max] = parseMinMax(value); - return { filterType, min, max }; - } - case undefined: { - return searchFilterDataFromField(field, value); - } +export class AdHocFilter extends AbstractSearchFilter { + makeDefaultData(filterOptions: FilterOptions) {} + + dataFromValue(value: any) { + return value; } -}; -export const searchFilterDataFromField = (field: CorpusField, value: string[]): SearchFilterData => { - switch (field.mappingType) { - case 'boolean': - return { filterType: 'BooleanFilter', checked: value[0] === 'true' }; - case 'date': { - const [min, max] = parseMinMax(value); - return { filterType: 'DateFilter', min, max }; - } - case 'integer': { - const [min, max] = parseMinMax(value); - return { filterType: 'RangeFilter', min: parseFloat(min), max: parseFloat(max) }; - } - case 'keyword': { - return { filterType: 'MultipleChoiceFilter', selected: value.map(encodeURIComponent) }; - } + dataFromString(value: string) { + return value; + } + + dataToString(data: any): string { + return data.toString(); + } + + dataToEsFilter(): EsTermFilter { + return { + term: { + [this.corpusField.name]: this.currentData + } + }; } -}; + + dataFromEsFilter(esFilter: EsTermFilter): string { + const data = _.first(_.values(esFilter.term)); + return data; + } +} const parseMinMax = (value: string[]): [string, string] => { const term = value[0]; @@ -86,13 +355,4 @@ const parseMinMax = (value: string[]): [string, string] => { } }; -export function contextFilterFromField(field: CorpusField, value?: string): SearchFilter { - const currentValue = value ? searchFilterDataFromField(field, [value]) : undefined; - return { - fieldName: field.name, - description: `Search only within this ${field.displayName}`, - useAsFilter: true, - adHoc: true, - currentData: currentValue - }; -} +export type SearchFilter = DateFilter | MultipleChoiceFilter | RangeFilter | BooleanFilter | AdHocFilter; diff --git a/frontend/src/app/models/search-results.ts b/frontend/src/app/models/search-results.ts index 06dbbe224..1b61b2fe7 100644 --- a/frontend/src/app/models/search-results.ts +++ b/frontend/src/app/models/search-results.ts @@ -56,7 +56,6 @@ export interface WordSimilarity { }; export interface RelatedWordsResults { - total_similarities: WordSimilarity[]; similarities_over_time: WordSimilarity[]; time_points: string[]; similarities_over_time_local_top_n: WordSimilarity[][]; diff --git a/frontend/src/app/models/sort-event.ts b/frontend/src/app/models/sort-event.ts deleted file mode 100644 index af3a5be7c..000000000 --- a/frontend/src/app/models/sort-event.ts +++ /dev/null @@ -1,6 +0,0 @@ -import { CorpusField } from './corpus'; - -export interface SortEvent { - ascending: boolean; - field: CorpusField | undefined; -} diff --git a/frontend/src/app/models/sort.spec.ts b/frontend/src/app/models/sort.spec.ts new file mode 100644 index 000000000..9f3c13143 --- /dev/null +++ b/frontend/src/app/models/sort.spec.ts @@ -0,0 +1,30 @@ +import { convertToParamMap } from '@angular/router'; +import { mockCorpus3, mockField3 } from '../../mock-data/corpus'; +import { SortConfiguration } from './sort'; + +describe('SortConfiguration', () => { + let sort: SortConfiguration; + + beforeEach(() => { + sort = new SortConfiguration(mockCorpus3); + }); + + it('should set the default state', () => { + expect(sort.sortBy.value).toBe(undefined); + expect(sort.sortDirection.value).toBe('desc'); + expect(sort.isDefault).toBe(true); + }); + + it('should convert to parameters', () => { + sort.setSortBy(mockField3); + sort.setSortDirection('asc'); + + const param = sort.toRouteParam(); + + // now initialise from the parameter + sort = new SortConfiguration(mockCorpus3, convertToParamMap(param)); + + expect(sort.sortBy.value).toEqual(mockField3); + expect(sort.sortDirection.value).toBe('asc'); + }); +}); diff --git a/frontend/src/app/models/sort.ts b/frontend/src/app/models/sort.ts new file mode 100644 index 000000000..c0362cee5 --- /dev/null +++ b/frontend/src/app/models/sort.ts @@ -0,0 +1,80 @@ +import { ParamMap } from '@angular/router'; +import { BehaviorSubject, combineLatest } from 'rxjs'; +import { makeSortSpecification } from '../utils/es-query'; +import { sortSettingsToParams } from '../utils/params'; +import { Corpus, CorpusField } from './corpus'; +import * as _ from 'lodash'; +import { findByName } from '../utils/utils'; + +export type SortBy = CorpusField | undefined; +export type SortDirection = 'asc'|'desc'; + +export class SortConfiguration { + sortBy = new BehaviorSubject(undefined); + sortDirection = new BehaviorSubject('desc'); + + configuration$ = combineLatest([this.sortBy, this.sortDirection]); + + private defaultSortBy: SortBy; + private defaultSortDirection: SortDirection = 'desc'; + + constructor(private corpus: Corpus, params?: ParamMap) { + this.defaultSortBy = this.corpus.fields.find(field => field.primarySort); + this.sortBy.next(this.defaultSortBy); + if (params) { + this.setFromParams(params); + } + } + + /** + * Whether the current state is the default sorting state + */ + get isDefault(): boolean { + return _.isEqual(this.sortBy.value, this.defaultSortBy) && this.sortDirection.value === this.defaultSortDirection; + } + + setSortBy(value: SortBy) { + this.sortBy.next(value); + + // sorting by relevance is always descending + if (!value) { + this.sortDirection.next('desc'); + } + } + + setSortDirection(value: SortDirection) { + this.sortDirection.next(value); + } + + reset() { + this.sortBy.next(this.defaultSortBy); + this.sortDirection.next(this.defaultSortDirection); + } + + toRouteParam(): {sort: string|null} { + if (this.isDefault) { + return {sort: null}; + } + return sortSettingsToParams(this.sortBy.value, this.sortDirection.value); + } + + /** convert this configuration to the 'sort' part of an elasticsearch query */ + toEsQuerySort(): { sort?: any } { + return makeSortSpecification(this.sortBy.value, this.sortDirection.value); + } + + private setFromParams(params: ParamMap) { + if (params.has('sort')) { + const [sortParam, ascParam] = params.get('sort').split(','); + if ( sortParam === 'relevance' ) { + this.sortBy.next(undefined); + } else { + const field = findByName(this.corpus.fields, sortParam); + this.sortBy.next(field); + } + this.setSortDirection(ascParam as 'asc'|'desc'); + } else { + this.reset(); + } + } +} diff --git a/frontend/src/app/models/visualization.ts b/frontend/src/app/models/visualization.ts index f353a97f9..753d481dd 100644 --- a/frontend/src/app/models/visualization.ts +++ b/frontend/src/app/models/visualization.ts @@ -124,3 +124,7 @@ export const barChartSetNull: Object = { normalize: null, visualizeTerm: null }; + +export interface FieldCoverage { + [field: string]: number; +}; diff --git a/frontend/src/app/search/highlight-selector.component.spec.ts b/frontend/src/app/search/highlight-selector.component.spec.ts index 15b9bf28b..82fd70d69 100644 --- a/frontend/src/app/search/highlight-selector.component.spec.ts +++ b/frontend/src/app/search/highlight-selector.component.spec.ts @@ -1,5 +1,7 @@ import { ComponentFixture, TestBed } from '@angular/core/testing'; +import { mockCorpus2 } from '../../mock-data/corpus'; import { commonTestBed } from '../common-test-bed'; +import { QueryModel } from '../models'; import { HighlightSelectorComponent } from './highlight-selector.component'; @@ -14,6 +16,7 @@ describe('HighlightSelectorComponent', () => { beforeEach(() => { fixture = TestBed.createComponent(HighlightSelectorComponent); component = fixture.componentInstance; + component.queryModel = new QueryModel(mockCorpus2); fixture.detectChanges(); }); diff --git a/frontend/src/app/search/highlight-selector.component.ts b/frontend/src/app/search/highlight-selector.component.ts index 31ccb901b..260bf62e1 100644 --- a/frontend/src/app/search/highlight-selector.component.ts +++ b/frontend/src/app/search/highlight-selector.component.ts @@ -1,8 +1,6 @@ -import { Component } from '@angular/core'; -import { ActivatedRoute, ParamMap, Router } from '@angular/router'; +import { Component, Input, OnChanges, OnDestroy, SimpleChanges } from '@angular/core'; +import { QueryModel } from '../models'; -import { ParamDirective } from '../param/param-directive'; -import { highlightFromParams } from '../utils/params'; const HIGHLIGHT = 200; @@ -11,29 +9,32 @@ const HIGHLIGHT = 200; templateUrl: './highlight-selector.component.html', styleUrls: ['./highlight-selector.component.scss'] }) -export class HighlightSelectorComponent extends ParamDirective { +export class HighlightSelectorComponent implements OnChanges, OnDestroy { + @Input() queryModel: QueryModel; public highlight: number = HIGHLIGHT; - constructor(route: ActivatedRoute, router: Router) { - super(route, router); + constructor() { } - initialize() { - + ngOnChanges(changes: SimpleChanges): void { + if (changes.queryModel) { + this.setStateFromQueryModel(); + this.queryModel.update.subscribe(this.setStateFromQueryModel.bind(this)); + } } - teardown() { - this.setParams({ highlight: null }); + ngOnDestroy(): void { + this.queryModel.setHighlight(undefined); } - setStateFromParams(params: ParamMap) { - this.highlight = highlightFromParams(params); + setStateFromQueryModel() { + this.highlight = this.queryModel.highlightSize; } updateHighlightSize(event) { const highlightSize = event.target.value; - this.setParams({ highlight: highlightSize !== "0" ? highlightSize : null }); + this.queryModel.setHighlight(highlightSize); } } diff --git a/frontend/src/app/search/search-results.component.html b/frontend/src/app/search/search-results.component.html index 7e0e2614c..2a42ad4c4 100644 --- a/frontend/src/app/search/search-results.component.html +++ b/frontend/src/app/search/search-results.component.html @@ -23,8 +23,7 @@

Sort By

- - +
@@ -81,27 +80,30 @@

- +
+ iaBalloon="view this document on its own page" autofocus + tabindex="0"> Link   - + @@ -111,7 +113,8 @@

diff --git a/frontend/src/app/search/search-results.component.spec.ts b/frontend/src/app/search/search-results.component.spec.ts index a3c7d8481..d66457861 100644 --- a/frontend/src/app/search/search-results.component.spec.ts +++ b/frontend/src/app/search/search-results.component.spec.ts @@ -1,7 +1,9 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; +import * as _ from 'lodash'; +import { mockCorpus, mockField } from '../../mock-data/corpus'; import { commonTestBed } from '../common-test-bed'; -import { CorpusField } from '../models/index'; +import { CorpusField, QueryModel } from '../models/index'; import { SearchResultsComponent } from './search-results.component'; @@ -38,38 +40,28 @@ describe('Search Results Component', () => { relation: 'gte' } }; - component.corpus = { - fields - }; + component.corpus = _.merge(mockCorpus, fields); component.fromIndex = 0; component.resultsPerPage = 20; + const query = new QueryModel(component.corpus); + query.setQueryText('wally'); + query.setHighlight(10); + component.queryModel = query; fixture.detectChanges(); }); - function createField(name: string): CorpusField { - return { - name, - displayName: name, - description: 'Description', - displayType: 'text', - searchFilter: null, - hidden: false, - sortable: true, - primarySort: false, - searchable: false, - downloadable: true, - mappingType: 'keyword', - }; - } + const createField = (name: string): CorpusField => { + const field = _.cloneDeep(mockField); + field.name = name; + return field; + }; - function createDocument( + const createDocument = ( fieldValues: { [name: string]: string }, id: string, relevance: number, highlight?: {[fieldName: string]: string[]} - ) { - return { id, relevance, fieldValues, highlight }; - } + ) => ({ id, relevance, fieldValues, highlight }); it('should be created', () => { expect(component).toBeTruthy(); diff --git a/frontend/src/app/search/search-results.component.ts b/frontend/src/app/search/search-results.component.ts index 0e272f8a0..65cf78dfd 100644 --- a/frontend/src/app/search/search-results.component.ts +++ b/frontend/src/app/search/search-results.component.ts @@ -1,3 +1,4 @@ +/* eslint-disable @typescript-eslint/member-ordering */ import { Component, ElementRef, EventEmitter, HostListener, Input, OnChanges, Output, ViewChild } from '@angular/core'; import { User, Corpus, SearchParameters, SearchResults, FoundDocument, QueryModel, ResultOverview } from '../models/index'; @@ -5,6 +6,7 @@ import { SearchService } from '../services'; import { ShowError } from '../error/error.component'; import * as _ from 'lodash'; import { faBookOpen, faArrowLeft, faArrowRight, faLink } from '@fortawesome/free-solid-svg-icons'; +import { makeContextParams } from '../utils/document-context'; @Component({ selector: 'ia-search-results', @@ -36,9 +38,6 @@ export class SearchResultsComponent implements OnChanges { @Output('searched') public searchedEvent = new EventEmitter(); - @Output('viewContext') - public contextEvent = new EventEmitter(); - public isLoading = false; public isScrolledDown: boolean; @@ -50,8 +49,6 @@ export class SearchResultsComponent implements OnChanges { public fromIndex = 0; - public queryText: string; - public imgSrc: Uint8Array; /** @@ -77,11 +74,11 @@ export class SearchResultsComponent implements OnChanges { constructor(private searchService: SearchService) { } ngOnChanges() { - if (this.queryModel !== null) { - this.queryText = this.queryModel.queryText; + if (this.queryModel) { this.fromIndex = 0; this.maximumDisplayed = this.user.downloadLimit ? this.user.downloadLimit : 10000; this.search(); + this.queryModel.update.subscribe(() => this.search()); } } @@ -96,10 +93,7 @@ export class SearchResultsComponent implements OnChanges { private search() { this.isLoading = true; - this.searchService.search( - this.queryModel, - this.corpus - ).then(results => { + this.searchService.search(this.queryModel).then(results => { this.results = results; this.results.documents.map((d, i) => d.position = i + 1); this.searched(this.queryModel.queryText, this.results.total.value); @@ -120,7 +114,7 @@ export class SearchResultsComponent implements OnChanges { this.isLoading = true; this.fromIndex = searchParameters.from; this.resultsPerPage = searchParameters.size; - this.results = await this.searchService.loadResults(this.corpus, this.queryModel, searchParameters.from, searchParameters.size); + this.results = await this.searchService.loadResults(this.queryModel, searchParameters.from, searchParameters.size); this.results.documents.map( (d, i) => d.position = i + searchParameters.from + 1 ); this.isLoading = false; } @@ -143,11 +137,6 @@ export class SearchResultsComponent implements OnChanges { this.documentTabIndex = 0; } - public goToContext(document: FoundDocument) { - this.showDocument = false; - this.contextEvent.emit(document); - } - get contextDisplayName(): string { if (this.corpus && this.corpus.documentContext) { return this.corpus.documentContext.displayName; @@ -201,4 +190,8 @@ export class SearchResultsComponent implements OnChanges { } return false; } + + contextParams(document: FoundDocument) { + return makeContextParams(document, this.corpus); + } } diff --git a/frontend/src/app/search/search-sorting.component.spec.ts b/frontend/src/app/search/search-sorting.component.spec.ts index 5be0bc405..ae1ebdd95 100644 --- a/frontend/src/app/search/search-sorting.component.spec.ts +++ b/frontend/src/app/search/search-sorting.component.spec.ts @@ -1,6 +1,7 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; -import { mockField3 } from '../../mock-data/corpus'; +import { mockCorpus } from '../../mock-data/corpus'; import { commonTestBed } from '../common-test-bed'; +import { QueryModel } from '../models'; import { SearchSortingComponent } from './search-sorting.component'; @@ -16,7 +17,7 @@ describe('Search Sorting Component', () => { beforeEach(() => { fixture = TestBed.createComponent(SearchSortingComponent); component = fixture.componentInstance; - component.fields = [mockField3]; + component.queryModel = new QueryModel(mockCorpus); fixture.detectChanges(); }); diff --git a/frontend/src/app/search/search-sorting.component.ts b/frontend/src/app/search/search-sorting.component.ts index cfd0954c2..42a6beaf6 100644 --- a/frontend/src/app/search/search-sorting.component.ts +++ b/frontend/src/app/search/search-sorting.component.ts @@ -1,9 +1,5 @@ -import { Component, Input } from '@angular/core'; -import { ActivatedRoute, ParamMap, Router } from '@angular/router'; -import { CorpusField } from '../models'; -import { ParamDirective } from '../param/param-directive'; -import { sortSettingsFromParams, sortSettingsToParams } from '../utils/params'; -import { sortDirectionFromBoolean } from '../utils/sort'; +import { Component, Input, OnChanges, OnDestroy, SimpleChanges } from '@angular/core'; +import { CorpusField, QueryModel, SortConfiguration } from '../models'; const defaultValueType = 'alpha'; @Component({ @@ -12,46 +8,51 @@ const defaultValueType = 'alpha'; styleUrls: ['./search-sorting.component.scss'], host: { class: 'field has-addons' } }) -export class SearchSortingComponent extends ParamDirective { - @Input() - public set fields(fields: CorpusField[]) { - this.sortableFields = fields.filter(field => field.sortable); - } +export class SearchSortingComponent implements OnChanges, OnDestroy { + @Input() queryModel: QueryModel; - private sortData: { - field: CorpusField - ascending: boolean - } public ascending = true; - public primarySort: CorpusField; public sortField: CorpusField; public valueType: 'alpha' | 'numeric' = defaultValueType; public sortableFields: CorpusField[]; public showFields = false; + constructor() {} + + get sortConfiguration(): SortConfiguration { + return this.queryModel.sort; + } + public get sortType(): SortType { return `${this.valueType}${this.ascending ? 'Asc' : 'Desc'}` as SortType; } - initialize() { - this.primarySort = this.sortableFields.find(field => field.primarySort); - this.sortField = this.primarySort; + + ngOnChanges(changes: SimpleChanges): void { + if (changes.queryModel) { + this.setSortableFields(); + this.queryModel.update.subscribe(this.setStateFromQueryModel.bind(this)); + } + } + + ngOnDestroy(): void { + this.sortConfiguration.reset(); } - teardown() { - this.setParams({ sort: null }); + setSortableFields() { + this.sortableFields = this.queryModel.corpus.fields.filter(field => field.sortable); + this.setStateFromQueryModel(); } - setStateFromParams(params: ParamMap) { - this.sortData = sortSettingsFromParams(params, this.sortableFields); - this.sortField = this.sortData.field; - this.ascending = this.sortData.ascending; + setStateFromQueryModel() { + this.sortField = this.sortConfiguration.sortBy.value; + this.ascending = this.sortConfiguration.sortDirection.value === 'asc'; } public toggleSortType() { - this.ascending = !this.ascending; - this.updateSort(); + const direction = this.ascending ? 'desc' : 'asc'; + this.queryModel.setSortDirection(direction); } public toggleShowFields() { @@ -61,17 +62,10 @@ export class SearchSortingComponent extends ParamDirective { public changeField(field: CorpusField | undefined) { if (field === undefined) { this.valueType = defaultValueType; - this.ascending = false; } else { this.valueType = ['integer', 'date', 'boolean'].indexOf(field.displayType) >= 0 ? 'numeric' : 'alpha'; } - this.sortField = field; - this.updateSort(); - } - - private updateSort() { - const setting = sortSettingsToParams(this.sortField, sortDirectionFromBoolean(this.ascending)); - this.setParams(setting); + this.queryModel.setSortBy(field || undefined); } } diff --git a/frontend/src/app/search/search.component.html b/frontend/src/app/search/search.component.html index 64727bec7..afdb2cf16 100644 --- a/frontend/src/app/search/search.component.html +++ b/frontend/src/app/search/search.component.html @@ -30,7 +30,7 @@

- +
@@ -49,20 +49,22 @@
- +
- - - + + + - +
diff --git a/frontend/src/app/search/search.component.ts b/frontend/src/app/search/search.component.ts index b220b4479..f88137533 100644 --- a/frontend/src/app/search/search.component.ts +++ b/frontend/src/app/search/search.component.ts @@ -1,15 +1,13 @@ -import {Subscription } from 'rxjs'; +import { Subscription } from 'rxjs'; import { Component, ElementRef, ViewChild, HostListener } from '@angular/core'; import { ActivatedRoute, Router, ParamMap } from '@angular/router'; -import * as _ from 'lodash'; -import { Corpus, CorpusField, ResultOverview, QueryModel, User, contextFilterFromField, SearchFilterData, - SearchFilter, - FoundDocument} from '../models/index'; -import { CorpusService, DialogService, ParamService, SearchService } from '../services/index'; +import { Corpus, CorpusField, ResultOverview, QueryModel, User } from '../models/index'; +import { CorpusService, DialogService, } from '../services/index'; import { ParamDirective } from '../param/param-directive'; -import { makeContextParams } from '../utils/document-context'; import { AuthService } from '../services/auth.service'; +import * as _ from 'lodash'; +import { paramsHaveChanged } from '../utils/params'; @Component({ selector: 'ia-search', @@ -33,10 +31,7 @@ export class SearchComponent extends ParamDirective { * Whether the total number of hits exceeds the download limit. */ public hasLimitedResults = false; - /** - * Hide the filters by default, unless an existing search is opened containing filters. - */ - public showFilters = true; + public user: User; protected corpusSubscription: Subscription; @@ -56,7 +51,6 @@ export class SearchComponent extends ParamDirective { constructor( private authService: AuthService, private corpusService: CorpusService, - private paramService: ParamService, private dialogService: DialogService, route: ActivatedRoute, router: Router @@ -76,17 +70,14 @@ export class SearchComponent extends ParamDirective { teardown() { this.user = undefined; this.corpusSubscription.unsubscribe(); - this.setParams( {query: null }); } setStateFromParams(params: ParamMap) { - this.queryText = params.get('query'); - const queryModel = this.paramService.queryModelFromParams(params, this.corpus.fields); - if (!_.isEqual(this.queryModel, queryModel)) { - this.queryModel = queryModel; - } this.tabIndex = params.has('visualize') ? 1 : 0; this.showVisualization = params.has('visualize') ? true : false; + if (paramsHaveChanged(this.queryModel, params)) { + this.setQueryModel(false); + } } @HostListener('window:scroll', []) @@ -115,21 +106,12 @@ export class SearchComponent extends ParamDirective { this.dialogService.showManualPage('query'); } - public showCorpusInfo(corpus: Corpus) { - this.dialogService.showDescriptionPage(corpus); - } - public switchTabs(index: number) { this.tabIndex = index; } public search() { - this.setParams({ query: this.queryText }); - } - - public goToContext(document: FoundDocument) { - const params = makeContextParams(document, this.corpus); - this.setParams(params); + this.queryModel.setQueryText(this.queryText); } /** @@ -138,10 +120,20 @@ export class SearchComponent extends ParamDirective { private setCorpus(corpus: Corpus) { if (!this.corpus || this.corpus.name !== corpus.name) { + const reset = !_.isUndefined(this.corpus); this.corpus = corpus; - this.filterFields = this.corpus.fields.filter(field => field.searchFilter); - this.queryModel = {queryText: ''}; + this.setQueryModel(reset); } } + private setQueryModel(reset: boolean) { + const params = reset ? undefined : this.route.snapshot.queryParamMap; + const queryModel = new QueryModel(this.corpus, params); + this.queryModel = queryModel; + this.queryText = queryModel.queryText; + this.queryModel.update.subscribe(() => { + this.queryText = this.queryModel.queryText; + this.setParams(this.queryModel.toRouteParam()); + }); + } } diff --git a/frontend/src/app/select-field/select-field.component.html b/frontend/src/app/select-field/select-field.component.html index 7ce497284..078bf63d5 100644 --- a/frontend/src/app/select-field/select-field.component.html +++ b/frontend/src/app/select-field/select-field.component.html @@ -1,4 +1,11 @@ - + {{allVisible? 'Show default fields' : 'Show all fields'}} diff --git a/frontend/src/app/select-field/select-field.component.ts b/frontend/src/app/select-field/select-field.component.ts index 5500cb1ca..5548c7366 100644 --- a/frontend/src/app/select-field/select-field.component.ts +++ b/frontend/src/app/select-field/select-field.component.ts @@ -1,40 +1,37 @@ +/* eslint-disable @typescript-eslint/member-ordering */ import * as _ from 'lodash'; import { Component, EventEmitter, Input, OnChanges, Output } from '@angular/core'; -import { ActivatedRoute, ParamMap, Router } from '@angular/router'; - -import { CorpusField } from '../models/index'; -import { ParamDirective } from '../param/param-directive'; -import { searchFieldsFromParams } from '../utils/params'; +import { CorpusField, QueryModel } from '../models/index'; @Component({ selector: 'ia-select-field', templateUrl: './select-field.component.html', styleUrls: ['./select-field.component.scss'], }) -export class SelectFieldComponent extends ParamDirective implements OnChanges { - @Input() public filterCriterion: string; +export class SelectFieldComponent implements OnChanges { + @Input() queryModel: QueryModel; + @Input() public filterCriterion: 'searchable'|'downloadable'; @Input() public corpusFields: CorpusField[]; - @Output() public updatedCorpusFields = new EventEmitter(); + @Output() selection = new EventEmitter(); - // all fields which are searchable + // all fields which are searchable/downloadable private availableFields: CorpusField[]; // the options displayed at any moment in the dropdown element public optionFields: CorpusField[]; // user selection - public selectedFields: CorpusField[]; - // string representation of user selection - public uiSelected: string[]; + selectedFields: CorpusField[]; // whether to display all field options, or just the core ones - public allVisible: boolean = false; + public allVisible = false; - constructor( - route: ActivatedRoute, - router: Router) { - super(route, router); - } + constructor() {} initialize() { - this.availableFields = this.getAvailableSearchFields(this.corpusFields); + if (this.queryModel) { + this.setStateFromQueryModel(this.queryModel); + } else { + this.selectedFields = this.filterCoreFields(); + } + this.availableFields = this.getAvailableFields(this.corpusFields); this.optionFields = this.filterCoreFields(); } @@ -42,24 +39,21 @@ export class SelectFieldComponent extends ParamDirective implements OnChanges { this.initialize(); } - teardown() { - this.setParams({ fields: null }); - } - - setStateFromParams(params: ParamMap) { - const queryFields = searchFieldsFromParams(params); - if (!queryFields) { - this.selectedFields = this.filterCoreFields(); + setStateFromQueryModel(queryModel: QueryModel) { + if (queryModel.searchFields) { + this.selectedFields = _.clone(queryModel.searchFields); } else { - this.selectedFields = this.optionFields.filter( field => queryFields.find(name => field.name === name) ); + this.selectedFields = []; } - this.updatedCorpusFields.emit(this.selectedFields); } - private getAvailableSearchFields(corpusFields: CorpusField[]): CorpusField[] { - const searchableFields = corpusFields.filter(field => field.searchable); - const allSearchFields = _.flatMap(searchableFields, this.searchableMultiFields.bind(this)) as CorpusField[]; - return allSearchFields; + private getAvailableFields(corpusFields: CorpusField[]): CorpusField[] { + const availableFields = corpusFields.filter(field => field[this.filterCriterion]); + if (this.filterCriterion === 'searchable') { + return _.flatMap(availableFields, this.searchableMultiFields.bind(this)) as CorpusField[]; + } else { + return availableFields; + } } private searchableMultiFields(field: CorpusField): CorpusField[] { @@ -94,40 +88,30 @@ export class SelectFieldComponent extends ParamDirective implements OnChanges { public toggleAllFields() { if (this.allVisible) { this.optionFields = this.filterCoreFields(); - } - else { + } else { // show all options, with core options first, the rest alphabetically sorted - let coreFields = this.filterCoreFields(); - let noCoreOptions = _.without(this.availableFields, ... coreFields); + const coreFields = this.filterCoreFields(); + const noCoreOptions = _.without(this.availableFields, ... coreFields); this.optionFields = coreFields.concat(_.sortBy(noCoreOptions,['displayName'])); } this.allVisible = !this.allVisible; - this.updatedCorpusFields.emit(this.selectedFields); + this.onUpdate(); } - public toggleField() { - if ( !this.selectedFields.length ) { - this.updatedCorpusFields.emit([]); - if (this.filterCriterion === 'csv') return; - this.setParams({ fields: null }); - } - else { - this.updatedCorpusFields.emit(this.selectedFields); - this.uiSelected = this.selectedFields.map(field => field.name); - const fields = this.uiSelected.join(','); - if (this.filterCriterion === 'csv') return; - this.setParams({ fields: fields }); + public onUpdate() { + this.selection.emit(this.selectedFields); + if (this.queryModel) { + this.queryModel.searchFields = this.selectedFields; + this.queryModel.update.next(); } } private filterCoreFields() { - if (this.filterCriterion === 'csv') { + if (this.filterCriterion === 'downloadable') { return this.corpusFields.filter(field => field.csvCore); - } - else if (this.filterCriterion === 'searchField') { + } else if (this.filterCriterion === 'searchable') { return this.corpusFields.filter(field => field.searchFieldCore); - } - else { + } else { return this.availableFields; } } diff --git a/frontend/src/app/services/api.service.ts b/frontend/src/app/services/api.service.ts index e57d097fe..a7f27ace3 100644 --- a/frontend/src/app/services/api.service.ts +++ b/frontend/src/app/services/api.service.ts @@ -24,6 +24,7 @@ import { DateTermFrequencyParameters, Download, DownloadOptions, + FieldCoverage, FoundDocument, LimitedResultsDownloadParameters, QueryDb, @@ -229,9 +230,15 @@ export class ApiService extends Resource { }) public corpusdescription: ResourceMethod< { filename: string; corpus: string }, - any + string >; + fieldCoverage(corpusName: string): Promise { + return this.http.get( + `/api/visualization/coverage/${corpusName}`, + ).toPromise(); + } + $getUrl(actionOptions: IResourceAction): string | Promise { const urlPromise = super.$getUrl(actionOptions); if (!this.apiUrl) { diff --git a/frontend/src/app/services/corpus.service.spec.ts b/frontend/src/app/services/corpus.service.spec.ts index 73272f7cf..8e7db5d37 100644 --- a/frontend/src/app/services/corpus.service.spec.ts +++ b/frontend/src/app/services/corpus.service.spec.ts @@ -1,4 +1,4 @@ -import { TestBed, inject, fakeAsync } from '@angular/core/testing'; +import { TestBed, inject } from '@angular/core/testing'; import { ApiServiceMock } from '../../mock-data/api'; import { ApiService } from './api.service'; @@ -8,10 +8,8 @@ import { UserService } from './user.service'; import { UserServiceMock } from '../../mock-data/user'; import { SessionService } from './session.service'; -import { Corpus } from '../models/corpus'; -import { CorpusField, SearchFilterData } from '../models/index'; import { RouterTestingModule } from '@angular/router/testing'; -import { Router } from '@angular/router'; +import * as _ from 'lodash'; describe('CorpusService', () => { let service: CorpusService; @@ -85,7 +83,7 @@ describe('CorpusService', () => { expect(items.map((item) => item.name)).toEqual(['test1', 'test2']); }); - it('should parse filters', () => { + it('should parse fields', () => { apiServiceMock.fakeResult['corpus'] = [ { name: 'times', @@ -93,6 +91,8 @@ describe('CorpusService', () => { title: 'Times', description: 'This is a description.', es_index: 'times', + languages: ['English'], + category: 'Tests', fields: [ { description: @@ -200,17 +200,10 @@ describe('CorpusService', () => { ]; return service.get().then((items) => { - const mockMultipleChoiceData: SearchFilterData = { - filterType: 'MultipleChoiceFilter', - optionCount: 42, - selected: [], - }; - const mockRangeData: SearchFilterData = { - filterType: 'RangeFilter', - min: 1785, - max: 2010, - }; - const allFields: CorpusField[] = [ + expect(items.length).toBe(1); + const corpus = _.first(items); + + const fieldData = [ { description: 'Banking concern to which the report belongs.', displayName: 'Bank', @@ -227,12 +220,10 @@ describe('CorpusService', () => { searchable: true, downloadable: false, name: 'bank', - searchFilter: { + filterOptions: { + name: 'MultipleChoiceFilter', description: 'Search only within these banks.', - fieldName: 'bank', - useAsFilter: false, - defaultData: mockMultipleChoiceData, - currentData: mockMultipleChoiceData, + option_count: 42, }, mappingType: 'keyword', }, @@ -252,13 +243,13 @@ describe('CorpusService', () => { visualizations: ['resultscount', 'termfrequency'], visualizationSort: 'key', multiFields: undefined, - searchFilter: { + filterOptions: { description: 'Restrict the years from which search results will be returned.', - fieldName: 'year', - useAsFilter: false, - defaultData: mockRangeData, - currentData: mockRangeData, + name: 'RangeFilter', + lower: 1785, + upper: 2010, + }, mappingType: 'integer', }, @@ -277,27 +268,17 @@ describe('CorpusService', () => { visualizations: ['wordcloud', 'ngram'], visualizationSort: null, multiFields: ['clean', 'stemmed', 'length'], - searchFilter: null, + filterOptions: null, searchFieldCore: true, mappingType: 'text', }, ]; - expect(items).toEqual([ - new Corpus( - 'default', - 'times', - 'Times', - 'This is a description.', - 'times', - allFields, - new Date(1785, 0, 1, 0, 0), - new Date(2010, 11, 31, 0, 0), - '/static/no-image.jpg', - 'png', - false, - true - ), - ]); + + _.zip(corpus.fields, fieldData).map(([result, expected]) => { + _.mapKeys(expected, key => { + expect(result[key]).toEqual(expected[key]); + }); + }); }); }); }); diff --git a/frontend/src/app/services/corpus.service.ts b/frontend/src/app/services/corpus.service.ts index 46ec83410..97323a6a2 100644 --- a/frontend/src/app/services/corpus.service.ts +++ b/frontend/src/app/services/corpus.service.ts @@ -2,14 +2,11 @@ import { Injectable } from '@angular/core'; import { BehaviorSubject } from 'rxjs'; -import * as moment from 'moment'; import { Corpus, CorpusField, - DocumentContext, - SearchFilter, - SearchFilterData, + DocumentContext } from '../models/index'; import { ApiRetryService } from './api-retry.service'; import { AuthService } from './auth.service'; @@ -93,77 +90,14 @@ export class CorpusService { data.scan_image_type, data.allow_image_download, data.word_models_present, + data.languages, + data.category, data.description_page, - this.parseDocumentContext(data.document_context, allFields) + this.parseDocumentContext(data.document_context, allFields), ); }; - private parseField = (data: any): CorpusField => ({ - description: data.description, - displayName: data.display_name || data.name, - displayType: data.display_type || data['es_mapping']?.type, - resultsOverview: data.results_overview, - csvCore: data.csv_core, - searchFieldCore: data.search_field_core, - visualizations: data.visualizations, - visualizationSort: data.visualization_sort, - multiFields: data['es_mapping']?.fields - ? Object.keys(data['es_mapping'].fields) - : undefined, - hidden: data.hidden, - sortable: data.sortable, - primarySort: data.primary_sort, - searchable: data.searchable, - downloadable: data.downloadable, - name: data.name, - searchFilter: data['search_filter'] - ? this.parseSearchFilter(data['search_filter'], data['name']) - : null, - mappingType: data.es_mapping?.type, - }); - - private parseSearchFilter( - filter: any, - fieldName: string - ): SearchFilter { - let defaultData: any; - switch (filter.name) { - case 'BooleanFilter': - defaultData = { - filterType: filter.name, - checked: false, - }; - break; - case 'MultipleChoiceFilter': - defaultData = { - filterType: filter.name, - optionCount: filter.option_count, - selected: [], - }; - break; - case 'RangeFilter': - defaultData = { - filterType: filter.name, - min: filter.lower, - max: filter.upper, - }; - break; - case 'DateFilter': - defaultData = { - filterType: filter.name, - min: this.formatDate(new Date(filter.lower)), - max: this.formatDate(new Date(filter.upper)), - }; - break; - } - return { - fieldName, - description: filter.description, - useAsFilter: false, - defaultData, - currentData: defaultData, - }; - } + private parseField = (data: any): CorpusField => new CorpusField(data); private parseDate(date: any): Date { // months are zero-based! @@ -176,13 +110,6 @@ export class CorpusService { ); } - /** - * Return a string of the form 0123-04-25. - */ - private formatDate(date: Date): string { - return moment(date).format().slice(0, 10); - } - private parseDocumentContext( data: { context_fields: string[] | null; diff --git a/frontend/src/app/services/download.service.ts b/frontend/src/app/services/download.service.ts index c313eabf7..27a8d0eaa 100644 --- a/frontend/src/app/services/download.service.ts +++ b/frontend/src/app/services/download.service.ts @@ -24,8 +24,7 @@ export class DownloadService { requestedResults: number, route: string, highlightFragmentSize: number, fileOptions: DownloadOptions ): Promise { - const esQuery = this.elasticSearchService.makeEsQuery( - queryModel, corpus.fields); // to create elastic search query + const esQuery = queryModel.toEsQuery(); // to create elastic search query const parameters = _.merge( { corpus: corpus.name, @@ -58,8 +57,7 @@ export class DownloadService { * @param queryModel QueryModel for which download is requested. * @param fields The fields to appear as columns in the csv. */ - const esQuery = this.elasticSearchService.makeEsQuery( - queryModel, corpus.fields); // to create elastic search query + const esQuery = queryModel.toEsQuery(); // to create elastic search query return this.apiService.downloadTask({corpus: corpus.name, es_query: esQuery, fields: fields.map( field => field.name ), route }) .then(result => result) .catch( error => { diff --git a/frontend/src/app/services/elastic-search.service.spec.ts b/frontend/src/app/services/elastic-search.service.spec.ts index bc08f7064..3a54c3114 100644 --- a/frontend/src/app/services/elastic-search.service.spec.ts +++ b/frontend/src/app/services/elastic-search.service.spec.ts @@ -1,69 +1,6 @@ -import { TestBed, inject } from '@angular/core/testing'; +import { TestBed } from '@angular/core/testing'; import { HttpClientTestingModule } from '@angular/common/http/testing'; import { ElasticSearchService } from './elastic-search.service'; -import { Corpus, DateFilterData, QueryModel, SearchFilter } from '../models'; -import * as _ from 'lodash'; -import { mockField, mockField2 } from '../../mock-data/corpus'; - - -const dateFilter: SearchFilter = { - fieldName: 'date', - description: '', - useAsFilter: true, - defaultData: { - filterType: 'DateFilter', - min: '1099-01-01', - max: '1300-12-31' - }, - currentData: { - filterType: 'DateFilter', - min: '1111-01-01', - max: '1299-12-31' - } -}; - -const mockCorpus: Corpus = { - serverName: '', - name: 'mock-corpus', - title: 'Mock Corpus', - description: '', - index: 'mock-corpus', - minDate: new Date('1800-01-01'), - maxDate: new Date('1900-01-01'), - image: 'image.jpeg', - scan_image_type: undefined, - allow_image_download: true, - word_models_present: false, - fields: [ - { - name: 'content', - displayName: 'Content', - description: '', - displayType: 'text_content', - hidden: false, - sortable: false, - primarySort: false, - searchable: true, - downloadable: true, - searchFilter: undefined, - mappingType: 'text', - }, - { - name: 'date', - displayName: 'Date', - description: '', - displayType: 'date', - hidden: false, - sortable: true, - primarySort: false, - searchable: false, - downloadable: true, - searchFilter: dateFilter, - mappingType: 'date' - } - ], -}; - describe('ElasticSearchService', () => { let service: ElasticSearchService; @@ -80,33 +17,4 @@ describe('ElasticSearchService', () => { it('should be created',() => { expect(service).toBeTruthy(); }); - - it('should select search fields', () => { - const queryModel: QueryModel = { - queryText: 'test', - fields: ['great_field', 'speech'] - }; - - const esQuery = service.makeEsQuery(queryModel, [mockField, mockField2]); - expect(_.get(esQuery, 'query.simple_query_string.fields')).toEqual(['great_field', 'speech']); - }); - - it('should convert between EsQuery and QueryModel types', () => { - - const querymodels: QueryModel[] = [ - { - queryText: 'test' - }, - { - queryText: 'test', - filters: [ dateFilter ] - } - ]; - - querymodels.forEach(queryModel => { - const esQuery = service.makeEsQuery(queryModel, mockCorpus.fields); - const restoredQueryModel = service.esQueryToQueryModel(esQuery, mockCorpus); - expect(restoredQueryModel).toEqual(queryModel); - }); - }); }); diff --git a/frontend/src/app/services/elastic-search.service.ts b/frontend/src/app/services/elastic-search.service.ts index b7e40089e..fb71eb6e6 100644 --- a/frontend/src/app/services/elastic-search.service.ts +++ b/frontend/src/app/services/elastic-search.service.ts @@ -1,15 +1,13 @@ /* eslint-disable @typescript-eslint/member-ordering */ +/* eslint-disable @typescript-eslint/member-ordering */ import { Injectable } from '@angular/core'; import { HttpClient, HttpParams } from '@angular/common/http'; - -import { FoundDocument, Corpus, CorpusField, QueryModel, SearchResults, - AggregateQueryFeedback, SearchFilter, SearchFilterData, searchFilterDataFromField, - EsFilter, EsSearchClause, BooleanQuery } from '../models/index'; - - +import { + FoundDocument, Corpus, QueryModel, SearchResults, + AggregateQueryFeedback, EsSearchClause, BooleanQuery, + EsFilter +} from '../models/index'; import * as _ from 'lodash'; -import { findByName } from '../utils/utils'; -import { makeBooleanQuery, makeEsSearchClause, makeHighlightSpecification, makeSortSpecification, } from '../utils/es-query'; @Injectable() @@ -21,41 +19,6 @@ export class ElasticSearchService { this.client = new Client(this.http); } - public makeEsQuery(queryModel: QueryModel, corpusFields: CorpusField[]): EsQuery | EsQuerySorted { - const searchFields = queryModel.fields?.map(fieldName => findByName(corpusFields, fieldName)); - const clause: EsSearchClause = makeEsSearchClause(queryModel.queryText, searchFields); - - let query: EsQuery | EsQuerySorted; - if (queryModel.filters) { - query = { - query: makeBooleanQuery(clause, this.mapFilters(queryModel.filters)) - }; - } else { - query = { - query: clause - }; - } - - const sort = makeSortSpecification(queryModel.sortBy, queryModel.sortAscending); - _.merge(query, sort); - - const highlight = makeHighlightSpecification(corpusFields, queryModel.queryText, queryModel.highlight); - _.merge(query, highlight); - - return query; - } - - public esQueryToQueryModel(query: EsQuery, corpus: Corpus): QueryModel { - const queryText = this.queryTextFromEsSearchClause(query.query); - const filters = this.filtersFromEsQuery(query, corpus); - - if (filters.length) { - return { queryText, filters }; - } else { - return { queryText }; - } - } - getDocumentById(id: string, corpus: Corpus): Promise { const query = { body: { @@ -78,47 +41,6 @@ export class ElasticSearchService { } } - private queryTextFromEsSearchClause(query: EsSearchClause | BooleanQuery | EsFilter): string { - const clause = 'bool' in query ? query.bool.must : query; - - if ('simple_query_string' in clause) { - return clause.simple_query_string.query; - } - } - - private filtersFromEsQuery(query: EsQuery, corpus: Corpus): SearchFilter[] { - if ('bool' in query.query) { - const filters = query.query.bool.filter; - return filters.map(filter => this.esFilterToSearchFilter(filter, corpus)); - } - return []; - } - - private esFilterToSearchFilter(filter: EsFilter, corpus: Corpus): SearchFilter { - let fieldName: string; - let value: any; - - if ('term' in filter) { // boolean filter - fieldName = _.keys(filter.term)[0]; - value = filter.term[fieldName]; - } else if ('terms' in filter) { // multiple choice filter - fieldName = _.keys(filter.terms)[0]; - value = filter.terms[fieldName]; - } else { // range or date filter - fieldName = _.keys(filter.range)[0]; - value = [filter.range[fieldName].gte.toString(), filter.range[fieldName].lte.toString()]; - } - const field: CorpusField = findByName(corpus.fields, fieldName); - const filterData = searchFilterDataFromField(field, value); - return { - fieldName: field.name, - description: field.searchFilter.description, - useAsFilter: true, - currentData: filterData, - defaultData: field.searchFilter.defaultData, - }; - } - /** * Construct the aggregator, based on kind of field * Date fields are aggregated in year intervals @@ -162,7 +84,7 @@ export class ElasticSearchService { aggregators.forEach(d => { aggregations[d.name] = this.makeAggregation(d.name, d.size, 1); }); - const esQuery = this.makeEsQuery(queryModel, corpusDefinition.fields); + const esQuery = queryModel.toEsQuery(); const aggregationModel = Object.assign({ aggs: aggregations }, esQuery); const result = await this.executeAggregate(corpusDefinition, aggregationModel); const aggregateData = {}; @@ -188,7 +110,7 @@ export class ElasticSearchService { } } }; - const esQuery = this.makeEsQuery(queryModel, corpusDefinition.fields); + const esQuery = queryModel.toEsQuery(); const aggregationModel = Object.assign({ aggs: agg }, esQuery); const result = await this.executeAggregate(corpusDefinition, aggregationModel); const aggregateData = {}; @@ -204,13 +126,13 @@ export class ElasticSearchService { public async search( - corpusDefinition: Corpus, queryModel: QueryModel, size?: number, ): Promise { - const esQuery = this.makeEsQuery(queryModel, corpusDefinition.fields); + const esQuery = queryModel.toEsQuery(); + // Perform the search - const response = await this.execute(corpusDefinition, esQuery, size || this.resultsPerPage); + const response = await this.execute(queryModel.corpus, esQuery, size || this.resultsPerPage); return this.parseResponse(response); } @@ -219,12 +141,11 @@ export class ElasticSearchService { * Load results for requested page */ public async loadResults( - corpusDefinition: Corpus, queryModel: QueryModel, from: number, size: number): Promise { - const esQuery = this.makeEsQuery(queryModel, corpusDefinition.fields); + const esQuery = queryModel.toEsQuery(); // Perform the search - const response = await this.execute(corpusDefinition, esQuery, size || this.resultsPerPage, from); + const response = await this.execute(queryModel.corpus, esQuery, size || this.resultsPerPage, from); return this.parseResponse(response); } @@ -247,7 +168,7 @@ export class ElasticSearchService { /** * return the id, relevance and field values of a given document */ - private hitToDocument(hit: SearchHit, maxScore: number): FoundDocument { + private hitToDocument(hit: SearchHit, maxScore: number) { return { id: hit._id, relevance: hit._score / maxScore, @@ -255,36 +176,6 @@ export class ElasticSearchService { highlight: hit.highlight, } as FoundDocument; } - - /** - * Convert filters from query model into elasticsearch form - */ - private mapFilters(filters: SearchFilter[]): EsFilter[] { - return filters.map(filter => { - switch (filter.currentData.filterType) { - case 'BooleanFilter': - return { term: { [filter.fieldName]: filter.currentData.checked } }; - case 'MultipleChoiceFilter': - return { - terms: { - [filter.fieldName]: _.map(filter.currentData.selected, f => decodeURIComponent(f)) - } - }; - case 'RangeFilter': - return { - range: { - [filter.fieldName]: { gte: filter.currentData.min, lte: filter.currentData.max } - } - }; - case 'DateFilter': - return { - range: { - [filter.fieldName]: { gte: filter.currentData.min, lte: filter.currentData.max, format: 'yyyy-MM-dd' } - } - }; - } - }); - } } interface Connection { diff --git a/frontend/src/app/services/index.ts b/frontend/src/app/services/index.ts index f79844125..e931e2158 100644 --- a/frontend/src/app/services/index.ts +++ b/frontend/src/app/services/index.ts @@ -5,7 +5,6 @@ export * from './corpus.service'; export * from './dialog.service'; export * from './download.service'; export * from './elastic-search.service'; -export * from './param.service'; export * from './highlight.service'; export * from './notification.service'; export * from './query.service'; diff --git a/frontend/src/app/services/param.service.spec.ts b/frontend/src/app/services/param.service.spec.ts deleted file mode 100644 index d21fde640..000000000 --- a/frontend/src/app/services/param.service.spec.ts +++ /dev/null @@ -1,25 +0,0 @@ -import { inject, TestBed } from '@angular/core/testing'; - -import { ParamService } from './param.service'; -import { SearchService } from './search.service'; -import { SearchServiceMock } from '../../mock-data/search'; -import { convertToParamMap } from '@angular/router'; - -describe('ParamService', () => { - let service: ParamService; - - beforeEach(() => { - TestBed.configureTestingModule({ - providers: [ - ParamService, - { provide: SearchService, useValue: new SearchServiceMock() } - ] - }); - service = TestBed.inject(ParamService); - }); - - it('should be created', inject([ParamService], (service: ParamService) => { - expect(service).toBeTruthy(); - })); - -}); diff --git a/frontend/src/app/services/param.service.ts b/frontend/src/app/services/param.service.ts deleted file mode 100644 index 259df597d..000000000 --- a/frontend/src/app/services/param.service.ts +++ /dev/null @@ -1,66 +0,0 @@ -import * as _ from 'lodash'; - -import { Injectable } from '@angular/core'; -import { ParamMap } from '@angular/router'; - -import { CorpusField, QueryModel } from '../models'; -import { SearchService } from './search.service'; -import { - filtersFromParams, highlightFromParams, paramForFieldName, queryFromParams, searchFieldsFromParams, - searchFilterDataToParam, sortSettingsFromParams -} from '../utils/params'; - -@Injectable() -export class ParamService { - - constructor(private searchService: SearchService) { } - - public queryModelFromParams(params: ParamMap, corpusFields: CorpusField[]) { - // copy fields so the state in components is isolated - const fields = _.cloneDeep(corpusFields); - const activeFilters = filtersFromParams(params, fields); - const highlight = highlightFromParams(params); - const query = queryFromParams(params); - const queryFields = searchFieldsFromParams(params); - const sortSettings = sortSettingsFromParams(params, fields); - return this.searchService.createQueryModel( - query, queryFields, activeFilters, sortSettings.field, sortSettings.ascending, highlight); - } - - public queryModelToRoute(queryModel: QueryModel, usingDefaultSortField = false, nullableParams = []): any { - const route = { - query: queryModel.queryText || '' - }; - - if (queryModel.fields) { - route['fields'] = queryModel.fields.join(','); - } else { - route['fields'] = null; - } - - for (const filter of queryModel.filters.map(data => ({ - param: paramForFieldName(data.fieldName), - value: searchFilterDataToParam(data) - }))) { - route[filter.param] = filter.value; - } - - if (!usingDefaultSortField && queryModel.sortBy) { - route['sort'] = `${queryModel.sortBy},${queryModel.sortAscending ? 'asc' : 'desc'}`; - } else { - route['sort'] = null; - } - - if (queryModel.highlight) { - route['highlight'] = `${queryModel.highlight}`; - } else { - route['highlight'] = null; - } - - if (nullableParams.length) { - nullableParams.forEach( param => route[param] = null); - } - return route; - } - -} diff --git a/frontend/src/app/services/search.service.ts b/frontend/src/app/services/search.service.ts index 9c6a9443c..e5b2f5720 100644 --- a/frontend/src/app/services/search.service.ts +++ b/frontend/src/app/services/search.service.ts @@ -4,14 +4,8 @@ import { ApiService } from './api.service'; import { ElasticSearchService } from './elastic-search.service'; import { QueryService } from './query.service'; import { - Corpus, - CorpusField, - QueryModel, - SearchFilter, - SearchResults, - AggregateQueryFeedback, - SearchFilterData, - QueryDb, + Corpus, QueryModel, SearchResults, + AggregateQueryFeedback, QueryDb } from '../models/index'; import { AuthService } from './auth.service'; @@ -31,60 +25,26 @@ export class SearchService { * Load results for requested page */ public async loadResults( - corpus: Corpus, queryModel: QueryModel, from: number, size: number ): Promise { const results = await this.elasticSearchService.loadResults( - corpus, queryModel, from, size ); - results.fields = corpus.fields.filter((field) => field.resultsOverview); + results.fields = queryModel.corpus.fields.filter((field) => field.resultsOverview); return results; } - /** - * Construct a dictionary representing an ES query. - * - * @param queryString Read as the `simple_query_string` DSL of standard ElasticSearch. - * @param fields Optional list of fields to restrict the queryString to. - * @param filters A list of dictionaries representing the ES DSL. - */ - public createQueryModel( - queryText: string = '', - fields: string[] | null = null, - filters: SearchFilter[] = [], - sortField: CorpusField = null, - sortAscending = false, - highlight: number = null - ): QueryModel { - const model: QueryModel = { - queryText, - filters, - sortBy: sortField ? sortField.name : undefined, - sortAscending, - }; - if (fields) { - model.fields = fields; - } - if (highlight) { - model.highlight = highlight; - } - return model; - } - - public async search( - queryModel: QueryModel, - corpus: Corpus + public async search(queryModel: QueryModel ): Promise { const user = await this.authService.getCurrentUserPromise(); - const query = new QueryDb(queryModel, corpus.name, user.id); + const esQuery = queryModel.toEsQuery(); + const query = new QueryDb(esQuery, queryModel.corpus.name, user.id); query.started = new Date(Date.now()); const results = await this.elasticSearchService.search( - corpus, queryModel ); query.total_results = results.total.value; @@ -92,7 +52,7 @@ export class SearchService { this.queryService.save(query); return { - fields: corpus.fields.filter((field) => field.resultsOverview), + fields: queryModel.corpus.fields.filter((field) => field.resultsOverview), total: results.total, documents: results.documents, } as SearchResults; diff --git a/frontend/src/app/services/visualization.service.ts b/frontend/src/app/services/visualization.service.ts index 0033927e6..3c484f55c 100644 --- a/frontend/src/app/services/visualization.service.ts +++ b/frontend/src/app/services/visualization.service.ts @@ -18,28 +18,27 @@ import { ElasticSearchService } from './elastic-search.service'; export class VisualizationService { constructor( - private apiService: ApiService, - private elasticSearchService: ElasticSearchService) { + private apiService: ApiService) { window['apiService'] = this.apiService; } public async getWordcloudData(fieldName: string, queryModel: QueryModel, corpus: Corpus, size: number): Promise { - const esQuery = this.elasticSearchService.makeEsQuery(queryModel, corpus.fields); + const esQuery = queryModel.toEsQuery(); return this.apiService.wordcloud({es_query: esQuery, corpus: corpus.name, field: fieldName, size}); } - public async getWordcloudTasks(fieldName: string, queryModel: QueryModel, corpus: Corpus): Promise { - const esQuery = this.elasticSearchService.makeEsQuery(queryModel, corpus.fields); - return this.apiService.wordcloudTasks({es_query: esQuery, corpus: corpus.name, field: fieldName}) + public async getWordcloudTasks(fieldName: string, queryModel: QueryModel, corpus: string): Promise { + const esQuery = queryModel.toEsQuery(); + return this.apiService.wordcloudTasks({es_query: esQuery, corpus, field: fieldName}) .then(result =>result['task_ids']); } public makeAggregateTermFrequencyParameters( corpus: Corpus, queryModel: QueryModel, fieldName: string, bins: {fieldValue: string|number; size: number}[], ): AggregateTermFrequencyParameters { - const esQuery = this.elasticSearchService.makeEsQuery(queryModel, corpus.fields); + const esQuery = queryModel.toEsQuery(); return { corpus_name: corpus.name, es_query: esQuery, @@ -59,7 +58,7 @@ export class VisualizationService { corpus: Corpus, queryModel: QueryModel, fieldName: string, bins: {size: number; start_date: Date; end_date?: Date}[], unit: TimeCategory, ): DateTermFrequencyParameters { - const esQuery = this.elasticSearchService.makeEsQuery(queryModel, corpus.fields); + const esQuery = queryModel.toEsQuery(); return { corpus_name: corpus.name, es_query: esQuery, @@ -82,7 +81,7 @@ export class VisualizationService { } getNgramTasks(queryModel: QueryModel, corpus: Corpus, field: string, params: NgramParameters): Promise { - const esQuery = this.elasticSearchService.makeEsQuery(queryModel, corpus.fields); + const esQuery = queryModel.toEsQuery(); return this.apiService.ngramTasks({ es_query: esQuery, corpus_name: corpus.name, diff --git a/frontend/src/app/services/wordmodels.service.ts b/frontend/src/app/services/wordmodels.service.ts index c8a430033..e6918707e 100644 --- a/frontend/src/app/services/wordmodels.service.ts +++ b/frontend/src/app/services/wordmodels.service.ts @@ -37,7 +37,7 @@ export class WordmodelsService extends Resource { @ResourceAction({ method: ResourceRequestMethod.Get, - path: '/word_in_model' + path: '/word_in_models' }) public wordInModelRequest: ResourceMethod< { query_term: string; corpus_name: string }, diff --git a/frontend/src/app/utils/document-context.spec.ts b/frontend/src/app/utils/document-context.spec.ts index 39464b447..565ffc9bb 100644 --- a/frontend/src/app/utils/document-context.spec.ts +++ b/frontend/src/app/utils/document-context.spec.ts @@ -1,56 +1,9 @@ -import { mockField, mockField2, mockField3 } from '../../mock-data/corpus'; -import { Corpus, CorpusField, FoundDocument } from '../models'; +import { mockCorpus3 } from '../../mock-data/corpus'; +import { FoundDocument } from '../models'; import { makeContextParams } from './document-context'; describe('document context utils', () => { - const dateField: CorpusField = { - name: 'date', - displayName: 'Date', - displayType: 'date', - mappingType: 'date', - description: '', - searchable: false, - sortable: true, - hidden: false, - downloadable: true, - primarySort: false, - searchFilter: { - fieldName: 'date', - description: '', - useAsFilter: false, - currentData: { - filterType: 'DateFilter', - min: '1800-01-01', - max: '1900-01-01' - } - }, - }; - - const corpus: Corpus = { - name: 'mock-corpus', - title: 'Mock corpus', - serverName: 'default', - description: '', - index: 'mock-corpus', - minDate: new Date('1800-01-01'), - maxDate: new Date('1900-01-01'), - image: '', - scan_image_type: '', - allow_image_download: true, - word_models_present: false, - documentContext: { - contextFields: [dateField], - displayName: 'edition', - sortField: mockField3, - sortDirection: 'asc', - }, - fields: [ - mockField, - mockField2, - mockField3, - dateField, - ] - }; + const corpus = mockCorpus3; const document: FoundDocument = { id: '1', @@ -65,7 +18,6 @@ describe('document context utils', () => { it('should create a document context link', () => { const params = makeContextParams(document, corpus); - expect(params).toEqual({ date: '1900-01-01:1900-01-01', sort: 'ordering,asc' diff --git a/frontend/src/app/utils/document-context.ts b/frontend/src/app/utils/document-context.ts index e843c3e48..160cbc88f 100644 --- a/frontend/src/app/utils/document-context.ts +++ b/frontend/src/app/utils/document-context.ts @@ -1,23 +1,24 @@ -import { contextFilterFromField, Corpus, FoundDocument } from '../models'; -import { omitNullParameters, searchFiltersToParams, sortSettingsToParams } from './params'; +import { Corpus, FoundDocument, QueryModel } from '../models'; -export const makeContextParams = (document: FoundDocument, corpus: Corpus): any => { - const contextSpec = corpus.documentContext; - - const queryText = null; +const documentContextQuery = (corpus: Corpus, document: FoundDocument): QueryModel => { + const queryModel = new QueryModel(corpus); - const contextFields = contextSpec.contextFields; + const spec = corpus.documentContext; - contextFields.forEach(field => { - field.searchFilter = contextFilterFromField(field, document.fieldValues[field.name]); + spec.contextFields.forEach(field => { + const filter = field.makeSearchFilter(); + filter.setToValue(document.fieldValues[field.name]); + queryModel.addFilter(filter); }); - const filterParams = searchFiltersToParams(contextFields); - const sortParams = sortSettingsToParams( - contextSpec.sortField, - contextSpec.sortDirection - ); + queryModel.sort.sortBy.next(spec.sortField); + queryModel.sort.sortDirection.next(spec.sortDirection); + + return queryModel; +}; - const params = { query: queryText, ...filterParams, ...sortParams }; - return omitNullParameters(params); +export const makeContextParams = (document: FoundDocument, corpus: Corpus): any => { + const queryModel = documentContextQuery(corpus, document); + return queryModel.toQueryParams(); }; + diff --git a/frontend/src/app/utils/es-query.spec.ts b/frontend/src/app/utils/es-query.spec.ts index eec32041e..1511f9e49 100644 --- a/frontend/src/app/utils/es-query.spec.ts +++ b/frontend/src/app/utils/es-query.spec.ts @@ -1,6 +1,6 @@ /* eslint-disable @typescript-eslint/naming-convention */ import * as _ from 'lodash'; -import { mockField, mockField2, mockField3 } from '../../mock-data/corpus'; +import { mockField, mockField2, mockCorpus3 } from '../../mock-data/corpus'; import { makeEsSearchClause, makeHighlightSpecification, makeSimpleQueryString, makeSortSpecification } from './es-query'; describe('es-query utils', () => { @@ -30,17 +30,16 @@ describe('es-query utils', () => { }); it('should make a sort specification', () => { - expect(makeSortSpecification(undefined, true)).toEqual({}); - expect(makeSortSpecification('great_field', false)).toEqual({ + expect(makeSortSpecification(undefined, 'asc')).toEqual({}); + expect(makeSortSpecification(mockField, 'desc')).toEqual({ sort: [{ great_field: 'desc' }] }); }); it('should make a highlight specification', () => { - const fields = [mockField2, mockField3]; - expect(makeHighlightSpecification(fields, 'test', undefined)).toEqual({}); + expect(makeHighlightSpecification(mockCorpus3, 'test', undefined)).toEqual({}); - expect(makeHighlightSpecification(fields, 'test', 100)).toEqual({ + expect(makeHighlightSpecification(mockCorpus3, 'test', 100)).toEqual({ highlight: { fragment_size: 100, pre_tags: [''], diff --git a/frontend/src/app/utils/es-query.ts b/frontend/src/app/utils/es-query.ts index 3f873a035..e4177ef4e 100644 --- a/frontend/src/app/utils/es-query.ts +++ b/frontend/src/app/utils/es-query.ts @@ -1,9 +1,12 @@ /* eslint-disable @typescript-eslint/naming-convention */ import * as _ from 'lodash'; -import { BooleanQuery, Corpus, CorpusField, EsFilter, EsSearchClause, MatchAll, SimpleQueryString } from '../models'; -import { sortDirectionFromBoolean } from './sort'; - +import { BooleanQuery, Corpus, CorpusField, EsFilter, EsSearchClause, MatchAll, + QueryModel, + SimpleQueryString, SortBy, SortDirection } from '../models'; +import { EsQuery } from '../services'; +import { findByName } from './utils'; +import { SearchFilter } from '../models/search-filter'; // conversion from query model -> elasticsearch query language @@ -48,13 +51,23 @@ export const makeBooleanQuery = (query: EsSearchClause, filters: EsFilter[]): Bo } }); +export const combineSearchClauseAndFilters = (queryText: string, filters: EsFilter[], searchFields?: CorpusField[]): EsQuery => { + let query: MatchAll | BooleanQuery; + if (queryText || filters.length) { + const searchClause = makeEsSearchClause(queryText, searchFields); + query = makeBooleanQuery(searchClause, filters); + } else { + query = matchAll; + } + return { query }; +}; -export const makeSortSpecification = (sortBy: string, sortAscending: boolean) => { +export const makeSortSpecification = (sortBy: SortBy, sortDirection: SortDirection) => { if (!sortBy) { return {}; } else { const sortByField = { - [sortBy]: sortDirectionFromBoolean(sortAscending) + [sortBy.name]: sortDirection }; return { sort: [sortByField] @@ -62,11 +75,11 @@ export const makeSortSpecification = (sortBy: string, sortAscending: boolean) => } }; -export const makeHighlightSpecification = (corpusFields: CorpusField[], queryText?: string, highlightSize?: number) => { +export const makeHighlightSpecification = (corpus: Corpus, queryText?: string, highlightSize?: number) => { if (!queryText || !highlightSize) { return {}; } - const highlightFields = corpusFields.filter(field => field.searchable); + const highlightFields = corpus.fields.filter(field => field.searchable); return { highlight: { fragment_size: highlightSize, @@ -80,3 +93,37 @@ export const makeHighlightSpecification = (corpusFields: CorpusField[], queryTex }; }; +// conversion from elasticsearch query language -> query model + +export const esQueryToQueryModel = (query: EsQuery, corpus: Corpus): QueryModel => { + const model = new QueryModel(corpus); + model.setQueryText(queryTextFromEsSearchClause(query.query)); + const filters = filtersFromEsQuery(query, corpus); + filters.forEach(filter => model.addFilter(filter)); + return model; +}; + +const queryTextFromEsSearchClause = (query: EsSearchClause | BooleanQuery | EsFilter): string => { + const clause = 'bool' in query ? query.bool.must : query; + + if ('simple_query_string' in clause) { + return clause.simple_query_string.query; + } +}; + +const filtersFromEsQuery = (query: EsQuery, corpus: Corpus): SearchFilter[] => { + if ('bool' in query.query) { + const filters = query.query.bool.filter; + return filters.map(filter => esFilterToSearchFilter(filter, corpus)); + } + return []; +}; + +const esFilterToSearchFilter = (esFilter: EsFilter, corpus: Corpus): SearchFilter => { + const filterType = _.first(_.keys(esFilter)) as 'term'|'terms'|'range'; + const fieldName = _.first(_.keys(esFilter[filterType])); + const field = findByName(corpus.fields, fieldName); + const filter = field.makeSearchFilter(); + filter.data.next(filter.dataFromEsFilter(esFilter as any)); // we know that the esFilter is of the correct type + return filter; +}; diff --git a/frontend/src/app/utils/params.spec.ts b/frontend/src/app/utils/params.spec.ts index 2d1e0ece9..d5b519622 100644 --- a/frontend/src/app/utils/params.spec.ts +++ b/frontend/src/app/utils/params.spec.ts @@ -1,12 +1,15 @@ import { convertToParamMap } from '@angular/router'; -import { highlightFromParams, omitNullParameters, searchFieldsFromParams } from './params'; +import { highlightFromParams, omitNullParameters, paramsHaveChanged, searchFieldsFromParams } from './params'; +import { mockCorpus, mockCorpus3, mockField2, mockField } from '../../mock-data/corpus'; +import { MultipleChoiceFilter, QueryModel } from '../models'; describe('searchFieldsFromParams', () => { it('should parse field parameters', () => { - const params = convertToParamMap({fields: 'speech,speaker'}); - const fields = searchFieldsFromParams(params); + const params = convertToParamMap({fields: 'speech,great_field'}); + const corpus = mockCorpus3; + const fields = searchFieldsFromParams(params, corpus); expect(fields.length).toEqual(2); - expect(fields).toContain('speech'); + expect(fields).toContain(mockField2); }); }); @@ -20,10 +23,51 @@ describe('highlightFromParams', () => { describe('omitNullParameters', () => { it('should omit null parameters', () => { - const p = { a: null, b: 1, c: 'test' }; + const p = { a: null, b: '1', c: 'test' }; expect(omitNullParameters(p)).toEqual( - { b: 1, c: 'test' } + { b: '1', c: 'test' } ); }); }); + +describe('omitNullParameters', () => { + it('should omit null parameters', () => { + const p = { a: null, b: '1', c: 'test' }; + + expect(omitNullParameters(p)).toEqual( + { b: '1', c: 'test' } + ); + }); +}); + +describe('paramsHaveChanged', () => { + const corpus = mockCorpus; + let queryModel: QueryModel; + + beforeEach(() => { + queryModel = new QueryModel(corpus); + }); + + it('should detect changes in parameters', () => { + const params1 = convertToParamMap({}); + const params2 = convertToParamMap({query: 'test'}); + + expect(paramsHaveChanged(queryModel, params1)).toBeFalse(); + expect(paramsHaveChanged(queryModel, params2)).toBeTrue(); + + queryModel = new QueryModel(corpus, params2); + + expect(paramsHaveChanged(queryModel, params2)).toBeFalse(); + expect(paramsHaveChanged(queryModel, params1)).toBeTrue(); + + }); + + it('should detect new filters', () => { + const filter = mockField.makeSearchFilter() as MultipleChoiceFilter; + filter.set(['test']); + const params = convertToParamMap(filter.toRouteParam()); + + expect(paramsHaveChanged(queryModel, params)).toBeTrue(); + }); +}); diff --git a/frontend/src/app/utils/params.ts b/frontend/src/app/utils/params.ts index 25c2a2ac1..5e2491ecf 100644 --- a/frontend/src/app/utils/params.ts +++ b/frontend/src/app/utils/params.ts @@ -1,7 +1,6 @@ import { ParamMap } from '@angular/router'; import * as _ from 'lodash'; -import { contextFilterFromField, CorpusField, SearchFilter, SearchFilterData, searchFilterDataFromSettings } from '../models'; -import { findByName } from './utils'; +import { Corpus, CorpusField, QueryModel, SearchFilter, SortBy, SortDirection } from '../models'; /** omit keys that mapp to null */ export const omitNullParameters = (params: {[key: string]: any}): {[key: string]: any} => { @@ -12,10 +11,10 @@ export const omitNullParameters = (params: {[key: string]: any}): {[key: string] export const queryFromParams = (params: ParamMap): string => params.get('query'); -export const searchFieldsFromParams = (params: ParamMap): string[] | null => { +export const searchFieldsFromParams = (params: ParamMap, corpus: Corpus): CorpusField[] => { if (params.has('fields')) { - const selectedSearchFields = params.get('fields').split(','); - return selectedSearchFields; + const fieldNames = params.get('fields').split(','); + return corpus.fields.filter(field => fieldNames.includes(field.name)); } }; @@ -24,115 +23,51 @@ export const highlightFromParams = (params: ParamMap): number => // sort -export const sortSettingsToParams = (sortBy: CorpusField, direction: string): {sort: string} => { - const fieldName = sortBy !== undefined ? sortBy.name : 'relevance'; - return {sort:`${fieldName},${direction}`}; -}; - -export const sortSettingsFromParams = (params: ParamMap, corpusFields: CorpusField[]): {field: CorpusField; ascending: boolean} => { - let sortField: CorpusField; - let sortAscending = true; - if (params.has('sort')) { - const [sortParam, ascParam] = params.get('sort').split(','); - sortAscending = ascParam === 'asc'; - if ( sortParam === 'relevance' ) { - return { - field: undefined, - ascending: sortAscending - }; - } - sortField = findByName(corpusFields, sortParam); +export const sortSettingsToParams = (sortBy: SortBy, direction: SortDirection): {sort: string|null} => { + let sortByName: string; + if (!sortBy) { + sortByName = 'relevance'; } else { - sortField = corpusFields.find(field => field.primarySort); + sortByName = sortBy.name; } - return { - field: sortField, - ascending: sortAscending - }; + return { sort: `${sortByName},${direction}` }; }; +// filters -interface SearchFilterSettings { - [fieldName: string]: SearchFilterData; -} - -/** - * Set the filter data from the query parameters and return whether any filters were actually set. - */ -export const filtersFromParams = (params: ParamMap, corpusFields: CorpusField[]): SearchFilter[] => { - const filterSettings = filterSettingsFromParams(params, corpusFields); - return applyFilterSettings(filterSettings, corpusFields); -}; - -const filterSettingsFromParams = (params: ParamMap, corpusFields: CorpusField[]): SearchFilterSettings => { - const settings = {}; - corpusFields.forEach(field => { - const param = paramForFieldName(field.name); - if (params.has(param)) { - let filterSettings = params.get(param).split(','); - if (filterSettings[0] === '') { - filterSettings = []; - } - const filterType = field.searchFilter ? field.searchFilter.currentData.filterType : undefined; - const data = searchFilterDataFromSettings(filterType, filterSettings, field); - settings[field.name] = data; - } +export const filtersFromParams = (params: ParamMap, corpus: Corpus): SearchFilter[] => { + const specifiedFields = corpus.fields.filter(field => params.has(field.name)); + return specifiedFields.map(field => { + const filter = field.makeSearchFilter(); + const data = filter.dataFromString(params.get(field.name)); + filter.set(data); + return filter; }); - - return settings; }; -const applyFilterSettings = (filterSettings: SearchFilterSettings, corpusFields: CorpusField[]) => { - corpusFields.forEach(field => { - if (_.has(filterSettings, field.name)) { - const searchFilter = field.searchFilter || contextFilterFromField(field); - const data = filterSettings[field.name]; - searchFilter.currentData = data; - searchFilter.useAsFilter = true; - field.searchFilter = searchFilter; - } else { - if (field.searchFilter) { - field.searchFilter.useAsFilter = false; - if (field.searchFilter.adHoc) { - field.searchFilter = null; - } - } - } - }); - - return corpusFields.filter( field => field.searchFilter && field.searchFilter.useAsFilter ).map( field => field.searchFilter ); +const filterParamForField = (queryModel: QueryModel, field: CorpusField) => { + const filter = queryModel.filterForField(field); + if (filter) { + return filter.toRouteParam(); + } else { + return { [field.name]: null }; + } }; -/*** - * Convert field name to string - */ -export const paramForFieldName = (fieldName: string) => - `${fieldName}`; - - -// --- set params from filters --- // - -export const searchFiltersToParams = (fields: CorpusField[]) => { - const params = {}; - fields.forEach( field => { - const paramName = paramForFieldName(field.name); - const value = field.searchFilter.useAsFilter? searchFilterDataToParam(field.searchFilter) : null; - params[paramName] = value; - }); - - return params; +export const queryFiltersToParams = (queryModel: QueryModel) => { + const filterParamsPerField = queryModel.corpus.fields.map( + field => filterParamForField(queryModel, field)); + return _.reduce( + filterParamsPerField, + _.merge, + {} + ); }; -export const searchFilterDataToParam = (filter: SearchFilter): string => { - switch (filter.currentData.filterType) { - case 'BooleanFilter': - return `${filter.currentData.checked}`; - case 'MultipleChoiceFilter': - return filter.currentData.selected.join(','); - case 'RangeFilter': - return `${filter.currentData.min}:${filter.currentData.max}`; - case 'DateFilter': - return `${filter.currentData.min}:${filter.currentData.max}`; - } -}; +export const paramsHaveChanged = (queryModel: QueryModel, newParams: ParamMap) => { + const currentParams = queryModel.toRouteParam(); + return _.some( _.keys(currentParams), key => + newParams.get(key) !== currentParams[key] + ); +}; diff --git a/frontend/src/app/utils/sort.ts b/frontend/src/app/utils/sort.ts deleted file mode 100644 index e4150ac1a..000000000 --- a/frontend/src/app/utils/sort.ts +++ /dev/null @@ -1,2 +0,0 @@ -export const sortDirectionFromBoolean = (sortAscending: boolean): 'asc'|'desc' => - sortAscending ? 'asc' : 'desc'; diff --git a/frontend/src/app/visualization/barchart/barchart.directive.ts b/frontend/src/app/visualization/barchart/barchart.directive.ts index f2dc8a744..9d071c2c8 100644 --- a/frontend/src/app/visualization/barchart/barchart.directive.ts +++ b/frontend/src/app/visualization/barchart/barchart.directive.ts @@ -1,22 +1,25 @@ /* eslint-disable @typescript-eslint/member-ordering */ -import { Directive, EventEmitter, Input, OnChanges, OnInit, Output, SimpleChanges } from '@angular/core'; +import { Directive, EventEmitter, Input, OnChanges, OnDestroy, OnInit, Output, SimpleChanges } from '@angular/core'; import * as _ from 'lodash'; import { ApiService, NotificationService, SearchService } from '../../services/index'; -import { Chart, ChartOptions, ChartType } from 'chart.js'; -import { AggregateResult, BarchartResult, Corpus, FreqTableHeaders, QueryModel, CorpusField, TaskResult, - BarchartSeries, AggregateQueryFeedback, TimelineDataPoint, HistogramDataPoint, TermFrequencyResult, ChartParameters } from '../../models'; +import { Chart, ChartOptions } from 'chart.js'; +import { + AggregateResult, Corpus, FreqTableHeaders, QueryModel, CorpusField, TaskResult, + BarchartSeries, AggregateQueryFeedback, TimelineDataPoint, HistogramDataPoint, TermFrequencyResult, ChartParameters +} from '../../models'; import Zoom from 'chartjs-plugin-zoom'; -import { BehaviorSubject } from 'rxjs'; +import { BehaviorSubject, Subject } from 'rxjs'; import { selectColor } from '../../utils/select-color'; import { VisualizationService } from '../../services/visualization.service'; import { findByName, showLoading } from '../../utils/utils'; +import { takeUntil } from 'rxjs/operators'; const hintSeenSessionStorageKey = 'hasSeenTimelineZoomingHint'; const hintHidingMinDelay = 500; // milliseconds const hintHidingDebounceTime = 1000; // milliseconds - +const barchartID = 'barchart'; @Directive({ selector: 'ia-barchart', @@ -26,7 +29,7 @@ const hintHidingDebounceTime = 1000; // milliseconds * histogram and timeline components. It does not function as a stand-alone component. */ export abstract class BarchartDirective - implements OnChanges, OnInit { + implements OnChanges, OnInit, OnDestroy { public showHint: boolean; // rawData: a list of series @@ -77,6 +80,8 @@ export abstract class BarchartDirective @Output() isLoading = new BehaviorSubject(false); @Output() error = new EventEmitter(); + destroy$ = new Subject(); + basicChartOptions: ChartOptions = { // chart options not suitable for Chart.defaults.global scales: { xAxis: { @@ -143,6 +148,11 @@ export abstract class BarchartDirective } ngOnChanges(changes: SimpleChanges) { + if (changes.queryModel) { + this.queryModel.update.pipe( + takeUntil(this.destroy$) + ).subscribe(this.refreshChart.bind(this)); + } // new doc counts should be requested if query has changed if (this.changesRequireRefresh(changes)) { this.refreshChart(); @@ -151,6 +161,10 @@ export abstract class BarchartDirective } } + ngOnDestroy(): void { + this.destroy$.next(); + } + /** check whether input changes should force reloading the data */ changesRequireRefresh(changes: SimpleChanges): boolean { const relevantChanges = [changes.corpus, changes.queryModel, changes.visualizedField, changes.frequencyMeasure] @@ -276,9 +290,8 @@ export abstract class BarchartDirective } else { const mainContentFields = this.corpus.fields.filter(field => field.searchable && (field.displayType === 'text_content')); - const queryModelCopy = _.cloneDeep(queryModel); - queryModelCopy.fields = mainContentFields.map(field => field.name); - + const queryModelCopy = queryModel.clone(); + queryModelCopy.searchFields = mainContentFields; return queryModelCopy; } } @@ -466,7 +479,7 @@ export abstract class BarchartDirective const datasets = this.getDatasets(); const options = this.chartOptions(datasets); - this.chart = new Chart('barchart', + this.chart = new Chart(barchartID, { type: this.chartType, data: { @@ -564,7 +577,7 @@ export abstract class BarchartDirective /** return a copy of a query model with the query text set to the given value */ setQueryText(query: QueryModel, queryText: string): QueryModel { - const queryModelCopy = _.cloneDeep(query); + const queryModelCopy = query.clone(); queryModelCopy.queryText = queryText; return queryModelCopy; } @@ -609,12 +622,9 @@ export abstract class BarchartDirective get searchFields(): string { if (this.corpus && this.queryModel) { - const searchFields = this.selectSearchFields(this.queryModel).fields; + const searchFields = this.selectSearchFields(this.queryModel).searchFields; - const displayNames = searchFields.map(fieldName => { - const field = findByName(this.corpus.fields, fieldName); - return field.displayName; - }); + const displayNames = searchFields.map(field => field.displayName); return displayNames.join(', '); } diff --git a/frontend/src/app/visualization/barchart/histogram.component.spec.ts b/frontend/src/app/visualization/barchart/histogram.component.spec.ts index 679c2caa1..46494cd8d 100644 --- a/frontend/src/app/visualization/barchart/histogram.component.spec.ts +++ b/frontend/src/app/visualization/barchart/histogram.component.spec.ts @@ -1,63 +1,10 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; -import { Corpus } from '../../models'; +import { QueryModel } from '../../models'; import { commonTestBed } from '../../common-test-bed'; import { HistogramComponent } from './histogram.component'; - -const MOCK_CORPUS: Corpus = { - name: 'mock-corpus', - serverName: 'bogus', - title: 'Mock Corpus', - description: 'corpus for testing', - index: 'mock-corpus', - image: 'nothing', - minDate: new Date('1-1-1800'), - maxDate: new Date('1-1-2000'), - scan_image_type: 'nothing', - allow_image_download: false, - word_models_present: false, - fields: [ - { - name: 'content', - description: 'main content field', - displayName: 'Content', - displayType: 'text_content', - mappingType: 'text', - searchable: true, - downloadable: true, - searchFilter: undefined, - primarySort: false, - sortable: false, - hidden: false, - }, { - name: 'keyword-1', - description: 'a keyword field', - displayName: 'Keyword 1', - displayType: 'keyword', - mappingType: 'keyword', - searchable: true, - downloadable: true, - searchFilter: undefined, - primarySort: false, - sortable: false, - hidden: false, - }, { - name: 'text', - description: 'a text field', - displayName: 'Text', - displayType: 'text', - mappingType: 'text', - searchable: true, - downloadable: true, - searchFilter: undefined, - primarySort: false, - sortable: false, - hidden: false, - } - ] -}; - +import { mockCorpus3, mockField, mockField2 } from '../../../mock-data/corpus'; describe('HistogramCompoment', () => { let component: HistogramComponent; @@ -79,32 +26,36 @@ describe('HistogramCompoment', () => { it('should filter text fields', () => { - component.corpus = MOCK_CORPUS; + component.corpus = mockCorpus3; component.frequencyMeasure = 'documents'; + const query1 = new QueryModel(mockCorpus3); + query1.setQueryText('test'); + + const query2 = new QueryModel(mockCorpus3); + query2.setQueryText('test'); + query2.searchFields = [mockField, mockField2]; + const cases = [ { - query: { - queryText: 'test' - } + query: query1, + searchFields: undefined, }, { - query: { - queryText: 'test', - fields: ['content', 'text'], - } + query: query2, + searchFields: [mockField, mockField2] } ]; cases.forEach(testCase => { const newQuery = component.selectSearchFields(testCase.query); - expect(newQuery.fields).toEqual(testCase.query.fields); + expect(newQuery.searchFields).toEqual(testCase.query.searchFields); }); component.frequencyMeasure = 'tokens'; cases.forEach(testCase => { const newQuery = component.selectSearchFields(testCase.query); - expect(newQuery.fields).toEqual(['content']); + expect(newQuery.searchFields).toEqual([mockField2]); }); }); diff --git a/frontend/src/app/visualization/barchart/histogram.component.ts b/frontend/src/app/visualization/barchart/histogram.component.ts index 583489feb..6847288c9 100644 --- a/frontend/src/app/visualization/barchart/histogram.component.ts +++ b/frontend/src/app/visualization/barchart/histogram.component.ts @@ -1,11 +1,13 @@ import { Component, OnChanges, OnInit, SimpleChanges, } from '@angular/core'; import * as _ from 'lodash'; -import { AggregateResult, MultipleChoiceFilterData, RangeFilterData, +import { AggregateResult, HistogramSeries, QueryModel, HistogramDataPoint, - TermFrequencyResult} from '../../models/index'; + TermFrequencyResult, + MultipleChoiceFilterOptions, + RangeFilterOptions} from '../../models/index'; import { BarchartDirective } from './barchart.directive'; import { selectColor } from '../../utils/select-color'; @@ -31,15 +33,15 @@ export class HistogramComponent extends BarchartDirective im */ getAggregator() { let size = 0; - if (!this.visualizedField.searchFilter) { + if (!this.visualizedField.filterOptions) { return {name: this.visualizedField.name, size: 100}; } - const defaultData = this.visualizedField.searchFilter.defaultData; - if (defaultData.filterType === 'MultipleChoiceFilter') { - size = (defaultData as MultipleChoiceFilterData).optionCount; - } else if (defaultData.filterType === 'RangeFilter') { - size = (defaultData as RangeFilterData).max - (defaultData as RangeFilterData).min; + const filterOptions = this.visualizedField.filterOptions; + if (filterOptions.name === 'MultipleChoiceFilter') { + size = (filterOptions as MultipleChoiceFilterOptions).option_count; + } else if (filterOptions.name === 'RangeFilter') { + size = (filterOptions as RangeFilterOptions).upper - (filterOptions as RangeFilterOptions).lower; } return {name: this.visualizedField.name, size}; } diff --git a/frontend/src/app/visualization/barchart/timeline.component.ts b/frontend/src/app/visualization/barchart/timeline.component.ts index 5afcc2505..60292d2e1 100644 --- a/frontend/src/app/visualization/barchart/timeline.component.ts +++ b/frontend/src/app/visualization/barchart/timeline.component.ts @@ -3,8 +3,9 @@ import { Component, OnChanges, OnInit } from '@angular/core'; import * as d3TimeFormat from 'd3-time-format'; import * as _ from 'lodash'; -import { QueryModel, AggregateResult, TimelineSeries, DateFilterData, TimelineDataPoint, TermFrequencyResult, - TimeCategory } from '../../models/index'; +import { QueryModel, AggregateResult, TimelineSeries, TimelineDataPoint, TermFrequencyResult, + TimeCategory, + DateFilterData} from '../../models/index'; import { BarchartDirective } from './barchart.directive'; import * as moment from 'moment'; import 'chartjs-adapter-moment'; @@ -36,7 +37,9 @@ export class TimelineComponent extends BarchartDirective impl /** get min/max date for the entire graph and set domain and time category */ setTimeDomain() { - const currentDomain = this.visualizedField.searchFilter.currentData as DateFilterData; + const filter = this.queryModel.filters.find(f => f.corpusField.name === this.visualizedField.name) + || this.visualizedField.makeSearchFilter(); + const currentDomain = filter.currentData as DateFilterData; const min = new Date(currentDomain.min); const max = new Date(currentDomain.max); this.xDomain = [min, max]; @@ -265,12 +268,12 @@ export class TimelineComponent extends BarchartDirective impl /** * Add a date filter to a query model restricting it to the provided min and max values. */ - addQueryDateFilter(query: QueryModel, min, max): QueryModel { - const queryModelCopy = _.cloneDeep(query); + addQueryDateFilter(query: QueryModel, min: Date, max: Date): QueryModel { + const queryModelCopy = query.clone(); // download zoomed in results - const filter = this.visualizedField.searchFilter; - filter.currentData = { filterType: 'DateFilter', min: this.timeFormat(min), max: this.timeFormat(max) }; - queryModelCopy.filters.push(filter); + const filter = this.visualizedField.makeSearchFilter(); + filter.set({ min, max }); + queryModelCopy.addFilter(filter); return queryModelCopy; } diff --git a/frontend/src/app/visualization/ngram/ngram.component.spec.ts b/frontend/src/app/visualization/ngram/ngram.component.spec.ts index c4e291c12..5f1521637 100644 --- a/frontend/src/app/visualization/ngram/ngram.component.spec.ts +++ b/frontend/src/app/visualization/ngram/ngram.component.spec.ts @@ -1,5 +1,7 @@ import { ComponentFixture, TestBed, waitForAsync } from '@angular/core/testing'; -import { convertToParamMap, Params } from '@angular/router'; +import { convertToParamMap } from '@angular/router'; +import { QueryModel } from '../../models'; +import { mockCorpus } from '../../../mock-data/corpus'; import { MockCorpusResponse } from '../../../mock-data/corpus-response'; import { commonTestBed } from '../../common-test-bed'; import { NgramComponent } from './ngram.component'; @@ -15,10 +17,9 @@ describe('NgramComponent', () => { beforeEach(() => { fixture = TestBed.createComponent(NgramComponent); component = fixture.componentInstance; - component.queryModel = { - queryText: 'testing', - filters: [] - }; + const queryModel = new QueryModel(mockCorpus); + queryModel.setQueryText('testing'); + component.queryModel = queryModel; component.corpus = MockCorpusResponse[0] as any; component.visualizedField = {name: 'speech'} as any; component.asTable = false; diff --git a/frontend/src/app/visualization/wordcloud/wordcloud.component.ts b/frontend/src/app/visualization/wordcloud/wordcloud.component.ts index e8937d79e..df4f9caec 100644 --- a/frontend/src/app/visualization/wordcloud/wordcloud.component.ts +++ b/frontend/src/app/visualization/wordcloud/wordcloud.component.ts @@ -1,13 +1,16 @@ -import { Component, ElementRef, EventEmitter, Input, OnChanges, OnDestroy, OnInit, Output, SimpleChanges, - ViewChild, ViewEncapsulation } from '@angular/core'; +import { + Component, ElementRef, EventEmitter, Input, OnChanges, OnDestroy, OnInit, Output, SimpleChanges, + ViewChild, ViewEncapsulation +} from '@angular/core'; import * as cloud from 'd3-cloud'; import * as d3 from 'd3'; import { AggregateResult, CorpusField, QueryModel, Corpus, FreqTableHeaders } from '../../models/index'; -import { DialogService, SearchService, ApiService } from '../../services/index'; -import { BehaviorSubject, Observable } from 'rxjs'; +import { ApiService } from '../../services/index'; +import { BehaviorSubject } from 'rxjs'; import { VisualizationService } from '../../services/visualization.service'; +import { showLoading } from '../../utils/utils'; @Component({ selector: 'ia-wordcloud', @@ -33,7 +36,6 @@ export class WordcloudComponent implements OnChanges, OnInit, OnDestroy { { key: 'doc_count', label: 'Frequency' } ]; - public significantText: AggregateResult[]; public disableLoadMore = false; private tasksToCancel: string[] = []; @@ -49,6 +51,10 @@ export class WordcloudComponent implements OnChanges, OnInit, OnDestroy { constructor(private visualizationService: VisualizationService, private apiService: ApiService) { } + get readyToLoad() { + return (this.corpus && this.visualizedField && this.queryModel && this.palette); + } + ngOnInit() { if (this.resultsCount > 0) { this.disableLoadMore = this.resultsCount < this.batchSize; @@ -60,35 +66,37 @@ export class WordcloudComponent implements OnChanges, OnInit, OnDestroy { } ngOnChanges(changes: SimpleChanges) { - if ((this.corpus && this.visualizedField && this.queryModel && this.batchSize && this.palette) && - (changes.corpus || changes.visualizedField || changes.queryModel || changes.batchSize)) { - this.loadData(this.batchSize); + if (this.readyToLoad && + (changes.corpus || changes.visualizedField || changes.queryModel)) { + if (changes.queryModel) { + this.queryModel.update.subscribe(this.loadData.bind(this)); + } + this.loadData(); } else { - this.onDataLoaded(); + this.makeChart(); } } - loadData(size: number = null) { - this.isLoading.next(true); - this.visualizationService.getWordcloudData(this.visualizedField.name, this.queryModel, this.corpus, size).then(result => { - this.significantText = result; - this.onDataLoaded(); - }) - .catch(this.emitError.bind(this)); + loadData() { + showLoading( + this.isLoading, + this.visualizationService.getWordcloudData( + this.visualizedField.name, this.queryModel, this.corpus, this.batchSize + ).then(this.onDataLoaded.bind(this)).catch(this.emitError.bind(this)) + ); } loadMoreData() { - this.isLoading.next(true); - const queryModel = this.queryModel; - if (queryModel) { - this.visualizationService.getWordcloudTasks(this.visualizedField.name, queryModel, this.corpus).then(response => { - this.tasksToCancel = response; - this.apiService.pollTasks(response).then( outcome => { - const result = outcome[0]; - this.significantText = result; - this.onDataLoaded(); - }); - }).catch(this.emitError.bind(this)); + if (this.readyToLoad) { + showLoading( + this.isLoading, + this.visualizationService.getWordcloudTasks(this.visualizedField.name, this.queryModel, this.corpus.name).then(response => { + this.tasksToCancel = response; + return this.apiService.pollTasks(response).then( outcome => + this.onDataLoaded(outcome[0]) + ); + }).catch(this.emitError.bind(this)) + ); } } @@ -96,8 +104,12 @@ export class WordcloudComponent implements OnChanges, OnInit, OnDestroy { this.error.emit(error.message); } - onDataLoaded() { - this.isLoading.next(false); + onDataLoaded(result: AggregateResult[]) { + this.significantText = result; + this.makeChart(); + } + + makeChart() { this.chartElement = this.chartContainer.nativeElement; d3.select('svg.wordcloud').remove(); const inputRange = d3.extent(this.significantText.map(d => d.doc_count)) as number[]; diff --git a/frontend/src/app/word-models/related-words/related-words.component.ts b/frontend/src/app/word-models/related-words/related-words.component.ts index 828b5f9e5..45324068a 100644 --- a/frontend/src/app/word-models/related-words/related-words.component.ts +++ b/frontend/src/app/word-models/related-words/related-words.component.ts @@ -23,7 +23,6 @@ export class RelatedWordsComponent implements OnChanges { neighbours = 5; timeIntervals: string[] = []; - totalSimilarities: WordSimilarity[]; // similarities over all time periods totalData: WordSimilarity[]; // similarities of overall nearest neighbours per time period zoomedInData: WordSimilarity[][]; // data when focusing on a single time interval: shows nearest neighbours from that period @@ -44,7 +43,6 @@ export class RelatedWordsComponent implements OnChanges { getTotalData(): Promise { return this.wordModelsService.getRelatedWords(this.queryText, this.corpus.name, this.neighbours) .then(results => { - this.totalSimilarities = results.total_similarities; this.totalData = results.similarities_over_time; this.timeIntervals = results.time_points; this.zoomedInData = results.similarities_over_time_local_top_n; diff --git a/frontend/src/app/word-models/similarity-chart/similarity-chart.component.spec.ts b/frontend/src/app/word-models/similarity-chart/similarity-chart.component.spec.ts index cd893d774..869bcae4a 100644 --- a/frontend/src/app/word-models/similarity-chart/similarity-chart.component.spec.ts +++ b/frontend/src/app/word-models/similarity-chart/similarity-chart.component.spec.ts @@ -36,6 +36,7 @@ describe('SimilarityChartComponent', () => { beforeEach(() => { fixture = TestBed.createComponent(SimilarityChartComponent); component = fixture.componentInstance; + component.timeIntervals = EXAMPLE_DATA.labels; fixture.detectChanges(); }); diff --git a/frontend/src/app/word-models/similarity-chart/similarity-chart.component.ts b/frontend/src/app/word-models/similarity-chart/similarity-chart.component.ts index 693a1a1dd..2b434c6f3 100644 --- a/frontend/src/app/word-models/similarity-chart/similarity-chart.component.ts +++ b/frontend/src/app/word-models/similarity-chart/similarity-chart.component.ts @@ -1,5 +1,5 @@ import { Component, Input, OnChanges, OnDestroy, OnInit, SimpleChanges } from '@angular/core'; -import { Chart, ChartData, ChartOptions, Filler } from 'chart.js'; +import { Chart, ChartData, ChartOptions, ChartType, Filler, TooltipItem } from 'chart.js'; import Zoom from 'chartjs-plugin-zoom'; import * as _ from 'lodash'; import { BehaviorSubject } from 'rxjs'; @@ -32,6 +32,8 @@ export class SimilarityChartComponent implements OnInit, OnChanges, OnDestroy { tableHeaders: FreqTableHeaders; tableData: WordSimilarity[]; + averages: Number[]; + graphStyle = new BehaviorSubject<'line'|'bar'>('line'); currentTimeIndex = undefined; @@ -106,12 +108,33 @@ export class SimilarityChartComponent implements OnInit, OnChanges, OnDestroy { } } + formatLabel(value: number): string { + const index = this.averages.indexOf(value) + return this.timeIntervals[index]; + } + + getAverageTime(time: string): number { + const times = time.split('-').map(t => parseInt(t)); + const avg = Math.round(_.mean(times)); + return avg; + } + + formatDataPoint(point: any, style: string): {x: number, y: number} | number { + if (style == 'line') { + return {x: this.getAverageTime(point.time), y: point.similarity} + } + else { + return point.similarity + } + } + /** convert array of word similarities to a chartData object */ makeChartData(data: WordSimilarity[], style: 'line'|'bar'): ChartData { + this.averages = this.timeIntervals.map(t => this.getAverageTime(t)); const allSeries = _.groupBy(data, point => point.key); const datasets = _.values(allSeries).map((series, datasetIndex) => { const label = series[0].key; - const similarities = series.map(point => point.similarity); + const similarities = series.map((point) => this.formatDataPoint(point, style)); const colour = selectColor(this.palette, datasetIndex); return { label, @@ -166,7 +189,16 @@ export class SimilarityChartComponent implements OnInit, OnChanges, OnDestroy { }, }, scales: { - x: {}, + x: { + type: 'linear', + ticks: { + stepSize: 1, + autoSkip: true, + callback: (value: number): string | undefined => this.averages.includes( + value) ? this.formatLabel(value) : undefined, + minRotation: 30 + } + }, y: { title: { display: true, @@ -182,6 +214,9 @@ export class SimilarityChartComponent implements OnInit, OnChanges, OnDestroy { tooltip: { displayColors: true, callbacks: { + title: (tooltipItems: TooltipItem[]): string => { + return this.formatLabel(tooltipItems[0].parsed.x) + }, labelColor: (tooltipItem: any): any => { const color = tooltipItem.dataset.borderColor; return { diff --git a/frontend/src/app/word-models/word-models.component.html b/frontend/src/app/word-models/word-models.component.html index 75582b796..99cf47453 100644 --- a/frontend/src/app/word-models/word-models.component.html +++ b/frontend/src/app/word-models/word-models.component.html @@ -1,4 +1,4 @@ - +
diff --git a/frontend/src/app/word-models/word-models.component.ts b/frontend/src/app/word-models/word-models.component.ts index 9deba1b3f..57c10ff01 100644 --- a/frontend/src/app/word-models/word-models.component.ts +++ b/frontend/src/app/word-models/word-models.component.ts @@ -18,7 +18,7 @@ export class WordModelsComponent implements DoCheck, OnInit { user: User; corpus: Corpus; - modelDocumentation: any; + queryText: string; asTable = false; @@ -78,17 +78,9 @@ export class WordModelsComponent implements DoCheck, OnInit { if (!this.corpus.word_models_present) { this.router.navigate(['search', this.corpus.name]); } - this.getDocumentation(); } } - getDocumentation() { - this.wordModelsService - .wordModelsDocumentationRequest({ corpus_name: this.corpus.name }) - .then((result) => { - this.modelDocumentation = result.documentation; - }); - } submitQuery(): void { this.errorMessage = undefined; diff --git a/frontend/src/app/word-models/word-similarity/word-similarity.component.ts b/frontend/src/app/word-models/word-similarity/word-similarity.component.ts index e780d8d8b..b90bcf6f4 100644 --- a/frontend/src/app/word-models/word-similarity/word-similarity.component.ts +++ b/frontend/src/app/word-models/word-similarity/word-similarity.component.ts @@ -1,5 +1,4 @@ import { Component, EventEmitter, Input, OnChanges, Output, SimpleChanges } from '@angular/core'; -import { Chart, ChartData } from 'chart.js'; import * as _ from 'lodash'; import { BehaviorSubject } from 'rxjs'; import { showLoading } from '../../utils/utils'; diff --git a/frontend/src/assets/UU-CDH_logo_EN_def_UU_CDH_logo_EN_yellowwhite.jpg b/frontend/src/assets/UU-CDH_logo_EN_def_UU_CDH_logo_EN_yellowwhite.jpg new file mode 100644 index 000000000..1115cebbd Binary files /dev/null and b/frontend/src/assets/UU-CDH_logo_EN_def_UU_CDH_logo_EN_yellowwhite.jpg differ diff --git a/frontend/src/assets/dhlab.png b/frontend/src/assets/dhlab.png deleted file mode 100644 index 40064be4d..000000000 Binary files a/frontend/src/assets/dhlab.png and /dev/null differ diff --git a/frontend/src/assets/uu-dhlab.png b/frontend/src/assets/uu-dhlab.png deleted file mode 100644 index f81bc9a5b..000000000 Binary files a/frontend/src/assets/uu-dhlab.png and /dev/null differ diff --git a/frontend/src/mock-data/api.ts b/frontend/src/mock-data/api.ts index b67f3d7b3..30e8b4d09 100644 --- a/frontend/src/mock-data/api.ts +++ b/frontend/src/mock-data/api.ts @@ -58,4 +58,8 @@ export class ApiServiceMock { public keyInfo() { return of({ username: 'Thomas', email: 'thomas@cromwell.com' }); } + + public fieldCoverage() { + return Promise.resolve({}); + } } diff --git a/frontend/src/mock-data/corpus.ts b/frontend/src/mock-data/corpus.ts index 48f62d670..d20579a00 100644 --- a/frontend/src/mock-data/corpus.ts +++ b/frontend/src/mock-data/corpus.ts @@ -1,62 +1,139 @@ +/* eslint-disable @typescript-eslint/naming-convention */ import { BehaviorSubject } from 'rxjs'; import { findByName } from '../app/utils/utils'; -import { BooleanFilterData, Corpus, CorpusField, SearchFilter } from '../app/models'; +import { BooleanFilterOptions } from '../app/models/search-filter-options'; +import { Corpus, CorpusField } from '../app/models'; -const mockFilterData: BooleanFilterData = { +const mockFilterOptions: BooleanFilterOptions = { checked: false, - filterType: 'BooleanFilter', -}; - -export const mockFilter: SearchFilter = { - fieldName: 'great_field', + name: 'BooleanFilter', description: 'Use this filter to decide whether or not this field is great', - currentData: mockFilterData, - defaultData: mockFilterData, - useAsFilter: true, }; -export const mockField: CorpusField = { - name: "great_field", - description: "A really wonderful field", - displayName: "Greatest field", - displayType: "keyword", - mappingType: "keyword", +/** a keyword field with a boolean filter */ +export const mockField = new CorpusField({ + name: 'great_field', + description: 'A really wonderful field', + display_name: 'Greatest field', + display_type: 'keyword', + es_mapping: {type: 'keyword'}, hidden: false, sortable: false, - primarySort: false, + primary_sort: false, searchable: false, downloadable: false, - searchFilter: mockFilter, - csvCore: true, -}; + search_filter: mockFilterOptions, + results_overview: true, + search_field_core: true, + csv_core: true, + visualizations: [], + visualization_sort: null, + indexed: true, + required: false, +}); + +/* a keyword field with a multiple choice filter */ +export const mockFieldMultipleChoice = new CorpusField({ + name: 'greater_field', + description: 'A even more wonderful field', + display_name: 'Greater field', + display_type: 'keyword', + es_mapping: {type: 'keyword'}, + hidden: false, + sortable: false, + primary_sort: false, + searchable: false, + downloadable: false, + search_filter: { + name: 'MultipleChoiceFilter', + option_count: 10, + description: 'Select your favourite values!' + }, + results_overview: true, + search_field_core: true, + csv_core: true, + visualizations: [], + visualization_sort: null, + indexed: true, + required: false, +}); -export const mockField2: CorpusField = { +/** a text content field */ +export const mockField2 = new CorpusField({ name: 'speech', description: 'A content field', - displayName: 'Speechiness', - displayType: 'text', - mappingType: 'text', + display_name: 'Speechiness', + display_type: 'text_content', + es_mapping: {type: 'text'}, hidden: false, sortable: false, - primarySort: false, + primary_sort: false, searchable: true, downloadable: true, - searchFilter: null -}; + search_filter: null, + results_overview: true, + search_field_core: true, + csv_core: true, + visualizations: [], + visualization_sort: null, + indexed: true, + required: false, +}); -export const mockField3: CorpusField = { +/** a keyword field with sorting option */ +export const mockField3 = new CorpusField({ name: 'ordering', description: 'A field which can be sorted on', - displayName: 'Sort me', - displayType: 'integer', - mappingType: 'keyword', + display_name: 'Sort me', + display_type: 'integer', + es_mapping: {type: 'keyword'}, hidden: false, sortable: true, - primarySort: false, + primary_sort: false, searchable: false, downloadable: true, - searchFilter: null -}; + results_overview: true, + search_filter: { + name: 'RangeFilter', + description: 'Filter me', + lower: 0, + upper: 100, + }, + search_field_core: false, + csv_core: true, + visualizations: [], + visualization_sort: null, + indexed: true, + required: false, +}); + +/** a date field */ +export const mockFieldDate = new CorpusField({ + name: 'date', + display_name: 'Date', + description: '', + display_type: 'date', + hidden: false, + sortable: true, + primary_sort: false, + searchable: false, + downloadable: true, + search_filter: { + name: 'DateFilter', + lower: '1800-01-01', + upper: '1899-12-31', + description: '' + }, + es_mapping: {type: 'date'}, + results_overview: true, + search_field_core: false, + csv_core: true, + visualizations: [], + visualization_sort: null, + indexed: true, + required: false, +}); + export const mockCorpus: Corpus = { name: 'test1', @@ -64,29 +141,56 @@ export const mockCorpus: Corpus = { index: 'test1', title: 'Test corpus', description: 'This corpus is for mocking', - minDate: new Date(), - maxDate: new Date(), - image: "test.jpg", - scan_image_type: "pdf", + minDate: new Date('1800-01-01'), + maxDate: new Date('1900-01-01'), + image: 'test.jpg', + scan_image_type: 'pdf', allow_image_download: false, word_models_present: false, fields: [mockField, mockField2], -}; + languages: ['English'], + category: 'Tests' +} as Corpus; -export const mockCorpus2: Corpus = { +export const mockCorpus2 = { name: 'test2', serverName: 'default', index: 'test2', title: 'Test corpus 2', description: 'This corpus is for mocking', + minDate: new Date('1850-01-01'), + maxDate: new Date('2000-01-01'), + image: 'test.jpg', + scan_image_type: 'pdf', + allow_image_download: false, + word_models_present: false, + fields: [mockField2], + languages: ['English', 'French'], + category: 'Different tests' +} as Corpus; + +export const mockCorpus3: Corpus = { + name: 'test3', + serverName: 'default', + index: 'test3', + title: 'Test corpus 3', + description: 'This corpus is for mocking', minDate: new Date(), maxDate: new Date(), image: 'test.jpg', scan_image_type: 'pdf', allow_image_download: false, word_models_present: false, - fields: [mockField2] -}; + fields: [mockField, mockField2, mockField3, mockFieldDate, mockFieldMultipleChoice], + languages: ['English'], + category: 'Tests', + documentContext: { + contextFields: [mockFieldDate], + displayName: 'day', + sortField: mockField3, + sortDirection: 'asc' + } +} as Corpus; export class CorpusServiceMock { private currentCorpusSubject = new BehaviorSubject(mockCorpus); diff --git a/frontend/src/mock-data/elastic-search.ts b/frontend/src/mock-data/elastic-search.ts index 26b2cef33..942f2fa3e 100644 --- a/frontend/src/mock-data/elastic-search.ts +++ b/frontend/src/mock-data/elastic-search.ts @@ -8,12 +8,6 @@ export class ElasticSearchServiceMock { public clearScroll() { } - esQueryToQueryModel(query: EsQuery, corpus: Corpus): QueryModel { - return { - queryText: '' - }; - } - getDocumentById(): Promise { return Promise.resolve({ id: '0', diff --git a/frontend/src/mock-data/query-model.ts b/frontend/src/mock-data/query-model.ts deleted file mode 100644 index 67fead3ab..000000000 --- a/frontend/src/mock-data/query-model.ts +++ /dev/null @@ -1,6 +0,0 @@ -import { QueryModel } from '../app/models'; - -export const mockQueryModel: QueryModel = { - queryText: 'Gouda', - filters: [] -}; diff --git a/frontend/src/mock-data/search.ts b/frontend/src/mock-data/search.ts index 8e88c9cff..c01a86239 100644 --- a/frontend/src/mock-data/search.ts +++ b/frontend/src/mock-data/search.ts @@ -1,11 +1,13 @@ -import { AggregateQueryFeedback, Corpus, CorpusField, QueryModel, SearchFilter, SearchFilterData } from '../app/models/index'; +import { SearchFilter } from '../app/models/search-filter'; +import { AggregateQueryFeedback, Corpus, CorpusField, QueryModel } from '../app/models/index'; export class SearchServiceMock { - public async aggregateSearch(corpus: Corpus, queryModel: QueryModel, aggregator: string): Promise { + public async aggregateSearch(corpus: Corpus, queryModel: QueryModel, aggregator: [{name: string}]): Promise { + const name = aggregator[0].name; return { completed: false, aggregations: { - aggregator: [{ + [name]: [{ key: '1999', doc_count: 200 }, { @@ -18,24 +20,26 @@ export class SearchServiceMock { } }; } + public async getRelatedWords() {} createQueryModel( - queryText: string = '', fields: string[] | null = null, filters: SearchFilter[] = [], + corpus: Corpus, + queryText: string = '', fields: CorpusField[] | null = null, filters: SearchFilter[] = [], sortField: CorpusField = null, sortAscending = false, highlight: number = null ): QueryModel { - const model: QueryModel = { - queryText, - filters, - sortBy: sortField ? sortField.name : undefined, - sortAscending - }; - if (fields) { - model.fields = fields; - } - if (highlight) { - model.highlight = highlight; + const model = new QueryModel(corpus); + model.setQueryText(queryText); + model.searchFields = fields; + filters.forEach(model.addFilter); + + if (sortField) { + model.setSortBy(sortField); + model.setSortDirection(sortAscending ? 'asc' : 'desc'); } + + model.highlightSize = highlight; + return model; } } diff --git a/package.json b/package.json index edc5eeb90..782a5ed47 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "i-analyzer", - "version": "4.0.2", + "version": "4.1.0", "license": "MIT", "scripts": { "postinstall": "yarn install-back && yarn install-front",