From 2c5c5016a802687389befbb91c4bf466eee1d9f0 Mon Sep 17 00:00:00 2001 From: "Joao Victor G. Rodrigues" Date: Thu, 17 Oct 2024 10:10:26 -0400 Subject: [PATCH] Feat/return-filename-with-text-snippets (#7373) * index filename with fulltext object * return filename on eleasticsearch query * add migrations to apply reindex --- .../index.ts | 17 +++++++++ ...sist_filename_with_fullText_object.spec.ts | 37 +++++++++++++++++++ .../specs/fixtures.ts | 13 +++++++ .../types.ts | 13 +++++++ app/api/search.v2/buildQuery.ts | 4 +- app/api/search.v2/searchResponse.ts | 5 ++- .../search.v2/specs/snippetsSearch.spec.ts | 12 ++++-- app/api/search/elasticTypes.ts | 2 +- app/api/search/entitiesIndex.js | 1 + 9 files changed, 97 insertions(+), 7 deletions(-) create mode 100644 app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/index.ts create mode 100644 app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/specs/169-reindex_persist_filename_with_fullText_object.spec.ts create mode 100644 app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/specs/fixtures.ts create mode 100644 app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/types.ts diff --git a/app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/index.ts b/app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/index.ts new file mode 100644 index 0000000000..2d46a71aac --- /dev/null +++ b/app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/index.ts @@ -0,0 +1,17 @@ +import { Db } from 'mongodb'; + +export default { + delta: 169, + + name: 'reindex_persist_filename_with_fullText_object', + + description: + "We're now indexing document.filename within the fullText object on elasticsearch, this is usefull because on search/v2 endpoint we need to return which filename each text snippet belongs to.", + + reindex: true, + + // eslint-disable-next-line @typescript-eslint/no-unused-vars + async up(db: Db) { + process.stdout.write(`${this.name}...\r\n`); + }, +}; diff --git a/app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/specs/169-reindex_persist_filename_with_fullText_object.spec.ts b/app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/specs/169-reindex_persist_filename_with_fullText_object.spec.ts new file mode 100644 index 0000000000..30a56fadf0 --- /dev/null +++ b/app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/specs/169-reindex_persist_filename_with_fullText_object.spec.ts @@ -0,0 +1,37 @@ +import { Db } from 'mongodb'; + +import testingDB from 'api/utils/testing_db'; +import migration from '../index'; +import { Fixture } from '../types'; +import { fixtures } from './fixtures'; + +let db: Db | null; + +const initTest = async (fixture: Fixture) => { + await testingDB.setupFixturesAndContext(fixture); + db = testingDB.mongodb!; + await migration.up(db); +}; + +beforeAll(async () => { + // eslint-disable-next-line @typescript-eslint/no-unused-vars + jest.spyOn(process.stdout, 'write').mockImplementation((str: string | Uint8Array) => true); +}); + +afterAll(async () => { + await testingDB.tearDown(); +}); + +describe('migration test', () => { + beforeAll(async () => { + await initTest(fixtures); + }); + + it('should have a delta number', () => { + expect(migration.delta).toBe(169); + }); + + it('should check if a reindex is needed', async () => { + expect(migration.reindex).toBe(true); + }); +}); diff --git a/app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/specs/fixtures.ts b/app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/specs/fixtures.ts new file mode 100644 index 0000000000..e0e2c34cd0 --- /dev/null +++ b/app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/specs/fixtures.ts @@ -0,0 +1,13 @@ +import { ObjectId } from 'mongodb'; +import { Fixture } from '../types'; + +const fixtures: Fixture = { + entities: [ + { + _id: new ObjectId(), + title: 'test_doc', + }, + ], +}; + +export { fixtures }; diff --git a/app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/types.ts b/app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/types.ts new file mode 100644 index 0000000000..f98b5fcc36 --- /dev/null +++ b/app/api/migrations/migrations/169-reindex_persist_filename_with_fullText_object/types.ts @@ -0,0 +1,13 @@ +import { ObjectId } from 'mongodb'; + +interface Entity { + _id: ObjectId; + title: string; + [k: string]: unknown | undefined; +} + +interface Fixture { + entities: Entity[]; +} + +export type { Entity, Fixture }; diff --git a/app/api/search.v2/buildQuery.ts b/app/api/search.v2/buildQuery.ts index dbbd0a02ac..3a5b2b936a 100644 --- a/app/api/search.v2/buildQuery.ts +++ b/app/api/search.v2/buildQuery.ts @@ -53,7 +53,9 @@ const fullTextSearch = ( type: 'fullText', score_mode: 'max', inner_hits: { - _source: false, + _source: { + excludes: ['fullText*'], + }, ...snippetsHighlight(query, [{ 'fullText_*': {} }]), }, query: { diff --git a/app/api/search.v2/searchResponse.ts b/app/api/search.v2/searchResponse.ts index c39aa5f377..bbf62a2911 100644 --- a/app/api/search.v2/searchResponse.ts +++ b/app/api/search.v2/searchResponse.ts @@ -12,10 +12,10 @@ function getSnippetsForNonFullText(hit: ElasticHit) { } function extractFullTextSnippets(hit: ElasticHit) { - const fullTextSnippets: { text: string; page: number }[] = []; + const fullTextSnippets: { text: string; page: number; filename: string }[] = []; if (hit.inner_hits && hit.inner_hits.fullText.hits.hits[0]?.highlight) { - const { highlight } = hit.inner_hits.fullText.hits.hits[0]; + const { highlight, _source } = hit.inner_hits.fullText.hits.hits[0]; const regex = /\[{2}(\d+)]{2}/g; Object.values(highlight).forEach(snippets => { @@ -24,6 +24,7 @@ function extractFullTextSnippets(hit: ElasticHit) { fullTextSnippets.push({ text: snippet.replace(regex, ''), page: matches ? Number(matches[1]) : 0, + filename: _source.filename, }); }); }); diff --git a/app/api/search.v2/specs/snippetsSearch.spec.ts b/app/api/search.v2/specs/snippetsSearch.spec.ts index b7f77c9aeb..4527f874a1 100644 --- a/app/api/search.v2/specs/snippetsSearch.spec.ts +++ b/app/api/search.v2/specs/snippetsSearch.spec.ts @@ -42,8 +42,8 @@ describe('searchSnippets', () => { count: 2, metadata: [], fullText: [ - { page: 2, text: matches }, - { page: 4, text: matches }, + { page: 2, text: matches, filename: 'entity1SharedId.pdf' }, + { page: 4, text: matches, filename: 'entity1SharedId.pdf' }, ], }, }), @@ -97,7 +97,13 @@ describe('searchSnippets', () => { snippets: { count: 1, metadata: [], - fullText: [{ page: 2, text: expect.stringContaining('searched:term') }], + fullText: [ + { + page: 2, + text: expect.stringContaining('searched:term'), + filename: 'entity4SharedId.pdf', + }, + ], }, }), ]; diff --git a/app/api/search/elasticTypes.ts b/app/api/search/elasticTypes.ts index a3fd4ecf4e..417ee4bc74 100644 --- a/app/api/search/elasticTypes.ts +++ b/app/api/search/elasticTypes.ts @@ -26,7 +26,7 @@ export interface ElasticHit { fields?: any; highlight?: any; // eslint-disable-next-line camelcase - inner_hits?: { fullText: { hits: { hits: [{ highlight: {} }] } } }; + inner_hits?: { fullText: { hits: { hits: [{ highlight: {}; _source: Record }] } } }; // eslint-disable-next-line camelcase matched_queries?: string[]; sort?: string[]; diff --git a/app/api/search/entitiesIndex.js b/app/api/search/entitiesIndex.js index be50c67a7c..3a76cb0642 100644 --- a/app/api/search/entitiesIndex.js +++ b/app/api/search/entitiesIndex.js @@ -54,6 +54,7 @@ function setFullTextSettings(defaultDocument, id, body, doc) { } const fullTextObject = { [`fullText_${language}`]: fullText, + filename: defaultDocument.filename, fullText: { name: 'fullText', parent: id }, }; body.push(fullTextObject);