From 5460bdd27c180aeb0542b5fb66d060b6816a68ee Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 12 Dec 2024 15:27:10 -0500 Subject: [PATCH 1/3] account for cvoc multiples --- src/main/java/edu/harvard/iq/dataverse/api/Index.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Index.java b/src/main/java/edu/harvard/iq/dataverse/api/Index.java index c30a77acb58..0b3e3ac52d1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Index.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Index.java @@ -44,6 +44,7 @@ import java.lang.reflect.Field; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; @@ -451,11 +452,11 @@ public Response clearOrphans(@QueryParam("sync") String sync) { public String getSolrSchema() { StringBuilder sb = new StringBuilder(); - - for (DatasetFieldType datasetField : datasetFieldService.findAllOrderedByName()) { + Map cvocTermUriMap = datasetFieldSvc.getCVocConf(true); + for (DatasetFieldType datasetFieldType : datasetFieldService.findAllOrderedByName()) { //ToDo - getSolrField() creates/returns a new object - just get it once and re-use - String nameSearchable = datasetField.getSolrField().getNameSearchable(); - SolrField.SolrType solrType = datasetField.getSolrField().getSolrType(); + String nameSearchable = datasetFieldType.getSolrField().getNameSearchable(); + SolrField.SolrType solrType = datasetFieldType.getSolrField().getSolrType(); String type = solrType.getType(); if (solrType.equals(SolrField.SolrType.EMAIL)) { /** @@ -474,7 +475,7 @@ public String getSolrSchema() { */ logger.info("email type detected (" + nameSearchable + ") See also https://github.com/IQSS/dataverse/issues/759"); } - String multivalued = datasetField.getSolrField().isAllowedToBeMultivalued().toString(); + String multivalued = Boolean.toString(datasetFieldType.getSolrField().isAllowedToBeMultivalued()|| cvocTermUriMap.containsKey(datasetFieldType.getId())); // sb.append(" \n"); } From f85c26c31e629a6ce9321f445b87d8234ac57b44 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Thu, 12 Dec 2024 16:50:24 -0500 Subject: [PATCH 2/3] release note, docs --- doc/release-notes/11095-fix-extcvoc-indexing.md | 7 +++++++ doc/sphinx-guides/source/admin/metadatacustomization.rst | 6 ++++-- doc/sphinx-guides/source/installation/config.rst | 3 +++ 3 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 doc/release-notes/11095-fix-extcvoc-indexing.md diff --git a/doc/release-notes/11095-fix-extcvoc-indexing.md b/doc/release-notes/11095-fix-extcvoc-indexing.md new file mode 100644 index 00000000000..1459b88c122 --- /dev/null +++ b/doc/release-notes/11095-fix-extcvoc-indexing.md @@ -0,0 +1,7 @@ +Some External Controlled Vocabulary scripts/configurations, when used on a metadata field that is single valued could result +in indexing failure for the dataset (e.g. when the the script tried to index both the identifier and name of the identified entity for indexing). +Dataverse has been updated to correctly indicate the need for a multi-valued solr field in these cases in the call to /api/admin/index/solr/schema. +Configuring the Solr schema and the update-fields.sh script as usually recommended when using custom metadata blocks will resolve the issue. + +The overall release notes should include a solr update (which hopefully is required by an update to 9.7.0 anyway) and our standard instructions +should change to recommending use of the udpate-fields.sh script when using custom metadatablocks *and/or external vocabulary scripts*. diff --git a/doc/sphinx-guides/source/admin/metadatacustomization.rst b/doc/sphinx-guides/source/admin/metadatacustomization.rst index e5326efebef..eee83260804 100644 --- a/doc/sphinx-guides/source/admin/metadatacustomization.rst +++ b/doc/sphinx-guides/source/admin/metadatacustomization.rst @@ -559,8 +559,7 @@ Using External Vocabulary Services The Dataverse software has a mechanism to associate specific fields defined in metadata blocks with a vocabulary(ies) managed by external services. The mechanism relies on trusted third-party Javascripts. The mapping from field type to external vocabulary(ies) is managed via the :ref:`:CVocConf <:CVocConf>` setting. -*This functionality is considered 'experimental'. It may require significant effort to configure and is likely to evolve in subsequent Dataverse software releases.* - +*This functionality may require significant effort to configure and is likely to evolve in subsequent Dataverse software releases.* The effect of configuring this mechanism is similar to that of defining a field in a metadata block with 'allowControlledVocabulary=true': @@ -585,6 +584,9 @@ Configuration involves specifying which fields are to be mapped, to which Solr f These are all defined in the :ref:`:CVocConf <:CVocConf>` setting as a JSON array. Details about the required elements as well as example JSON arrays are available at https://github.com/gdcc/dataverse-external-vocab-support, along with an example metadata block that can be used for testing. The scripts required can be hosted locally or retrieved dynamically from https://gdcc.github.io/ (similar to how dataverse-previewers work). +Since external vocabulary scripts can change how fields are indexed (storing an identifier and name and/or values in different languages), +updating the solr schema as described in :ref:`update-solr-schema` should be done after adding new scripts to your configuration. + Please note that in addition to the :ref:`:CVocConf` described above, an alternative is the :ref:`:ControlledVocabularyCustomJavaScript` setting. Protecting MetadataBlocks diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 30a36da9499..a310e9f96f8 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -4653,6 +4653,9 @@ The commands below should give you an idea of how to load the configuration, but ``curl -X PUT --upload-file cvoc-conf.json http://localhost:8080/api/admin/settings/:CVocConf`` +Since external vocabulary scripts can change how fields are indexed (storing an identifier and name and/or values in different languages), +updating the solr schema as described in :ref:`update-solr-schema` should be done after adding new scripts to your configuration. + .. _:ControlledVocabularyCustomJavaScript: :ControlledVocabularyCustomJavaScript From 968dfdaf9372065f885efb4a94d81b40ce6ec06f Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 19 Dec 2024 12:41:00 -0500 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Philip Durbin --- doc/release-notes/11095-fix-extcvoc-indexing.md | 10 +++++----- .../source/admin/metadatacustomization.rst | 2 +- doc/sphinx-guides/source/installation/config.rst | 2 +- src/main/java/edu/harvard/iq/dataverse/api/Index.java | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/release-notes/11095-fix-extcvoc-indexing.md b/doc/release-notes/11095-fix-extcvoc-indexing.md index 1459b88c122..f4931d81263 100644 --- a/doc/release-notes/11095-fix-extcvoc-indexing.md +++ b/doc/release-notes/11095-fix-extcvoc-indexing.md @@ -1,7 +1,7 @@ -Some External Controlled Vocabulary scripts/configurations, when used on a metadata field that is single valued could result -in indexing failure for the dataset (e.g. when the the script tried to index both the identifier and name of the identified entity for indexing). -Dataverse has been updated to correctly indicate the need for a multi-valued solr field in these cases in the call to /api/admin/index/solr/schema. +Some External Controlled Vocabulary scripts/configurations, when used on a metadata field that is single-valued could result +in indexing failure for the dataset (e.g. when the script tried to index both the identifier and name of the identified entity for indexing). +Dataverse has been updated to correctly indicate the need for a multi-valued Solr field in these cases in the call to /api/admin/index/solr/schema. Configuring the Solr schema and the update-fields.sh script as usually recommended when using custom metadata blocks will resolve the issue. -The overall release notes should include a solr update (which hopefully is required by an update to 9.7.0 anyway) and our standard instructions -should change to recommending use of the udpate-fields.sh script when using custom metadatablocks *and/or external vocabulary scripts*. +The overall release notes should include a Solr update (which hopefully is required by an update to 9.7.0 anyway) and our standard instructions +should change to recommending use of the update-fields.sh script when using custom metadatablocks *and/or external vocabulary scripts*. diff --git a/doc/sphinx-guides/source/admin/metadatacustomization.rst b/doc/sphinx-guides/source/admin/metadatacustomization.rst index eee83260804..2a104354af9 100644 --- a/doc/sphinx-guides/source/admin/metadatacustomization.rst +++ b/doc/sphinx-guides/source/admin/metadatacustomization.rst @@ -585,7 +585,7 @@ These are all defined in the :ref:`:CVocConf <:CVocConf>` setting as a JSON arra The scripts required can be hosted locally or retrieved dynamically from https://gdcc.github.io/ (similar to how dataverse-previewers work). Since external vocabulary scripts can change how fields are indexed (storing an identifier and name and/or values in different languages), -updating the solr schema as described in :ref:`update-solr-schema` should be done after adding new scripts to your configuration. +updating the Solr schema as described in :ref:`update-solr-schema` should be done after adding new scripts to your configuration. Please note that in addition to the :ref:`:CVocConf` described above, an alternative is the :ref:`:ControlledVocabularyCustomJavaScript` setting. diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index a310e9f96f8..a653a100c89 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -4654,7 +4654,7 @@ The commands below should give you an idea of how to load the configuration, but ``curl -X PUT --upload-file cvoc-conf.json http://localhost:8080/api/admin/settings/:CVocConf`` Since external vocabulary scripts can change how fields are indexed (storing an identifier and name and/or values in different languages), -updating the solr schema as described in :ref:`update-solr-schema` should be done after adding new scripts to your configuration. +updating the Solr schema as described in :ref:`update-solr-schema` should be done after adding new scripts to your configuration. .. _:ControlledVocabularyCustomJavaScript: diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Index.java b/src/main/java/edu/harvard/iq/dataverse/api/Index.java index 0b3e3ac52d1..bc9a8ae692b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Index.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Index.java @@ -475,7 +475,7 @@ public String getSolrSchema() { */ logger.info("email type detected (" + nameSearchable + ") See also https://github.com/IQSS/dataverse/issues/759"); } - String multivalued = Boolean.toString(datasetFieldType.getSolrField().isAllowedToBeMultivalued()|| cvocTermUriMap.containsKey(datasetFieldType.getId())); + String multivalued = Boolean.toString(datasetFieldType.getSolrField().isAllowedToBeMultivalued() || cvocTermUriMap.containsKey(datasetFieldType.getId())); // sb.append(" \n"); }