From facb30e46b1863728efd1aec722cbd20c8f8e43d Mon Sep 17 00:00:00 2001 From: Christine Poerschke Date: Fri, 4 Oct 2024 17:08:35 +0100 Subject: [PATCH] PR 13757 follow-up: add missing with-discountOverlaps Similarity constructor variants, CHANGES.txt entries (#13845) (cherry picked from commit dab731175c86cc25741f37845136d38a69a9d165) (cherry picked from commit cbd8b5218addb7712452893cdfe2a3795dc9e05a) Resolved Conflicts: lucene/CHANGES.txt --- lucene/CHANGES.txt | 5 +++++ .../lucene/search/similarities/Axiomatic.java | 15 ++++++++++++- .../search/similarities/DFISimilarity.java | 14 +++++++++++- .../search/similarities/DFRSimilarity.java | 4 ++-- .../search/similarities/IBSimilarity.java | 22 ++++++++++++++++++- .../IndriDirichletSimilarity.java | 7 ++++++ .../similarities/LMDirichletSimilarity.java | 8 ++++++- .../LMJelinekMercerSimilarity.java | 8 ++++++- .../search/similarities/LMSimilarity.java | 6 +++++ 9 files changed, 82 insertions(+), 7 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 13882514d358..52f0bf65d6ae 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -127,6 +127,8 @@ API Changes unifies Byte/FloatVectorValues incorporating RandomAccess* API and introduces DocIndexIterator for iterative access in place of direct inheritance from DISI. (Michael Sokolov) +* GITHUB#13845: Add missing with-discountOverlaps Similarity constructor variants. (Pierre Salagnac, Christine Poerschke, Robert Muir) + New Features --------------------- @@ -323,6 +325,9 @@ API Changes * GITHUB#13568, GITHUB#13750: Add DrillSideways#search method that supports any CollectorManagers for drill-sideways dimensions or drill-down. (Egor Potemkin) +* GITHUB#13757: For similarities, provide default computeNorm implementation and remove remaining discountOverlaps setters. + (Christine Poerschke, Adrien Grand, Robert Muir) + New Features --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java index 77f71782e315..b4546946acfd 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Axiomatic.java @@ -44,13 +44,26 @@ public abstract class Axiomatic extends SimilarityBase { protected final int queryLen; /** - * Constructor setting all Axiomatic hyperparameters + * Constructor setting all Axiomatic hyperparameters and using default discountOverlaps value. * * @param s hyperparam for the growth function * @param queryLen the query length * @param k hyperparam for the primitive weighting function */ public Axiomatic(float s, int queryLen, float k) { + this(true, s, queryLen, k); + } + + /** + * Constructor setting all Axiomatic hyperparameters + * + * @param discountOverlaps true if overlap tokens should not impact document length for scoring. + * @param s hyperparam for the growth function + * @param queryLen the query length + * @param k hyperparam for the primitive weighting function + */ + public Axiomatic(boolean discountOverlaps, float s, int queryLen, float k) { + super(discountOverlaps); if (Float.isFinite(s) == false || Float.isNaN(s) || s < 0 || s > 1) { throw new IllegalArgumentException("illegal s value: " + s + ", must be between 0 and 1"); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java index b9c651008ccd..34d619ea69f3 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFISimilarity.java @@ -46,11 +46,23 @@ public class DFISimilarity extends SimilarityBase { private final Independence independence; /** - * Create DFI with the specified divergence from independence measure + * Create DFI with the specified divergence from independence measure and using default + * discountOverlaps value * * @param independenceMeasure measure of divergence from independence */ public DFISimilarity(Independence independenceMeasure) { + this(independenceMeasure, true); + } + + /** + * Create DFI with the specified parameters + * + * @param independenceMeasure measure of divergence from independence + * @param discountOverlaps true if overlap tokens should not impact document length for scoring. + */ + public DFISimilarity(Independence independenceMeasure, boolean discountOverlaps) { + super(discountOverlaps); this.independence = independenceMeasure; } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java index 0b3c1a5e7f02..08e424b32303 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java @@ -83,7 +83,7 @@ public class DFRSimilarity extends SimilarityBase { protected final Normalization normalization; /** - * Creates DFRSimilarity from the three components. + * Creates DFRSimilarity from the three components and using default discountOverlaps value. * *

Note that null values are not allowed: if you want no normalization, instead * pass {@link NoNormalization}. @@ -98,7 +98,7 @@ public DFRSimilarity( } /** - * Creates DFRSimilarity from the three components. + * Creates DFRSimilarity from the three components and with the specified discountOverlaps value. * *

Note that null values are not allowed: if you want no normalization, instead * pass {@link NoNormalization}. diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java index 5b0e93571b12..d2325d200335 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IBSimilarity.java @@ -76,7 +76,7 @@ public class IBSimilarity extends SimilarityBase { protected final Normalization normalization; /** - * Creates IBSimilarity from the three components. + * Creates IBSimilarity from the three components and using default discountOverlaps value. * *

Note that null values are not allowed: if you want no normalization, instead * pass {@link NoNormalization}. @@ -86,6 +86,26 @@ public class IBSimilarity extends SimilarityBase { * @param normalization term frequency normalization */ public IBSimilarity(Distribution distribution, Lambda lambda, Normalization normalization) { + this(distribution, lambda, normalization, true); + } + + /** + * Creates IBSimilarity from the three components and with the specified discountOverlaps value. + * + *

Note that null values are not allowed: if you want no normalization, instead + * pass {@link NoNormalization}. + * + * @param distribution probabilistic distribution modeling term occurrence + * @param lambda distribution's λw parameter + * @param normalization term frequency normalization + * @param discountOverlaps true if overlap tokens should not impact document length for scoring. + */ + public IBSimilarity( + Distribution distribution, + Lambda lambda, + Normalization normalization, + boolean discountOverlaps) { + super(discountOverlaps); this.distribution = distribution; this.lambda = lambda; this.normalization = normalization; diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java index 9f708362bb5f..b3994c5dc46e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java @@ -37,6 +37,13 @@ public class IndriDirichletSimilarity extends LMSimilarity { /** The μ parameter. */ private final float mu; + /** Instantiates the similarity with the provided parameters. */ + public IndriDirichletSimilarity( + CollectionModel collectionModel, boolean discountOverlaps, float mu) { + super(collectionModel, discountOverlaps); + this.mu = mu; + } + /** Instantiates the similarity with the provided μ parameter. */ public IndriDirichletSimilarity(CollectionModel collectionModel, float mu) { super(collectionModel); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java index 51b1604aef1c..ab80d0d337e5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMDirichletSimilarity.java @@ -39,7 +39,13 @@ public class LMDirichletSimilarity extends LMSimilarity { /** Instantiates the similarity with the provided μ parameter. */ public LMDirichletSimilarity(CollectionModel collectionModel, float mu) { - super(collectionModel); + this(collectionModel, true, mu); + } + + /** Instantiates the similarity with the provided parameters. */ + public LMDirichletSimilarity( + CollectionModel collectionModel, boolean discountOverlaps, float mu) { + super(collectionModel, discountOverlaps); if (Float.isFinite(mu) == false || mu < 0) { throw new IllegalArgumentException( "illegal mu value: " + mu + ", must be a non-negative finite value"); diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java index e1990f34b0b6..7029fa8e133c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMJelinekMercerSimilarity.java @@ -43,7 +43,13 @@ public class LMJelinekMercerSimilarity extends LMSimilarity { /** Instantiates with the specified collectionModel and λ parameter. */ public LMJelinekMercerSimilarity(CollectionModel collectionModel, float lambda) { - super(collectionModel); + this(collectionModel, true, lambda); + } + + /** Instantiates with the specified collectionModel and parameters. */ + public LMJelinekMercerSimilarity( + CollectionModel collectionModel, boolean discountOverlaps, float lambda) { + super(collectionModel, discountOverlaps); if (Float.isNaN(lambda) || lambda <= 0 || lambda > 1) { throw new IllegalArgumentException("lambda must be in the range (0 .. 1]"); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java index e1536db268fd..5bd48f37a34e 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/LMSimilarity.java @@ -43,6 +43,12 @@ public abstract class LMSimilarity extends SimilarityBase { /** Creates a new instance with the specified collection language model. */ public LMSimilarity(CollectionModel collectionModel) { + this(collectionModel, true); + } + + /** Creates a new instance with the specified collection language model and discountOverlaps. */ + public LMSimilarity(CollectionModel collectionModel, boolean discountOverlaps) { + super(discountOverlaps); this.collectionModel = collectionModel; }