Skip to content

Commit

Permalink
Issue DmitryKey#175: Offered improvement to reconstruction of unstore…
Browse files Browse the repository at this point in the history
…d fields with no position information
  • Loading branch information
Chris Bamford committed Nov 25, 2019
1 parent 13f7243 commit 75c0d1a
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 28 deletions.
64 changes: 36 additions & 28 deletions src/main/java/org/getopt/luke/DocReconstructor.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.apache.lucene.document.Document;
import org.apache.lucene.index.*;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;

Expand Down Expand Up @@ -155,38 +156,45 @@ public Reconstructed reconstruct(int docNum) throws Exception {

DocsAndPositionsEnum newDpe = te.docsAndPositions(live, dpe, 0);

if (newDpe == null) { // no position info for this field
// re-construct without positions
GrowableStringArray gsa = (GrowableStringArray)
res.getReconstructedFields().get(fld);
if (newDpe != null) {
// we have positions for the field, process them accordingly
dpe = newDpe;

int num = dpe.advance(docNum);
if (num != docNum) { // either greater than or NO_MORE_DOCS
continue; // no data for this term in this doc
}

// we have computed the value earlier, using the bytesRef data structure
docTerm = te.term().utf8ToString();

GrowableStringArray gsa = res.getReconstructedFields().get(fld);
if (gsa == null) {
gsa = new GrowableStringArray();
res.getReconstructedFields().put(fld, gsa);
}
gsa.append(0, "|", docTerm);
// we are done. Move to the next field
break;
}

// we should have positions as well for the field, process them accordingly
dpe = newDpe;

int num = dpe.advance(docNum);
if (num != docNum) { // either greater than or NO_MORE_DOCS
continue; // no data for this term in this doc
}

// we have computed the value earlier, using the bytesRef data structure
docTerm = te.term().utf8ToString();

GrowableStringArray gsa = res.getReconstructedFields().get(fld);
if (gsa == null) {
gsa = new GrowableStringArray();
res.getReconstructedFields().put(fld, gsa);
}
for (int k = 0; k < dpe.freq(); k++) {
int pos = dpe.nextPosition();
gsa.append(pos, "|", docTerm);
for (int k = 0; k < dpe.freq(); k++) {
int pos = dpe.nextPosition();
gsa.append(pos, "|", docTerm);
}
} else {
// Reconstruct without positions (cross-reference via DocsEnum).
// NB if there are multiple terms they will all be added to the array at position 0
// (concatenated together, pipe-delimited)
DocsEnum docsEnum = te.docs(null, null);
if (docsEnum != null) {
int termDoc;
while ((termDoc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (termDoc == docNum) {
GrowableStringArray gsa = res.getReconstructedFields().get(fld);
if (gsa == null) {
gsa = new GrowableStringArray();
res.getReconstructedFields().put(fld, gsa);
}
gsa.append(0, "|", docTerm);
}
}
}
}
}
}
Expand Down
102 changes: 102 additions & 0 deletions src/test/java/org.apache.lucene.index/IndexTester2.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package org.apache.lucene.index;

import junit.framework.TestCase;
import org.apache.lucene.analysis.standard.UAX29URLEmailAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Version;
import org.getopt.luke.DocReconstructor;
import org.getopt.luke.IndexInfo;

import java.io.File;

/**
* Created by cbamford on 19/11/2019.
* Tests that unstored fields with no position info are reconstructed correctly.
* For completeness it also checks the 3 other field types.
*/
public class IndexTester2 extends TestCase {

private String indexPath = "src/test/indices/lukeindex2";
private IndexWriterConfig indexCfg;
private Directory directory;
private DocReconstructor recon;

@Override
protected void setUp() throws Exception {
super.setUp();
directory = NIOFSDirectory.open(new File(indexPath));
populate();
}

@Override
protected void tearDown() throws Exception {
super.tearDown();
if (directory != null) directory.close();
}

public void testDummy() {
assertTrue(true == true);
}

public void testVerifyReconstructionOfMultipleFieldTypesAcrossMultipleDocs() throws Exception {

// Check doc 1
DocReconstructor.Reconstructed reconstructed = recon.reconstruct(0);
assertEquals("value1", (reconstructed.getStoredFields().get("stored"))[0].stringValue());
assertEquals("value1", reconstructed.getReconstructedFields().get("stored+tvs").get(0));
assertEquals("value1", reconstructed.getReconstructedFields().get("unstored-posns").get(0));
assertEquals("value1", reconstructed.getReconstructedFields().get("unstored+posns").get(0));

// Check doc 2
reconstructed = recon.reconstruct(1);
assertEquals("value2", (reconstructed.getStoredFields().get("stored"))[0].stringValue());
assertEquals("value2", reconstructed.getReconstructedFields().get("stored+tvs").get(0));
assertEquals("value2", reconstructed.getReconstructedFields().get("unstored-posns").get(0));
assertEquals("value2", reconstructed.getReconstructedFields().get("unstored+posns").get(0));
}

private void populate() throws Exception {
// create an index
indexCfg = new IndexWriterConfig(Version.LUCENE_4_10_3, new UAX29URLEmailAnalyzer());
indexCfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

IndexWriter writer = new IndexWriter(directory, indexCfg);
FieldType tvFtype = createUnstoredWithTermVectorsFieldType();

Document doc = new Document();
doc.add(new TextField("stored", "value1", Field.Store.YES));
doc.add(new Field("stored+tvs", "value1", tvFtype));
doc.add(new TextField("unstored+posns", "value1", Field.Store.NO));
doc.add(new StringField("unstored-posns", "value1", Field.Store.NO));
writer.addDocument(doc);

doc = new Document();
doc.add(new TextField("stored", "value2", Field.Store.YES));
doc.add(new Field("stored+tvs", "value2", tvFtype));
doc.add(new TextField("unstored+posns", "value2", Field.Store.NO));
doc.add(new StringField("unstored-posns", "value2", Field.Store.NO));
writer.addDocument(doc);

writer.close();

IndexReader ir = DirectoryReader.open(directory);
IndexInfo idxInfo = new IndexInfo(ir, indexPath);
String[] idxFields = idxInfo.getFieldNames().toArray(new String[0]);

recon = new DocReconstructor(ir, idxFields, idxInfo.getNumTerms());
}

private FieldType createUnstoredWithTermVectorsFieldType() {
FieldType fType = new FieldType();
fType.setStored(false);
fType.setIndexed(true);
fType.setTokenized(true);
fType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
fType.setStoreTermVectors(true);
fType.setStoreTermVectorOffsets(true);
fType.setStoreTermVectorPositions(true);
return fType;
}
}

0 comments on commit 75c0d1a

Please sign in to comment.