dk.statsbiblioteket.netark.dvenabler.DVReaderTest.java Source code

Java tutorial

Introduction

Here is the source code for dk.statsbiblioteket.netark.dvenabler.DVReaderTest.java

Source

/*
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */
package dk.statsbiblioteket.netark.dvenabler;

import dk.statsbiblioteket.netark.dvenabler.wrapper.DVDirectoryReader;
import junit.framework.TestCase;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.Log;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.IOException;
import java.util.*;

public class DVReaderTest extends TestCase {
    private static Log log = LogFactory.getLog(DVReaderTest.class);

    private static final String ID = "id";
    private static final String DV = "dv";
    private static final String DV_CONTENT = "dvcontent";
    private static final String SEARCH = "search";
    private static final String SEARCH_CONTENT = "searchcontent";
    private static final String SINGLE = "singlestring";
    private static final String SINGLE_CONTENT = "plainstoreS";
    private static final String MULTI = "multistring";
    private static final String MULTI_CONTENT_1 = "plainstoreM1";
    private static final String MULTI_CONTENT_2 = "plainstoreM2";
    private static final String DOUBLE = "double";
    private static final double DOUBLE_CONTENT = 12.13;
    private static final String FLOAT = "float";
    private static final float FLOAT_CONTENT = 18.5f;
    private static final long LONG_CONTENT = 87L;
    private static final String LONG = "long";

    private static final Version LUCENE_VERSION = Version.LUCENE_48;

    public void testCreateAndReadPlainIndex() throws IOException, ParseException {
        log.info("testCreateAndReadPlainIndex started");
        final File INDEX = generateIndex();
        try {
            assertIndexValues(INDEX, false);
        } finally {
            delete(INDEX);
        }
    }

    public void testCreateAndReadWrappedIndex() throws IOException, ParseException {
        log.info("testCreateAndReadPlainIndex started");
        final File INDEX = generateIndex();

        try {
            Directory directory = MMapDirectory.open(INDEX);
            IndexReader reader = new DVDirectoryReader(DirectoryReader.open(directory),
                    createDVFieldDescriptions(INDEX));
            IndexSearcher searcher = new IndexSearcher(reader);

            assertIndexValues(reader, searcher, true);
        } finally {
            delete(INDEX);
        }
    }

    public void testDVEnableIndex() throws IOException, ParseException {
        log.info("testDVEnableIndex started");

        final File INDEX_SRC = generateIndex();
        final File INDEX_DEST = new File("target/testindex.deletefreely2");
        try {
            IndexUtils.convert(INDEX_SRC, INDEX_DEST, createDVFieldDescriptions(INDEX_SRC));
            assertIndexValues(INDEX_SRC, false);
            assertIndexValues(INDEX_DEST, true);
        } finally {
            delete(INDEX_SRC);
            delete(INDEX_DEST);
        }
    }

    public void testLargerDVEnableIndex() throws IOException {
        final int DOCS = 1000;

        log.info("testLargerDVEnableIndex started");

        final File INDEX_SRC = generateIndex(DOCS);
        final File INDEX_DEST = new File("target/testindex.deletefreely.dest");
        try {
            IndexUtils.convert(INDEX_SRC, INDEX_DEST, createDVFieldDescriptions(INDEX_SRC));
            IndexReader readerSrc = DirectoryReader.open(MMapDirectory.open(INDEX_SRC));
            IndexReader readerDest = DirectoryReader.open(MMapDirectory.open(INDEX_DEST));

            long multiCount = 0;
            long singleCount = 0;
            long longCount = 0;
            long doubleCount = 0;
            for (int docID = 0; docID < DOCS; docID++) {
                {
                    String[] multisSrc = readerSrc.document(docID).getValues(MULTI);
                    if (multisSrc != null) {
                        List<String> dvs = getSortedSetDocValues(readerDest, docID, MULTI);
                        Arrays.sort(multisSrc);
                        Collections.sort(dvs);
                        assertEquals("There should be as many DV as stored for field " + MULTI, multisSrc.length,
                                dvs.size());
                        for (int i = 0; i < multisSrc.length; i++) {
                            assertEquals("Value " + i + " for field " + MULTI + " should be equal", multisSrc[i],
                                    dvs.get(i));
                            multiCount++;
                        }
                    }
                }
                {
                    String singleSrc = readerSrc.document(docID).get(SINGLE);
                    if (singleSrc != null) {
                        String dv = getSortedDocValue(readerDest, docID, SINGLE);
                        assertEquals("The DV for field " + SINGLE + " should match the stored value", singleSrc,
                                dv);
                        singleCount++;
                    }
                }
                {
                    IndexableField fieldSrc = readerSrc.document(docID).getField(LONG);
                    if (fieldSrc != null) {
                        long longSrc = fieldSrc.numericValue().longValue();
                        long dv = getLongDocValue(readerDest, docID, LONG);
                        assertEquals("The DV for field " + LONG + " should match the stored value", longSrc, dv);
                        longCount++;
                    }
                }
                {
                    IndexableField fieldSrc = readerSrc.document(docID).getField(DOUBLE);
                    if (fieldSrc != null) {
                        double doubleSrc = fieldSrc.numericValue().doubleValue();
                        double dv = getDoubleDocValue(readerDest, docID, DOUBLE);
                        assertEquals("The DV for field " + DOUBLE + " should match the stored value", doubleSrc,
                                dv);
                        doubleCount++;
                    }
                }
            }
            assertTrue("There should be at least 1 value for field " + MULTI + " in a document", multiCount > 0);
            assertTrue("There should be at least 1 value for field " + SINGLE + " in a document", singleCount > 0);
            assertTrue("There should be at least 1 value for field " + LONG + " in a document", longCount > 0);
            assertTrue("There should be at least 1 value for field " + DOUBLE + " in a document", doubleCount > 0);
            readerSrc.close();
            readerDest.close();
        } finally {
            delete(INDEX_SRC);
            delete(INDEX_DEST);
        }
    }

    private Set<DVConfig> createDVFieldDescriptions(File index) throws IOException {
        List<DVConfig> baseConfigs = IndexUtils.getDVConfigs(index);
        List<DVConfig> dvConfigs = new ArrayList<DVConfig>();
        for (DVConfig baseConfig : baseConfigs) {
            if (SINGLE.equals(baseConfig.getName())) {
                dvConfigs.add(baseConfig.set(FieldInfo.DocValuesType.SORTED));
            } else if (MULTI.equals(baseConfig.getName())) {
                dvConfigs.add(baseConfig.set(FieldInfo.DocValuesType.SORTED_SET));
            } else if (DOUBLE.equals(baseConfig.getName())) {
                dvConfigs.add(baseConfig.set(FieldInfo.DocValuesType.NUMERIC, FieldType.NumericType.DOUBLE));
            } else if (FLOAT.equals(baseConfig.getName())) {
                dvConfigs.add(baseConfig.set(FieldInfo.DocValuesType.NUMERIC, FieldType.NumericType.FLOAT));
            } else if (LONG.equals(baseConfig.getName())) {
                dvConfigs.add(baseConfig.set(FieldInfo.DocValuesType.NUMERIC, FieldType.NumericType.LONG));
            }
        }
        return new HashSet<DVConfig>(dvConfigs);
    }

    public static void assertIndexValues(File index, boolean dvExpected) throws IOException, ParseException {
        IndexReader reader = DirectoryReader.open(MMapDirectory.open(index));
        IndexSearcher searcher = new IndexSearcher(reader);
        try {
            assertIndexValues(reader, searcher, dvExpected);
        } finally {
            reader.close();
        }
    }

    private static void assertIndexValues(IndexReader reader, boolean dvExpected)
            throws IOException, ParseException {
        IndexSearcher searcher = new IndexSearcher(reader);
        assertIndexValues(reader, searcher, dvExpected);
    }

    private static void assertIndexValues(IndexReader reader, IndexSearcher searcher, boolean dvExpected)
            throws ParseException, IOException {

        final String M = "dvExpected=" + dvExpected + ". ";
        Analyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
        QueryParser queryParser = new QueryParser(LUCENE_VERSION, SEARCH, analyzer);
        Query query = queryParser.parse(SEARCH_CONTENT);
        TopDocs topDocs = searcher.search(query, 10);
        assertEquals(M + "Search for 'somecontent' should give the right number of results", 1, topDocs.totalHits);
        Document doc = reader.document(topDocs.scoreDocs[0].doc);

        assertEquals(M + "The stored value for the document should be correct", SINGLE_CONTENT, doc.get(SINGLE));
        assertEquals(M + "The stored long value for the document should be correct", Long.toString(LONG_CONTENT),
                doc.get(LONG));
        for (String value : Arrays.asList(MULTI_CONTENT_1, MULTI_CONTENT_2)) {
            assertTrue("The value " + value + " should be stored in field " + MULTI,
                    Arrays.asList(doc.getValues(MULTI)).contains(value));
        }

        /*        assertEquals(M + "The stored double value for the document should be correct",
                 Double.toString(DOUBLE_CONTENT), doc.get(DOUBLE));
                assertEquals(M + "The stored float value for the document should be correct",
                 Double.toString(FLOAT_CONTENT), doc.get(FLOAT));*/

        String dv = getSortedDocValue(reader, topDocs.scoreDocs[0].doc, DV);
        assertEquals("The plain DocValues content for the document should be correct", DV_CONTENT, dv);

        // SINGLE (single value String)
        try {
            String nonexistingDV = getSortedDocValue(reader, topDocs.scoreDocs[0].doc, SINGLE);
            if (!dvExpected) {
                fail(M + "Requesting the DocValue from the non-DV field " + SINGLE
                        + " should have failed but returned " + nonexistingDV);
            }
            assertEquals("Requesting DV from a stored field should work due to the wrapper", SINGLE_CONTENT,
                    nonexistingDV);
        } catch (Exception e) {
            if (dvExpected) {
                fail(M + "There should have been a DV-value for field " + SINGLE);
            }
        }

        // LONG
        try {
            long nonexistingDV = getLongDocValue(reader, topDocs.scoreDocs[0].doc, LONG);
            if (!dvExpected) {
                fail(M + "Requesting the DocValue from the non-DV field " + LONG
                        + " should have failed but returned " + nonexistingDV);
            }
            assertEquals("Requesting DV from a stored field should work due to the wrapper", LONG_CONTENT,
                    nonexistingDV);
        } catch (Exception e) {
            if (dvExpected) {
                fail(M + "There should have been a DV-value for field " + LONG);
            }
        }

        // MULTI (multi value String)
        try {
            List<String> dvs = getSortedSetDocValues(reader, topDocs.scoreDocs[0].doc, MULTI);
            if (!dvExpected) {
                fail(M + "Requesting the SortedSet DocValues from the non-DV field " + SINGLE
                        + " should have failed but returned " + dvs);
            }
            assertEquals("The number of returned DVs for field " + MULTI + " should match", 2, dvs.size());
            for (String value : Arrays.asList(MULTI_CONTENT_1, MULTI_CONTENT_2)) {
                assertTrue("The value " + value + " should be DocValued in field " + MULTI, dvs.contains(value));
            }
        } catch (Exception e) {
            if (dvExpected) {
                fail(M + "There should have been a DV-value for field " + MULTI);
            }
        }

    }

    private static List<String> getSortedSetDocValues(IndexReader reader, int docID, String field)
            throws IOException {
        if (!reader.getContext().isTopLevel) {
            throw new IllegalStateException("Expected the reader to be topLevel");
        }
        for (AtomicReaderContext atom : reader.getContext().leaves()) {
            if (atom.docBase <= docID && atom.docBase + atom.reader().maxDoc() > docID) {
                return getSortedSetDocValues(atom, docID, field);
            }
        }
        throw new IllegalArgumentException("The docID " + docID + " exceeded the index size");
    }

    private static List<String> getSortedSetDocValues(AtomicReaderContext atomContext, int docID, String field)
            throws IOException {
        SortedSetDocValues dvs = atomContext.reader().getSortedSetDocValues(field);
        if (dvs == null) {
            throw new IllegalStateException("No SortedSetDocValues for field '" + field + "'");
        }
        dvs.setDocument(docID);
        List<String> values = new ArrayList<String>();
        BytesRef result = new BytesRef();
        long ord;
        while ((ord = dvs.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
            dvs.lookupOrd(ord, result);
            values.add(result.utf8ToString());
        }
        return values;
    }

    private static String getSortedDocValue(IndexReader reader, int docID, String field) throws IOException {
        if (!reader.getContext().isTopLevel) {
            throw new IllegalStateException("Expected the reader to be topLevel");
        }
        for (AtomicReaderContext atom : reader.getContext().leaves()) {
            if (atom.docBase <= docID && atom.docBase + atom.reader().maxDoc() > docID) {
                return getSortedDocValue(atom, docID, field);
            }
        }
        throw new IllegalArgumentException("The docID " + docID + " exceeded the index size");
    }

    private static String getSortedDocValue(AtomicReaderContext atomContext, int docID, String field)
            throws IOException {
        SortedDocValues dvs = atomContext.reader().getSortedDocValues(field);
        if (dvs == null) {
            throw new IllegalStateException("No SortedDocValues for field '" + field + "'");
        }
        BytesRef result = new BytesRef();
        dvs.get(docID - atomContext.docBase, result);
        return result.utf8ToString();
    }

    private double getDoubleDocValue(IndexReader reader, int docID, String field) throws IOException {
        if (!reader.getContext().isTopLevel) {
            throw new IllegalStateException("Expected the reader to be topLevel");
        }
        for (AtomicReaderContext atom : reader.getContext().leaves()) {
            if (atom.docBase <= docID && atom.docBase + atom.reader().maxDoc() > docID) {
                return getDoubleDocValue(atom, docID, field);
            }
        }
        throw new IllegalArgumentException("The docID " + docID + " exceeded the index size");
    }

    private static Double getDoubleDocValue(AtomicReaderContext atomContext, int docID, String field)
            throws IOException {
        NumericDocValues dvs = atomContext.reader().getNumericDocValues(field);
        if (dvs == null) {
            throw new IllegalStateException("No NumericDocValues for field '" + field + "'");
        }
        return NumericUtils.sortableLongToDouble(dvs.get(docID));
    }

    private static long getLongDocValue(IndexReader reader, int docID, String field) throws IOException {
        if (!reader.getContext().isTopLevel) {
            throw new IllegalStateException("Expected the reader to be topLevel");
        }
        for (AtomicReaderContext atom : reader.getContext().leaves()) {
            if (atom.docBase <= docID && atom.docBase + atom.reader().maxDoc() > docID) {
                return getLongDocValue(atom, docID, field);
            }
        }
        throw new IllegalArgumentException("The docID " + docID + " exceeded the index size");
    }

    private static Long getLongDocValue(AtomicReaderContext atomContext, int docID, String field)
            throws IOException {
        NumericDocValues dvs = atomContext.reader().getNumericDocValues(field);
        if (dvs == null) {
            throw new IllegalStateException("No NumericDocValues for field '" + field + "'");
        }
        return dvs.get(docID);
    }

    // Semi.random index with stored fields only: multi, single and long. Some values missing from some documents
    private static File generateIndex(int documents) throws IOException {
        final File INDEX = new File("target/testindex.deletefreely." + documents);
        final long seed = new Random().nextLong();
        Random random = new Random(seed);
        log.info("Testing with random seed" + seed);
        Analyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);

        final FieldType SINGLE_F = new FieldType();
        SINGLE_F.setIndexed(true);
        SINGLE_F.setStored(true);

        final FieldType MULTI_F = new FieldType();
        MULTI_F.setIndexed(true);
        MULTI_F.setStored(true);

        final FieldType SEARCH_F = new FieldType();
        SEARCH_F.setIndexed(true);

        final FieldType LONG_F = new FieldType();
        LONG_F.setIndexed(true);
        LONG_F.setStored(true);
        LONG_F.setNumericType(FieldType.NumericType.LONG);

        final FieldType DOUBLE_F = new FieldType();
        DOUBLE_F.setIndexed(true);
        DOUBLE_F.setStored(true);
        DOUBLE_F.setNumericType(FieldType.NumericType.DOUBLE);

        IndexWriter indexWriter = new IndexWriter(MMapDirectory.open(INDEX),
                new IndexWriterConfig(LUCENE_VERSION, analyzer));
        for (int docID = 0; docID < documents; docID++) {
            Document document = new Document();
            document.add(new Field(ID, Integer.toString(docID), SINGLE_F));
            document.add(new Field(SEARCH, SEARCH_CONTENT + "_" + docID, SEARCH_F));
            if (random.nextInt(5) > 0) {
                document.add(new Field(SINGLE, SINGLE_CONTENT + "_r" + random.nextInt(), SINGLE_F));
            }
            if (random.nextInt(5) > 0) {
                document.add(new Field(MULTI, MULTI_CONTENT_1 + "_" + docID, MULTI_F));
                if (random.nextInt(3) > 0) {
                    document.add(new Field(MULTI, MULTI_CONTENT_2 + "_random" + random.nextInt(5), MULTI_F));
                }
            }
            if (random.nextInt(5) > 0) {
                document.add(new LongField(LONG, random.nextLong(), LONG_F));
            }
            if (random.nextInt(5) > 0) {
                document.add(new DoubleField(DOUBLE, random.nextDouble(), DOUBLE_F));
            }
            indexWriter.addDocument(document);
            if (docID == documents / 3) {
                indexWriter.commit(); // Ensure multi-segment
            }
        }
        indexWriter.commit();
        indexWriter.close();
        return INDEX;
    }

    public static File generateIndex() throws IOException {
        final File INDEX = new File("target/testindex.deletefreely");
        Analyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);

        final FieldType SINGLE_F = new FieldType();
        SINGLE_F.setIndexed(true);
        SINGLE_F.setStored(true);

        final FieldType MULTI_F = new FieldType();
        MULTI_F.setIndexed(true);
        MULTI_F.setStored(true);

        final FieldType SEARCH_F = new FieldType();
        SEARCH_F.setIndexed(true);

        final FieldType LONG_F = new FieldType();
        LONG_F.setIndexed(true);
        LONG_F.setStored(true);
        LONG_F.setNumericType(FieldType.NumericType.LONG);

        /*        final FieldType DOUBLE_F = new FieldType();
                DOUBLE_F.setIndexed(true);
                DOUBLE_F.setStored(true);
                DOUBLE_F.setNumericType(FieldType.NumericType.DOUBLE);
            
                final FieldType FLOAT_F = new FieldType();
                FLOAT_F.setIndexed(true);
                FLOAT_F.setStored(true);
                FLOAT_F.setNumericType(FieldType.NumericType.FLOAT);
          */

        /*        final FieldType STR_DV = new FieldType();
                STR_DV.setIndexed(true);
                STR_DV.setStored(true);
                STR_DV.setDocValueType(FieldInfo.DocValuesType.SORTED);*/

        IndexWriter indexWriter = new IndexWriter(MMapDirectory.open(INDEX),
                new IndexWriterConfig(LUCENE_VERSION, analyzer));
        {
            Document document = new Document();
            document.add(new Field(ID, "1", MULTI_F));
            document.add(new Field(SEARCH, SEARCH_CONTENT, SEARCH_F));
            document.add(new Field(SINGLE, SINGLE_CONTENT, MULTI_F));
            document.add(new Field(MULTI, MULTI_CONTENT_1, MULTI_F));
            document.add(new Field(MULTI, MULTI_CONTENT_2, MULTI_F));
            document.add(new LongField(LONG, LONG_CONTENT, LONG_F));
            //            document.add(new DoubleField(DOUBLE, DOUBLE_CONTENT, DOUBLE_F));
            //            document.add(new FloatField(FLOAT, FLOAT_CONTENT, FLOAT_F));
            document.add(new SortedDocValuesField(DV, new BytesRef(DV_CONTENT)));
            indexWriter.addDocument(document);
        }
        indexWriter.commit();
        indexWriter.close();
        return INDEX;
    }

    public static void delete(File path) {
        File[] subs = path.listFiles();
        if (subs != null) {
            for (File subPath : subs) {
                delete(subPath);
            }
        }
        if (!path.delete()) {
            log.warn("Unable to delete '" + path + "'");
        }
    }
}