org.apache.lucene.benchmark.byTask.feeds.LineDocSourceTest.java Source code

Introduction

Here is the source code for org.apache.lucene.benchmark.byTask.feeds.LineDocSourceTest.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.benchmark.byTask.feeds;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Properties;

import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.benchmark.BenchmarkTestCase;
import org.apache.lucene.benchmark.byTask.PerfRunData;
import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.HeaderLineParser;
import org.apache.lucene.benchmark.byTask.feeds.LineDocSource.LineParser;
import org.apache.lucene.benchmark.byTask.tasks.AddDocTask;
import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask;
import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask;
import org.apache.lucene.benchmark.byTask.tasks.TaskSequence;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.util.IOUtils;

/** Tests the functionality of {@link LineDocSource}. */
public class LineDocSourceTest extends BenchmarkTestCase {

    private static final CompressorStreamFactory csFactory = new CompressorStreamFactory();

    private void createBZ2LineFile(Path file, boolean addHeader) throws Exception {
        OutputStream out = Files.newOutputStream(file);
        out = csFactory.createCompressorOutputStream("bzip2", out);
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8));
        writeDocsToFile(writer, addHeader, null);
        writer.close();
    }

    private void writeDocsToFile(BufferedWriter writer, boolean addHeader, Properties otherFields)
            throws IOException {
        if (addHeader) {
            writer.write(WriteLineDocTask.FIELDS_HEADER_INDICATOR);
            writer.write(WriteLineDocTask.SEP);
            writer.write(DocMaker.TITLE_FIELD);
            writer.write(WriteLineDocTask.SEP);
            writer.write(DocMaker.DATE_FIELD);
            writer.write(WriteLineDocTask.SEP);
            writer.write(DocMaker.BODY_FIELD);
            if (otherFields != null) {
                // additional field names in the header 
                for (Object fn : otherFields.keySet()) {
                    writer.write(WriteLineDocTask.SEP);
                    writer.write(fn.toString());
                }
            }
            writer.newLine();
        }
        StringBuilder doc = new StringBuilder();
        doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP)
                .append(DocMaker.BODY_FIELD);
        if (otherFields != null) {
            // additional field values in the doc line 
            for (Object fv : otherFields.values()) {
                doc.append(WriteLineDocTask.SEP).append(fv.toString());
            }
        }
        writer.write(doc.toString());
        writer.newLine();
    }

    private void createRegularLineFile(Path file, boolean addHeader) throws Exception {
        OutputStream out = Files.newOutputStream(file);
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8));
        writeDocsToFile(writer, addHeader, null);
        writer.close();
    }

    private void createRegularLineFileWithMoreFields(Path file, String... extraFields) throws Exception {
        OutputStream out = Files.newOutputStream(file);
        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8));
        Properties p = new Properties();
        for (String f : extraFields) {
            p.setProperty(f, f);
        }
        writeDocsToFile(writer, true, p);
        writer.close();
    }

    private void doIndexAndSearchTest(Path file, Class<? extends LineParser> lineParserClass, String storedField)
            throws Exception {
        doIndexAndSearchTestWithRepeats(file, lineParserClass, 1, storedField); // no extra repetitions
        doIndexAndSearchTestWithRepeats(file, lineParserClass, 2, storedField); // 1 extra repetition
        doIndexAndSearchTestWithRepeats(file, lineParserClass, 4, storedField); // 3 extra repetitions
    }

    private void doIndexAndSearchTestWithRepeats(Path file, Class<? extends LineParser> lineParserClass,
            int numAdds, String storedField) throws Exception {

        IndexReader reader = null;
        IndexSearcher searcher = null;
        PerfRunData runData = null;
        try {
            Properties props = new Properties();

            // LineDocSource specific settings.
            props.setProperty("docs.file", file.toAbsolutePath().toString());
            if (lineParserClass != null) {
                props.setProperty("line.parser", lineParserClass.getName());
            }

            // Indexing configuration.
            props.setProperty("analyzer", WhitespaceAnalyzer.class.getName());
            props.setProperty("content.source", LineDocSource.class.getName());
            props.setProperty("directory", "RAMDirectory");
            props.setProperty("doc.stored", "true");
            props.setProperty("doc.index.props", "true");

            // Create PerfRunData
            Config config = new Config(props);
            runData = new PerfRunData(config);

            TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false);
            tasks.addTask(new CreateIndexTask(runData));
            for (int i = 0; i < numAdds; i++) {
                tasks.addTask(new AddDocTask(runData));
            }
            tasks.addTask(new CloseIndexTask(runData));
            try {
                tasks.doLogic();
            } finally {
                tasks.close();
            }

            reader = DirectoryReader.open(runData.getDirectory());
            searcher = newSearcher(reader);
            TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10);
            assertEquals(numAdds, td.totalHits);
            assertNotNull(td.scoreDocs[0]);

            if (storedField == null) {
                storedField = DocMaker.BODY_FIELD; // added to all docs and satisfies field-name == value
            }
            assertEquals("Wrong field value", storedField, searcher.doc(0).get(storedField));
        } finally {
            IOUtils.close(reader, runData);
        }

    }

    /* Tests LineDocSource with a bzip2 input stream. */
    public void testBZip2() throws Exception {
        Path file = getWorkDir().resolve("one-line.bz2");
        createBZ2LineFile(file, true);
        doIndexAndSearchTest(file, null, null);
    }

    public void testBZip2NoHeaderLine() throws Exception {
        Path file = getWorkDir().resolve("one-line.bz2");
        createBZ2LineFile(file, false);
        doIndexAndSearchTest(file, null, null);
    }

    public void testRegularFile() throws Exception {
        Path file = getWorkDir().resolve("one-line");
        createRegularLineFile(file, true);
        doIndexAndSearchTest(file, null, null);
    }

    public void testRegularFileSpecialHeader() throws Exception {
        Path file = getWorkDir().resolve("one-line");
        createRegularLineFile(file, true);
        doIndexAndSearchTest(file, HeaderLineParser.class, null);
    }

    public void testRegularFileNoHeaderLine() throws Exception {
        Path file = getWorkDir().resolve("one-line");
        createRegularLineFile(file, false);
        doIndexAndSearchTest(file, null, null);
    }

    public void testInvalidFormat() throws Exception {
        String[] testCases = new String[] { "", // empty line
                "title", // just title
                "title" + WriteLineDocTask.SEP, // title + SEP
                "title" + WriteLineDocTask.SEP + "body", // title + SEP + body
                // note that title + SEP + body + SEP is a valid line, which results in an
                // empty body
        };

        for (int i = 0; i < testCases.length; i++) {
            Path file = getWorkDir().resolve("one-line");
            BufferedWriter writer = Files.newBufferedWriter(file, StandardCharsets.UTF_8);
            writer.write(testCases[i]);
            writer.newLine();
            writer.close();
            expectThrows(Exception.class, () -> {
                doIndexAndSearchTest(file, null, null);
            });
        }
    }

    /** Doc Name is not part of the default header */
    public void testWithDocsName() throws Exception {
        Path file = getWorkDir().resolve("one-line");
        createRegularLineFileWithMoreFields(file, DocMaker.NAME_FIELD);
        doIndexAndSearchTest(file, null, DocMaker.NAME_FIELD);
    }

    /** Use fields names that are not defined in Docmaker and so will go to Properties */
    public void testWithProperties() throws Exception {
        Path file = getWorkDir().resolve("one-line");
        String specialField = "mySpecialField";
        createRegularLineFileWithMoreFields(file, specialField);
        doIndexAndSearchTest(file, null, specialField);
    }

}