io.anserini.index.IndexGov2.java Source code

Introduction

Here is the source code for io.anserini.index.IndexGov2.java
Source

package io.anserini.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.benchmark.byTask.feeds.TrecContentSource;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public final class IndexGov2 {
    private static final Logger LOG = LogManager.getLogger(IndexGov2.class);

    private static TrecContentSource createGov2Source(String dataDir) {
        TrecContentSource tcs = new TrecContentSource();
        Properties props = new Properties();
        props.setProperty("print.props", "false");
        props.setProperty("content.source.verbose", "false");
        props.setProperty("content.source.excludeIteration", "true");
        props.setProperty("docs.dir", dataDir);
        props.setProperty("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser");
        props.setProperty("content.source.forever", "false");
        tcs.setConfig(new Config(props));
        try {
            tcs.resetInputs();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return tcs;
    }

    private static final String INPUT_OPTION = "input";
    private static final String INDEX_OPTION = "index";
    private static final String DOCLIMIT_OPTION = "doclimit";
    private static final String THREADS_OPTION = "threads";
    private static final String UPDATE_OPTION = "update";
    private static final String POSITIONS_OPTION = "positions";
    private static final String OPTIMIZE_OPTION = "optimize";

    @SuppressWarnings("static-access")
    public static void main(String[] args) throws Exception {
        Options options = new Options();
        options.addOption(
                OptionBuilder.withArgName("path").hasArg().withDescription("input data path").create(INPUT_OPTION));
        options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output index path")
                .create(INDEX_OPTION));
        options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexer threads")
                .create(THREADS_OPTION));
        options.addOption(OptionBuilder.withArgName("num").hasArg()
                .withDescription("max number of documents to index (-1 to index everything)")
                .create(DOCLIMIT_OPTION));

        options.addOption(POSITIONS_OPTION, false, "index positions");
        options.addOption(OPTIMIZE_OPTION, false, "merge all index segments");

        CommandLine cmdline = null;
        CommandLineParser parser = new GnuParser();
        try {
            cmdline = parser.parse(options, args);
        } catch (ParseException exp) {
            System.err.println("Error parsing command line: " + exp.getMessage());
            System.exit(-1);
        }

        if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)
                || !cmdline.hasOption(THREADS_OPTION)) {
            HelpFormatter formatter = new HelpFormatter();
            formatter.setWidth(100);
            formatter.printHelp(IndexGov2.class.getCanonicalName(), options);
            System.exit(-1);
        }

        final String dirPath = cmdline.getOptionValue(INDEX_OPTION);
        final String dataDir = cmdline.getOptionValue(INPUT_OPTION);
        final int docCountLimit = cmdline.hasOption(DOCLIMIT_OPTION)
                ? Integer.parseInt(cmdline.getOptionValue(DOCLIMIT_OPTION))
                : -1;
        final int numThreads = Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION));

        final boolean doUpdate = cmdline.hasOption(UPDATE_OPTION);
        final boolean positions = cmdline.hasOption(POSITIONS_OPTION);
        final boolean optimize = cmdline.hasOption(OPTIMIZE_OPTION);

        final Analyzer a = new EnglishAnalyzer();
        final TrecContentSource trecSource = createGov2Source(dataDir);
        final Directory dir = FSDirectory.open(Paths.get(dirPath));

        LOG.info("Index path: " + dirPath);
        LOG.info("Doc limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit));
        LOG.info("Threads: " + numThreads);
        LOG.info("Positions: " + positions);
        LOG.info("Optimize (merge segments): " + optimize);

        final IndexWriterConfig config = new IndexWriterConfig(a);

        if (doUpdate) {
            config.setOpenMode(IndexWriterConfig.OpenMode.APPEND);
        } else {
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        }

        final IndexWriter writer = new IndexWriter(dir, config);
        Gov2IndexThreads threads = new Gov2IndexThreads(writer, positions, trecSource, numThreads, docCountLimit);
        LOG.info("Indexer: start");

        final long t0 = System.currentTimeMillis();

        threads.start();

        while (!threads.done()) {
            Thread.sleep(100);
        }
        threads.stop();

        final long t1 = System.currentTimeMillis();
        LOG.info("Indexer: indexing done (" + (t1 - t0) / 1000.0 + " sec); total " + writer.maxDoc() + " docs");
        if (!doUpdate && docCountLimit != -1 && writer.maxDoc() != docCountLimit) {
            throw new RuntimeException("w.maxDoc()=" + writer.maxDoc() + " but expected " + docCountLimit);
        }
        if (threads.failed.get()) {
            throw new RuntimeException("exceptions during indexing");
        }

        final long t2;
        t2 = System.currentTimeMillis();

        final Map<String, String> commitData = new HashMap<String, String>();
        commitData.put("userData", "multi");
        writer.setCommitData(commitData);
        writer.commit();
        final long t3 = System.currentTimeMillis();
        LOG.info("Indexer: commit multi (took " + (t3 - t2) / 1000.0 + " sec)");

        if (optimize) {
            LOG.info("Indexer: merging all segments");
            writer.forceMerge(1);
            final long t4 = System.currentTimeMillis();
            LOG.info("Indexer: segments merged (took " + (t4 - t3) / 1000.0 + " sec)");
        }

        LOG.info("Indexer: at close: " + writer.segString());
        final long tCloseStart = System.currentTimeMillis();
        writer.close();
        LOG.info("Indexer: close took " + (System.currentTimeMillis() - tCloseStart) / 1000.0 + " sec");
        dir.close();
        final long tFinal = System.currentTimeMillis();
        LOG.info("Indexer: finished (" + (tFinal - t0) / 1000.0 + " sec)");
        LOG.info("Indexer: net bytes indexed " + threads.getBytesIndexed());
        LOG.info("Indexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / ((tFinal - t0) / 3600000.))
                + " GB/hour plain text");
    }
}