io.anserini.index.IndexCollection.java Source code

Introduction

Here is the source code for io.anserini.index.IndexCollection.java
Source

/**
 * Anserini: An information retrieval toolkit built on Lucene
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.anserini.index;

import io.anserini.collection.Collection;
import io.anserini.document.SourceDocument;
import io.anserini.index.generator.LuceneDocumentGenerator;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.OptionHandlerFilter;
import org.kohsuke.args4j.ParserProperties;
import org.kohsuke.args4j.Option;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;

public final class IndexCollection {
    private static final Logger LOG = LogManager.getLogger(IndexCollection.class);

    public static final class Args {

        // required arguments

        @Option(name = "-input", metaVar = "[Directory]", required = true, usage = "collection directory")
        public String input;

        @Option(name = "-index", metaVar = "[Path]", required = true, usage = "index path")
        public String index;

        @Option(name = "-threads", metaVar = "[Number]", required = true, usage = "Number of Threads")
        public int threads;

        @Option(name = "-collection", required = true, usage = "collection class in io.anserini.collection")
        public String collectionClass;

        @Option(name = "-generator", required = true, usage = "document generator in io.anserini.index.generator")
        public String generatorClass;

        // optional arguments

        @Option(name = "-memorybuffer", usage = "memory buffer size")
        public int memorybufferSize = 2048;

        @Option(name = "-keepStopwords", usage = "boolean switch to keep stopwords")
        public boolean keepStopwords = false;

        @Option(name = "-storePositions", usage = "boolean switch to index storePositions")
        public boolean storePositions = false;

        @Option(name = "-storeDocvectors", usage = "boolean switch to store document vectors")
        public boolean storeDocvectors = false;

        @Option(name = "-storeTransformedDocs", usage = "boolean switch to store transformed document text")
        public boolean storeTransformedDocs = false;

        @Option(name = "-storeRawDocs", usage = "boolean switch to store raw document text")
        public boolean storeRawDocs = false;

        @Option(name = "-optimize", usage = "boolean switch to optimize index (force merge)")
        public boolean optimize = false;
    }

    public final class Counters {
        public AtomicLong indexedDocuments = new AtomicLong();
        public AtomicLong emptyDocuments = new AtomicLong();
        public AtomicLong errors = new AtomicLong();
    }

    private final class IndexerThread extends Thread {
        final private Path inputFile;
        final private IndexWriter writer;
        final private Collection collection;

        private IndexerThread(IndexWriter writer, Collection collection, Path inputFile) throws IOException {
            this.writer = writer;
            this.collection = collection;
            this.inputFile = inputFile;
            setName(inputFile.getFileName().toString());
        }

        @Override
        public void run() {
            try {
                LuceneDocumentGenerator transformer = (LuceneDocumentGenerator) transformerClass.newInstance();
                transformer.config(args);
                transformer.setCounters(counters);

                int cnt = 0;
                Collection.FileSegment iter = collection.createFileSegment(inputFile);
                while (iter.hasNext()) {
                    SourceDocument d = (SourceDocument) iter.next();
                    if (d == null || !d.indexable()) {
                        continue;
                    }

                    @SuppressWarnings("unchecked") // Yes, we know what we're doing here.
                    Document doc = transformer.createDocument(d);

                    if (doc != null) {
                        writer.addDocument(doc);
                        cnt++;
                    }
                }
                iter.close();
                LOG.info(inputFile.getParent().getFileName().toString() + File.separator
                        + inputFile.getFileName().toString() + ": " + cnt + " docs added.");
                counters.indexedDocuments.addAndGet(cnt);
            } catch (Exception e) {
                LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e);
            }
        }
    }

    private final IndexCollection.Args args;
    private final Path indexPath;
    private final Path collectionPath;
    private final Class collectionClass;
    private final Class transformerClass;
    private final Collection collection;
    private final Counters counters;

    public IndexCollection(IndexCollection.Args args) throws Exception {
        this.args = args;

        LOG.info("Collection path: " + args.input);
        LOG.info("Index path: " + args.index);
        LOG.info("Threads: " + args.threads);
        LOG.info("Keep stopwords? " + args.keepStopwords);
        LOG.info("Store positions? " + args.storePositions);
        LOG.info("Store docvectors? " + args.storeDocvectors);
        LOG.info("Store transformed docs? " + args.storeTransformedDocs);
        LOG.info("Store raw docs? " + args.storeRawDocs);
        LOG.info("Optimize (merge segments)? " + args.optimize);

        this.indexPath = Paths.get(args.index);
        if (!Files.exists(this.indexPath)) {
            Files.createDirectories(this.indexPath);
        }

        collectionPath = Paths.get(args.input);
        if (!Files.exists(collectionPath) || !Files.isReadable(collectionPath)
                || !Files.isDirectory(collectionPath)) {
            throw new RuntimeException("Document directory " + collectionPath.toString()
                    + " does not exist or is not readable, please check the path");
        }

        this.transformerClass = Class.forName("io.anserini.index.generator." + args.generatorClass);

        this.collectionClass = Class.forName("io.anserini.collection." + args.collectionClass);
        collection = (Collection) this.collectionClass.newInstance();
        collection.setCollectionPath(collectionPath);

        this.counters = new Counters();
    }

    public void run() throws IOException, InterruptedException {
        final long start = System.nanoTime();
        LOG.info("Starting indexer...");

        int numThreads = args.threads;

        final Directory dir = FSDirectory.open(indexPath);
        final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET)
                : new EnglishAnalyzer();
        final IndexWriterConfig config = new IndexWriterConfig(analyzer);
        config.setSimilarity(new BM25Similarity());
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        config.setRAMBufferSizeMB(args.memorybufferSize);
        config.setUseCompoundFile(false);
        config.setMergeScheduler(new ConcurrentMergeScheduler());

        final IndexWriter writer = new IndexWriter(dir, config);

        final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
        final List<Path> segmentPaths = collection.getFileSegmentPaths();

        final int segmentCnt = segmentPaths.size();
        LOG.info(segmentCnt + " files found in " + collectionPath.toString());
        for (int i = 0; i < segmentCnt; i++) {
            executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i)));
        }

        executor.shutdown();

        try {
            // Wait for existing tasks to terminate
            while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
                LOG.info(String.format("%.2f percent completed",
                        (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d));
            }
        } catch (InterruptedException ie) {
            // (Re-)Cancel if current thread also interrupted
            executor.shutdownNow();
            // Preserve interrupt status
            Thread.currentThread().interrupt();
        }

        if (segmentCnt != executor.getCompletedTaskCount()) {
            throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount =  "
                    + executor.getCompletedTaskCount());
        }

        int numIndexed = writer.maxDoc();

        try {
            writer.commit();
            if (args.optimize)
                writer.forceMerge(1);
        } finally {
            try {
                writer.close();
            } catch (IOException e) {
                // It is possible that this happens... but nothing much we can do at this point,
                // so just log the error and move on.
                LOG.error(e);
            }
        }

        LOG.info("Indexed documents: " + counters.indexedDocuments.get());
        LOG.info("Empty documents: " + counters.emptyDocuments.get());
        LOG.info("Errors: " + counters.errors.get());

        final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
        LOG.info("Total " + numIndexed + " documents indexed in "
                + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
    }

    public static void main(String[] args) throws Exception {
        IndexCollection.Args indexCollectionArgs = new IndexCollection.Args();
        CmdLineParser parser = new CmdLineParser(indexCollectionArgs,
                ParserProperties.defaults().withUsageWidth(90));

        try {
            parser.parseArgument(args);
        } catch (CmdLineException e) {
            System.err.println(e.getMessage());
            parser.printUsage(System.err);
            System.err.println("Example: " + IndexCollection.class.getSimpleName()
                    + parser.printExample(OptionHandlerFilter.REQUIRED));
            return;
        }

        new IndexCollection(indexCollectionArgs).run();
    }
}