io.anserini.IndexerCW09B.java Source code

Introduction

Here is the source code for io.anserini.IndexerCW09B.java
Source

package io.anserini;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.NoDeletionPolicy;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.jsoup.Jsoup;

import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.zip.GZIPInputStream;

/**
 * Indexer for ClueWeb09 Category B Corpus.
 */
public final class IndexerCW09B {

    static final String FIELD_BODY = "contents";
    static final String FIELD_ID = "id";
    private static final String RESPONSE = "response";

    private final class IndexerThread extends Thread {

        final private Path inputWarcFile;

        final private IndexWriter writer;

        volatile int addCount;

        public IndexerThread(IndexWriter writer, Path inputWarcFile) throws IOException {
            this.writer = writer;
            this.inputWarcFile = inputWarcFile;
            setName(inputWarcFile.getFileName().toString());
        }

        private int indexWarcFile() throws IOException {

            int i = 0;

            try (DataInputStream inStream = new DataInputStream(
                    new GZIPInputStream(Files.newInputStream(inputWarcFile, StandardOpenOption.READ)))) {

                // iterate through our stream
                ClueWeb09WarcRecord wDoc;
                while ((wDoc = ClueWeb09WarcRecord.readNextWarcRecord(inStream)) != null) {
                    // see if it's a response record
                    if (RESPONSE.equals(wDoc.getHeaderRecordType())) {

                        String id = wDoc.getDocid();

                        org.jsoup.nodes.Document jDoc = Jsoup.parse(wDoc.getContent());

                        String contents = jDoc.text();
                        // don't index empty documents
                        if (contents.trim().length() == 0) {
                            System.err.println(id);
                            continue;
                        }

                        // make a new, empty document
                        Document document = new Document();

                        // document ID
                        document.add(new StringField(FIELD_ID, id, Field.Store.YES));

                        // entire document
                        document.add(new TextField(FIELD_BODY, contents, Field.Store.NO));

                        writer.addDocument(document);
                        i++;
                    }
                }
            }
            return i;
        }

        @Override
        public void run() {
            try {
                addCount = indexWarcFile();
                System.out.println("*./" + inputWarcFile.getParent().getFileName().toString() + File.separator
                        + inputWarcFile.getFileName().toString() + "  " + addCount);
            } catch (IOException ioe) {
                System.out.println(Thread.currentThread().getName() + ": ERROR: unexpected IOException:");
                ioe.printStackTrace(System.out);
            }
        }
    }

    private final Path indexPath;
    private final Path docDir;

    public IndexerCW09B(String docsPath, String indexPath) throws IOException {

        this.indexPath = Paths.get(indexPath);
        if (!Files.exists(this.indexPath))
            Files.createDirectories(this.indexPath);

        docDir = Paths.get(docsPath);
        if (!Files.exists(docDir) || !Files.isReadable(docDir) || !Files.isDirectory(docDir)) {
            System.out.println("Document directory '" + docDir.toString()
                    + "' does not exist or is not readable, please check the path");
            System.exit(1);
        }
    }

    private final static PathMatcher matcher = FileSystems.getDefault().getPathMatcher("glob:*.warc.gz");

    static List<Path> discoverWarcFiles(Path p) {

        final List<Path> warcFiles = new ArrayList<>();

        FileVisitor<Path> fv = new SimpleFileVisitor<Path>() {
            @Override
            public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {

                Path name = file.getFileName();
                if (name != null && matcher.matches(name))
                    warcFiles.add(file);
                return FileVisitResult.CONTINUE;
            }
        };

        try {
            Files.walkFileTree(p, fv);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return warcFiles;
    }

    /**
     * KStemAnalyzer: Filters {@link ClassicTokenizer} with {@link org.apache.lucene.analysis.standard.ClassicFilter},
     * {@link org.apache.lucene.analysis.core.LowerCaseFilter} and {@link org.apache.lucene.analysis.en.KStemFilter}.
     *
     * @return KStemAnalyzer
     * @throws IOException
     */
    static Analyzer analyzer() throws IOException {
        return CustomAnalyzer.builder().withTokenizer("classic").addTokenFilter("classic")
                .addTokenFilter("lowercase").addTokenFilter("kstem").build();
    }

    public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

        System.out.println(
                "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

        final Directory dir = FSDirectory.open(indexPath);

        final IndexWriterConfig iwc = new IndexWriterConfig(analyzer());

        iwc.setSimilarity(new BM25Similarity());
        iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE);
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        iwc.setRAMBufferSizeMB(256.0);
        iwc.setUseCompoundFile(false);
        iwc.setMergeScheduler(new ConcurrentMergeScheduler());

        final IndexWriter writer = new IndexWriter(dir, iwc);

        final ExecutorService executor = Executors.newFixedThreadPool(numThreads);

        for (Path f : discoverWarcFiles(docDir))
            executor.execute(new IndexerThread(writer, f));

        //add some delay to let some threads spawn by scheduler
        Thread.sleep(30000);
        executor.shutdown(); // Disable new tasks from being submitted

        try {
            // Wait for existing tasks to terminate
            while (!executor.awaitTermination(5, TimeUnit.MINUTES)) {
                Thread.sleep(1000);
            }
        } catch (InterruptedException ie) {
            // (Re-)Cancel if current thread also interrupted
            executor.shutdownNow();
            // Preserve interrupt status
            Thread.currentThread().interrupt();
        }

        int numIndexed = writer.maxDoc();

        try {
            writer.commit();
        } finally {
            writer.close();
        }

        return numIndexed;
    }

    public static void main(String[] args) throws IOException, InterruptedException {

        Args clArgs = new Args(args);

        final String dataDir = clArgs.getString("-dataDir");
        final String indexPath = clArgs.getString("-indexPath");
        final int numThreads = clArgs.getInt("-threadCount");

        clArgs.check();

        Date start = new Date();
        IndexerCW09B indexer = new IndexerCW09B(dataDir, indexPath);
        int numIndexed = indexer.indexWithThreads(numThreads);
        System.out.println("Total " + numIndexed + " documents indexed in "
                + DurationFormatUtils.formatDuration(new Date().getTime() - start.getTime(), "HH:mm:ss"));
    }
}