io.anserini.index.IndexUtils.java Source code

Introduction

Here is the source code for io.anserini.index.IndexUtils.java
Source

/**
 * Anserini: An information retrieval toolkit built on Lucene
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.anserini.index;

import io.anserini.index.generator.LuceneDocumentGenerator;
import io.anserini.index.generator.TweetGenerator;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.FSDirectory;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;
import org.kohsuke.args4j.Option;
import org.kohsuke.args4j.ParserProperties;
import edu.stanford.nlp.simple.Sentence;
import org.jsoup.Jsoup;

import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

import static io.anserini.search.SearchCollection.BREAK_SCORE_TIES_BY_DOCID;
import static io.anserini.search.SearchCollection.BREAK_SCORE_TIES_BY_TWEETID;
import static java.util.stream.Collectors.joining;

public class IndexUtils {
    private static final Logger LOG = LogManager.getLogger(IndexUtils.class);

    enum Compression {
        NONE, GZ, BZ2, ZIP
    }

    enum DocVectorWeight {
        NONE, TF_IDF
    }

    public static final class Args {
        @Option(name = "-index", metaVar = "[Path]", required = true, usage = "index path")
        String index;

        @Option(name = "-stats", usage = "print index statistics")
        boolean stats;

        @Option(name = "-printTermInfo", metaVar = "term", usage = "prints term info (stemmed, total counts, doc counts, etc.)")
        String term;

        @Option(name = "-dumpDocVector", metaVar = "docid", usage = "prints the document vector of a document")
        String docvectorDocid;

        @Option(name = "-dumpDocVectors", metaVar = "[Path]", usage = "dumps the document vector for all documents from input file")
        String docVectors;

        @Option(name = "-docVectorWeight", metaVar = "[str]", usage = "the weight for dumped document vector(s), NONE or TF_IDF")
        DocVectorWeight docVectorWeight;

        @Option(name = "-dumpAllDocids", usage = "dumps all docids in sorted order. For non-tweet collection the order is "
                + "in ascending of String docid; For tweets collection the order is in descending of Long tweet id"
                + "please provide the compression scheme for the output")
        Compression dumpAllDocids;

        @Option(name = "-dumpRawDoc", metaVar = "docid", usage = "dumps raw document (if stored in the index)")
        String rawDoc;

        @Option(name = "-dumpRawDocs", metaVar = "[Path]", usage = "dumps raw documents from the input file")
        String rawDocs;

        @Option(name = "-dumpRawDocsWithDocid", metaVar = "[Path]", usage = "By default there is no <DOCNO>docid<DOCNO> "
                + "stored in the raw docs. By prepending <DOCNO>docid<DOCNO> in front of the raw docs we can directly index them")
        String rawDocsWithDocid;

        @Option(name = "-dumpTransformedDoc", metaVar = "docid", usage = "dumps transformed document (if stored in the index)")
        String transformedDoc;

        @Option(name = "-dumpSentences", metaVar = "docid", usage = "splits the fetched document into sentences (if stored in the index)")
        String sentDoc;

        @Option(name = "-convertDocidToLuceneDocid", metaVar = "docid", usage = "converts a collection lookupDocid to a Lucene internal lookupDocid")
        String lookupDocid;

        @Option(name = "-convertLuceneDocidToDocid", metaVar = "docid", usage = "converts to a Lucene internal lookupDocid to a collection lookupDocid ")
        int lookupLuceneDocid;
    }

    public class NotStoredException extends Exception {
        public NotStoredException(String message) {
            super(message);
        }
    }

    private final FSDirectory directory;
    private final DirectoryReader reader;

    public IndexUtils(String indexPath) throws IOException {
        this.directory = FSDirectory.open(new File(indexPath).toPath());
        this.reader = DirectoryReader.open(directory);
    }

    public InputStream getReadFileStream(String path) throws IOException {
        InputStream fin = Files.newInputStream(Paths.get(path), StandardOpenOption.READ);
        BufferedInputStream in = new BufferedInputStream(fin);
        if (path.endsWith(".bz2")) {
            BZip2CompressorInputStream bzIn = new BZip2CompressorInputStream(in);
            return bzIn;
        } else if (path.endsWith(".gz")) {
            GzipCompressorInputStream gzIn = new GzipCompressorInputStream(in);
            return gzIn;
        } else if (path.endsWith(".zip")) {
            GzipCompressorInputStream zipIn = new GzipCompressorInputStream(in);
            return zipIn;
        }
        return in;
    }

    void printIndexStats() throws IOException {
        Fields fields = MultiFields.getFields(reader);
        Terms terms = fields.terms(LuceneDocumentGenerator.FIELD_BODY);

        System.out.println("Index statistics");
        System.out.println("----------------");
        System.out.println("documents:             " + reader.numDocs());
        System.out.println("documents (non-empty): " + reader.getDocCount(LuceneDocumentGenerator.FIELD_BODY));
        System.out.println("unique terms:          " + terms.size());
        System.out.println(
                "total terms:           " + reader.getSumTotalTermFreq(LuceneDocumentGenerator.FIELD_BODY));

        System.out.println("stored fields:");

        FieldInfos fieldInfos = MultiFields.getMergedFieldInfos(reader);
        for (String fd : fields) {
            FieldInfo fi = fieldInfos.fieldInfo(fd);
            System.out.println("  " + fd + " (" + "indexOption: " + fi.getIndexOptions() + ", hasVectors: "
                    + fi.hasVectors() + ")");
        }
    }

    public void printTermCounts(String termStr) throws IOException, ParseException {
        EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
        QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
        TermQuery q = (TermQuery) qp.parse(termStr);
        Term t = q.getTerm();

        System.out.println("raw term:             " + termStr);
        System.out.println("stemmed term:         " + q.toString(LuceneDocumentGenerator.FIELD_BODY));
        System.out.println("collection frequency: " + reader.totalTermFreq(t));
        System.out.println("document frequency:   " + reader.docFreq(t));

        PostingsEnum postingsEnum = MultiFields.getTermDocsEnum(reader, LuceneDocumentGenerator.FIELD_BODY,
                t.bytes());
        System.out.println("postings:\n");
        while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            System.out.printf("\t%s, %s\n", postingsEnum.docID(), postingsEnum.freq());
        }
    }

    public void printDocumentVector(String docid) throws IOException, NotStoredException {
        Terms terms = reader.getTermVector(convertDocidToLuceneDocid(docid), LuceneDocumentGenerator.FIELD_BODY);
        if (terms == null) {
            throw new NotStoredException("Document vector not stored!");
        }
        TermsEnum te = terms.iterator();
        if (te == null) {
            throw new NotStoredException("Document vector not stored!");
        }
        while ((te.next()) != null) {
            System.out.println(te.term().utf8ToString() + " " + te.totalTermFreq());
        }
    }

    public void dumpDocumentVectors(String reqDocidsPath, DocVectorWeight weight) throws IOException {
        String outFileName = weight == null ? reqDocidsPath + ".docvector.tar.gz"
                : reqDocidsPath + ".docvector." + weight + ".tar.gz";
        LOG.info("Start dump document vectors with weight " + weight);

        InputStream in = getReadFileStream(reqDocidsPath);
        BufferedReader bRdr = new BufferedReader(new InputStreamReader(in));
        FileOutputStream fOut = new FileOutputStream(new File(outFileName));
        BufferedOutputStream bOut = new BufferedOutputStream(fOut);
        GzipCompressorOutputStream gzOut = new GzipCompressorOutputStream(bOut);
        TarArchiveOutputStream tOut = new TarArchiveOutputStream(gzOut);

        Map<Term, Integer> docFreqMap = new HashMap<>();

        int numNonEmptyDocs = reader.getDocCount(LuceneDocumentGenerator.FIELD_BODY);

        String docid;
        int counter = 0;
        while ((docid = bRdr.readLine()) != null) {
            counter++;

            // get term frequency
            Terms terms = reader.getTermVector(convertDocidToLuceneDocid(docid),
                    LuceneDocumentGenerator.FIELD_BODY);
            if (terms == null) {
                // We do not throw exception here because there are some
                //  collections in which part of documents don't have document vectors
                LOG.warn("Document vector not stored for doc " + docid);
                continue;
            }

            TermsEnum te = terms.iterator();
            if (te == null) {
                LOG.warn("Document vector not stored for doc " + docid);
                continue;
            }

            Term term;
            long freq;

            // iterate every term and write and store in Map
            Map<String, String> docVectors = new HashMap<>();
            while ((te.next()) != null) {
                term = new Term(LuceneDocumentGenerator.FIELD_BODY, te.term());
                freq = te.totalTermFreq();

                switch (weight) {
                case NONE:
                    docVectors.put(term.bytes().utf8ToString(), String.valueOf(freq));
                    break;

                case TF_IDF:
                    int docFreq;
                    if (docFreqMap.containsKey(term)) {
                        docFreq = docFreqMap.get(term);
                    } else {
                        try {
                            docFreq = reader.docFreq(term);
                        } catch (Exception e) {
                            LOG.error("Cannot find term " + term.toString() + " in indexing file.");
                            continue;
                        }
                        docFreqMap.put(term, docFreq);
                    }
                    float tfIdf = (float) (freq * Math.log(numNonEmptyDocs * 1.0 / docFreq));
                    docVectors.put(term.bytes().utf8ToString(), String.format("%.6f", tfIdf));
                    break;
                }
            }

            // Count size and write
            byte[] bytesOut = docVectors.entrySet().stream().map(e -> e.getKey() + " " + e.getValue())
                    .collect(joining("\n")).getBytes(StandardCharsets.UTF_8);

            TarArchiveEntry tarEntry = new TarArchiveEntry(new File(docid));
            tarEntry.setSize(bytesOut.length + String.format("<DOCNO>%s</DOCNO>\n", docid).length());
            tOut.putArchiveEntry(tarEntry);
            tOut.write(String.format("<DOCNO>%s</DOCNO>\n", docid).getBytes());
            tOut.write(bytesOut);
            tOut.closeArchiveEntry();

            if (counter % 100000 == 0) {
                LOG.info(counter + " files have been dumped.");
            }
        }
        tOut.close();
        LOG.info("Document Vectors are output to: " + outFileName);
    }

    public void getAllDocids(Compression compression) throws IOException {
        Query q = new FieldValueQuery(LuceneDocumentGenerator.FIELD_ID);
        IndexSearcher searcher = new IndexSearcher(reader);
        ScoreDoc[] scoreDocs;
        try {
            scoreDocs = searcher.search(new FieldValueQuery(LuceneDocumentGenerator.FIELD_ID), reader.maxDoc(),
                    BREAK_SCORE_TIES_BY_DOCID).scoreDocs;
        } catch (IllegalStateException e) { // because this is tweets collection
            scoreDocs = searcher.search(new FieldValueQuery(TweetGenerator.StatusField.ID_LONG.name),
                    reader.maxDoc(), BREAK_SCORE_TIES_BY_TWEETID).scoreDocs;
        }

        String basePath = directory.getDirectory().getFileName().toString() + ".allDocids";
        OutputStream outStream = null;
        String outputPath = "";
        switch (compression) {
        case NONE:
            outputPath = basePath + ".txt";
            outStream = Files.newOutputStream(Paths.get(outputPath));
            break;
        case GZ:
            outputPath = basePath + ".gz";
            outStream = new GzipCompressorOutputStream(
                    new BufferedOutputStream(Files.newOutputStream(Paths.get(outputPath))));
            break;
        case ZIP:
            outputPath = basePath + ".zip";
            outStream = new ZipOutputStream(new BufferedOutputStream(Files.newOutputStream(Paths.get(outputPath))));
            ((ZipOutputStream) outStream).putNextEntry(new ZipEntry(basePath));
            break;
        case BZ2:
            outputPath = basePath + ".bz2";
            outStream = new BZip2CompressorOutputStream(
                    new BufferedOutputStream(Files.newOutputStream(Paths.get(outputPath))));
            break;
        }
        for (int i = 0; i < scoreDocs.length; i++) {
            StringBuilder builder = new StringBuilder();
            builder.append(searcher.doc(scoreDocs[i].doc).getField(LuceneDocumentGenerator.FIELD_ID).stringValue())
                    .append("\n");
            outStream.write(builder.toString().getBytes(StandardCharsets.UTF_8));
        }
        outStream.close();
        System.out.println(String.format("All Documents IDs are output to: %s", outputPath));
    }

    public String getRawDocument(String docid) throws IOException, NotStoredException {
        Document d = reader.document(convertDocidToLuceneDocid(docid));
        IndexableField doc = d.getField(LuceneDocumentGenerator.FIELD_RAW);
        if (doc == null) {
            throw new NotStoredException("Raw documents not stored!");
        }
        return doc.stringValue();
    }

    public void dumpRawDocuments(String reqDocidsPath, boolean prependDocid)
            throws IOException, NotStoredException {
        LOG.info("Start dump raw documents" + (prependDocid ? " with Docid prepended" : "."));

        InputStream in = getReadFileStream(reqDocidsPath);
        BufferedReader bRdr = new BufferedReader(new InputStreamReader(in));
        FileOutputStream fOut = new FileOutputStream(new File(reqDocidsPath + ".output.tar.gz"));
        BufferedOutputStream bOut = new BufferedOutputStream(fOut);
        GzipCompressorOutputStream gzOut = new GzipCompressorOutputStream(bOut);
        TarArchiveOutputStream tOut = new TarArchiveOutputStream(gzOut);

        String docid;
        int counter = 0;
        while ((docid = bRdr.readLine()) != null) {
            counter += 1;
            Document d = reader.document(convertDocidToLuceneDocid(docid));
            IndexableField doc = d.getField(LuceneDocumentGenerator.FIELD_RAW);
            if (doc == null) {
                throw new NotStoredException("Raw documents not stored!");
            }
            TarArchiveEntry tarEntry = new TarArchiveEntry(new File(docid));

            byte[] bytesOut = doc.stringValue().getBytes(StandardCharsets.UTF_8);
            tarEntry.setSize(
                    bytesOut.length + (prependDocid ? String.format("<DOCNO>%s</DOCNO>\n", docid).length() : 0));
            tOut.putArchiveEntry(tarEntry);
            if (prependDocid) {
                tOut.write(String.format("<DOCNO>%s</DOCNO>\n", docid).getBytes());
            }
            tOut.write(bytesOut);
            tOut.closeArchiveEntry();

            if (counter % 100000 == 0) {
                LOG.info(counter + " files have been dumped.");
            }
        }
        tOut.close();
        LOG.info(String.format("Raw documents are output to: %s", reqDocidsPath + ".output.tar.gz"));
    }

    public String getTransformedDocument(String docid) throws IOException, NotStoredException {
        Document d = reader.document(convertDocidToLuceneDocid(docid));
        IndexableField doc = d.getField(LuceneDocumentGenerator.FIELD_BODY);
        if (doc == null) {
            throw new NotStoredException("Transformed documents not stored!");
        }
        return doc.stringValue();
    }

    public List<Sentence> getSentDocument(String docid) throws IOException, NotStoredException {
        String toSplit;
        try {
            toSplit = getTransformedDocument(docid);
        } catch (NotStoredException e) {
            String rawDoc = getRawDocument(docid);
            org.jsoup.nodes.Document jDoc = Jsoup.parse(rawDoc);
            toSplit = jDoc.text();
        }
        edu.stanford.nlp.simple.Document doc = new edu.stanford.nlp.simple.Document(toSplit);
        return doc.sentences();
    }

    public int convertDocidToLuceneDocid(String docid) throws IOException {
        IndexSearcher searcher = new IndexSearcher(reader);

        Query q = new TermQuery(new Term(LuceneDocumentGenerator.FIELD_ID, docid));
        TopDocs rs = searcher.search(q, 1);
        ScoreDoc[] hits = rs.scoreDocs;

        if (hits == null) {
            throw new RuntimeException("Docid not found!");
        }

        return hits[0].doc;
    }

    public String convertLuceneDocidToDocid(int docid) throws IOException {
        Document d = reader.document(docid);
        IndexableField doc = d.getField(LuceneDocumentGenerator.FIELD_ID);
        if (doc == null) {
            // Really shouldn't happen!
            throw new RuntimeException();
        }
        return doc.stringValue();
    }

    public static void main(String[] argv) throws Exception {
        Args args = new Args();
        CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(90));
        try {
            parser.parseArgument(argv);
        } catch (CmdLineException e) {
            System.err.println(e.getMessage());
            parser.printUsage(System.err);
            return;
        }

        final IndexUtils util = new IndexUtils(args.index);

        if (args.stats) {
            util.printIndexStats();
        }

        if (args.term != null) {
            util.printTermCounts(args.term);
        }

        if (args.docvectorDocid != null) {
            util.printDocumentVector(args.docvectorDocid);
        }

        if (args.docVectors != null) {
            if (args.docVectorWeight == null) {
                args.docVectorWeight = DocVectorWeight.NONE;
            }
            util.dumpDocumentVectors(args.docVectors, args.docVectorWeight);
        }

        if (args.dumpAllDocids != null) {
            util.getAllDocids(args.dumpAllDocids);
        }

        if (args.rawDoc != null) {
            System.out.println(util.getRawDocument(args.rawDoc));
        }

        if (args.rawDocs != null) {
            util.dumpRawDocuments(args.rawDocs, false);
        }

        if (args.rawDocsWithDocid != null) {
            util.dumpRawDocuments(args.rawDocs, true);
        }

        if (args.transformedDoc != null) {
            System.out.println(util.getTransformedDocument(args.transformedDoc));
        }

        if (args.sentDoc != null) {
            for (Sentence sent : util.getSentDocument(args.sentDoc)) {
                System.out.println(sent);
            }
        }

        if (args.lookupDocid != null) {
            System.out.println(util.convertDocidToLuceneDocid(args.lookupDocid));
        }

        if (args.lookupLuceneDocid > 0) {
            System.out.println(util.convertLuceneDocidToDocid(args.lookupLuceneDocid));
        }
    }
}