com.zimbra.cs.index.LuceneViewer.java Source code

Java tutorial

Introduction

Here is the source code for com.zimbra.cs.index.LuceneViewer.java

Source

/*
 * ***** BEGIN LICENSE BLOCK *****
 * Zimbra Collaboration Suite Server
 * Copyright (C) 2010, 2013, 2014, 2016 Synacor, Inc.
 *
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software Foundation,
 * version 2 of the License.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License along with this program.
 * If not, see <https://www.gnu.org/licenses/>.
 * ***** END LICENSE BLOCK *****
 */
package com.zimbra.cs.index;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.index.CheckIndex.Status;
import org.apache.lucene.store.Directory;

import com.zimbra.common.util.SetUtil;

public class LuceneViewer {

    private String mIndexDir;
    private String mOutputFile;
    private TermFilters mTermFilters;
    private Console mConsole;

    /**
     * If filters are used, intersection of doc nums that appear in all terms
     * that matched any filter. If filters are not used, null.
     */
    private Set<Integer> mDocsIntersection;

    private IndexReader mIndexReader;
    private FileWriter mWriter;

    private static class TermFilters {

        private static class TermFilter {
            private String mField;
            private String mText;

            private TermFilter(String field, String text) {
                mField = field;
                mText = text;
            }
        }

        private List<TermFilter> mFilters = new ArrayList<TermFilter>();

        private void addFilter(String field, String text) {
            mFilters.add(new TermFilter(field, text));
        }

        private List<TermFilter> getFilters() {
            return mFilters;
        }

    }

    public LuceneViewer(String indexName, String outfileName, TermFilters termFilters, Console console)
            throws Exception {
        mIndexDir = indexName;
        mOutputFile = outfileName;
        mTermFilters = termFilters;
        mConsole = console;

        mIndexReader = IndexReader.open(LuceneDirectory.open(new File(mIndexDir)));
        mWriter = new FileWriter(mOutputFile);

        if (hasFilters()) {
            mDocsIntersection = new HashSet<Integer>();
        }
    }

    private List<TermFilters.TermFilter> getFilters() {
        List<TermFilters.TermFilter> filters = mTermFilters == null ? null : mTermFilters.getFilters();
        return filters;
    }

    private boolean hasFilters() {
        List<TermFilters.TermFilter> filters = mTermFilters == null ? null : mTermFilters.getFilters();
        if (filters == null || filters.isEmpty()) {
            return false;
        } else {
            return true;
        }
    }

    private boolean wantThisTerm(String termField, String termText) {
        if (!hasFilters()) {
            return true;
        }

        for (TermFilters.TermFilter termFilter : getFilters()) {
            String field = termFilter.mField;
            String text = termFilter.mText;
            boolean matched = ((field == null || field.equalsIgnoreCase(termField))
                    & (text == null || text.equalsIgnoreCase(termText)));
            if (matched) {
                return true;
            }
        }

        return false;
    }

    private void closeIndexReader() throws IOException {
        mIndexReader.close();
    }

    private void outputBanner(String bannerText) throws IOException {
        outputLn();
        outputLn("==============================");
        outputLn(bannerText);
        outputLn("==============================");
        outputLn();
    }

    private void outputLn() throws IOException {
        output("\n");
    }

    private void outputLn(String out) throws IOException {
        output(out + "\n");
    }

    private void output(String out) throws IOException {
        mWriter.write(out);
        mWriter.flush();
    }

    private void closeOutputWriter() throws IOException {
        mWriter.close();
    }

    private void dump() throws Exception {

        outputLn("Index directory: " + mIndexDir);
        outputLn("Output file:     " + mOutputFile);
        dumpTermFilters();

        dumpFields();
        dumpDocuments();
        dumpTerms();

        dumpDocsIntersection();

        outputBanner("end");

        closeIndexReader();
        closeOutputWriter();
    }

    private void dumpTermFilters() throws IOException {
        if (!hasFilters()) {
            return;
        }

        outputLn("Term filters:");
        for (TermFilters.TermFilter termFilter : getFilters()) {
            String field = termFilter.mField;
            String text = termFilter.mText;
            outputLn("   (field: " + (field == null ? "" : field) + ") (text: " + (text == null ? "" : text) + ")");
        }
    }

    private void dumpFields() throws IOException {
        outputBanner("Fields");

        Collection<String> fieldNames = mIndexReader.getFieldNames(IndexReader.FieldOption.ALL);
        for (String fieldName : fieldNames) {
            outputLn("    " + fieldName.toString());
        }
    }

    private void dumpDocuments() throws IOException {
        outputBanner("Documents");

        int totalDocs = mIndexReader.numDocs();

        outputLn();
        outputLn("There are " + totalDocs + " documents in this index.");

        mConsole.debug("Total number of documents: " + totalDocs);
        for (int i = 0; i < totalDocs; i++) {
            Document doc = null;
            try {
                doc = mIndexReader.document(i, null);
            } catch (IllegalArgumentException e) {
                if ("attempt to access a deleted document".equals(e.getMessage())) {
                    mConsole.warn("encountered exception while dumping document " + i + ": " + e.getMessage());
                } else {
                    throw e;
                }
            }
            dumpDocument(i, doc);

            if ((i + 1) % 100 == 0) {
                mConsole.debug("Dumped " + (i + 1) + " documents");
            }
        }
    }

    private void dumpDocument(int docNum, Document doc) throws IOException {

        outputLn();
        outputLn("Document " + docNum);

        if (doc == null) {
            outputLn("    deleted");
            return;
        }

        // note: only stored fields will be returned
        for (Fieldable field : doc.getFields()) {
            String fieldName = field.name();

            boolean isDate = "l.date".equals(fieldName);

            outputLn("    Field [" + fieldName + "]: " + field.toString());
            String[] values = doc.getValues(fieldName);
            if (values != null) {
                int i = 0;
                for (String value : values) {
                    output("         " + "(" + i++ + ") " + value);
                    if (isDate) {
                        try {
                            Date date = DateTools.stringToDate(value);
                            output(" (" + date.toString() + " (" + date.getTime() + "))");
                        } catch (java.text.ParseException e) {
                            assert false;
                        }
                    }
                    outputLn();
                }
            }
        }
    }

    // keep track of docs that appear in all terms that are filtered in.
    private void computeDocsIntersection(Set<Integer> docs) {
        // sanity check
        if (!hasFilters()) {
            return;
        }

        if (mDocsIntersection.isEmpty()) {
            mDocsIntersection = docs;
        } else {
            mDocsIntersection = SetUtil.intersect(mDocsIntersection, docs);
        }
    }

    private void dumpTerms() throws IOException {
        outputBanner("Terms (in Term.compareTo() order)");

        TermEnum terms = mIndexReader.terms();
        int order = 0;

        while (terms.next()) {
            order++;
            Term term = terms.term();
            String field = term.field();
            String text = term.text();

            if (!wantThisTerm(field, text)) {
                continue;
            }

            outputLn(order + " " + field + ": " + text);

            /*
             * for each term, print the
             * <document, frequency, <position>* > tuples for a term.
             *
             * document:  document in which the Term appears
             * frequency: number of time the Term appears in the document
             * position:  position for each appearance in the document
             *
             * e.g. doc.add(new Field("field", "one two three two four five", Field.Store.YES, Field.Index.ANALYZED));
             *      then the tuple for Term("field", "two") in this document would be like:
             *      88, 2, <2, 4>
             *      where
             *      88 is the document number
             *      2  is the frequency this term appear in the document
             *      <2, 4> are the positions for each appearance in the document
             */
            // by TermPositions
            outputLn("    document, frequency, <position>*");

            // keep track of docs that appear in all terms that are filtered in.
            Set<Integer> docNums = null;
            if (hasFilters()) {
                docNums = new HashSet<Integer>();
            }

            TermPositions termPos = mIndexReader.termPositions(term);
            while (termPos.next()) {
                int docNum = termPos.doc();
                int freq = termPos.freq();

                if (docNums != null) {
                    docNums.add(docNum);
                }

                output("    " + docNum + ", " + freq + ", <");

                boolean first = true;
                for (int f = 0; f < freq; f++) {
                    int positionInDoc = termPos.nextPosition();
                    if (!first) {
                        output(" ");
                    } else {
                        first = false;
                    }
                    output(positionInDoc + "");
                }
                outputLn(">");
            }
            termPos.close();

            if (docNums != null) {
                computeDocsIntersection(docNums);
            }

            outputLn();

            if (order % 1000 == 0) {
                mConsole.debug("Dumped " + order + " terms");
            }
        }

        terms.close();
    }

    private void dumpDocsIntersection() throws IOException {
        if (mDocsIntersection == null) {
            return;
        }

        outputBanner("Documents in which all (filtered in) terms appear");

        List<Integer> sorted = new ArrayList<Integer>(mDocsIntersection);
        Collections.sort(sorted);
        for (Integer docNum : sorted) {
            outputLn("    " + docNum);
        }
    }

    private static class CLI {

        public static final int NUM_TERM_FILTERS = 10;

        public static final String O_ACTION = "a";
        public static final String O_HELP = "h";
        public static final String O_INPUT = "i";
        public static final String O_OUTPUT = "o";
        public static final String O_VERBOSE = "v";

        private static final String O_TERM_FILTER_FIELD_PREFIX = "f";
        private static final String O_TERM_FILTER_TEXT_PREFIX = "t";

        private Options getAllOptions() {
            return getOptions(true);
        }

        private Options getOptions(boolean includeHiddenOptions) {
            Options options = new Options();

            options.addOption(O_ACTION, "action", true, "action, values are dump|check");
            options.addOption(O_HELP, "help", false, "input directory");
            options.addOption(O_INPUT, "input", true, "input directory");
            options.addOption(O_OUTPUT, "output", true, "output file");
            options.addOption(O_VERBOSE, "verbose", false, "verbose mode");

            if (includeHiddenOptions) {
                for (Object option : getTermFilterOptions().getOptions()) {
                    options.addOption((Option) option);
                }
            }

            return options;
        }

        static private String termFilterFieldOption(int i) {
            return O_TERM_FILTER_FIELD_PREFIX + i;
        }

        static private String termFilterTextOption(int i) {
            return O_TERM_FILTER_TEXT_PREFIX + i;
        }

        private Options getTermFilterOptions() {
            Options options = new Options();

            for (int i = 1; i <= NUM_TERM_FILTERS; i++) {
                options.addOption(termFilterFieldOption(i), "field" + i, true, "field name of term filter " + i);
                options.addOption(termFilterTextOption(i), "text" + i, true, "text of term filter " + i);
            }

            return options;
        }

        private boolean helpOptionSpecified(String[] args) {
            return args != null && args.length == 1 && ("-h".equals(args[0]) || "--help".equals(args[0]));
        }

        private void usage(boolean exit) {
            usage(null, exit);
        }

        protected String getCommandUsage() {
            return ("zmjava com.zimbra.cs.index.LuceneViewer <options> [term filter options]");
        }

        private void usage(ParseException e, boolean exit) {
            if (e != null) {
                System.err.println(e.getMessage());
                System.err.println();
                System.err.println();
            }

            PrintWriter pw = new PrintWriter(System.err, true);
            HelpFormatter formatter = new HelpFormatter();
            formatter.setWidth(80);
            formatter.printHelp(pw, formatter.getWidth(), getCommandUsage(), null, getOptions(false),
                    formatter.getLeftPadding(), formatter.getDescPadding(), null);
            pw.flush();

            System.err.println();
            System.err.println("term filter: ");
            System.err.println(
                    "    - each f[n], t[n] pair represents a term filter, f[n], t[n] don't have to both exist");
            System.err.println("    - f[n] represents a term field name, t[n] represents a term text");
            System.err.println("    - the final filter is formed by ORing all term filters together");
            System.err.println("    - maximum of " + NUM_TERM_FILTERS + " term filters are allowed");

            System.err.println();
            System.err.println("    examples:");
            System.err.println("        -f1 l.content -t1 foo" + " (term l.content=foo)");
            System.err.println("        -f2 from" + "              (all terms with from as the field name)");
            System.err.println("        -t3 bar" + "               (all terms with bar as the text)");

            System.err.println();
            System.err.println();
            System.err.println("Sample command lines:");
            System.err.println(
                    "zmjava com.zimbra.cs.index.LuceneViewer -a dump -i /opt/zimbra/index/0/2/index/0 -o /tmp/user1-index-dump.txt");
            System.err.println(
                    "zmjava com.zimbra.cs.index.LuceneViewer -a dump -v -f1 l.content -t1 jay -f2 subject -t2 howdy -i /opt/zimbra/index/0/2/index/0 -o /tmp/user1-index-dump.txt");
            System.err.println(
                    "zmjava com.zimbra.cs.index.LuceneViewer -a dump -f1 from jay@test.com -i /opt/zimbra/index/0/2/index/0 -o /tmp/user1-index-dump.txt");

            if (exit) {
                System.exit(1);
            }
        }

        protected CommandLine getCommandLine(String[] args) {
            CommandLineParser clParser = new GnuParser();
            CommandLine cl = null;

            Options opts = getAllOptions();
            try {
                cl = clParser.parse(opts, args);
            } catch (ParseException e) {
                if (helpOptionSpecified(args)) {
                    usage(true);
                } else {
                    usage(e, true);
                }
            }

            return cl;
        }

        static TermFilters getTermFilters(CommandLine cl) {
            TermFilters termFilters = new TermFilters();

            for (int i = 1; i <= NUM_TERM_FILTERS; i++) {
                String fieldOption = termFilterFieldOption(i);
                String textOption = termFilterTextOption(i);
                if (cl.hasOption(fieldOption) || cl.hasOption(textOption)) {
                    termFilters.addFilter(cl.hasOption(fieldOption) ? cl.getOptionValue(fieldOption) : null,
                            cl.hasOption(textOption) ? cl.getOptionValue(textOption) : null);
                }
            }

            return termFilters;
        }
    }

    private static class Console {
        private boolean mVerbose;

        Console(boolean verbose) {
            mVerbose = verbose;
        }

        private void debug(String text) {
            if (mVerbose) {
                System.out.println(text);
            }
        }

        private void info(String text) {
            System.out.println(text);
        }

        private void warn(String text) {
            System.out.println(text);
        }
    }

    private static void doCheck(CommandLine cl) throws Exception {
        Console console = new Console(cl.hasOption(CLI.O_VERBOSE));

        String indexDir = cl.getOptionValue(CLI.O_INPUT);
        console.info("Checking index " + indexDir);

        Directory dir = null;
        try {
            dir = LuceneDirectory.open(new File(indexDir));
        } catch (Throwable t) {
            console.info("ERROR: could not open directory \"" + indexDir + "\"; exiting");
            t.printStackTrace(System.out);
            System.exit(1);
        }

        CheckIndex checker = new CheckIndex(dir);
        checker.setInfoStream(System.out);

        Status result = checker.checkIndex();
        console.info("Result:" + (result.clean ? "clean" : "not clean"));
    }

    private static void doDump(CommandLine cl) throws Exception {
        Console console = new Console(cl.hasOption(CLI.O_VERBOSE));

        String indexDir = cl.getOptionValue(CLI.O_INPUT);
        String outputFile = cl.getOptionValue(CLI.O_OUTPUT);

        TermFilters termFilters = CLI.getTermFilters(cl);

        console.info("Dumping index directory: " + indexDir);
        console.info("Output file: " + outputFile);

        LuceneViewer viewer = new LuceneViewer(indexDir, outputFile, termFilters, console);

        viewer.dump();

        console.info("all done");
    }

    public static void main(String args[]) throws Exception {

        CLI cli = new CLI();
        CommandLine cl = cli.getCommandLine(args);

        if (!cl.hasOption(CLI.O_ACTION)) {
            cli.usage(new ParseException("missing required option " + CLI.O_ACTION), true);
        }

        String action = cl.getOptionValue(CLI.O_ACTION);
        if ("dump".equals(action)) {
            doDump(cl);
        } else if ("check".equals(action)) {
            doCheck(cl);
        } else {
            cli.usage(new ParseException("invalid option " + action), true);
        }

    }
}