com.redhat.satellite.search.index.IndexManager.java Source code

Introduction

Here is the source code for com.redhat.satellite.search.index.IndexManager.java
Source

/**
 * Copyright (c) 2008--2012 Red Hat, Inc.
 *
 * This software is licensed to you under the GNU General Public License,
 * version 2 (GPLv2). There is NO WARRANTY for this software, express or
 * implied, including the implied warranties of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. You should have received a copy of GPLv2
 * along with this software; if not, see
 * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt.
 *
 * Red Hat trademarks are not licensed under GPLv2. No permission is
 * granted to use or replicate Red Hat trademarks that are incorporated
 * in this software or its documentation.
 */

package com.redhat.satellite.search.index;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.nutch.analysis.AnalyzerFactory;
import org.apache.nutch.searcher.FetchedSegments;
import org.apache.nutch.searcher.HitDetails;
import org.apache.nutch.searcher.Summary;
import org.apache.nutch.util.NutchConfiguration;

import com.redhat.satellite.search.config.Configuration;
import com.redhat.satellite.search.index.builder.BuilderFactory;
import com.redhat.satellite.search.index.ngram.NGramAnalyzer;
import com.redhat.satellite.search.index.ngram.NGramQueryParser;
import com.redhat.satellite.search.rpc.handlers.IndexHandler;

/**
 * Indexing workhorse class
 *
 * @version $Rev$
 */
public class IndexManager {

    private static Logger log = Logger.getLogger(IndexManager.class);
    private String indexWorkDir;
    private int maxHits;
    private double score_threshold;
    private double system_score_threshold;
    private double errata_score_threshold;
    private double errata_advisory_score_threshold;
    private int min_ngram;
    private int max_ngram;
    private boolean filterDocResults = false;
    private boolean explainResults = false;
    private AnalyzerFactory nutchAnalyzerFactory;
    // Name conflict with our Configuration class and Hadoop's
    private org.apache.hadoop.conf.Configuration nutchConf;
    private Map<String, String> docLocaleLookUp = new TreeMap<String, String>(String.CASE_INSENSITIVE_ORDER);
    private Map<String, FetchedSegments> docSegments;

    /**
     * Constructor
     *
     * @param config application config
     */
    public IndexManager(Configuration config) {
        maxHits = config.getInt("search.max_hits_returned", 0);
        indexWorkDir = config.getString("search.index_work_dir", null);
        if (indexWorkDir == null) {
            throw new IllegalArgumentException("search.index_work_dir config entry " + "is missing");
        }
        if (!indexWorkDir.endsWith("/")) {
            indexWorkDir += "/";
        }
        score_threshold = config.getDouble("search.score_threshold", .30);
        system_score_threshold = config.getDouble("search.system_score_threshold", .30);
        errata_score_threshold = config.getDouble("search.errata_score_threshold", .30);
        errata_advisory_score_threshold = config.getDouble("search.errata.advisory_score_threshold", .30);
        min_ngram = config.getInt("search.min_ngram", 1);
        max_ngram = config.getInt("search.max_ngram", 5);
        initDocLocaleLookup();
        filterDocResults = config.getBoolean("search.doc.limit_results");
        explainResults = config.getBoolean("search.log.explain.results");
        initDocSummary();
    }

    /**
     * @return String of the index working directory
     */
    public String getIndexWorkDir() {
        return indexWorkDir;
    }

    /**
     * Query a index
     *
     * @param indexName name of the index
     * @param query search query
     * @param lang language
     * @return list of hits
     * @throws IndexingException if there is a problem indexing the content.
     * @throws QueryParseException
     */
    public List<Result> search(String indexName, String query, String lang)
            throws IndexingException, QueryParseException {
        return search(indexName, query, lang, false);
    }

    /**
     * Query a index
     *
     * @param indexName name of the index
     * @param query search query
     * @param lang language
     * @param isFineGrained
     *      true:   will limit results, less are returned but they are closer
     *              to the search query, useful for advanced/free form queries
     *
     *      false:  will allow queries to be more flexible returning words
     *              which are spelled similarly
     *
     * @return list of hits
     * @throws IndexingException if there is a problem indexing the content.
     * @throws QueryParseException
     */
    public List<Result> search(String indexName, String query, String lang, boolean isFineGrained)
            throws IndexingException, QueryParseException {
        IndexSearcher searcher = null;
        IndexReader reader = null;
        List<Result> retval = null;
        try {
            reader = getIndexReader(indexName, lang);
            searcher = getIndexSearcher(indexName, lang);
            QueryParser qp = getQueryParser(indexName, lang, isFineGrained);
            Query q = qp.parse(query);
            if (log.isDebugEnabled()) {
                log.debug("Original query was: " + query);
                log.debug("Parsed Query is: " + q.toString());
            }
            Hits hits = searcher.search(q);
            if (log.isDebugEnabled()) {
                log.debug(hits.length() + " results were found.");
            }
            Set<Term> queryTerms = null;
            try {
                queryTerms = new HashSet<Term>();
                Query newQ = q.rewrite(reader);
                newQ.extractTerms(queryTerms);
            } catch (Exception e) {
                e.printStackTrace();
                throw new QueryParseException(e);
            }
            retval = processHits(indexName, hits, queryTerms, query, lang);
            if (explainResults) {
                debugExplainResults(indexName, hits, searcher, q, queryTerms);
            }
        } catch (IOException e) {
            // this exception is thrown, when there're no packages or errata on the system
            // and the user performs a search
            // if this is the case, just return 0 results, otherwise rethrow the exception
            if (!e.getMessage().contains(
                    "no segments* file found in org.apache.lucene.store.FSDirectory@/var/lib/rhn/search/indexes")) {
                throw new IndexingException(e);
            }
            log.error(e.getMessage());
            retval = new ArrayList<Result>();
        } catch (ParseException e) {
            throw new QueryParseException("Could not parse query: '" + query + "'");
        } finally {
            try {
                if (searcher != null) {
                    searcher.close();
                }
                if (reader != null) {
                    reader.close();
                }
            } catch (IOException ex) {
                throw new IndexingException(ex);
            }
        }
        return retval;
    }

    /**
     * Create an empty index if it exists
     *
     * @param indexName index to use
     * @param lang language.
     * @throws IndexingException something went wrong adding the document
     */
    public void createIndex(String indexName, String lang) throws IndexingException {

        try {
            IndexWriter writer = getIndexWriter(indexName, lang);
            try {
                writer.flush();
            } finally {
                try {
                    writer.close();
                } finally {
                    // unlock it if it is locked.
                    unlockIndex(indexName);
                }
            }
        } catch (CorruptIndexException e) {
            throw new IndexingException(e);
        } catch (LockObtainFailedException e) {
            throw new IndexingException(e);
        } catch (IOException e) {
            throw new IndexingException(e);
        }
    }

    /**
     * Adds a document to an index
     *
     * @param indexName index to use
     * @param doc Document to be indexed.
     * @param lang language.
     * @throws IndexingException something went wrong adding the document
     */
    public void addToIndex(String indexName, Document doc, String lang) throws IndexingException {

        try {
            IndexWriter writer = getIndexWriter(indexName, lang);
            try {
                writer.addDocument(doc);
                writer.flush();
            } finally {
                try {
                    writer.close();
                } finally {
                    // unlock it if it is locked.
                    unlockIndex(indexName);
                }
            }
        } catch (CorruptIndexException e) {
            throw new IndexingException(e);
        } catch (LockObtainFailedException e) {
            throw new IndexingException(e);
        } catch (IOException e) {
            throw new IndexingException(e);
        }
    }

    /**
     * @param indexName
     * @param doc document with data to index
     * @param uniqueField field in doc which identifies this uniquely
     * @param lang language
     * @throws IndexingException
     */
    public void addUniqueToIndex(String indexName, Document doc, String uniqueField, String lang)
            throws IndexingException {
        IndexReader reader = null;
        int numFound = 0;
        try {
            reader = getIndexReader(indexName, lang);
            Term term = new Term(uniqueField, doc.get(uniqueField));
            numFound = reader.docFreq(term);
        } catch (FileNotFoundException e) {
            // Index doesn't exist, so this add will be unique
            // we don't need to do anything/
        } catch (IOException e) {
            throw new IndexingException(e);
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) {
                    //
                }
            }
        }
        if (numFound > 0) {
            log.info("Found " + numFound + " <" + indexName + " docs for " + uniqueField + ":"
                    + doc.get(uniqueField) + " will remove them now.");
            removeFromIndex(indexName, uniqueField, doc.get(uniqueField));
        }
        addToIndex(indexName, doc, lang);
    }

    /**
     * Remove a document from an index
     *
     * @param indexName index to use
     * @param uniqueField field name which represents this data's unique id
     * @param objectId unique document id
     * @throws IndexingException something went wrong removing the document
     */
    public void removeFromIndex(String indexName, String uniqueField, String objectId) throws IndexingException {
        log.info("Removing <" + indexName + "> " + uniqueField + ":" + objectId);
        Term t = new Term(uniqueField, objectId);
        IndexReader reader;
        try {
            reader = getIndexReader(indexName, IndexHandler.DEFAULT_LANG);
            try {
                reader.deleteDocuments(t);
                reader.flush();
            } finally {
                if (reader != null) {
                    reader.close();
                }
            }
        } catch (CorruptIndexException e) {
            throw new IndexingException(e);
        } catch (IOException e) {
            throw new IndexingException(e);
        }
    }

    /**
     * Unlocks the index at the given directory if it is currently locked.
     * Otherwise, does nothing.
     * @param indexName index name
     * @throws IOException thrown if there is a problem unlocking the index.
     */
    private void unlockIndex(String indexName) throws IOException {
        String path = indexWorkDir + indexName;
        File f = new File(path);
        Directory dir = FSDirectory.getDirectory(f);
        if (IndexReader.isLocked(dir)) {
            IndexReader.unlock(dir);
        }
    }

    private IndexWriter getIndexWriter(String name, String lang)
            throws CorruptIndexException, LockObtainFailedException, IOException {
        String path = indexWorkDir + name;
        File f = new File(path);
        f.mkdirs();
        Analyzer analyzer = getAnalyzer(name, lang);
        IndexWriter writer = new IndexWriter(path, analyzer);
        writer.setUseCompoundFile(true);
        return writer;
    }

    private IndexReader getIndexReader(String indexName, String locale) throws CorruptIndexException, IOException {
        String path = "";
        if (indexName.compareTo(BuilderFactory.DOCS_TYPE) == 0) {
            path = indexWorkDir + File.separator + getDocIndexPath(locale);
        } else {
            path = indexWorkDir + indexName;
        }
        log.info("IndexManager::getIndexReader(" + indexName + ", " + locale + ") path = " + path);
        File f = new File(path);
        IndexReader retval = IndexReader.open(FSDirectory.getDirectory(f));
        return retval;
    }

    private IndexSearcher getIndexSearcher(String indexName, String locale)
            throws CorruptIndexException, IOException {
        String path = "";
        if (indexName.compareTo(BuilderFactory.DOCS_TYPE) == 0) {
            path = indexWorkDir + File.separator + getDocIndexPath(locale);
        } else {
            path = indexWorkDir + indexName;
        }
        log.info("IndexManager::getIndexSearcher(" + indexName + ", " + locale + ") path = " + path);
        IndexSearcher retval = new IndexSearcher(path);
        return retval;
    }

    private QueryParser getQueryParser(String indexName, String lang, boolean isFineGrained) {
        if (log.isDebugEnabled()) {
            log.debug("getQueryParser(" + indexName + ", " + lang + ", " + isFineGrained + ")");
        }
        QueryParser qp;
        Analyzer analyzer = getAnalyzer(indexName, lang);
        if (indexName.compareTo(BuilderFactory.DOCS_TYPE) == 0) {
            qp = new QueryParser("content", analyzer);
        } else {
            qp = new NGramQueryParser("name", analyzer, isFineGrained);
        }
        qp.setDateResolution(DateTools.Resolution.MINUTE);
        return qp;
    }

    private Analyzer getAnalyzer(String indexName, String lang) {
        if (log.isDebugEnabled()) {
            log.debug("getAnalyzer(" + indexName + ", " + lang + ")");
        }
        if (indexName.compareTo(BuilderFactory.DOCS_TYPE) == 0) {
            return getDocAnalyzer(lang);
        } else if (indexName.compareTo(BuilderFactory.SERVER_TYPE) == 0) {
            return getServerAnalyzer();
        } else if (indexName.compareTo(BuilderFactory.ERRATA_TYPE) == 0) {
            return getErrataAnalyzer();
        } else if (indexName.compareTo(BuilderFactory.SNAPSHOT_TAG_TYPE) == 0) {
            return getSnapshotTagAnalyzer();
        } else if (indexName.compareTo(BuilderFactory.HARDWARE_DEVICE_TYPE) == 0) {
            return getHardwareDeviceAnalyzer();
        } else if (indexName.compareTo(BuilderFactory.SERVER_CUSTOM_INFO_TYPE) == 0) {
            return getServerCustomInfoAnalyzer();
        } else {
            log.debug(indexName + " using getDefaultAnalyzer()");
            return getDefaultAnalyzer();
        }
    }

    private List<Result> processHits(String indexName, Hits hits, Set<Term> queryTerms, String query, String lang)
            throws IOException {
        List<Result> retval = new ArrayList<Result>();
        for (int x = 0; x < hits.length(); x++) {
            Document doc = hits.doc(x);
            Result pr = null;
            if (!isScoreAcceptable(indexName, hits, x, query)) {
                break;
            }
            if (indexName.compareTo(BuilderFactory.DOCS_TYPE) == 0) {
                pr = new DocResult(x, hits.score(x), doc);
                String summary = lookupDocSummary(doc, query, lang);
                if (summary != null) {
                    ((DocResult) pr).setSummary(summary);
                }
            } else if (indexName.compareTo(BuilderFactory.HARDWARE_DEVICE_TYPE) == 0) {
                pr = new HardwareDeviceResult(x, hits.score(x), doc);
            } else if (indexName.compareTo(BuilderFactory.SNAPSHOT_TAG_TYPE) == 0) {
                pr = new SnapshotTagResult(x, hits.score(x), doc);
            } else if (indexName.compareTo(BuilderFactory.SERVER_CUSTOM_INFO_TYPE) == 0) {
                pr = new ServerCustomInfoResult(x, hits.score(x), doc);
            } else if (indexName.compareTo(BuilderFactory.XCCDF_IDENT_TYPE) == 0) {
                pr = new Result(x, doc.getField("id").stringValue(), doc.getField("identifier").stringValue(),
                        hits.score(x));
            } else {
                pr = new Result(x, doc.getField("id").stringValue(), doc.getField("name").stringValue(),
                        hits.score(x));
            }
            if (log.isDebugEnabled()) {
                log.debug("Hit[" + x + "] Score = " + hits.score(x) + ", Result = " + pr);
            }
            /**
             * matchingField will help the webUI to understand what field was responsible
             * for this match.  Later implementation should use "Explanation" to determine
             * field, for now we will simply grab one term and return it's field.
             */
            try {
                MatchingField match = new MatchingField(query, doc, queryTerms);
                pr.setMatchingField(match.getFieldName());
                pr.setMatchingFieldValue(match.getFieldValue());
                log.info("hit[" + x + "] matchingField is being set to: <" + pr.getMatchingField()
                        + "> based on passed in query field.  " + "matchingFieldValue = "
                        + pr.getMatchingFieldValue());
            } catch (Exception e) {
                log.error("Caught exception: ", e);
            }
            if (pr != null) {
                retval.add(pr);
            }
            if (maxHits > 0 && x == maxHits) {
                break;
            }
        }
        return retval;
    }

    /**
     *
     * @param indexName
     * @param hits
     * @param x
     * @param query
     * @return  true - score is acceptable
     *          false - score is NOT acceptable
     * @throws IOException
     */
    private boolean isScoreAcceptable(String indexName, Hits hits, int x, String queryIn) throws IOException {
        String guessMainQueryTerm = MatchingField.getFirstFieldName(queryIn);

        if ((indexName.compareTo(BuilderFactory.DOCS_TYPE) == 0) && (!filterDocResults)) {
            return true;
        }
        /**
         * Dropping matches which are a poor fit.
         * system searches are filtered based on "system_score_threshold"
         * other searches will return 10 best matches, then filter anything below
         * "score_threshold"
         */
        if ((indexName.compareTo(BuilderFactory.SERVER_TYPE) == 0)
                || (indexName.compareTo(BuilderFactory.SERVER_CUSTOM_INFO_TYPE) == 0)
                || (indexName.compareTo(BuilderFactory.SNAPSHOT_TAG_TYPE) == 0)
                || (indexName.compareTo(BuilderFactory.HARDWARE_DEVICE_TYPE) == 0)) {
            if (hits.score(x) < system_score_threshold) {
                if (log.isDebugEnabled()) {
                    log.debug("hits.score(" + x + ") is " + hits.score(x));
                    log.debug("Filtering out search results from " + x + " to " + hits.length()
                            + ", due to their score being below " + "system_score_threshold = "
                            + system_score_threshold);
                }
                return false;
            }
        } else if (indexName.compareTo(BuilderFactory.ERRATA_TYPE) == 0) {
            if (guessMainQueryTerm.compareTo("name") == 0) {
                if (hits.score(x) < errata_advisory_score_threshold) {
                    if (log.isDebugEnabled()) {
                        log.debug("hits.score(" + x + ") is " + hits.score(x));
                        log.debug("Filtering out search results from " + x + " to " + hits.length()
                                + ", due to their score being below " + "errata_advisory_score_threshold = "
                                + errata_advisory_score_threshold);
                    }
                    return false;
                }
            } else {
                if (hits.score(x) < errata_score_threshold) {
                    if (log.isDebugEnabled()) {
                        log.debug("hits.score(" + x + ") is " + hits.score(x));
                        log.debug("Filtering out search results from " + x + " to " + hits.length()
                                + ", due to their score being below " + "errata_score_threshold = "
                                + errata_score_threshold);
                    }
                    return false;
                }
            }
        } else if (((hits.score(x) < score_threshold) && (x > 10)) || (hits.score(x) < 0.001)) {
            /**
             * Dropping matches which are a poor fit.
             * First term is configurable, it allows matches like spelling errors or
             * suggestions to be possible.
             * Second term is intended to get rid of pure and utter crap hits
             */
            if (log.isDebugEnabled()) {
                log.debug("hits.score(" + x + ") is " + hits.score(x));
                log.debug("Filtering out search results from " + x + " to " + hits.length()
                        + ", due to their score being below " + "score_threshold = " + score_threshold);
            }
            return false;
        }
        return true;
    }

    /**
     * Removes any documents which are not related to the passed in Set of good value
     * @param ids Set of ids of all known/good values
     * @param indexName index name to operate on
     * @param uniqField the name of the field in the Document to uniquely identify
     * this record
     * @return the number of documents deleted
     */
    public int deleteRecordsNotInList(Set<String> ids, String indexName, String uniqField) {
        int count = 0;
        IndexReader reader = null;
        try {
            reader = getIndexReader(indexName, IndexHandler.DEFAULT_LANG);

            // Use maxDoc() to iterate over all docs, numDocs() returns the
            // number of currently alive docs leaving out the deleted ones.
            int maxDoc = reader.maxDoc();
            for (int i = 0; i < maxDoc; i++) {
                if (!reader.isDeleted(i)) {
                    Document doc = reader.document(i);
                    String uniqId = doc.getField(uniqField).stringValue();
                    if (!ids.contains(uniqId)) {
                        log.info(indexName + ":" + uniqField + ":  <" + uniqId
                                + "> not found in list of current/good values "
                                + "assuming this has been deleted from Database and we " + "should remove it.");
                        removeFromIndex(indexName, uniqField, uniqId);
                        count++;
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
            log.info("deleteRecordsNotInList() caught exception : " + e);
        } catch (IndexingException e) {
            e.printStackTrace();
            log.info("deleteRecordsNotInList() caught exception : " + e);
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) {
                    //
                }
            }
        }
        return count;
    }

    private void debugExplainResults(String indexName, Hits hits, IndexSearcher searcher, Query q,
            Set<Term> queryTerms) throws IOException {
        log.debug("Parsed Query is " + q.toString());
        log.debug("Looking at index:  " + indexName);
        for (int i = 0; i < hits.length(); i++) {
            if ((i < 10)) {
                Document doc = hits.doc(i);
                Float score = hits.score(i);
                Explanation ex = searcher.explain(q, hits.id(i));
                log.debug("Looking at hit<" + i + ", " + hits.id(i) + ", " + score + ">: " + doc);
                log.debug("Explanation: " + ex);
                MatchingField match = new MatchingField(q.toString(), doc, queryTerms);
                String fieldName = match.getFieldName();
                String fieldValue = match.getFieldValue();
                log.debug("Guessing that matched fieldName is " + fieldName + " = " + fieldValue);
            }
        }
    }

    private String getDocIndexPath(String lang) throws IOException {
        String l = lookupLocale(lang);
        if (!StringUtils.isBlank(l)) {
            return BuilderFactory.DOCS_TYPE + File.separator + l;
        }
        log.error("Unable to find docs index dir for language " + lang);
        throw new IOException("Unable to find docs index dir for language: " + lang);
    }

    private String lookupLocale(String lang) {
        String ret = docLocaleLookUp.get(lang.toLowerCase());
        if (StringUtils.isBlank(ret)) {
            Locale l = new Locale(lang);
            ret = docLocaleLookUp.get(l.getLanguage().toLowerCase());
        }
        return ret;
    }

    private Analyzer getDocAnalyzer(String lang) {
        /**
         * We want to use the same Analyzer nutch is using when the indexes are
         * generated
         * */
        Analyzer analyzer = null;
        try {
            analyzer = nutchAnalyzerFactory.get(lang);
        } catch (Exception e) {
            log.info("Caught exception, nutch is most likely not installed");
            log.info("Defaulting to generic analyzer for Documentation Search");
            log.info("Install nutch package to get summary info and better matches.");
            analyzer = new StandardAnalyzer();
        }
        log.info("Language choice is " + lang + ", analyzer chosen is " + analyzer);
        return analyzer;
    }

    private Analyzer getServerAnalyzer() {
        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new NGramAnalyzer(min_ngram, max_ngram));
        analyzer.addAnalyzer("checkin", new KeywordAnalyzer());
        analyzer.addAnalyzer("registered", new KeywordAnalyzer());
        analyzer.addAnalyzer("ram", new KeywordAnalyzer());
        analyzer.addAnalyzer("swap", new KeywordAnalyzer());
        analyzer.addAnalyzer("cpuMHz", new KeywordAnalyzer());
        analyzer.addAnalyzer("cpuNumberOfCpus", new KeywordAnalyzer());

        return analyzer;
    }

    private Analyzer getErrataAnalyzer() {
        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new NGramAnalyzer(min_ngram, max_ngram));
        analyzer.addAnalyzer("advisoryName", new KeywordAnalyzer());
        analyzer.addAnalyzer("synopsis", new StandardAnalyzer());
        analyzer.addAnalyzer("description", new StandardAnalyzer());
        analyzer.addAnalyzer("topic", new StandardAnalyzer());
        analyzer.addAnalyzer("solution", new StandardAnalyzer());

        return analyzer;
    }

    private Analyzer getSnapshotTagAnalyzer() {
        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new NGramAnalyzer(min_ngram, max_ngram));
        analyzer.addAnalyzer("id", new KeywordAnalyzer());
        analyzer.addAnalyzer("snapshotId", new KeywordAnalyzer());
        analyzer.addAnalyzer("orgId", new KeywordAnalyzer());
        analyzer.addAnalyzer("serverId", new KeywordAnalyzer());
        analyzer.addAnalyzer("tagNameId", new KeywordAnalyzer());
        analyzer.addAnalyzer("created", new KeywordAnalyzer());
        analyzer.addAnalyzer("modified", new KeywordAnalyzer());
        return analyzer;
    }

    private Analyzer getHardwareDeviceAnalyzer() {
        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new NGramAnalyzer(min_ngram, max_ngram));
        analyzer.addAnalyzer("id", new KeywordAnalyzer());
        analyzer.addAnalyzer("serverId", new KeywordAnalyzer());
        analyzer.addAnalyzer("pciType", new KeywordAnalyzer());
        return analyzer;
    }

    private Analyzer getServerCustomInfoAnalyzer() {
        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new NGramAnalyzer(min_ngram, max_ngram));
        analyzer.addAnalyzer("id", new KeywordAnalyzer());
        analyzer.addAnalyzer("serverId", new KeywordAnalyzer());
        analyzer.addAnalyzer("created", new KeywordAnalyzer());
        analyzer.addAnalyzer("modified", new KeywordAnalyzer());
        analyzer.addAnalyzer("createdBy", new KeywordAnalyzer());
        analyzer.addAnalyzer("lastModifiedBy", new KeywordAnalyzer());
        return analyzer;
    }

    private Analyzer getDefaultAnalyzer() {
        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new NGramAnalyzer(min_ngram, max_ngram));
        analyzer.addAnalyzer("id", new KeywordAnalyzer());
        analyzer.addAnalyzer("arch", new KeywordAnalyzer());
        analyzer.addAnalyzer("epoch", new KeywordAnalyzer());
        analyzer.addAnalyzer("version", new KeywordAnalyzer());
        analyzer.addAnalyzer("release", new KeywordAnalyzer());
        analyzer.addAnalyzer("filename", new KeywordAnalyzer());
        return analyzer;
    }

    private boolean initDocSummary() {
        /**
         * NOTE:  NutchConfiguration is expecting "nutch-default.xml" and "nutch-site.xml"
         * to be available in the CLASSPATH
         */
        try {
            nutchConf = NutchConfiguration.create();
            nutchAnalyzerFactory = new AnalyzerFactory(nutchConf);
            FileSystem fs = FileSystem.get(nutchConf);
            docSegments = new TreeMap<String, FetchedSegments>(String.CASE_INSENSITIVE_ORDER);
            for (String key : docLocaleLookUp.keySet()) {
                String segmentsDir = indexWorkDir + File.separator + getDocIndexPath(key) + File.separator
                        + "segments";
                FetchedSegments segments = new FetchedSegments(fs, segmentsDir, nutchConf);
                if (segments == null) {
                    log.info("Unable to create docSegments for language: " + key);
                    docSegments.put(key, null);
                }
                String[] segNames = segments.getSegmentNames();
                if (segNames == null || segNames.length == 0) {
                    log.info("Unable to find any segments for language: " + key);
                    docSegments.put(key, null);
                }
                log.info("Adding Documentation segments for language: " + key);
                docSegments.put(key, segments);
            }
        } catch (Exception e) {
            log.error("ignoring exception - most likely Nutch isn't present, so" + " doc summaries will be empty");
            e.printStackTrace();
        }
        return true;
    }

    private String lookupDocSummary(Document doc, String queryString, String lang) {
        if (docSegments == null) {
            log.info("docSegments is null, doc summary not possible");
            log.info("nutch is probably not installed, install nutch to get summary info");
            return "";
        }
        if (!docSegments.containsKey(lang)) {
            log.info("Couldn't find segments info for " + lang);
            log.info("Summary info will be missing for " + lang);
            return "";
        }
        FetchedSegments segments = docSegments.get(lang);
        if (segments == null) {
            log.info("Segments info for " + lang + " is null");
            return "";
        }
        try {
            if (log.isDebugEnabled()) {
                log.debug("Attempting lookupDocSummary<" + lang + "> for " + doc);
            }
            HitDetails hd = new HitDetails(doc.getField("segment").stringValue(),
                    doc.getField("url").stringValue());
            // NOTE: Name conflict with Nutch's Query versus Lucene Query
            org.apache.nutch.searcher.Query query = org.apache.nutch.searcher.Query.parse(queryString, nutchConf);
            Summary sum = segments.getSummary(hd, query);
            if (log.isDebugEnabled()) {
                log.debug("Will return summary<" + lang + "> = " + sum.toString());
            }
            return sum.toString();
        } catch (Exception e) {
            log.info("Failed to lookupDocSummary<" + lang + ">, caught Exception: " + e);
            e.printStackTrace();
        }
        return "";
    }

    private void initDocLocaleLookup() {
        docLocaleLookUp.put("bn", "bn-IN");
        docLocaleLookUp.put("bn_in", "bn-IN");
        docLocaleLookUp.put("de", "de-DE");
        docLocaleLookUp.put("en_us", "en-US");
        docLocaleLookUp.put("en", "en-US");
        docLocaleLookUp.put("es", "es-ES");
        docLocaleLookUp.put("fr", "fr-FR");
        docLocaleLookUp.put("gu", "gu-IN");
        docLocaleLookUp.put("hi", "hi-IN");
        docLocaleLookUp.put("it", "it-IT");
        docLocaleLookUp.put("ja", "ja-JP");
        docLocaleLookUp.put("ko", "ko-KR");
        docLocaleLookUp.put("pa", "pa-IN");
        docLocaleLookUp.put("pt_br", "pt-BR");
        docLocaleLookUp.put("pt", "pt-BR");
        docLocaleLookUp.put("pt_pt", "pt-BR");
        docLocaleLookUp.put("ru", "ru-RU");
        docLocaleLookUp.put("ta", "ta-IN");
        docLocaleLookUp.put("zh", "zh-CN");
        docLocaleLookUp.put("zh_cn", "zh-CN");
        docLocaleLookUp.put("zh_tw", "zh-TW");
        // Below exist in docs, but weren't available as a doc option from
        // satellite webui and they weren't available Locales on my machine
        // guessing at what they will look at.
        docLocaleLookUp.put("as", "as-IN");
        docLocaleLookUp.put("ml", "ml-IN");
        docLocaleLookUp.put("mr", "mr-IN");
        docLocaleLookUp.put("or", "or-IN");
        docLocaleLookUp.put("kn", "kn-IN");
        docLocaleLookUp.put("si_lk", "si-LK");
        docLocaleLookUp.put("te", "te-IN");
    }

}