dk.defxws.fgslucene.Statement.java Source code

Java tutorial

Introduction

Here is the source code for dk.defxws.fgslucene.Statement.java

Source

//$Id$
/*
 * <p><b>License and Copyright: </b>The contents of this file is subject to the
 * same open source license as the Fedora Repository System at www.fedora-commons.org
 * Copyright &copy; 2006, 2007, 2008, 2009, 2010, 2011, 2012 by The Technical University of Denmark.
 * All rights reserved.</p>
 */
package dk.defxws.fgslucene;

import java.io.IOException;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.net.URLEncoder;
import java.util.Collection;
import java.util.ListIterator;
import java.util.Locale;
import java.util.StringTokenizer;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.FieldComparatorSource;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.Version;

import dk.defxws.fedoragsearch.server.errors.GenericSearchException;
import org.fcrepo.server.utilities.StreamUtility;

/**
 * queries the Lucene index 
 * 
 * @author  gsp@dtv.dk
 * @version 
 */
public class Statement {

    private static final Logger logger = Logger.getLogger(Statement.class);

    private IndexSearcher searcher;

    public ResultSet executeQuery(IndexSearcher searcher, String queryString, int startRecord, int maxResults,
            int snippetsMax, int fieldMaxLength, Analyzer analyzer, String defaultQueryFields,
            boolean allowLeadingWildcard, boolean lowercaseExpandedTerms, String indexPath, String indexName,
            String snippetBegin, String snippetEnd, String sortFields) throws GenericSearchException {
        if (logger.isDebugEnabled())
            logger.debug("executeQuery" + " query=" + queryString + " startRecord=" + startRecord + " maxResults="
                    + maxResults + " snippetsMax=" + snippetsMax + " fieldMaxLength=" + fieldMaxLength
                    + " indexName=" + indexName + " sortFields=" + sortFields + " defaultQueryFields="
                    + defaultQueryFields + " allowLeadingWildcard=" + allowLeadingWildcard
                    + " lowercaseExpandedTerms=" + lowercaseExpandedTerms);
        this.searcher = searcher;
        ResultSet rs = null;
        StringTokenizer defaultFieldNames = new StringTokenizer(defaultQueryFields);
        int countFields = defaultFieldNames.countTokens();
        String[] defaultFields = new String[countFields];
        for (int i = 0; i < countFields; i++) {
            defaultFields[i] = defaultFieldNames.nextToken();
        }
        Query query = null;
        if (defaultFields.length == 1) {
            QueryParser queryParser = new QueryParser(Version.LUCENE_36, defaultFields[0], analyzer);
            queryParser.setAllowLeadingWildcard(allowLeadingWildcard);
            queryParser.setLowercaseExpandedTerms(lowercaseExpandedTerms);
            if (logger.isDebugEnabled())
                logger.debug("executeQuery queryParser" + " allowLeadingWildcard="
                        + queryParser.getAllowLeadingWildcard() + " lowercaseExpandedTerms="
                        + queryParser.getLowercaseExpandedTerms());
            try {
                query = queryParser.parse(queryString);
            } catch (ParseException e) {
                throw new GenericSearchException(e.toString());
            }
        } else {
            MultiFieldQueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_36, defaultFields,
                    analyzer);
            queryParser.setAllowLeadingWildcard(allowLeadingWildcard);
            queryParser.setLowercaseExpandedTerms(lowercaseExpandedTerms);
            if (logger.isDebugEnabled())
                logger.debug("executeQuery mfqueryParser" + " allowLeadingWildcard="
                        + queryParser.getAllowLeadingWildcard() + " lowercaseExpandedTerms="
                        + queryParser.getLowercaseExpandedTerms());
            try {
                query = queryParser.parse(queryString);
            } catch (ParseException e) {
                throw new GenericSearchException(e.toString());
            }
        }
        if (logger.isDebugEnabled())
            logger.debug("executeQuery after parse query=" + query);
        try {
            query.rewrite(searcher.getIndexReader());
        } catch (Exception e) {
            throw new GenericSearchException(e.toString());
        }
        if (logger.isDebugEnabled())
            logger.debug("executeQuery after rewrite query=" + query);
        int start = Integer.parseInt(Integer.toString(startRecord));
        TopDocs hits = getHits(query, start + maxResults - 1, sortFields);
        ScoreDoc[] docs = hits.scoreDocs;
        int end = Math.min(hits.totalHits, start + maxResults - 1);
        if (logger.isDebugEnabled())
            logger.debug("executeQuery hits.totalHits=" + hits.totalHits);
        StringBuffer resultXml = new StringBuffer();
        resultXml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
        String queryStringEncoded = null;
        try {
            queryStringEncoded = URLEncoder.encode(queryString, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            errorExit(e.toString());
        }
        resultXml.append("<lucenesearch " + "   xmlns:dc=\"http://purl.org/dc/elements/1.1/" + "\" query=\""
                + queryStringEncoded + "\" indexName=\"" + indexName + "\" sortFields=\"" + sortFields
                + "\" hitPageStart=\"" + startRecord + "\" hitPageSize=\"" + maxResults + "\" hitTotal=\""
                + hits.totalHits + "\">");
        ScoreDoc hit = null;
        Document doc = null;
        String hitsScore = null;
        for (int i = start; i <= end; i++) {
            try {
                hit = docs[i - 1];
                doc = searcher.doc(hit.doc);
                hitsScore = "" + hit.score;
            } catch (CorruptIndexException e) {
                errorExit(e.toString());
            } catch (IOException e) {
                errorExit(e.toString());
            }
            resultXml.append("<hit no=\"" + i + "\" score=\"" + hitsScore + "\">");
            for (ListIterator li = doc.getFields().listIterator(); li.hasNext();) {
                Fieldable f = (Fieldable) li.next();
                resultXml.append("<field name=\"" + f.name() + "\"");
                String snippets = null;
                if (snippetsMax > 0) {
                    SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("!!!SNIPPETBEGIN", "!!!SNIPPETEND");
                    QueryScorer scorer = new QueryScorer(query, f.name());
                    Highlighter highlighter = new Highlighter(formatter, scorer);
                    Fragmenter fragmenter = new SimpleFragmenter(fieldMaxLength);
                    highlighter.setTextFragmenter(fragmenter);
                    TokenStream tokenStream = analyzer.tokenStream(f.name(), new StringReader(f.stringValue()));
                    try {
                        snippets = highlighter.getBestFragments(tokenStream, f.stringValue(), snippetsMax, " ... ");
                    } catch (Exception e) { // all Exceptions to be caught, not just IOException 
                        errorExit(e.toString());
                    }
                    snippets = checkTruncatedWords(snippets, " ... ");
                    snippets = StreamUtility.enc(snippets);
                    snippets = snippets.replaceAll("!!!SNIPPETBEGIN", snippetBegin);
                    snippets = snippets.replaceAll("!!!SNIPPETEND", snippetEnd);
                    if (snippets != null && !snippets.equals("")) {
                        resultXml.append(" snippet=\"yes\">" + snippets);
                    }
                }
                if (snippets == null || snippets.equals(""))
                    if (fieldMaxLength > 0 && f.stringValue().length() > fieldMaxLength) {
                        String snippet = f.stringValue().substring(0, fieldMaxLength);
                        int iamp = snippet.lastIndexOf("&");
                        if (iamp > -1 && iamp > fieldMaxLength - 8)
                            snippet = snippet.substring(0, iamp);
                        resultXml.append(">" + StreamUtility.enc(snippet) + " ... ");
                    } else
                        resultXml.append(">" + StreamUtility.enc(f.stringValue()));
                resultXml.append("</field>");
            }
            resultXml.append("</hit>");
        }
        resultXml.append("</lucenesearch>");
        if (logger.isDebugEnabled()) {
            int size = 500;
            if (resultXml.length() < size)
                size = resultXml.length();
            String debugString = resultXml.substring(0, size);
            if (resultXml.length() > size)
                debugString += "...";
            logger.debug("executeQuery resultXml=" + debugString);
        }
        rs = new ResultSet(resultXml);
        return rs;
    }

    private void errorExit(String message) throws GenericSearchException {
        if (searcher != null) {
            try {
                searcher.close();
            } catch (IOException e) {
            }
        }
        throw new GenericSearchException(message);
    }

    //  sortFields      ::= [sortField[';'sortField]*]
    //  sortField       ::= sortFieldName[','(sortType | locale | comparatorClass)[','reverse]]]]
    //  sortFieldName   ::= #the name of an index field, which is UN_TOKENIZED and contains a single term per document
    //  sortType      ::= 'BYTE' | 'DOC' | 'DOUBLE' | 'FLOAT' | 'INT' | 'LONG' | 'SCORE' | 'SHORT' | 'STRING' | 'STRING_VAL'
    //  locale          ::= language['-'country['-'variant]]
    //  comparatorClass ::= package-path'.'className['('param['-'param]*')']
    //  reverse         ::= 'false' (default) | 'true' | 'reverse'
    private TopDocs getHits(Query query, int numHits, String sortFields) throws GenericSearchException {
        if (logger.isDebugEnabled())
            logger.debug("getHits" + " query=" + query + " numHits=" + numHits + " sortFields=" + sortFields);
        TopDocs hits = null;
        IndexReader ireader = searcher.getIndexReader();
        Collection<String> fieldNames = ReaderUtil.getIndexedFields(ireader);
        String sortFieldsString = sortFields;
        if (sortFields == null)
            sortFieldsString = "";
        StringTokenizer st = new StringTokenizer(sortFieldsString, ";");
        SortField[] sortFieldArray = new SortField[st.countTokens()];
        int i = 0;
        while (st.hasMoreTokens()) {
            SortField sortField = null;
            int sortType = -1;
            String sortFieldString = st.nextToken().trim();
            if (sortFieldString.length() == 0)
                errorExit("getHits sortFields='" + sortFields + "' : empty sortField string");
            StringTokenizer stf = new StringTokenizer(sortFieldString, ",");
            if (!stf.hasMoreTokens())
                errorExit("getHits sortFields='" + sortFields + "' : empty sortFieldName string in '"
                        + sortFieldString + "'");
            String sortFieldName = stf.nextToken().trim();
            if (sortFieldName.length() == 0)
                errorExit("getHits sortFields='" + sortFields + "' : empty sortFieldName string in '"
                        + sortFieldString + "'");
            if (!fieldNames.contains(sortFieldName))
                errorExit("getHits sortFields='" + sortFields + "' : sortFieldName '" + sortFieldName
                        + "' not found as index field name");
            if (!stf.hasMoreTokens()) {
                sortType = SortField.SCORE;
                sortField = new SortField(sortFieldName, sortType);
            } else {
                String sortTypeOrLocaleOrCompString = stf.nextToken().trim();
                if (sortTypeOrLocaleOrCompString.length() == 0)
                    errorExit("getHits sortFields='" + sortFields
                            + "' : empty sortType or locale or comparatorClass string in '" + sortFieldString
                            + "'");
                if (sortTypeOrLocaleOrCompString.indexOf(".") >= 0) {
                    String compString = sortTypeOrLocaleOrCompString;
                    String paramString = "";
                    Object[] params = new Object[] {};
                    if (sortTypeOrLocaleOrCompString.indexOf("(") >= 0) {
                        int p = compString.indexOf("(");
                        int q = compString.indexOf(")");
                        if (p < 3 || q < p + 1)
                            errorExit("getHits sortFields='" + sortFields
                                    + "' : comparatorClass parameters malformed in '" + compString + "'.");
                        paramString = compString.substring(p + 1, q);
                        compString = compString.substring(0, p);
                        StringTokenizer stp = new StringTokenizer(paramString, "-");
                        params = new Object[stp.countTokens()];
                        int ip = 0;
                        while (stp.hasMoreTokens()) {
                            params[ip++] = stp.nextToken().trim();
                        }
                    }
                    FieldComparatorSource scs = null;
                    Class comparatorClass = null;
                    try {
                        comparatorClass = Class.forName(compString);
                    } catch (ClassNotFoundException e) {
                        errorExit("getHits sortFields='" + sortFields + "' : comparatorClass '" + compString + "'"
                                + ": class not found:\n" + e.toString());
                    }
                    Constructor[] constructors = comparatorClass.getConstructors();
                    StringBuffer errorMessage = new StringBuffer();
                    for (int j = 0; j < constructors.length; j++) {
                        Constructor cj = constructors[j];
                        try {
                            scs = (FieldComparatorSource) cj.newInstance(params);
                            if (logger.isDebugEnabled())
                                logger.debug(
                                        "getHits sortFields='" + sortFields + "' : comparatorClass '" + compString
                                                + "'" + ": constructor[" + j + "]='" + cj.toGenericString() + "'");
                            break;
                        } catch (IllegalArgumentException e) {
                            errorMessage.append("\nconstructor[" + j + "]='" + cj.toGenericString() + "'" + "\n"
                                    + e.toString() + " ");
                        } catch (InstantiationException e) {
                            errorMessage.append("\nconstructor[" + j + "]='" + cj.toGenericString() + "'" + "\n"
                                    + e.toString() + " ");
                        } catch (IllegalAccessException e) {
                            errorMessage.append("\nconstructor[" + j + "]='" + cj.toGenericString() + "'" + "\n"
                                    + e.toString() + " ");
                        } catch (InvocationTargetException e) {
                            errorMessage.append("\nconstructor[" + j + "]='" + cj.toGenericString() + "'" + "\n"
                                    + e.toString() + " ");
                        }
                    }
                    if (scs == null) {
                        errorExit("getHits sortFields='" + sortFields + "' : comparatorClass '" + compString + "'"
                                + ": no constructor applied:\n" + errorMessage.toString());
                    }
                    if (!stf.hasMoreTokens()) {
                        sortField = new SortField(sortFieldName, scs);
                    } else {
                        String reverseString = stf.nextToken().trim();
                        if (reverseString.length() == 0)
                            errorExit("getHits sortFields='" + sortFields + "' : empty reverse string in '"
                                    + sortFieldString + "'");
                        boolean reverse = false;
                        if ("true".equalsIgnoreCase(reverseString))
                            reverse = true;
                        else if ("reverse".equalsIgnoreCase(reverseString))
                            reverse = true;
                        else if ("false".equalsIgnoreCase(reverseString))
                            reverse = false;
                        else
                            errorExit("getHits sortFields='" + sortFields + "' : unknown reverse string '"
                                    + reverseString + "' in '" + sortFieldString + "'");
                        sortField = new SortField(sortFieldName, scs, reverse);
                    }
                } else {
                    String sortTypeOrLocaleString = sortTypeOrLocaleOrCompString;
                    Locale locale = null;
                    if ("BYTE".equals(sortTypeOrLocaleString))
                        sortType = SortField.BYTE;
                    else if ("DOC".equals(sortTypeOrLocaleString))
                        sortType = SortField.DOC;
                    else if ("DOUBLE".equals(sortTypeOrLocaleString))
                        sortType = SortField.DOUBLE;
                    else if ("FLOAT".equals(sortTypeOrLocaleString))
                        sortType = SortField.FLOAT;
                    else if ("INT".equals(sortTypeOrLocaleString))
                        sortType = SortField.INT;
                    else if ("LONG".equals(sortTypeOrLocaleString))
                        sortType = SortField.LONG;
                    else if ("SCORE".equals(sortTypeOrLocaleString)) {
                        sortType = SortField.SCORE;
                        searcher.setDefaultFieldSortScoring(true, true);
                    } else if ("SHORT".equals(sortTypeOrLocaleString))
                        sortType = SortField.SHORT;
                    else if ("STRING".equals(sortTypeOrLocaleString))
                        sortType = SortField.STRING;
                    else if ("STRING_VAL".equals(sortTypeOrLocaleString))
                        sortType = SortField.STRING_VAL;
                    else if (((sortTypeOrLocaleString.substring(0, 1)).compareTo("A") >= 0)
                            && ((sortTypeOrLocaleString.substring(0, 1)).compareTo("Z") <= 0)) {
                        errorExit("getHits sortFields='" + sortFields + "' : unknown sortType string '"
                                + sortTypeOrLocaleString + "' in '" + sortFieldString + "'");
                    } else {
                        StringTokenizer stfl = new StringTokenizer(sortTypeOrLocaleString, "-");
                        if (stfl.countTokens() > 3)
                            errorExit("getHits sortFields='" + sortFields + "' : unknown locale string '"
                                    + sortTypeOrLocaleString + "' in '" + sortFieldString + "'");
                        String language = stfl.nextToken().trim();
                        if (language.length() == 0)
                            errorExit("getHits sortFields='" + sortFields + "' : empty language string in '"
                                    + sortFieldString + "'");
                        if (language.length() > 2)
                            errorExit("getHits sortFields='" + sortFields + "' : unknown language string '"
                                    + language + "' in '" + sortFieldString + "'");
                        if (!stfl.hasMoreTokens()) {
                            locale = new Locale(language);
                        } else {
                            String country = stfl.nextToken().trim();
                            if (country.length() == 0)
                                errorExit("getHits sortFields='" + sortFields + "' : empty country string in '"
                                        + sortFieldString + "'");
                            if (country.length() > 3)
                                errorExit("getHits sortFields='" + sortFields + "' : unknown country string '"
                                        + country + "' in '" + sortFieldString + "'");
                            if (!stfl.hasMoreTokens()) {
                                locale = new Locale(language, country);
                            } else {
                                String variant = stfl.nextToken().trim();
                                if (variant.length() == 0)
                                    errorExit("getHits sortFields='" + sortFields + "' : empty variant string in '"
                                            + sortFieldString + "'");
                                locale = new Locale(language, country, variant);
                            }
                        }
                    }
                    if (!stf.hasMoreTokens()) {
                        if (sortType >= 0)
                            sortField = new SortField(sortFieldName, sortType);
                        else
                            sortField = new SortField(sortFieldName, locale);
                    } else {
                        String reverseString = stf.nextToken().trim();
                        if (reverseString.length() == 0)
                            errorExit("getHits sortFields='" + sortFields + "' : empty reverse string in '"
                                    + sortFieldString + "'");
                        boolean reverse = false;
                        if ("true".equalsIgnoreCase(reverseString))
                            reverse = true;
                        else if ("reverse".equalsIgnoreCase(reverseString))
                            reverse = true;
                        else if ("false".equalsIgnoreCase(reverseString))
                            reverse = false;
                        else
                            throw new GenericSearchException(
                                    "getHits sortFields='" + sortFields + "' : unknown reverse string '"
                                            + reverseString + "' in '" + sortFieldString + "'");
                        if (sortType == SortField.SCORE)
                            reverse = !reverse;
                        if (sortType >= 0)
                            sortField = new SortField(sortFieldName, sortType, reverse);
                        else
                            sortField = new SortField(sortFieldName, locale, reverse);
                    }
                }
            }
            sortFieldArray[i++] = sortField;
        }
        if (sortFieldArray.length == 0) {
            try {
                hits = searcher.search(query, numHits);
            } catch (Exception e) {
                errorExit("getHits search : " + e.toString());
            }
        } else {
            try {
                Sort sort = new Sort(sortFieldArray);
                hits = searcher.search(query, numHits, sort);
            } catch (Exception e) {
                errorExit("getHits search sortFields='" + sortFields + "' : " + e.toString());
            }
        }
        return hits;
    }

    //   contributed by Leire Urcelay
    private String checkTruncatedWords(String snippets, String separator) {
        String transformedSnippets = "";

        if (snippets != null && !snippets.equals("")) {
            int separatorIndex = snippets.indexOf(separator);
            while (separatorIndex > -1) {
                transformedSnippets = transformedSnippets
                        .concat(removeLastWordIfNeeded(snippets.substring(0, separatorIndex)));
                transformedSnippets = transformedSnippets.concat(separator);
                snippets = snippets.substring(separatorIndex + separator.length());
                separatorIndex = snippets.indexOf(separator);
            }
            //add last node
            snippets = removeLastWordIfNeeded(snippets.substring(0, snippets.length()));
            transformedSnippets = transformedSnippets.concat(snippets);
        } else {
            transformedSnippets = snippets;
        }
        return transformedSnippets;
    }

    private String removeLastWordIfNeeded(String snippetsFragment) {
        int lastWordIndex = snippetsFragment.lastIndexOf(" ");
        if ((lastWordIndex > -1) && (lastWordIndex + 1 <= snippetsFragment.length())) {
            String lastWord = snippetsFragment.substring(lastWordIndex + 1, snippetsFragment.length());
            if ((lastWord.startsWith("&")) && (!lastWord.endsWith(";"))) {
                snippetsFragment = snippetsFragment.substring(0, lastWordIndex);
            }
        }
        return snippetsFragment;
    }

    void close() throws GenericSearchException {
    }
}