org.mskcc.pathdb.lucene.LuceneResults.java Source code

Introduction

Here is the source code for org.mskcc.pathdb.lucene.LuceneResults.java
Source

// $Id: LuceneResults.java,v 1.14 2009-10-12 18:23:46 cerami Exp $
//------------------------------------------------------------------------------
/** Copyright (c) 2006 Memorial Sloan-Kettering Cancer Center.
 **
 ** Code written by: Ethan Cerami
 ** Authors: Ethan Cerami, Gary Bader, Chris Sander
 **
 ** This library is free software; you can redistribute it and/or modify it
 ** under the terms of the GNU Lesser General Public License as published
 ** by the Free Software Foundation; either version 2.1 of the License, or
 ** any later version.
 **
 ** This library is distributed in the hope that it will be useful, but
 ** WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF
 ** MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  The software and
 ** documentation provided hereunder is on an "as is" basis, and
 ** Memorial Sloan-Kettering Cancer Center
 ** has no obligations to provide maintenance, support,
 ** updates, enhancements or modifications.  In no event shall
 ** Memorial Sloan-Kettering Cancer Center
 ** be liable to any party for direct, indirect, special,
 ** incidental or consequential damages, including lost profits, arising
 ** out of the use of this software and its documentation, even if
 ** Memorial Sloan-Kettering Cancer Center
 ** has been advised of the possibility of such damage.  See
 ** the GNU Lesser General Public License for more details.
 **
 ** You should have received a copy of the GNU Lesser General Public License
 ** along with this library; if not, write to the Free Software Foundation,
 ** Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 **/
package org.mskcc.pathdb.lucene;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.highlight.*;
import org.mskcc.pathdb.sql.dao.DaoExternalDb;
import org.mskcc.pathdb.sql.dao.DaoExternalDbSnapshot;
import org.mskcc.pathdb.sql.dao.DaoException;
import org.mskcc.pathdb.taglib.Pager;
import org.mskcc.pathdb.model.ExternalDatabaseRecord;
import org.mskcc.pathdb.model.ExternalDatabaseSnapshotRecord;
import org.mskcc.pathdb.util.xml.XmlStripper;
import org.mskcc.pathdb.model.GlobalFilterSettings;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.Set;
import java.util.HashSet;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;

import jena.query;

/**
 * Query Utility Class.
 *
 * @author Ethan Cerami, Benjamin Gross.
 */
public class LuceneResults {
    public static final String START_TAG = "<b>";
    public static final String END_TAG = "</b>";
    public static final String MEMBER_OF = "contains a molecule or protein with the keyword:  ";
    private long cpathIds[];
    private List<List<String>> fragments;
    private Map<Long, Set<String>> dataSourceMap;
    private ArrayList<Integer> numDescendentsList;
    private ArrayList<Integer> numParentsList;
    private ArrayList<Integer> numParentPathwaysList;
    private ArrayList<Integer> numParentInteractionsList;
    private Map<Long, Explanation> explanationMap = new HashMap<Long, Explanation>();
    private Map<Long, Float> scores;
    private int numHits;
    private Set<String> globalDataSources;

    public LuceneResults(Pager pager, Query query, IndexSearcher indexSearcher, Hits hits, String term,
            GlobalFilterSettings globalFilterSettings, boolean debug)
            throws IOException, ParseException, DaoException {
        numHits = hits.length();
        int size = pager.getEndIndex() - pager.getStartIndex();

        // init private variables
        cpathIds = new long[size];
        fragments = new ArrayList<List<String>>();
        numDescendentsList = new ArrayList<Integer>();
        numParentsList = new ArrayList<Integer>();
        numParentPathwaysList = new ArrayList<Integer>();
        numParentInteractionsList = new ArrayList<Integer>();
        dataSourceMap = new HashMap<Long, Set<String>>();
        scores = new HashMap<Long, Float>();
        globalDataSources = new HashSet<String>();

        if (globalFilterSettings != null) {
            DaoExternalDbSnapshot daoSnapShot = new DaoExternalDbSnapshot();
            for (Long snapshotId : globalFilterSettings.getSnapshotIdSet()) {
                ExternalDatabaseSnapshotRecord snapShotRecord = daoSnapShot.getDatabaseSnapshot(snapshotId);
                if (snapShotRecord == null)
                    continue;
                ExternalDatabaseRecord externalDatabaseRecord = snapShotRecord.getExternalDatabase();
                if (externalDatabaseRecord == null)
                    continue;
                globalDataSources.add(externalDatabaseRecord.getMasterTerm());
            }
        }

        DaoExternalDb dao = new DaoExternalDb();
        int index = 0;
        Highlighter highLighter = null;

        if (term != null) {
            //term = reformatTerm(term);
            highLighter = createHighlighter(term);
        }

        for (int i = pager.getStartIndex(); i < pager.getEndIndex(); i++) {
            Document doc = hits.doc(i);

            Field field = doc.getField(LuceneConfig.FIELD_CPATH_ID);
            if (field != null) {
                cpathIds[index++] = Long.parseLong(field.stringValue());
                scores.put(Long.parseLong(field.stringValue()), new Float(hits.score(i)));

                if (debug) {
                    explanationMap.put(Long.parseLong(field.stringValue()),
                            indexSearcher.explain(query, hits.id(i)));
                }
            }

            if (highLighter != null) {
                extractFragment(doc, highLighter, term);
            }

            extractNumFamilyTree(doc, LuceneConfig.FIELD_NUM_DESCENDENTS, numDescendentsList);
            extractNumFamilyTree(doc, LuceneConfig.FIELD_NUM_PARENTS, numParentsList);
            extractNumFamilyTree(doc, LuceneConfig.FIELD_NUM_PARENT_PATHWAYS, numParentPathwaysList);
            extractNumFamilyTree(doc, LuceneConfig.FIELD_NUM_PARENT_INTERACTIONS, numParentInteractionsList);
            extractDataSourceMap(doc, dao);
        }
    }

    public int getNumHits() {
        return numHits;
    }

    public long[] getCpathIds() {
        return cpathIds;
    }

    public List<List<String>> getFragments() {
        return fragments;
    }

    public Map<Long, Set<String>> getDataSourceMap() {
        return dataSourceMap;
    }

    public ArrayList<Integer> getNumDescendentsList() {
        return numDescendentsList;
    }

    public ArrayList<Integer> getNumParentsList() {
        return numParentsList;
    }

    public ArrayList<Integer> getNumParentPathwaysList() {
        return numParentPathwaysList;
    }

    public ArrayList<Integer> getNumParentInteractionsList() {
        return numParentInteractionsList;
    }

    public Map<Long, Explanation> getExplanationMap() {
        return explanationMap;
    }

    public Map<Long, Float> getScores() {
        return scores;
    }

    private void extractDataSourceMap(Document doc, DaoExternalDb dao) throws DaoException {
        Field cpathIdField = doc.getField(LuceneConfig.FIELD_CPATH_ID);
        Field dataSourceField = doc.getField(LuceneConfig.FIELD_DATA_SOURCE);
        HashSet<String> dataSourcesSet = new HashSet<String>();
        if (cpathIdField != null && dataSourceField != null) {
            for (String fieldValue : dataSourceField.stringValue().split(" ")) {
                if (fieldValue != null && fieldValue.trim().length() > 0) {
                    dataSourcesSet.add(dao.getRecordByTerm(fieldValue).getName());
                }
            }
            dataSourceMap.put(Long.parseLong(cpathIdField.stringValue()), dataSourcesSet);
        }
    }

    /**
     * Generalized function to extract total number of descendents, parents, interactions, pathways.
     *
     * @param doc Document
     * @param fieldName String
     * @param list ArrayList<Integer>
     */
    private void extractNumFamilyTree(Document doc, String fieldName, ArrayList<Integer> list) {
        Field field;
        field = doc.getField(fieldName);
        if (field == null) {
            list.add(0);
        } else if (fieldName == LuceneConfig.FIELD_NUM_PARENT_PATHWAYS
                || fieldName == LuceneConfig.FIELD_NUM_PARENT_INTERACTIONS) {
            Integer num = new Integer(0);
            String[] parentPathwaysOrInteractions = field.stringValue().split("\t");
            if (parentPathwaysOrInteractions.length == 0) {
                list.add(0);
                return;
            }
            for (String parentPathwayOrInteraction : parentPathwaysOrInteractions) {
                // first component is MASTER TERM, second is count
                String[] components = parentPathwayOrInteraction.split(":");
                if (components.length != 2)
                    continue;
                // get snapshot id
                if (globalDataSources.contains(components[0])) {
                    try {
                        Integer toAdd = Integer.parseInt(components[1]);
                        num += toAdd;
                    } catch (NumberFormatException e) {
                    }
                }
            }
            list.add(num);
        } else {
            try {
                Integer num = Integer.parseInt(field.stringValue());
                list.add(num);
            } catch (NumberFormatException e) {
                list.add(0);
            }
        }
    }

    private void extractFragment(Document doc, Highlighter highLighter, String term) throws IOException {
        String fragment = getFragment(doc, highLighter, term);
        // if fragment is null, assume descendent ?
        if ((fragment == null || fragment.length() == 0)) {
            String value = doc.getField(LuceneConfig.FIELD_NAME).stringValue();
            if (value != null && value.length() > 0) {
                //  Do not show sentences like this:
                //  Protein contains a molecule or protein with the keyword: DATA_SOURCE:REACTOME.
                List<String> listToReturn = new ArrayList<String>();
                if (term != null && term.indexOf(LuceneConfig.FIELD_DATA_SOURCE) > -1) {
                    fragments.add(listToReturn);
                    return;
                }
                //  Do not show sentences like this:
                //  Protein contains a molecule or protein with the keyword: dna repair.
                if (term != null && term.trim().indexOf(" ") > 0) {
                    fragments.add(listToReturn);
                    return;
                }
                listToReturn.add(MEMBER_OF + START_TAG + term.toUpperCase() + END_TAG);
                fragments.add(listToReturn);
                return;
            }
        }
        fragments.add(cookFragment(term, fragment));
    }

    // if query contains multiple terms, surround it with quotes (unless the query already is)
    private String reformatTerm(String term) {
        term = term.trim();
        if (term.matches("^[^\"]*\\s[^\"]*$")) {
            term = "\"" + term + "\"";
        }
        if (term.contains("*")) {
            term = term.replace("*", "");
        }
        return term;
    }

    private Highlighter createHighlighter(String term) throws IOException, ParseException {

        //  Standard Analyzer to extract words using a list of English stop words.
        StandardAnalyzer analyzer = new StandardAnalyzer();

        //  Standard Query Parser
        QueryParser queryParser = new QueryParser(LuceneConfig.FIELD_ALL, analyzer);

        // for the usage of highlighting with wildcards
        // Necessary to expand search terms
        IndexReader reader = IndexReader.open(new File(LuceneConfig.getLuceneDirectory()));
        Query luceneQuery = queryParser.parse(term);
        luceneQuery = luceneQuery.rewrite(reader);

        //  Scorer implementation which scores text fragments by the number of
        //  unique query terms found.
        QueryScorer queryScorer = new QueryScorer(luceneQuery);

        //  HTML Formatted surrounds matching text with <B></B> tags.
        SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter();

        //  Highligher Class
        Highlighter highLighter = new Highlighter(htmlFormatter, queryScorer);

        //  XXX Characters Max in Each Fragment
        Fragmenter fragmenter = new SimpleFragmenter(100);
        highLighter.setTextFragmenter(fragmenter);
        return highLighter;
    }

    /**
     * Grabs fragment of lucene field that matches query term & highlights term.
     * Method traverses the lucene fields indexed for match.  If match is not found
     * null is returned.
     * 
     * @param doc Lucene Document
     * @param highLighter QueryHighlightExtractor
     * @return String
     * @throws IOException
     */
    private String getFragment(Document doc, Highlighter highLighter, String term) throws IOException {

        String[] fields = { LuceneConfig.FIELD_ALL, LuceneConfig.FIELD_SYNONYMS, LuceneConfig.FIELD_EXTERNAL_REFS };

        for (String fieldName : fields) {
            //  Get the Field of Interest
            Field field = doc.getField(fieldName);

            //  Create the Token Stream
            TokenStream tokenStream = new StandardAnalyzer().tokenStream(LuceneConfig.FIELD_ALL,
                    new StringReader(field.stringValue()));

            //  Get the Best Fragment
            String formattedText = highLighter.getBestFragments(tokenStream, field.stringValue(), 5, "...");
            if (formattedText != null && formattedText.length() > 0) {
                return formattedText;
            }
        }

        // made it here, assume descendent ?
        return null;
    }

    /**
     * Removes XML element delimiter placed in fragment by XmlStripper.
     *  Also removes subset(s) of fragments that do not contain query terms.
     * 
     * @param terms String
     * @param fragments String
     * @return List<String>
     */
    private List<String> cookFragment(String terms, String fragments) {

        // check fragment args
        if (fragments == null || fragments.length() == 0)
            return null;

        // to return
        List<String> toReturn = null;

        // create terms regex - used to match terms anywhere in fragment
        boolean haveTerms = false;
        String termRegex = "^.*(?i)("; // note: (?i) specifies case-insensitive matching
        for (String term : terms.split(" ")) {
            if (validTerm(term)) {
                termRegex += term + "|";
                haveTerms = true;
            }
        }
        termRegex = termRegex.replaceAll("\\|$", ""); // remove trailing '|'
        termRegex += ").*$";

        // to we have valid terms ?  - see addTerm(..) method for more information
        if (!haveTerms)
            return null;

        Map<String, String> fragmentMap = new HashMap<String, String>();
        for (String fragment : fragments.split(XmlStripper.ELEMENT_DELIMITER)) {
            if (fragment.matches(termRegex)) {
                // don't process duplicate fragments
                if (!fragmentMap.containsKey(fragment)) {
                    toReturn = (toReturn == null) ? new ArrayList<String>() : toReturn;
                    // do our own highlighting - see note in extractFragments(..)
                    String origFragment = fragment;
                    for (String term : terms.split(" ")) {
                        if (validTerm(term)) {
                            fragment = fragment.replaceAll("(?i)" + "(" + term + ")", START_TAG + "$1" + END_TAG);
                        }
                    }
                    // lets remove sentences that are part of fragment but don't contain any terms
                    String subFragmentsToReturn = "";
                    for (String subFragment : fragment.split("\\.")) {
                        // try to determine if '.' is not part of title, like Dr.
                        if (!subFragment.matches("^.*(?i)dr$") && subFragment.matches(termRegex)) {
                            // do we append a '.' after subFragment ?
                            int indexOfPeriod = fragment.indexOf(subFragment) + subFragment.length();
                            boolean appendPeriod = ((indexOfPeriod <= fragment.length() - 1)
                                    && (fragment.charAt(indexOfPeriod) == '.'));
                            subFragment = subFragment + ((appendPeriod) ? "." : "");
                            subFragmentsToReturn += subFragment;
                        }
                    }
                    // fragment may not have contained periods
                    subFragmentsToReturn = (subFragmentsToReturn.length() == 0) ? fragment : subFragmentsToReturn;
                    toReturn.add(subFragmentsToReturn);
                    fragmentMap.put(origFragment, "");
                }
            }
        }

        // outta here
        return toReturn;
    }

    /**
     * The idea behind this method is that we only should
     * consider terms in cookFragment(..) to be valid if the term 
     * contains "free text" entered by the user, ie 'p53'
     * If the term is a canned lucene query, like
     * 'data_source:"NCI_NATURE"' or 'entity_type:pathway'
     * or 'AND', the term should not be considered valid.
     *
     * @param term String
     * @return boolean
     */
    private static boolean validTerm(String term) {

        // see BioPaxToIndex for fields to check here...
        return (term.contains(LuceneConfig.FIELD_ALL + ":") || term.contains(LuceneConfig.FIELD_CPATH_ID + ":")
                || term.contains(LuceneConfig.FIELD_DATA_SOURCE + ":")
                || term.contains(LuceneConfig.FIELD_NAME + ":") || term.contains(LuceneConfig.FIELD_ORGANISM + ":")
                || term.contains(LuceneConfig.FIELD_SYNONYMS + ":")
                || term.contains(LuceneConfig.FIELD_EXTERNAL_REFS + ":")
                || term.contains(LuceneConfig.FIELD_DESCENDENTS + ":") ||
                // lucene boolean operators must be capitalized
                term.equalsIgnoreCase("AND") || term.equalsIgnoreCase("OR") || term.equalsIgnoreCase("NOT")) ? false
                        : true;
    }
}