uk.ac.ebi.arrayexpress.utils.search.EFOExpansionLookupIndex.java Source code

Introduction

Here is the source code for uk.ac.ebi.arrayexpress.utils.search.EFOExpansionLookupIndex.java
Source

package uk.ac.ebi.arrayexpress.utils.search;

/*
 * Copyright 2009-2011 European Molecular Biology Laboratory
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import uk.ac.ebi.arrayexpress.utils.efo.EFONode;
import uk.ac.ebi.arrayexpress.utils.efo.IEFO;

import java.io.File;
import java.io.IOException;
import java.util.*;

public class EFOExpansionLookupIndex implements IEFOExpansionLookup {
    // logging machinery
    private final Logger logger = LoggerFactory.getLogger(getClass());

    private FSDirectory indexDirectory;

    private IEFO efo;
    private Set<String> stopWords;
    private Map<String, Set<String>> customSynonyms;

    // maximum number of index documents to be processed; in reality shouldn't
    // be more than 2
    private static final int MAX_INDEX_HITS = 16;

    // maximum number of index expansions
    // TODO: rpe must be configurable (its impossible to expand disease in more
    // than 3k terms!!! - it's too slow)
    private static final int MAX_NUMBER_EXPANSIONS = 100;

    public EFOExpansionLookupIndex(String indexLocation, Set<String> stopWords) throws IOException {
        this.stopWords = stopWords;
        this.indexDirectory = FSDirectory.open(new File(indexLocation));
    }

    private IEFO getEfo() {
        return this.efo;
    }

    public void setEfo(IEFO efo) {
        this.efo = efo;
    }

    public void setCustomSynonyms(Map<String, Set<String>> synonyms) {
        this.customSynonyms = synonyms;
    }

    public void buildIndex() throws InterruptedException {
        try {
            IndexWriter w = createIndex(this.indexDirectory, new LowercaseAnalyzer());

            this.logger.debug("Building expansion lookup index");
            addNodeAndChildren(this.efo.getMap().get(IEFO.ROOT_ID), w);
            addCustomSynonyms(w);
            commitIndex(w);
            this.logger.debug("Building completed");
        } catch (InterruptedException x) {
            throw x;
        } catch (Exception x) {
            this.logger.error("Caught an exception:", x);
        }

    }

    private void addCustomSynonyms(IndexWriter w) throws InterruptedException {
        // here we add all custom synonyms so those that weren't added during
        // EFO processing
        // get a chance to be included, too. don't worry about duplication,
        // dupes will be removed during retrieval
        if (null != this.customSynonyms) {
            Set<String> addedTerms = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER);
            for (String term : this.customSynonyms.keySet()) {
                if (!addedTerms.contains(term)) {
                    Document d = new Document();

                    Set<String> syns = this.customSynonyms.get(term);
                    for (String syn : syns) {
                        addIndexField(d, "term", syn, true, true);

                    }
                    addIndexDocument(w, d);
                    addedTerms.addAll(syns);
                }
            }
        }
    }

    private void addNodeAndChildren(EFONode node, IndexWriter w) throws InterruptedException {
        //TODO: rpe talk with Nikolay about this
        Thread.sleep(1);
        if (null != node) {
            addNodeToIndex(node, w);
            for (EFONode child : node.getChildren()) {
                addNodeAndChildren(child, w);
            }
        }
    }

    private void addNodeToIndex(EFONode node, IndexWriter w) throws InterruptedException {
        String term = node.getTerm();
        Set<String> synonyms = node.getAlternativeTerms();

        // if the node represents organizational class, just include its
        // synonyms, but not children
        Set<String> childTerms = node.isOrganizationalClass() ? new HashSet<String>()
                : getEfo().getTerms(node.getId(), IEFO.INCLUDE_CHILDREN);

        // here we add custom synonyms to EFO synonyms/child terms and their
        // synonyms
        if (null != this.customSynonyms) {
            for (String syn : new HashSet<String>(synonyms)) {
                if (null != syn && this.customSynonyms.containsKey(syn)) {
                    synonyms.addAll(this.customSynonyms.get(syn));
                }
            }

            // System.out.println("Term->" + term);
            if (this.customSynonyms.containsKey(term)) {
                synonyms.addAll(this.customSynonyms.get(term));
            }

            for (String child : new HashSet<String>(childTerms)) {
                if (null != child && this.customSynonyms.containsKey(child)) {
                    childTerms.addAll(this.customSynonyms.get(child));
                }
            }
        }
        if (synonyms.contains(term)) {
            synonyms.remove(term);
        }

        if (isStopTerm(term)) {
            // this.logger.debug("Term [{}] is a stop-word, skipping", term);
        } else {
            if (synonyms.size() > 0 || childTerms.size() > 0) {

                Document d = new Document();

                for (String syn : synonyms) {
                    if (childTerms.contains(syn)) {
                        // this.logger.debug("Synonym [{}] for term [{}] is present as a child term itelf, skipping",
                        // syn, term);
                    } else if (isStopExpansionTerm(syn)) {
                        // this.logger.debug("Synonym [{}] for term [{}] is a stop-word, skipping",
                        // syn, term);
                    } else {
                        addIndexField(d, "term", syn, true, true);
                    }
                }

                for (String efoTerm : childTerms) {
                    if (isStopExpansionTerm(efoTerm)) {
                        // this.logger.debug("Child EFO term [{}] for term [{}] is a stop-word, skipping",
                        // efoTerm, term);
                    } else {
                        addIndexField(d, "efo", efoTerm, false, true);
                    }
                }

                addIndexField(d, "term", term, true, true);
                addIndexDocument(w, d);
            }
        }
    }

    public EFOExpansionTerms getExpansionTerms(Query origQuery) throws IOException {
        EFOExpansionTerms expansion = new EFOExpansionTerms();

        if (this.indexDirectory.getFile().exists()) {
            IndexReader reader = null;
            IndexSearcher searcher = null;
            try {
                reader = IndexReader.open(this.indexDirectory, true);

                // to show _all_ available nodes
                searcher = new IndexSearcher(reader);
                Query q = overrideQueryField(origQuery, "term");

                TopDocs hits = searcher.search(q, MAX_INDEX_HITS);
                this.logger.debug("Expansion lookup for query [{}] returned [{}] hits", q.toString(),
                        hits.totalHits);

                for (ScoreDoc d : hits.scoreDocs) {
                    Document doc = searcher.doc(d.doc);
                    String[] terms = doc.getValues("term");
                    String[] efo = doc.getValues("efo");
                    this.logger.debug("Synonyms [{}], EFO Terms [{}]", StringUtils.join(terms, ", "),
                            StringUtils.join(efo, ", "));
                    if (0 != terms.length) {
                        expansion.synonyms.addAll(Arrays.asList(terms));
                    }

                    if (0 != efo.length) {
                        expansion.efo.addAll(Arrays.asList(efo));
                    }
                }
                logger.debug("Numer of expansion terms: " + expansion.efo.size());
                // TODO: RPE I have to limit the number of expansions ... i I
                // expand the term disease I will have more 5k(5196) terms ...
                // it;s not performant)

                // Collections.shuffle(list);
                if (expansion.efo.size() > MAX_NUMBER_EXPANSIONS) {
                    List<String> list = new LinkedList<String>(expansion.efo);
                    expansion.efo = new HashSet<String>(list.subList(0, MAX_NUMBER_EXPANSIONS));
                }

            } finally {
                if (null != searcher) {
                    searcher.close();
                }

                if (null != reader) {
                    reader.close();
                }
            }
        }

        return expansion;
    }

    private IndexWriter createIndex(Directory indexDirectory, Analyzer analyzer) throws InterruptedException {
        IndexWriter iwriter = null;
        try {
            Thread.sleep(1);
            iwriter = new IndexWriter(indexDirectory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
        } catch (InterruptedException x) {
            throw x;
        } catch (Exception x) {
            this.logger.error("Caught an exception:", x);
        }

        return iwriter;
    }

    private void addIndexField(Document document, String name, String value, boolean shouldAnalyze,
            boolean shouldStore) {
        value = value.replaceAll("[^\\d\\w-]", " ").toLowerCase();
        document.add(new Field(name, value, shouldStore ? Field.Store.YES : Field.Store.NO,
                shouldAnalyze ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED, Field.TermVector.NO));
    }

    private void addIndexDocument(IndexWriter iwriter, Document document) throws InterruptedException {
        try {
            Thread.sleep(1);
            iwriter.addDocument(document);
        } catch (InterruptedException x) {
            throw x;
        } catch (Exception x) {
            this.logger.error("Caught an exception:", x);
        }
    }

    private void commitIndex(IndexWriter iwriter) {
        try {
            iwriter.optimize();
            iwriter.commit();
            iwriter.close();
        } catch (Exception x) {
            this.logger.error("Caught an exception:", x);
        }
    }

    private Query overrideQueryField(Query origQuery, String fieldName) {
        Query query = new TermQuery(new Term(""));

        try {
            if (origQuery instanceof PrefixQuery) {
                Term term = ((PrefixQuery) origQuery).getPrefix();
                query = new PrefixQuery(new Term(fieldName, term.text()));
            } else if (origQuery instanceof WildcardQuery) {
                Term term = ((WildcardQuery) origQuery).getTerm();
                query = new WildcardQuery(new Term(fieldName, term.text()));
            } else if (origQuery instanceof TermRangeQuery) {
                TermRangeQuery trq = (TermRangeQuery) origQuery;
                query = new TermRangeQuery(fieldName, trq.getLowerTerm(), trq.getUpperTerm(), trq.includesLower(),
                        trq.includesUpper());
            } else if (origQuery instanceof FuzzyQuery) {
                Term term = ((FuzzyQuery) origQuery).getTerm();
                query = new FuzzyQuery(new Term(fieldName, term.text()));
            } else if (origQuery instanceof TermQuery) {
                Term term = ((TermQuery) origQuery).getTerm();
                query = new TermQuery(new Term(fieldName, term.text()));
            } else if (origQuery instanceof PhraseQuery) {
                Term[] terms = ((PhraseQuery) origQuery).getTerms();
                StringBuilder text = new StringBuilder();
                for (Term t : terms) {
                    text.append(t.text()).append(' ');
                }
                query = new TermQuery(new Term(fieldName, text.toString().trim()));
            } else {
                this.logger.error("Unsupported query type [{}]", origQuery.getClass().getCanonicalName());
            }
        } catch (Exception x) {
            this.logger.error("Caught an exception:", x);
        }

        return query;
    }

    private boolean isStopTerm(String str) {
        return null == str || str.length() < 3 || stopWords.contains(str.toLowerCase());
    }

    private boolean isStopExpansionTerm(String str) {
        return isStopTerm(str) || str.matches(".*(\\s\\(.+\\)|\\s\\[.+\\]|,\\s|\\s-\\s|/|NOS).*");
    }
}