org.splevo.vpm.analyzer.semantic.lucene.finder.SharedTermFinder.java Source code

Introduction

Here is the source code for org.splevo.vpm.analyzer.semantic.lucene.finder.SharedTermFinder.java
Source

/*******************************************************************************
 * Copyright (c) 2014
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *    Daniel Kojic - initial API and implementation and/or initial documentation
 *    Benjamin Klatt
 *******************************************************************************/
package org.splevo.vpm.analyzer.semantic.lucene.finder;

import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.util.BytesRef;
import org.splevo.vpm.analyzer.semantic.lucene.Indexer;

import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;

/**
 * This Finder matches documents with a specified minimum percentage of equal terms.
 */
public class SharedTermFinder implements RelationshipFinder {

    /** The logger for this class. */
    private Logger logger = Logger.getLogger(SharedTermFinder.class);

    /** The reader containing the content to be matched. */
    private DirectoryReader reader;

    /** Indicates whether to include comments for analysis or not. */
    private boolean matchComments;

    /** The minimum number of shared terms to match. */
    private int minSharedTerms;

    /**
     * Initializations. Queries the content of the given {@link DirectoryReader} . Documents having
     * the specified minimum similarity or having at least one common term (depending on the
     * parameter) are a match.
     *
     * @param reader
     *            The reader to be used by the Finder.
     * @param matchComments
     *            Indicates whether to include comments for analysis or not.
     * @param minSharedTerms
     *            The minimum number of terms to share for a relationship.
     */
    public SharedTermFinder(DirectoryReader reader, boolean matchComments, int minSharedTerms) {
        this.reader = reader;
        this.matchComments = matchComments;
        this.minSharedTerms = minSharedTerms;
    }

    @Override
    public Table<String, String, Set<String>> findSimilarEntries() {

        Table<String, String, Set<String>> sharedTermTable = HashBasedTable.create();

        try {
            IndexSearcher indexSearcher = new IndexSearcher(reader);

            // Iterate over all documents (VariationPoints).
            for (int i = 0; i < reader.maxDoc(); i++) {
                Document referenceDoc = indexSearcher.doc(i);

                if (referenceDoc.getField(Indexer.INDEX_CONTENT) != null) {
                    Table<String, String, Set<String>> sharedTerms = buildQueryAndExecuteSearch(indexSearcher,
                            Indexer.INDEX_CONTENT, i, referenceDoc);
                    sharedTermTable.putAll(sharedTerms);
                }

                if (matchComments && referenceDoc.getField(Indexer.INDEX_COMMENT) != null) {
                    Table<String, String, Set<String>> sharedTerms = buildQueryAndExecuteSearch(indexSearcher,
                            Indexer.INDEX_COMMENT, i, referenceDoc);
                    sharedTermTable.putAll(sharedTerms);
                }
            }
        } catch (IOException e) {
            logger.error("Failure while searching Lucene index.", e);
        }

        return sharedTermTable;
    }

    /**
     * Builds the relevant arguments for a search and executes it. Add the search results to the
     * given result parameter.
     *
     * @param indexSearcher
     *            The {@link IndexSearcher} to execute the search on.
     * @param field
     *            The field to search on.
     * @param docID
     *            The current document's ID.
     * @param referenceDoc
     *            The current document.
     * @return The table of identified term-sharing variation points.<br>
     *         Table entries: [sourceVPId, targetVPid, sharedTerms]
     * @throws IOException
     *             Thrown if there were problems during search.
     */
    private Table<String, String, Set<String>> buildQueryAndExecuteSearch(IndexSearcher indexSearcher, String field,
            int docID, Document referenceDoc) throws IOException {
        Map<String, Integer> frequencies = getTermFrequencies(docID, field);
        Query query = buildQuery(field, frequencies);
        int maxDoc = reader.maxDoc();
        ScoreDoc[] hits = executeQuery(indexSearcher, maxDoc, query);
        Set<Term> referenceDocTerms = new HashSet<Term>();
        query.extractTerms(referenceDocTerms);
        return buildSharedTermTable(indexSearcher, hits, referenceDoc, referenceDocTerms, field);
    }

    /**
     * Build a matrix assigning to each variation point id combination their shared terms.
     *
     * If no terms are shared, no entry will be contained in the according table cell.
     *
     * Each variation point id pair is ordered before the table is filled. This ensures the required
     * bi-directional relationship.
     *
     * @param indexSearcher
     *            The index searcher to access the Lucene search index.
     * @param hits
     *            The number of hits for the document.
     * @param referenceDoc
     *            The document itself.
     * @param referenceDocTerms
     *            The terms to query for.
     * @param field
     *            The field to get the terms for.
     * @return The table of identified term-sharing variation points.<br>
     *         Table entries: [sourceVPId, targetVPid, sharedTerms]
     * @throws IOException
     *             Any error while working with the search engine index.
     */
    private Table<String, String, Set<String>> buildSharedTermTable(IndexSearcher indexSearcher, ScoreDoc[] hits,
            Document referenceDoc, Set<Term> referenceDocTerms, String field) throws IOException {

        //[sourceVPId, targetVPid, sharedTerms]
        Table<String, String, Set<String>> sharedTermTable = HashBasedTable.create();
        for (int q = 0; q < hits.length; q++) {

            int indexDocId = hits[q].doc;

            String vpId = referenceDoc.get(Indexer.INDEX_VARIATIONPOINT);
            Document foundDoc = indexSearcher.doc(indexDocId);
            String foundVPId = foundDoc.get(Indexer.INDEX_VARIATIONPOINT);

            if (vpId.equals(foundVPId)) {
                continue;
            }

            Set<String> sharedTerms = determineSharedTerms(referenceDocTerms, foundDoc, indexDocId, field);

            // minShared terms is not check here because further
            // shared terms might be collected before this is evaluated.
            if (sharedTerms.size() > 0) {
                if (vpId.compareTo(foundVPId) > 0) {
                    String idTmp = vpId;
                    vpId = foundVPId;
                    foundVPId = idTmp;
                }

                // initialize the shared term list for the pair if not
                // done yet
                Set<String> set = sharedTermTable.get(vpId, foundVPId);
                if (set == null) {
                    set = new LinkedHashSet<String>();
                }
                set.addAll(sharedTerms);
                sharedTermTable.put(vpId, foundVPId, sharedTerms);
            }
        }
        return sharedTermTable;
    }

    /**
     * Determine the terms shared by the related variation points by looking up all terms included
     * in the search query AND a found document.
     *
     * @param referenceDocTerms
     *            The terms of the reference doc and used in the search query.
     * @param foundDoc
     *            A specific document found by the query.
     * @param foundDocId
     *            The id of the document found to get it's index terms.
     * @param field
     *            The field to get the terms for.
     *
     * @return The {@link Set} of terms shared between the query and the document.
     * @throws IOException
     */
    private Set<String> determineSharedTerms(Set<Term> referenceDocTerms, Document foundDoc, int foundDocId,
            String field) throws IOException {
        Set<String> sharedTerms = new TreeSet<String>();
        Terms termVector = reader.getTermVector(foundDocId, field);
        TermsEnum termsEnum = null;
        TermsEnum iterator = termVector.iterator(termsEnum);
        BytesRef br = null;
        while ((br = iterator.next()) != null) {
            String term = br.utf8ToString();
            for (Term t : referenceDocTerms) {
                if (t.text().equals(term)) {
                    sharedTerms.add(term);
                }
            }
        }
        return sharedTerms;
    }

    /**
     * Executes a query.
     *
     * @param indexSearcher
     *            The {@link IndexSearcher} to be used.
     * @param maxDoc
     *            The max. number of results.
     * @param query
     *            The {@link Query} to be executed.
     * @return The result of the search.
     * @throws IOException
     *             If there were errors while executing the query.
     */
    private ScoreDoc[] executeQuery(IndexSearcher indexSearcher, int maxDoc, Query query) throws IOException {
        TopScoreDocCollector collector = TopScoreDocCollector.create(maxDoc, true);
        indexSearcher.search(query, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;
        return hits;
    }

    /**
     * Extracts the frequencies of all {@link Term}s in the specified {@link Document}. Uses the
     * member reader.
     *
     * @param docId
     *            The ID of the {@link Document} to extract the {@link Term}s from.
     * @param fieldName
     *            The name of the field to extract frequencies from.
     * @return A {@link Map} containing the terms as the key and the related frequencies as
     *         {@link Integer} value.
     */
    private Map<String, Integer> getTermFrequencies(int docId, String fieldName) {
        Map<String, Integer> frequencies = new HashMap<String, Integer>();

        try {
            Terms vector = reader.getTermVector(docId, fieldName);
            if (vector == null) {
                return frequencies;
            }
            TermsEnum termsEnum = null;
            termsEnum = vector.iterator(termsEnum);
            BytesRef text = null;
            while ((text = termsEnum.next()) != null) {
                String term = text.utf8ToString();
                int freq = (int) termsEnum.totalTermFreq();
                frequencies.put(term, freq);
            }
        } catch (IOException e) {
            logger.error("Failure while extracting Term Frequencies.");
        }
        return frequencies;
    }

    /**
     * This Method builds the {@link Query} the Finder uses to search similarities.
     *
     * @param fieldName
     *            The name of the field that should be searched.
     * @param termFrequencies
     *            A {@link Map} that contains all terms and their frequencies.
     * @return The {@link Query}.
     */
    private Query buildQuery(String fieldName, Map<String, Integer> termFrequencies) {
        BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE);
        BooleanQuery finalQuery = new BooleanQuery();

        // Add a TermQuery for each term in the document.
        for (String key : termFrequencies.keySet()) {
            Term t = new Term(fieldName, key);
            TermQuery termQuery = new TermQuery(t);
            finalQuery.add(termQuery, Occur.SHOULD);
        }

        finalQuery.setMinimumNumberShouldMatch(minSharedTerms);

        return finalQuery;
    }
}