org.splevo.vpm.analyzer.semantic.SemanticVPMAnalyzer.java Source code

Introduction

Here is the source code for org.splevo.vpm.analyzer.semantic.SemanticVPMAnalyzer.java
Source

/*******************************************************************************
 * Copyright (c) 2014
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *    Daniel Kojic - initial API and implementation and/or initial documentation
 *    Benjamin Klatt
 *    Christian Busch
 *******************************************************************************/
package org.splevo.vpm.analyzer.semantic;

import java.io.File;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.graphstream.graph.Node;
import org.splevo.vpm.analyzer.AbstractVPMAnalyzer;
import org.splevo.vpm.analyzer.VPMAnalyzerResult;
import org.splevo.vpm.analyzer.VPMEdgeDescriptor;
import org.splevo.vpm.analyzer.config.BooleanConfiguration;
import org.splevo.vpm.analyzer.config.ChoiceConfiguration;
import org.splevo.vpm.analyzer.config.NumericConfiguration;
import org.splevo.vpm.analyzer.config.Range;
import org.splevo.vpm.analyzer.config.StringConfiguration;
import org.splevo.vpm.analyzer.config.VPMAnalyzerConfigurationSet;
import org.splevo.vpm.analyzer.graph.VPMGraph;
import org.splevo.vpm.analyzer.semantic.extensionpoint.SemanticContent;
import org.splevo.vpm.analyzer.semantic.extensionpoint.SemanticContentProvider;
import org.splevo.vpm.analyzer.semantic.extensionpoint.SemanticContentProviderRegistry;
import org.splevo.vpm.analyzer.semantic.extensionpoint.UnsupportedSoftwareElementException;
import org.splevo.vpm.analyzer.semantic.lucene.Indexer;
import org.splevo.vpm.analyzer.semantic.lucene.Stemming;
import org.splevo.vpm.analyzer.semantic.lucene.finder.SharedTermFinder;
import org.splevo.vpm.software.SoftwareElement;
import org.splevo.vpm.variability.Variant;
import org.splevo.vpm.variability.VariationPoint;

import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.collect.Table;
import com.google.common.collect.Table.Cell;

/**
 * <h2>What it does</h2>
 * <p>
 * The semantic relationship VPMAnalazer analyzer is able to find semantic relationships between
 * several {@link VariationPoint}s. Several configurations allow a customized search, just as
 * needed.
 * </p>
 *
 * <h2>How it works</h2>
 * <p>
 * As a first step, the analyzer extracts all relevant content from a VPMGraph and stores that
 * within a Lucene index. Through storing additional informations about the indexed text, Lucene
 * provides the ability to extract vectors from given index content. The Analyzer uses several
 * Finders to search for semantic dependencies. Those results can be displayed within the VPMGraph
 * or the Refinement Browser.
 * </p>
 */
public class SemanticVPMAnalyzer extends AbstractVPMAnalyzer {

    /** The relationship label of the analyzer. */
    private static final String RELATIONSHIP_LABEL_SEMANTIC = "Semantic";

    /** The displayed name of the analyzer. */
    private static final String DISPLAYED_NAME = "Semantic VPM Analyzer";

    /** The logger for this class. */
    private Logger logger = Logger.getLogger(SemanticVPMAnalyzer.class);

    /** The indexer instance. */
    private Indexer indexer = Indexer.getInstance();

    /** The configuration-object for the include comments configuration. */
    private BooleanConfiguration includeCommentsConfig = new BooleanConfiguration(Config.CONFIG_ID_INCLUDE_COMMENTS,
            Config.LABEL_INCLUDE_COMMENTS, null, Config.DEFAULT_INCLUDE_COMMENTS);

    /** The configuration-object for the split on case change configuration. */
    private BooleanConfiguration splitCamelCaseConfig = new BooleanConfiguration(Config.CONFIG_ID_SPLIT_CAMEL_CASE,
            Config.LABEL_SPLIT_CAMEL_CASE, null, Config.DEFAULT_SPLIT_CAMEL_CASE);

    /** The configuration-object for the stemming to be used. */
    private ChoiceConfiguration stemmingConfig = new ChoiceConfiguration(Config.CONFIG_ID_STEMMING,
            Config.LABEL_STEMMING, null, Config.DEFAULT_STEMMING, Config.AVAILABLEVALUES_STEMMING);

    /** The configuration-object for the stop words configuration. */
    private StringConfiguration stopWordsConfig = new StringConfiguration(Config.CONFIG_ID_STOP_WORDS,
            Config.LABEL_STOP_WORDS, Config.EXPL_STOP_WORDS, Config.DEFAULT_STOP_WORDS_HOST, false);

    /** The configuration-object for the minimum number of shared terms configuration. */
    private NumericConfiguration minSharedTermConfig = new NumericConfiguration(
            Config.CONFIG_ID_SHARED_TERM_MINIMUM, Config.LABEL_SHARED_TERM_MINIMUM, Config.EXPL_SHARED_TERM_MINIMUM,
            Config.DEFAULT_SHARED_TERM_MINIMUM, 1, new Range<Integer>(1, Integer.MAX_VALUE));

    /** The configuration-object for the log indexed terms configuration. */
    private StringConfiguration logIndexedTermsConfig = new StringConfiguration(Config.CONFIG_ID_LOG_INDEXED_TERMS,
            Config.LABEL_LOG_INDEXED_TERMS, Config.EXPL_LOG_INDEXED_TERMS, Config.DEFAULT_LOG_INDEXED_TERMS, true);

    /** The configuration-object for the feature term configuration. */
    private StringConfiguration featureTermConfig = new StringConfiguration(Config.CONFIG_ID_FEATURE_TERMS,
            Config.LABEL_FEATURE_TERMS, Config.EXPL_FEATURE_TERMS, Config.DEFAULT_FEATURE_TERMS, false);

    /** The configuration-object for the featured terms only configuration. */
    private BooleanConfiguration featuredTermsOnlyConfig = new BooleanConfiguration(
            Config.CONFIG_ID_FEATURE_TERMS_ONLY, Config.LABEL_FEATURE_TERMS_ONLY, Config.EXPL_FEATURE_TERMS_ONLY,
            Config.DEFAULT_FEATURE_TERMS_ONLY);

    /** The configuration-object for the similar term sets only configuration. */
    private BooleanConfiguration similarTermSetOnlyConfig = new BooleanConfiguration(
            Config.CONFIG_ID_SIMILAR_TERM_SET_ONLY, Config.LABEL_SIMILAR_TERM_SET_ONLY,
            Config.EXPL_SIMILAR_TERM_SET_ONLY, Config.DEFAULT_SIMILAR_TERM_SET_ONLY);

    /** The configuration-object for the one shared term only configuration. */
    private BooleanConfiguration oneSharedTermOnlyConfig = new BooleanConfiguration(
            Config.CONFIG_ID_ONE_SHARED_TERM_ONLY, Config.LABEL_ONE_SHARED_TERM_ONLY,
            Config.EXPL_ONE_SHARED_TERM_ONLY, Config.DEFAULT_ONE_SHARED_TERM_ONLY);

    @Override
    public VPMAnalyzerResult analyze(VPMGraph vpmGraph) {
        if (vpmGraph == null) {
            throw new IllegalArgumentException();
        }
        if (vpmGraph.getNodeCount() == 0) {
            logger.info("Got empty VPM Graph. No analysis executed.");
            return null;
        }

        // Fill the index.
        try {
            fillIndex(vpmGraph);
        } catch (Exception e) {
            logger.error("Cannot write Index. Close all open IndexWriters first.", e);
            return null;
        }

        if (!Strings.isNullOrEmpty(logIndexedTermsConfig.getCurrentValue())) {
            logIndexedTerms();
        }

        // Find relationships.
        VPMAnalyzerResult result = null;
        try {
            result = findRelationships(vpmGraph);
        } catch (IOException e) {
            logger.error("Cannot read Index. Close all open IndexWriters first.", e);
        } finally {
            cleanUp();
        }

        return result;
    }

    private void logIndexedTerms() {

        Map<String, Integer> indexedTerms = getTermsFromIndex();

        List<String> lines = Lists.newLinkedList();
        lines.add("Term,VPCount");
        for (String term : indexedTerms.keySet()) {
            lines.add(term + "," + indexedTerms.get(term));
        }

        String logDirectory = getCurrentLogSubDirectory();
        File logFile = new File(logDirectory + "indexed-terms.csv");

        try {
            FileUtils.writeLines(logFile, lines);
        } catch (IOException e) {
            logger.error("Failed to write term log", e);
        }
    }

    /**
     * Get the current run specific log directory. A timestamp specific sub directory of the
     * configured path will be used.
     *
     * @return The base log directory concatenated with a timestamp specific sub directory.
     */
    private String getCurrentLogSubDirectory() {
        DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HHmmss");
        Calendar cal = Calendar.getInstance();
        String timeStamp = dateFormat.format(cal.getTime());
        String logDirectory = logIndexedTermsConfig.getCurrentValue() + File.separator + timeStamp + File.separator;
        return logDirectory;
    }

    private void cleanUp() {
        try {
            Indexer.getInstance().clearIndex();
        } catch (IOException e) {
            logger.error("Failure while trying to empty main index.", e);
        }
    }

    private Map<String, Integer> getTermsFromIndex() {
        Map<String, Integer> indexedTerms = Maps.newLinkedHashMap();
        try {
            DirectoryReader indexReader = indexer.getIndexReader();
            Terms terms = SlowCompositeReaderWrapper.wrap(indexReader).terms(Indexer.INDEX_CONTENT);
            if (terms == null) {
                return indexedTerms;
            }

            TermsEnum termEnum = terms.iterator(null);
            BytesRef byteRef = null;

            while ((byteRef = termEnum.next()) != null) {
                String term = byteRef.utf8ToString();
                int count = indexReader.docFreq(new Term(Indexer.INDEX_CONTENT, byteRef));
                indexedTerms.put(term, Integer.valueOf(count));
            }
            indexReader.close();
        } catch (Exception e) {
            logger.error("Failed to dump index", e);
        }
        return indexedTerms;
    }

    /*
     * (non-Javadoc)
     *
     * @see org.splevo.vpm.analyzer.VPMAnalyzer#getConfigurations()
     */
    @Override
    public VPMAnalyzerConfigurationSet getConfigurations() {
        VPMAnalyzerConfigurationSet configurations = new VPMAnalyzerConfigurationSet();
        configurations.addConfigurations(Config.CONFIG_GROUP_GENERAL, stopWordsConfig, stemmingConfig,
                similarTermSetOnlyConfig, oneSharedTermOnlyConfig);
        configurations.addConfigurations(Config.CONFIG_GROUP_FEATURED_TERMS, featuredTermsOnlyConfig,
                featureTermConfig);
        configurations.addConfigurations(Config.CONFIG_GROUP_SHARED_TERM_FINDER, includeCommentsConfig,
                splitCamelCaseConfig, minSharedTermConfig, logIndexedTermsConfig);
        return configurations;
    }

    @Override
    public String getName() {
        return DISPLAYED_NAME;
    }

    @Override
    public String getRelationshipLabel() {
        return RELATIONSHIP_LABEL_SEMANTIC;
    }

    /**
     * Writes all necessary data from the {@link VPMGraph} into the Index.
     *
     * @param vpmGraph
     *            The {@link VPMGraph} containing the information to be indexed.
     */
    private void fillIndex(VPMGraph vpmGraph) {
        if (vpmGraph == null) {
            throw new IllegalArgumentException();
        }

        // Get the user-configurations.
        boolean indexComments = includeCommentsConfig.getCurrentValue();
        boolean splitCamelCase = splitCamelCaseConfig.getCurrentValue();
        String stopWords = stopWordsConfig.getCurrentValue();
        String featureTerms = featureTermConfig.getCurrentValue();
        boolean featuredTermsOnly = featuredTermsOnlyConfig.getCurrentValue();

        String stemmingString = stemmingConfig.getCurrentValue();
        Stemming stemming = Stemming.valueOf(stemmingString);

        this.indexer.setSplitCamelCase(splitCamelCase);
        this.indexer.setStemming(stemming);

        if (stopWords != null) {
            this.indexer.setStopWords(stopWords.split(" "));
        }

        if (featureTerms != null) {
            if (featureTerms.length() > 0) {
                this.indexer.setFeatureTermSet(new HashSet<String>(Arrays.asList(featureTerms.split(" "))));
            } else {
                this.indexer.setFeatureTermSet(new HashSet<String>());
            }
        }
        this.indexer.setFeaturedTermsOnly(featuredTermsOnly);

        // Iterate through the graph.
        for (Node node : vpmGraph.getNodeSet()) {
            VariationPoint vp = node.getAttribute(VPMGraph.VARIATIONPOINT, VariationPoint.class);
            indexNode(node.getId(), vp, indexComments);
        }
    }

    /**
     * This method uses the IndexASTNodeSwitch to extract the text from the given Node. It iterates
     * through all child elements.
     *
     * @param id
     *            The ID used to store the text in the Lucene index.
     * @param vp
     *            The corresponding {@link VariationPoint}.
     * @param matchComments
     *            Determines if comments should be indexed or ignored.
     */
    private void indexNode(String id, VariationPoint vp, boolean matchComments) {
        if (id == null || vp == null) {
            throw new IllegalArgumentException();
        }

        List<String> codeTerms = Lists.newLinkedList();
        List<String> comments = Lists.newLinkedList();

        for (Variant variant : vp.getVariants()) {
            for (SoftwareElement softwareElement : variant.getImplementingElements()) {
                loadTermsForSoftwareElement(matchComments, softwareElement, codeTerms, comments);
            }
        }

        // Get content and comment from switch.
        String codeString = convertListToString(codeTerms);
        String commentString = convertListToString(comments);

        // Add to index.
        try {
            this.indexer.addToIndex(id, codeString, commentString);
        } catch (IOException e) {
            logger.error("Failure while adding node to index.", e);
        }

    }

    /**
     * Get the semantic relevant terms for a software element from the registered semantic content
     * providers and store them in the code respectively comment lists.
     *
     * @param matchComments
     *            The flag if comments should be considered.
     * @param softwareElement
     *            The software element to get the terms for.
     * @param codeTerms
     *            The list to store code terms in.
     * @param commentTerms
     *            The list to store comment terms in. Nothing added if comments not considered.
     */
    private void loadTermsForSoftwareElement(boolean matchComments, SoftwareElement softwareElement,
            List<String> codeTerms, List<String> commentTerms) {
        List<SemanticContentProvider> semanticContentProviders = SemanticContentProviderRegistry.getInstance()
                .getElements();
        for (SemanticContentProvider semanticContentProvider : semanticContentProviders) {
            SemanticContent relevantContent;
            try {
                relevantContent = semanticContentProvider.getRelevantContent(softwareElement, matchComments);
            } catch (UnsupportedSoftwareElementException e) {
                continue;
            }
            codeTerms.addAll(relevantContent.getCode());
            commentTerms.addAll(relevantContent.getComments());
        }
    }

    /**
     * Transforms a list that stores strings to a string.
     *
     * @param list
     *            The list.
     * @return The string representation.
     */
    private String convertListToString(List<String> list) {
        return Joiner.on(" ").skipNulls().join(list);
    }

    /**
     * Finds semantic relationships between the variation points.
     *
     * @param graph
     *            The {@link VPMGraph} to extract the IDs of the result nodes from.
     * @return A {@link VPMAnalyzerResult} containing the search results.
     * @throws IOException
     *             Throws an {@link IOException} when there is already an open index writer.
     */
    private VPMAnalyzerResult findRelationships(VPMGraph graph) throws IOException {

        // Get the configurations
        boolean includeComments = includeCommentsConfig.getCurrentValue();
        int minSharedTerms = 1;
        if (minSharedTermConfig.getCurrentValue() != null) {
            minSharedTerms = minSharedTermConfig.getCurrentValue().intValue();
        }

        DirectoryReader reader = Indexer.getInstance().getIndexReader();
        SharedTermFinder finder = new SharedTermFinder(reader, includeComments, minSharedTerms);
        Table<String, String, Set<String>> sharedTermTable = finder.findSimilarEntries();
        reader.close();

        Set<String> vpFilter = buildImpreciseVPFilter(sharedTermTable);

        VPMAnalyzerResult result = new VPMAnalyzerResult(this);
        ArrayList<String> edgeRegistry = Lists.newArrayList();
        for (Cell<String, String, Set<String>> cell : sharedTermTable.cellSet()) {
            String id1 = cell.getRowKey();
            String id2 = cell.getColumnKey();

            if (vpFilter.contains(id1) || vpFilter.contains(id2)) {
                continue;
            }

            Set<String> sharedTerms = cell.getValue();

            if (sharedTerms.size() >= minSharedTerms) {
                Node node1 = graph.getNode(id1);
                Node node2 = graph.getNode(id2);
                String subLabel = convertListToString(Lists.newArrayList(sharedTerms));

                VPMEdgeDescriptor edge = buildEdgeDescriptor(node1, node2, subLabel, edgeRegistry);
                if (edge != null) {
                    logAnalysisInfo(id1, id2, "", "", String.format("Shared terms: %s", subLabel));
                    result.getEdgeDescriptors().add(edge);
                }
            }
        }
        return result;
    }

    /**
     * Build filter for variation points not sharing the same set of terms with all of it's
     * connected vps.
     *
     * If configured, also variation points sharing more than one term are included in the filter.
     *
     * @param sharedTermTable
     *            The table of variation points, there referenced other VPs and the terms they
     *            share.
     * @return The set of VP ids to not register any relationships for.
     */
    private Set<String> buildImpreciseVPFilter(Table<String, String, Set<String>> sharedTermTable) {
        Set<String> vpFilter = Sets.newLinkedHashSet();
        if (similarTermSetOnlyConfig.getCurrentValue()) {

            // CHECK VPs in both directions as shared term is a bi-derectional relationship
            // and some VPs might be contained in only one direction.

            // CHECK ROWS
            for (String referenceVP : sharedTermTable.rowKeySet()) {
                Map<String, Set<String>> row = sharedTermTable.row(referenceVP);
                Set<String> referenceTerms = null;
                for (Set<String> currentTerms : row.values()) {
                    if (referenceTerms == null) {
                        if (oneSharedTermOnlyConfig.getCurrentValue() && currentTerms.size() > 1) {
                            vpFilter.add(referenceVP);
                            break;
                        }
                        referenceTerms = currentTerms;
                    } else {
                        if (!referenceTerms.containsAll(currentTerms)
                                || !currentTerms.containsAll(referenceTerms)) {
                            vpFilter.add(referenceVP);
                            break;
                        }
                    }
                }
            }

            // CHECK COLUMNS
            for (String referenceVP : sharedTermTable.columnKeySet()) {
                Map<String, Set<String>> column = sharedTermTable.column(referenceVP);
                Set<String> referenceTerms = null;
                for (Set<String> currentTerms : column.values()) {
                    if (referenceTerms == null) {
                        if (oneSharedTermOnlyConfig.getCurrentValue() && currentTerms.size() > 1) {
                            vpFilter.add(referenceVP);
                            break;
                        }
                        referenceTerms = currentTerms;
                    } else {
                        if (!referenceTerms.containsAll(currentTerms)
                                || !currentTerms.containsAll(referenceTerms)) {
                            vpFilter.add(referenceVP);
                            break;
                        }
                    }
                }
            }
        }
        return vpFilter;
    }
}