knowledgeMiner.mining.SentenceParserHeuristic.java Source code

Introduction

Here is the source code for knowledgeMiner.mining.SentenceParserHeuristic.java
Source

/*******************************************************************************
 * Copyright (C) 2013 University of Waikato, Hamilton, New Zealand.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl.html
 * 
 * Contributors:
 *    Sam Sarjant - initial API and implementation
 ******************************************************************************/
package knowledgeMiner.mining;

import io.IOManager;
import io.ResourceAccess;
import io.ontology.OntologySocket;
import io.resources.WMISocket;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import knowledgeMiner.mapping.CycMapper;
import knowledgeMiner.mapping.textToCyc.TextMappedConcept;
import opennlp.tools.parser.Parse;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import util.Tree;
import util.text.OpenNLP;
import util.wikipedia.WikiParser;
import cyc.AssertionArgument;
import cyc.CycConstants;
import cyc.MappableConcept;

public class SentenceParserHeuristic extends MiningHeuristic {
    private static final String[] COPULAS = { "is", "are", "was", "were", "be", "am", "being", "been" };
    private static final Collection<ExtractionPattern> EXTRACTION_PATTERNS;
    private final static Logger logger_ = LoggerFactory.getLogger(SentenceParserHeuristic.class);
    private static final Pattern[] SENTENCE_SIMPLIFIER = { Pattern.compile("^In [^,.]+, "),
            Pattern.compile("(?<=[^,.]+), [^,.]+,(?= (is|was|are|were))") };
    public static final String SENTENCE_PREFIX = "TOPICS is a ";

    public SentenceParserHeuristic(CycMapper mapper, CycMiner miner) {
        super(false, mapper, miner);
        OpenNLP.getParser();
    }

    private String disambiguateTree(Parse parse, String[] predicateStrs, MappableConcept focusConcept,
            SortedMap<String, String> anchors, WMISocket wmi, OntologySocket cyc, MiningHeuristic heuristic,
            Collection<PartialAssertion> results) throws Exception {
        if (predicateStrs == null) {
            predicateStrs = new String[1];
            predicateStrs[0] = "";
        }

        Parse[] children = parse.getChildren();
        String type = parse.getType();
        String text = parse.getCoveredText();

        // No children? Return value
        if (children.length == 0)
            return text;

        // Recurse to 'left'
        int childIndex = 0;
        String left = disambiguateTree(children[childIndex++], Arrays.copyOf(predicateStrs, predicateStrs.length),
                focusConcept, anchors, wmi, cyc, heuristic, results);

        // If VP or PP, add to predicate
        boolean canCreate = true;
        if (left != null) {
            if (type.equals("VP"))
                predicateStrs[0] = left.trim();
            else if (type.equals("PP")) {
                // If PP, split recursion into two predicates
                predicateStrs[0] = (predicateStrs[0] + " " + left).trim();
                if (!predicateStrs[0].equals(left)) {
                    predicateStrs = Arrays.copyOf(predicateStrs, predicateStrs.length + 1);
                    predicateStrs[predicateStrs.length - 1] = left;
                }
            }
        } else
            canCreate = false;

        for (; childIndex < children.length; childIndex++) {
            Parse childParse = children[childIndex];
            String result = disambiguateTree(childParse, Arrays.copyOf(predicateStrs, predicateStrs.length),
                    focusConcept, anchors, wmi, cyc, heuristic, results);
            if (result == null) {
                canCreate = false;
            }
        }

        if (type.equals("VP") || type.equals("PP"))
            return null;

        // Can create and we have a target and predicate(s)
        if (canCreate && type.equals("NP") && !predicateStrs[0].isEmpty()) {
            for (String predStr : predicateStrs) {
                AssertionArgument predicate = null;
                if (isCopula(predStr)) {
                    predicate = CycConstants.ISA_GENLS.getConcept();
                } else {
                    // TODO Figure out a safe way to parse predicates. Probably
                    // need to look at the parse code again.
                    // predStr = reAnchorString(predStr, anchors);
                    // predicate = new TextMappedConcept(predStr, true, true);
                }

                if (predicate == null)
                    continue;

                // Return the possible noun strings
                Collection<Tree<String>> nounStrs = composeAdjNounsTree(parse, anchors);
                logger_.trace("createAssertions: " + predicate.toString() + " "
                        + nounStrs.toString().replaceAll("\\\\\n", " "));

                // Recurse through the tree and build the partial assertions
                HeuristicProvenance provenance = new HeuristicProvenance(heuristic, predStr + "+" + text);
                Collection<PartialAssertion> currAssertions = recurseStringTree(predicate, focusConcept, nounStrs,
                        provenance);

                // Add the assertions
                for (PartialAssertion pa : currAssertions)
                    if (!results.contains(pa)) {
                        results.add(pa);
                        canCreate = false;
                    }
            }
        }

        if (!canCreate)
            return null;

        return text;
    }

    /**
     * Extracts the anchors from the text and returns them in tree form.
     *
     * @param text
     *            The text to reinsert anchors into, then extract anchors from
     * @param anchors
     *            The reanchoring map.
     * @param anchorMap
     *            The map of anchors to add to.
     * @return A collection of extracted anchors.
     */
    private Collection<Tree<String>> extractAnchors(String text, SortedMap<String, String> anchors,
            Map<String, Tree<String>> anchorMap) {
        Collection<Tree<String>> anchorCol = new ArrayList<>();
        // Reanchor the string and extract all anchors
        String anchorText = reAnchorString(text, anchors);
        Matcher m = WikiParser.ANCHOR_PARSER_ROUGH.matcher(anchorText);
        while (m.find()) {
            Tree<String> anchorTree = new Tree<String>(m.group());
            // Split up anchor text
            String anchor = (m.group(2) == null) ? m.group(1) : m.group(2);
            anchorMap.put(anchor, anchorTree);
            anchorCol.add(anchorTree);
        }
        return anchorCol;
    }

    /**
     * If the current verbPhrase is a copula.
     * 
     * @param verbPhrase
     *            The phrase to check.
     * @return True if the verb phrase is a copula.
     */
    private boolean isCopula(String verbPhrase) {
        for (String copula : COPULAS) {
            if (verbPhrase.equalsIgnoreCase(copula))
                return true;
        }
        return false;
    }

    /**
     * Pairs the nouns and adjectives together to produce a number of result
     * text fragments to be resolved to concepts.
     *
     * @param parse
     *            The parse to process and pair.
     * @param anchors
     *            The anchor map.
     * @param existingAnchorTrees
     *            The anchor trees already added to the results (for reuse and
     *            subtree-ing)
     * @return A collection of possible mappable entities composed of at least
     *         one noun and possible adjectives (with sub-adjectives).
     */
    private Collection<Tree<String>> pairNounAdjs(Parse parse, SortedMap<String, String> anchors,
            Map<String, Tree<String>> existingAnchorTrees) {
        Collection<Tree<String>> results = new ArrayList<>();
        boolean createNewNounSet = false;
        ArrayList<String> nounPhrases = new ArrayList<>();
        Parse[] children = parse.getChildren();
        for (int i = children.length - 1; i >= 0; i--) {
            String childType = children[i].getType();
            String childText = children[i].getCoveredText();
            if (childType.startsWith("NN") || childType.equals("NP")) {
                // Note the noun, adding it to the front of the existing NP.
                if (createNewNounSet)
                    nounPhrases.clear();
                String existingNounPhrase = "";
                if (!nounPhrases.isEmpty())
                    existingNounPhrase = nounPhrases.get(nounPhrases.size() - 1);
                String np = (childText + " " + existingNounPhrase).trim();
                nounPhrases.add(np);

                // Add to the tree (if not a pure anchor)
                if (!anchors.containsKey(np))
                    results.add(new Tree<String>(reAnchorString(np, anchors)));
            } else if (childType.startsWith("JJ") || childType.equals("ADJP")) {
                // Only process if we have an NP
                if (!nounPhrases.isEmpty()) {
                    // For every nounPhrase
                    StringBuilder adjective = new StringBuilder();
                    for (int j = i; children[j].getType().startsWith("JJ")
                            || children[j].getType().equals("ADJP"); j++) {
                        // Build adjective combinations
                        if (adjective.length() != 0)
                            adjective.append(" ");
                        adjective.append(children[j].getCoveredText());
                        for (String np : nounPhrases) {
                            // Create the tree (with sub adjective tree)
                            String adjNP = adjective + " " + np;
                            Tree<String> adjP = null;
                            // Check for an existing anchor tree
                            if (existingAnchorTrees.containsKey(adjNP))
                                adjP = existingAnchorTrees.get(adjNP);
                            else
                                adjP = new Tree<String>(reAnchorString(adjNP, anchors));
                            if (!anchors.containsKey(adjective.toString()))
                                adjP.addSubValue(reAnchorString(adjective.toString(), anchors));

                            // Add to the tree
                            results.add(adjP);
                        }
                    }
                }
                createNewNounSet = true;
            } else {
                createNewNounSet = true;
            }
        }
        return results;
    }

    /**
     * Cleans the indices off the end of strings.
     * 
     * @param str
     *            A string with indices on the end of each word.
     * @param anchors
     *            The anchor map to apply.
     * @return A string without index suffixes.
     */
    private String reAnchorString(String str, SortedMap<String, String> anchors) {
        if (anchors == null)
            return str;
        String result = str.replaceAll("(\\S+)-\\d+(?= |$)", "$1");
        for (String anchor : anchors.keySet()) {
            if (anchors.get(anchor) != null)
                result = WikiParser.replaceAll(result, anchor, anchors.get(anchor));
        }
        return result;
    }

    /**
     * Recurse down text fragments and attempt to find mapped concepts for each
     * fragment, maintaining the same hierarchy.
     * 
     * @param parentCols
     *            The noun strings to recurse through.
     * @param wmi
     *            The WMI access.
     * @param ontology
     *            The ontology access.
     * @return The results set to add to.
     */
    private Collection<PartialAssertion> recurseStringTree(AssertionArgument predicate,
            MappableConcept focusConcept, Collection<Tree<String>> nounStrs, HeuristicProvenance provenance) {
        Collection<PartialAssertion> assertions = new ArrayList<>();
        // Recurse through every discovered noun combination
        for (Tree<String> t : nounStrs) {
            PartialAssertion pa = new PartialAssertion(predicate, provenance, focusConcept,
                    new TextMappedConcept(t.getValue(), false, false));
            if (!t.getSubTrees().isEmpty())
                for (PartialAssertion subPA : recurseStringTree(predicate, focusConcept, t.getSubTrees(),
                        provenance)) {
                    pa.addSubAssertion(subPA);
                }
            assertions.add(pa);
        }
        return assertions;
    }

    @Override
    protected void setInformationTypes(boolean[] infoTypes) {
        infoTypes[InformationType.TAXONOMIC.ordinal()] = true;
        infoTypes[InformationType.NON_TAXONOMIC.ordinal()] = true;
    }

    /**
     * Compose the hierarchical set of noun-adjective combinations from the NP.
     * First gets all anchors, then processes each JJ and NN combination, adding
     * them directly or as subvalues of the anchors.
     * 
     * @param parse
     *            The parse to compose the strings from.
     * @param anchors
     *            The anchors to insert during composition.
     * @return A Hierarchical Weighted Set of strings representing the order of
     *         strings that should be attempted to assert.
     */
    public Collection<Tree<String>> composeAdjNounsTree(Parse parse, SortedMap<String, String> anchors) {
        String text = parse.getCoveredText();
        Collection<Tree<String>> results = new ArrayList<>();

        // Add all visible anchors
        // Keep track of which text is in what anchors
        Map<String, Tree<String>> anchorMap = new HashMap<>();
        results.addAll(extractAnchors(text, anchors, anchorMap));

        // Work backwards through the children, adding nouns, then adjectives to
        // the nouns
        results.addAll(pairNounAdjs(parse, anchors, anchorMap));
        return results;
    }

    /**
     * Extracts a set of assertions from a sentence using parsing techniques to
     * identify plain-text assertions.
     * 
     * @param sentence
     *            The sentence to extract assertions from.
     * @param focusConcept
     *            The concept the sentence is being mined for.
     * @param wikifyText
     *            If the text should be wikified.
     * @param wmi
     *            The WMI access.
     * @param cyc
     *            The Cyc access.
     * @param heuristic
     *            The heuristic to which assertions are assigned to.
     * @throws Exception
     *             Should something go awry...
     */
    public Collection<PartialAssertion> extractAssertions(String sentence, MappableConcept focusConcept,
            boolean wikifyText, WMISocket wmi, OntologySocket cyc, MiningHeuristic heuristic) throws Exception {
        logger_.trace("mineSentence: " + sentence);

        if (wikifyText)
            sentence = wmi.annotate(sentence, 0, false);

        Map<String, Double> anchorWeights = new HashMap<>();
        SortedMap<String, String> anchors = locateAnchors(sentence, anchorWeights);
        sentence = sentence.replaceAll("'{3,}.+?'{3,}", "THING");
        sentence = sentence.replaceAll("\\?{2,}", "");
        String cleanSentence = WikiParser.cleanAllMarkup(sentence);

        Parse parse = parseLine(cleanSentence);
        // parse.show();
        Collection<PartialAssertion> results = new ArrayList<>();
        logger_.trace("disambiguateTree: " + sentence);
        disambiguateTree(parse, null, focusConcept, anchors, wmi, cyc, heuristic, results);
        return results;
    }

    static {
        EXTRACTION_PATTERNS = new ArrayList<>();
        EXTRACTION_PATTERNS.add(new ExtractionPattern("VB", "NP"));
    }

    /**
     * Locates the anchors in a string and forms a replacement map for them.
     * 
     * @param sentence
     *            The sentence to search for anchors.
     * @param anchorWeights
     *            An optional map to record any weight information for the
     *            anchors. If no weights in the text, all weights are assumed to
     *            be 1.0.
     * @return A SortedMap of anchors, ordered in largest text size to smallest.
     */
    public static SortedMap<String, String> locateAnchors(String sentence, Map<String, Double> anchorWeights) {
        SortedMap<String, String> anchorMap = new TreeMap<>(new Comparator<String>() {
            @Override
            public int compare(String o1, String o2) {
                int result = Double.compare(o1.length(), o2.length());
                if (result != 0)
                    return -result;
                return o1.compareTo(o2);
            }
        });
        Matcher m = WikiParser.ANCHOR_PARSER.matcher(sentence);
        while (m.find()) {
            String replString = (m.group(2) != null) ? m.group(2) : m.group(1);
            anchorMap.put(replString, m.group());
        }
        return anchorMap;
    }

    public static void main(String[] args) throws Exception {
        ResourceAccess.newInstance();
        CycMapper mapper = new CycMapper();
        CycMiner miner = new CycMiner(null, mapper);
        SentenceParserHeuristic sph = new SentenceParserHeuristic(mapper, miner);
        WMISocket wmi = ResourceAccess.requestWMISocket();
        OntologySocket cyc = ResourceAccess.requestOntologySocket();

        String input = "";
        MappableConcept mappable = new TextMappedConcept("PLACEHOLDER", false, false);
        while (!input.equalsIgnoreCase("exit")) {
            System.out.println("Enter sentence to parse:");
            BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
            input = in.readLine();
            Collection<PartialAssertion> assertions = sph.extractAssertions(input, mappable, true, wmi, cyc, null);
            System.out.println(assertions);
        }
    }

    public static synchronized Parse parseLine(String cleanSentence) {
        Parse parse = null;
        while (parse == null) {
            parse = OpenNLP.parseLine(cleanSentence);
            // Could not parse
            if (parse.getType().equals("INC")) {
                try {
                    IOManager.getInstance().writeFirstSentence(-1, cleanSentence);
                } catch (IOException e) {
                    e.printStackTrace();
                }

                // Simplify the sentence
                for (Pattern p : SENTENCE_SIMPLIFIER) {
                    String simplifiedSentence = p.matcher(cleanSentence).replaceFirst("");
                    // Replace the clean sentence
                    if (!simplifiedSentence.equals(cleanSentence)) {
                        cleanSentence = StringUtils.capitalize(simplifiedSentence);
                        parse = null;
                        break;
                    }
                }
            }
        }
        return parse;
    }
}