opennlp.tools.parse_thicket.external_rst.ExternalRSTImporter.java Source code

Java tutorial

Introduction

Here is the source code for opennlp.tools.parse_thicket.external_rst.ExternalRSTImporter.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package opennlp.tools.parse_thicket.external_rst;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;

import edu.stanford.nlp.trees.Tree;
import opennlp.tools.parse_thicket.ArcType;
import opennlp.tools.parse_thicket.Pair;
import opennlp.tools.parse_thicket.ParseThicket;
import opennlp.tools.parse_thicket.ParseTreeNode;
import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc;
import opennlp.tools.parse_thicket.matching.Matcher;
import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;

public class ExternalRSTImporter extends PT2ThicketPhraseBuilder {
    private StringDistanceMeasurer strDistProc = new StringDistanceMeasurer();
    private String resourceDir = null;

    public ExternalRSTImporter() {
        try {
            resourceDir = new File(".").getCanonicalPath() + "/src/test/resources";
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    public List<RstNode> buildArrayOfRSTnodes(ParseThicket pt, String jotyDumpFileName) {
        String dump = null;
        try {
            dump = FileUtils.readFileToString(new File(jotyDumpFileName), Charset.defaultCharset().toString());
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }
        List<RstNode> nodes = new ArrayList<RstNode>();
        String[] lines = dump.split("\n");
        int startOfDim = StringUtils.lastIndexOf(lines[0], " ");
        String dimStr = lines[0].substring(startOfDim).replace(")", "").trim();
        int dim = Integer.parseInt(dimStr);
        Integer[][] rstArcsIndices = new Integer[dim][dim];
        for (int i = 1; i < lines.length; i++) {
            RstNode node = new RstNode(lines[i]);
            nodes.add(node);
        }
        return nodes;
    }

    private Map<String, Integer> phraseRstIndex = new HashMap<String, Integer>();
    private Map<Integer, List<ParseTreeNode>> rstIndexPhrase = new HashMap<Integer, List<ParseTreeNode>>();

    public List<WordWordInterSentenceRelationArc> buildRSTArcsFromRSTparser(List<RstNode> rstNodes,
            List<WordWordInterSentenceRelationArc> arcs, Map<Integer, List<List<ParseTreeNode>>> sentNumPhrasesMap,
            ParseThicket pt) {
        List<WordWordInterSentenceRelationArc> arcsRST = new ArrayList<WordWordInterSentenceRelationArc>();

        for (int nSentFrom = 0; nSentFrom < pt.getSentences().size(); nSentFrom++) {
            for (int nSentTo = nSentFrom + 1; nSentTo < pt.getSentences().size(); nSentTo++) {

                // label all phrases with EDU
                List<List<ParseTreeNode>> phrasesFrom = sentNumPhrasesMap.get(nSentFrom);
                for (List<ParseTreeNode> p : phrasesFrom) {
                    Integer rstIndex = findBestRstNodeTextForAPhrase(p, rstNodes);
                    if (rstIndex != null) {
                        phraseRstIndex.put(p.toString(), rstIndex);
                        rstIndexPhrase.put(rstIndex, p);
                    }
                }
                List<List<ParseTreeNode>> phrasesTo = sentNumPhrasesMap.get(nSentTo);
                for (List<ParseTreeNode> p : phrasesTo) {
                    Integer rstIndex = findBestRstNodeTextForAPhrase(p, rstNodes);
                    if (rstIndex != null) {
                        phraseRstIndex.put(p.toString(), rstIndex);
                        rstIndexPhrase.put(rstIndex, p);
                    }
                }
            }
        } // for a pair of phrases, discover ^ in RST tree which connects these
          // sentences

        for (int nSentFrom = 0; nSentFrom < pt.getSentences().size(); nSentFrom++) {
            for (int nSentTo = nSentFrom + 1; nSentTo < pt.getSentences().size(); nSentTo++) {
                System.out.println("Sent from # = " + nSentFrom + " -- " + "Sent to # = " + nSentTo);

                List<List<ParseTreeNode>> phrasesFrom = sentNumPhrasesMap.get(nSentFrom);
                List<List<ParseTreeNode>> phrasesTo = sentNumPhrasesMap.get(nSentTo);
                for (List<ParseTreeNode> vpFrom : phrasesFrom) {
                    for (List<ParseTreeNode> vpTo : phrasesTo) {
                        System.out.println("Computing arc between phrases " + vpFrom + " => " + vpTo);
                        // get two RST nodes
                        Integer rstNodeFrom = phraseRstIndex.get(vpFrom.toString());
                        Integer rstNodeTo = phraseRstIndex.get(vpTo.toString());
                        if (rstNodeFrom == null || rstNodeTo == null || rstNodeFrom >= rstNodeTo)
                            continue;
                        System.out.println("Finding RST path for phrases " + vpFrom + "' and '" + vpTo);
                        System.out.println("Sent from # = " + nSentFrom + " -- " + "Sent to # = " + nSentTo);

                        Integer commonAncestorIndex = findCommonAncestor(rstNodeFrom, rstNodeTo, rstNodes);
                        if (commonAncestorIndex != null) {
                            // and figure out if they can be properly connected
                            // by an arc, by navigating RST tree
                            ArcType arcType = new ArcType("rst", rstNodes.get(rstNodeTo).getRel2par(), 0, 0);
                            WordWordInterSentenceRelationArc arcRST = new WordWordInterSentenceRelationArc(
                                    new Pair<Integer, Integer>(nSentFrom, vpFrom.get(0).getId()),
                                    new Pair<Integer, Integer>(nSentTo, vpFrom.get(0).getId()), "", "", arcType);
                            arcsRST.add(arcRST);

                        }
                    }
                }
            }
        }

        return arcsRST;
    }

    private Integer findAncestorForRSTnode(Integer rstNodeFrom, List<RstNode> rstNodes) {
        RstNode initNode = rstNodes.get(rstNodeFrom);
        if (initNode.level == null)
            return null;

        try {
            int initLevel = initNode.level;
            int iter = 1; // start with moving one step up
            while (rstNodeFrom - iter >= 0) {
                Integer currLevel = rstNodes.get(rstNodeFrom - iter).level;
                if (currLevel != null && currLevel < initLevel) // found
                    // ancestor
                    return rstNodeFrom - iter;
                iter++;
            }
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return null;
    }

    private Integer findCommonAncestor(Integer rstNodeFrom, Integer rstNodeTo, List<RstNode> rstNodes) {
        List<Integer> ancestorsFrom = new ArrayList<Integer>(), ancestorsTo = new ArrayList<Integer>();
        ancestorsFrom.add(rstNodeFrom);
        ancestorsTo.add(rstNodeTo);
        int curLevel = rstNodes.get(rstNodeTo).level;
        Integer rstNodeFromCurrent = rstNodeFrom, rstNodeToCurrent = rstNodeTo;

        while (curLevel > 0) {
            if (rstNodeFromCurrent != null) {
                rstNodeFromCurrent = findAncestorForRSTnode(rstNodeFromCurrent, rstNodes);
            }
            if (rstNodeToCurrent != null) {
                rstNodeToCurrent = findAncestorForRSTnode(rstNodeToCurrent, rstNodes);
            }
            if (rstNodeFromCurrent != null) {
                ancestorsFrom.add(rstNodeFromCurrent);
            }
            if (rstNodeToCurrent != null)
                ancestorsTo.add(rstNodeToCurrent);

            List<Integer> ancestorsFromCurr = new ArrayList<Integer>(ancestorsFrom);
            ancestorsFromCurr.retainAll(ancestorsTo);
            if (!ancestorsFromCurr.isEmpty()) {
                System.out.println("Found comm ancestor " + rstNodes.get(ancestorsFromCurr.get(0)).toString()
                        + " id =  " + ancestorsFromCurr.get(0) + " for two RST nodes | id = " + rstNodeFrom + "'"
                        + rstNodes.get(rstNodeFrom).toString() + "' and | id = " + rstNodeTo + "'"
                        + rstNodes.get(rstNodeTo).toString() + "'");
                String rel2par = rstNodes.get(ancestorsFromCurr.get(0)).rel2par;
                // if common ancestor is trivial, return null and do not form a
                // link
                if (rel2par == null) // || rel2par.equals("span"))
                    return null;
                else
                    return ancestorsFromCurr.get(0);
            }
            curLevel--;
        }
        return null;
    }

    private Integer findBestRstNodeTextForAPhrase(List<ParseTreeNode> ps, List<RstNode> rstNodes) {
        // firt get the phrase string
        String phraseStr = "";
        for (ParseTreeNode n : ps) {
            phraseStr += " " + n.getWord();
        }
        phraseStr = phraseStr.trim();
        if (phraseStr.length() < 10) {
            return null;
        }
        // now look for closest EDU text from the list of all
        double rMin = -10000d;
        Integer index = -1;
        int count = 0;
        for (RstNode r : rstNodes) {
            if (r.getText() == null || r.getText().length() < 10) {
                count++;
                continue;
            }
            double res = strDistProc.measureStringDistanceNoStemming(phraseStr, r.getText());
            if (res > rMin) {
                rMin = res;
                index = count;
            }
            count++;
        }
        if (index == -1)
            return null;
        System.out.println("Found RST node " + rstNodes.get(index) + " for phrase =" + phraseStr);
        return index;
    }

    /*
     * Building phrases takes a Parse Thicket and forms phrases for each
     * sentence individually Then based on built phrases and obtained arcs, it
     * builds arcs for RST Finally, based on all formed arcs, it extends phrases
     * with thicket phrases
     */

    public List<WordWordInterSentenceRelationArc> buildPT2ptPhrases(ParseThicket pt,
            String externalRSTresultFilename) {
        List<List<ParseTreeNode>> phrasesAllSent = new ArrayList<List<ParseTreeNode>>();
        Map<Integer, List<List<ParseTreeNode>>> sentNumPhrases = new HashMap<Integer, List<List<ParseTreeNode>>>();
        // build regular phrases
        for (int nSent = 0; nSent < pt.getSentences().size(); nSent++) {
            List<ParseTreeNode> sentence = pt.getNodesThicket().get(nSent);
            Tree ptree = pt.getSentences().get(nSent);
            // ptree.pennPrint();
            List<List<ParseTreeNode>> phrases = buildPT2ptPhrasesForASentence(ptree, sentence);
            System.out.println(phrases);
            phrasesAllSent.addAll(phrases);
            sentNumPhrases.put(nSent, phrases);

        }
        // TODO: code to run joty suite
        List<RstNode> rstNodes = new ExternalRSTImporter().buildArrayOfRSTnodes(null,
                resourceDir + externalRSTresultFilename);

        // discover and add RST arcs
        List<WordWordInterSentenceRelationArc> arcsRST = buildRSTArcsFromRSTparser(rstNodes, null, sentNumPhrases,
                pt);
        System.out.println(arcsRST);
        return arcsRST;

    }

}