edu.cmu.ark.AnalysisUtilities.java Source code

Introduction

Here is the source code for edu.cmu.ark.AnalysisUtilities.java
Source

// Question Generation via Overgenerating Transformations and Ranking
// Copyright (c) 2010 Carnegie Mellon University.  All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Michael Heilman
//     Carnegie Mellon University
//     mheilman@cmu.edu
//     http://www.cs.cmu.edu/~mheilman

package edu.cmu.ark;

import java.io.*;
import java.net.*;
import java.util.*;

import org.apache.commons.lang.StringUtils;

import net.didion.jwnl.data.IndexWord;
import net.didion.jwnl.data.POS;
import net.didion.jwnl.dictionary.Dictionary;

import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.parser.lexparser.*;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.tregex.*;
import edu.stanford.nlp.trees.tregex.tsurgeon.*;
import edu.stanford.nlp.util.Pair;

public class AnalysisUtilities {
    private AnalysisUtilities() {
        parser = null;

        conjugator = new VerbConjugator();
        conjugator.load(GlobalProperties.getProperties().getProperty("verbConjugationsFile",
                "config" + File.separator + "verbConjugations.txt"));
        headfinder = new CollinsHeadFinder();
        tree_factory = new LabeledScoredTreeFactory();
        tlp = new PennTreebankLanguagePack();
    }

    public static void addPeriodIfNeeded(Tree input) {
        String tregexOpStr = "ROOT < (S=mainclause !< /\\./)";
        TregexPattern matchPattern = TregexPatternFactory.getPattern(tregexOpStr);
        TregexMatcher matcher = matchPattern.matcher(input);

        if (matcher.find()) {
            TsurgeonPattern p;
            List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>();
            List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>();

            ps.add(Tsurgeon.parseOperation("insert (. .) >-1 mainclause"));
            p = Tsurgeon.collectOperations(ps);
            ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p));
            Tsurgeon.processPatternsOnTree(ops, input);
        }
    }

    public static int getNumberOfMatchesInTree(String tregexExpression, Tree t) {
        int res = 0;
        TregexMatcher m = TregexPatternFactory.getPattern(tregexExpression).matcher(t);
        while (m.find()) {
            res++;
        }
        return res;
    }

    public static List<String> getSentences(String document) {
        DocumentPreprocessor dp = new DocumentPreprocessor(false);
        List<String> res = new ArrayList<String>();
        String sentence;

        document = preprocess(document);

        String[] paragraphs = document.split("\\n");

        for (int i = 0; i < paragraphs.length; i++) {
            StringReader reader = new StringReader(paragraphs[i]);
            List<List<? extends HasWord>> sents = new ArrayList<List<? extends HasWord>>();

            try {
                sents = dp.getSentencesFromText(reader);
            } catch (Exception e) {
                e.printStackTrace();
            }

            for (List<? extends HasWord> tmp1 : sents) {
                sentence = "";
                for (HasWord tmp2 : tmp1) {
                    String tmp = tmp2.word().toString();
                    sentence += tmp + " ";
                }
                sentence = sentence.trim();
                res.add(sentence);
            }
        }

        return res;
    }

    public static String abbrevTree(Tree tree) {
        ArrayList<String> toks = new ArrayList<String>();
        for (Tree L : tree.getLeaves()) {
            toks.add(L.label().toString());
        }
        return tree.label().toString() + "[" + StringUtils.join(toks, " ") + "]";
    }

    public static void downcaseFirstToken(Tree inputTree) {
        Tree firstWordTree = inputTree.getLeaves().get(0);
        if (firstWordTree == null)
            return;
        Tree preterm = firstWordTree.parent(inputTree);
        String firstWord = firstWordTree.yield().toString();
        if (!preterm.label().toString().matches("^NNP.*") && !firstWord.equals("I")) {
            //if(firstWord.indexOf('-') == -1 && !firstWord.equals("I")){
            firstWord = firstWord.substring(0, 1).toLowerCase() + firstWord.substring(1);
            firstWordTree.label().setValue(firstWord);
        }

        //if(QuestionTransducer.DEBUG) System.err.println("downcaseFirstToken: "+inputTree.toString());
    }

    public static void upcaseFirstToken(Tree inputTree) {
        Tree firstWordTree = inputTree.getLeaves().get(0);
        if (firstWordTree == null)
            return;

        String firstWord = firstWordTree.yield().toString();
        firstWord = firstWord.substring(0, 1).toUpperCase() + firstWord.substring(1);
        firstWordTree.label().setValue(firstWord);

        //if(QuestionTransducer.DEBUG) System.err.println("upcaseFirstToken: "+inputTree.toString());
    }

    public static String preprocess(String sentence) {
        //remove trailing whitespace   
        sentence = sentence.trim();

        //remove single words in parentheses.  
        //the stanford parser api messed up on these
        //by removing the parentheses but not the word in them
        sentence = sentence.replaceAll("\\(\\S*\\)", "");
        sentence = sentence.replaceAll("\\(\\s*\\)", "");

        //some common unicode characters that the tokenizer throws out otherwise
        sentence = sentence.replaceAll("", "--");
        sentence = sentence.replaceAll("", "'");
        sentence = sentence.replaceAll("?", "\"");
        sentence = sentence.replaceAll("", "\"");
        sentence = sentence.replaceAll("|||", "e");
        sentence = sentence.replaceAll("|||", "E");
        sentence = sentence.replaceAll("|||", "i");
        sentence = sentence.replaceAll("|?||?", "I");
        sentence = sentence.replaceAll("||||||", "a");
        sentence = sentence.replaceAll("|?|||||", "A");
        sentence = sentence.replaceAll("||||", "o");
        sentence = sentence.replaceAll("||||", "O");
        sentence = sentence.replaceAll("|||", "u");
        sentence = sentence.replaceAll("|||", "U");
        sentence = sentence.replaceAll("", "n");

        //contractions
        sentence = sentence.replaceAll("can't", "can not");
        sentence = sentence.replaceAll("won't", "will not");
        sentence = sentence.replaceAll("n't", " not"); //aren't shouldn't don't isn't
        sentence = sentence.replaceAll("are n't", "are not");

        //simply remove other unicode characters
        //if not, the tokenizer replaces them with spaces, 
        //which wreaks havoc on the final parse sometimes
        for (int i = 0; i < sentence.length(); i++) {
            if (sentence.charAt(i) > 'z') {
                sentence = sentence.substring(0, i) + sentence.substring(i + 1);
            }
        }

        //add punctuation to the end if necessary
        /*Matcher matcher = Pattern.compile(".*\\.['\"\n ]*$", Pattern.DOTALL).matcher(sentence);
        if(!matcher.matches()){
           sentence += ".";
        }*/

        return sentence;
    }

    public static String preprocessTreeString(String sentence) {
        sentence = sentence.replaceAll(" n't", " not");
        sentence = sentence.replaceAll("\\(MD ca\\)", "(MD can)");
        sentence = sentence.replaceAll("\\(MD wo\\)", "(MD will)");
        sentence = sentence.replaceAll("\\(MD 'd\\)", "(MD would)");
        sentence = sentence.replaceAll("\\(VBD 'd\\)", "(VBD had)");
        sentence = sentence.replaceAll("\\(VBZ 's\\)", "(VBZ is)");
        sentence = sentence.replaceAll("\\(VBZ 's\\)", "(VBZ is)");
        sentence = sentence.replaceAll("\\(VBZ 's\\)", "(VBZ is)");
        sentence = sentence.replaceAll("\\(VBP 're\\)", "(VBP are)");

        return sentence;
    }

    public VerbConjugator getConjugator() {
        return conjugator;
    }

    public CollinsHeadFinder getHeadFinder() {
        return headfinder;
    }

    public static AnalysisUtilities getInstance() {
        if (instance == null) {
            instance = new AnalysisUtilities();
        }
        return instance;
    }

    public ParseResult parseSentence(String sentence) {
        String result = "";
        //System.err.println(sentence);
        //see if a parser socket server is available
        int port = new Integer(GlobalProperties.getProperties().getProperty("parserServerPort", "5556"));
        String host = "127.0.0.1";
        Socket client;
        PrintWriter pw;
        BufferedReader br;
        String line;
        Tree parse = null;
        double parseScore = Double.MIN_VALUE;

        try {
            client = new Socket(host, port);

            pw = new PrintWriter(client.getOutputStream());
            br = new BufferedReader(new InputStreamReader(client.getInputStream()));
            pw.println(sentence);
            pw.flush(); //flush to complete the transmission

            while ((line = br.readLine()) != null) {
                //if(!line.matches(".*\\S.*")){
                //        System.out.println();
                //}
                if (br.ready()) {
                    line = line.replaceAll("\n", "");
                    line = line.replaceAll("\\s+", " ");
                    result += line + " ";
                } else {
                    parseScore = new Double(line);
                }
            }

            br.close();
            pw.close();
            client.close();

            if (parse == null) {
                parse = readTreeFromString("(ROOT (. .))");
                parseScore = -99999.0;
            }

            if (GlobalProperties.getDebug())
                System.err.println("result (parse):" + result);
            parse = readTreeFromString(result);
            return new ParseResult(true, parse, parseScore);

        } catch (Exception ex) {
            if (GlobalProperties.getDebug())
                System.err.println("Could not connect to parser server.");
            //ex.printStackTrace();
        }

        System.err.println("parsing:" + sentence);

        //if socket server not available, then use a local parser object
        if (parser == null) {
            try {
                Options op = new Options();
                String serializedInputFileOrUrl = GlobalProperties.getProperties().getProperty("parserGrammarFile",
                        "config" + File.separator + "englishFactored.ser.gz");
                parser = new LexicalizedParser(serializedInputFileOrUrl, op);
                int maxLength = new Integer(GlobalProperties.getProperties().getProperty("parserMaxLength", "40"))
                        .intValue();
                parser.setMaxLength(maxLength);
                parser.setOptionFlags("-outputFormat", "oneline");
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        try {
            if (parser.parse(sentence)) {
                parse = parser.getBestParse();

                //remove all the parent annotations (this is a hacky way to do it)
                String ps = parse.toString().replaceAll("\\[[^\\]]+/[^\\]]+\\]", "");
                parse = AnalysisUtilities.getInstance().readTreeFromString(ps);

                parseScore = parser.getPCFGScore();
                return new ParseResult(true, parse, parseScore);
            }
        } catch (Exception e) {
        }

        parse = readTreeFromString("(ROOT (. .))");
        parseScore = -99999.0;
        return new ParseResult(false, parse, parseScore);
    }

    public String getLemma(String word, String pos) {
        if (!(pos.startsWith("N") || pos.startsWith("V") || pos.startsWith("J") || pos.startsWith("R"))
                || pos.startsWith("NNP")) {
            return word.toLowerCase();
        }

        String res = word.toLowerCase();

        if (res.equals("is") || res.equals("are") || res.equals("were") || res.equals("was")) {
            res = "be";
        } else {
            try {
                IndexWord iw;
                if (pos.startsWith("V"))
                    iw = Dictionary.getInstance().getMorphologicalProcessor().lookupBaseForm(POS.VERB, res);
                else if (pos.startsWith("N"))
                    iw = Dictionary.getInstance().getMorphologicalProcessor().lookupBaseForm(POS.NOUN, res);
                else if (pos.startsWith("J"))
                    iw = Dictionary.getInstance().getMorphologicalProcessor().lookupBaseForm(POS.ADJECTIVE, res);
                else
                    iw = Dictionary.getInstance().getMorphologicalProcessor().lookupBaseForm(POS.ADVERB, res);

                if (iw == null)
                    return res;
                res = iw.getLemma();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        return res;
    }

    /**
     * Remove traces and non-terminal decorations (e.g., "-SUBJ" in "NP-SUBJ") from a Penn Treebank-style tree.
     * 
     * @param inputTree
     */
    public void normalizeTree(Tree inputTree) {
        inputTree.label().setFromString("ROOT");

        List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>();
        List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>();
        String tregexOpStr;
        TregexPattern matchPattern;
        TsurgeonPattern p;
        TregexMatcher matcher;

        tregexOpStr = "/\\-NONE\\-/=emptynode";
        matchPattern = TregexPatternFactory.getPattern(tregexOpStr);
        matcher = matchPattern.matcher(inputTree);
        ps.add(Tsurgeon.parseOperation("prune emptynode"));
        matchPattern = TregexPatternFactory.getPattern(tregexOpStr);
        p = Tsurgeon.collectOperations(ps);
        ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p));
        Tsurgeon.processPatternsOnTree(ops, inputTree);

        Label nonterminalLabel;

        tregexOpStr = "/.+\\-.+/=nonterminal < __";
        matchPattern = TregexPatternFactory.getPattern(tregexOpStr);
        matcher = matchPattern.matcher(inputTree);
        while (matcher.find()) {
            nonterminalLabel = matcher.getNode("nonterminal");
            if (nonterminalLabel == null)
                continue;
            nonterminalLabel.setFromString(tlp.basicCategory(nonterminalLabel.value()));
        }

    }

    /**
     * remove extra quotation marks
     * (a hack due to annoying PTB conventions by which quote marks aren't in the same consituent)
     *  
     * @param input
     */
    public static void removeExtraQuotes(Tree input) {
        List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>();
        String tregexOpStr;
        TregexPattern matchPattern;
        TsurgeonPattern p;
        List<TsurgeonPattern> ps;

        ps = new ArrayList<TsurgeonPattern>();
        tregexOpStr = "ROOT [ << (``=quote < `` !.. ('' < '')) | << (''=quote < '' !,, (`` < ``)) ] ";
        matchPattern = TregexPatternFactory.getPattern(tregexOpStr);
        ps.add(Tsurgeon.parseOperation("prune quote"));
        p = Tsurgeon.collectOperations(ps);
        ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p));
        Tsurgeon.processPatternsOnTree(ops, input);

    }

    public static String getCleanedUpYield(Tree inputTree) {
        Tree copyTree = inputTree.deeperCopy();

        //if(GlobalProperties.getDebug()) System.err.println("yield:"+copyTree.toString());

        return cleanUpSentenceString(copyTree.yield().toString());
    }

    public static String cleanUpSentenceString(String s) {
        String res = s;
        //if(res.length() > 1){
        //   res = res.substring(0,1).toUpperCase() + res.substring(1);
        //}

        res = res.replaceAll("\\s([\\.,!\\?\\-;:])", "$1");
        res = res.replaceAll("(\\$)\\s", "$1");
        res = res.replaceAll("can not", "cannot");
        res = res.replaceAll("\\s*-LRB-\\s*", " (");
        res = res.replaceAll("\\s*-RRB-\\s*", ") ");
        res = res.replaceAll("\\s*([\\.,?!])\\s*", "$1 ");
        res = res.replaceAll("\\s+''", "''");
        //res = res.replaceAll("\"", "");
        res = res.replaceAll("``\\s+", "``");
        res = res.replaceAll("\\-[LR]CB\\-", ""); //brackets, e.g., [sic]
        res = res.replaceAll("\\. \\?", ".?");
        res = res.replaceAll(" 's(\\W)", "'s$1");
        res = res.replaceAll("(\\d,) (\\d)", "$1$2"); //e.g., "5, 000, 000" -> "5,000,000"
        res = res.replaceAll("``''", "");

        //remove extra spaces
        res = res.replaceAll("\\s\\s+", " ");
        res = res.trim();

        return res;
    }

    public static boolean cCommands(Tree root, Tree n1, Tree n2) {
        if (n1.dominates(n2))
            return false;

        Tree n1Parent = n1.parent(root);
        while (n1Parent != null && n1Parent.numChildren() == 1) {
            n1Parent = n1Parent.parent(root);
        }

        if (n1Parent != null && n1Parent.dominates(n2))
            return true;

        return false;
    }

    public Tree readTreeFromString(String parseStr) {
        //read in the input into a Tree data structure
        TreeReader treeReader = new PennTreeReader(new StringReader(parseStr), tree_factory);
        Tree inputTree = null;
        try {
            inputTree = treeReader.readTree();

        } catch (IOException e) {
            e.printStackTrace();
        }
        return inputTree;
    }

    public static boolean filterOutSentenceByPunctuation(String sentence) {
        //return (sentence.indexOf("\"") != -1 
        //|| sentence.indexOf("''") != -1 
        //|| sentence.indexOf("``") != -1
        //|| sentence.indexOf("*") != -1);
        if (sentence.indexOf("*") != -1) {
            return true;
        }

        //if(sentence.matches("[^\\w\\-\\/\\?\\.,;:\\$\\#\\&\\(\\) ]")){
        //   return true;
        //}

        return false;
    }

    public String getSurfaceForm(String lemma, String pos) {
        return conjugator.getSurfaceForm(lemma, pos);
    }

    private LexicalizedParser parser;
    private static AnalysisUtilities instance;
    private VerbConjugator conjugator;
    private CollinsHeadFinder headfinder;
    private LabeledScoredTreeFactory tree_factory;
    private PennTreebankLanguagePack tlp;

}