Java tutorial
// Question Generation via Overgenerating Transformations and Ranking // Copyright (c) 2008, 2009 Carnegie Mellon University. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // // For more information, bug reports, fixes, contact: // Michael Heilman // Carnegie Mellon University // mheilman@cmu.edu // http://www.cs.cmu.edu/~mheilman // // 9/2012 Michael Kutschke: fix compatibility issues with newer versions of StanfordNLP tools package edu.cmu.ark; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.tregex.TregexMatcher; import edu.stanford.nlp.trees.tregex.TregexPattern; import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon; import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern; import edu.stanford.nlp.util.Pair; /** * Class for converting declarative statements into questions. This class and the WhPhraseGenerator * class constitute "stage 2" as discussed in papers on the system. * * @author Michael Heilman (mheilman@cs.cmu.edu) */ public class QuestionTransducer { public QuestionTransducer() { whGen = new WhPhraseGenerator(); numWHPhrases = 0; } /** * This method removes question objects that have duplicate yields (i.e., output strings). It * goes in order so that higher ranked questions, which are expected to appear first, will * remain. * * @param givenQuestions */ public static void removeDuplicateQuestions(Collection<Question> givenQuestions) { Map<String, Question> yieldMap = new HashMap<String, Question>(); String yield; //add questions that used NP Clarification first for (Question q : givenQuestions) { if (q.getFeatureValue("performedNPClarification") == 0.0) { continue; } yield = q.getTree().yield().toString(); if (yieldMap.containsKey(yield)) { if (GlobalProperties.getDebug()) System.err.println("Removing duplicate: " + yield); continue; } yieldMap.put(yield, q); } //now add any new questions that don't involve NP Clarification for (Question q : givenQuestions) { if (q.getFeatureValue("performedNPClarification") == 1.0) { continue; } yield = q.getTree().yield().toString(); if (yieldMap.containsKey(yield)) { if (GlobalProperties.getDebug()) System.err.println("Removing duplicate: " + yield); //if a previous question that involved NP Clarification has the same yield (i.e., text), //then mark it as using NP Clarification for the answer only Question other = yieldMap.get(yield); if (other.getFeatureValue("performedNPClarification") == 1.0 && other.getSourceSentenceNumber() == q.getSourceSentenceNumber()) { //other.setFeatureValue("performedNPClarificationAnswerOnly", 1.0); other.setFeatureValue("performedNPClarification", 0.0); } continue; } yieldMap.put(yield, q); } givenQuestions.clear(); givenQuestions.addAll(yieldMap.values()); } /** * This method identifies whether the question contains personal pronouns or demonstrative * pronouns (e.g., ``THAT was interesting''), so that the system (by default) avoids outputting * them. If the noun phrase clarification has resolved a personal pronoun to something within * the same sentence, then a question will be produced as output (e.g., John knew he would win * -> Who knew he would win?) * * @param q * @return */ public static boolean containsUnresolvedPronounsOrDemonstratives(Question q) { boolean res = false; String tregexOpStr; TregexPattern matchPattern; TregexMatcher matcher; //return false if noun phrase clarification //has been performed (i.e., if there are pronouns left, they are OK). if (q.getFeatureValue("performedNPClarification") == 0.0) { tregexOpStr = "/^PRP/"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(q.getTree()); res |= matcher.find(); } //tregexOpStr = "NP < (DT < that|this|those|these !$ NN|NNP|NNPS|NP|NNS|SBAR)"; tregexOpStr = "NP < (DT < that|this|those|these)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(q.getTree()); res |= matcher.find(); return res; } public void generateQuestionsFromParse(String inputParseStr) { Tree inputTree = AnalysisUtilities.getInstance().readTreeFromString(inputParseStr); generateQuestionsFromParse(inputTree); } public void generateQuestionsFromParse(Tree inputTree) { Question q = new Question(); q.setSourceTree(inputTree); q.setIntermediateTree(inputTree.deepCopy()); generateQuestionsFromParse(q); } /** * The top-level method for converting declarative sentences into yes-no and WH questions. * * @param inputQuestion */ public void generateQuestionsFromParse(Question inputQuestion) { //initialize the array used to store the output questions questions = new ArrayList<Question>(); //check if this is a sentence we want to create questions from. //E.g., avoid blank sentences, fragments, and sentences that are already questions if (!isUsableInputSentence(inputQuestion.getIntermediateTree())) { if (GlobalProperties.getDebug()) System.err.println("Not a usable sentence."); return; } Tree answerPhrase; Question tmp1 = inputQuestion.deeperCopy(); Question tmp2 = null; List<Tree> outputTrees; if (GlobalProperties.getDebug()) System.err.println("getQuestionsFromParse: input: " + tmp1.toString()); whGen.setCurrentQuestion(tmp1); tmp1.setTree(tmp1.getIntermediateTree().deepCopy()); putLeadingAbverbPhrasesInsideVPs(tmp1.getTree()); AnalysisUtilities.downcaseFirstToken(tmp1.getTree()); //mark phrases that should not be answer phrases, //either due to syntactic constraints or conservative restrictions tmp1.setTree(markUnmovablePhrases(tmp1.getTree())); tmp1.setTree(markPossibleAnswerPhrases(tmp1.getTree())); if (GlobalProperties.getDebug()) System.err.println("Number of Possible WH questions: " + numWHPhrases + "\n"); //iterate over the possible answer phrases, generate //questions for each one for (int i = 0; i < numWHPhrases; i++) { tmp2 = tmp1.deeperCopy(); answerPhrase = getAnswerPhrase(tmp2.getTree(), i); answerPhrase = removeMarkersFromTree(answerPhrase.deepCopy()); //check whether the current answer phrase is the subject. //if not, then decompose the main verb and perform subject auxiliary inversion boolean subjectMovement = isSubjectMovement(tmp2.getTree(), i); if (subjectMovement) { ensureVerbAgreementForSubjectWH(tmp2.getTree()); if (GlobalProperties.getComputeFeatures()) tmp2.setFeatureValue("isSubjectMovement", 1.0); if (GlobalProperties.getComputeFeatures()) tmp2.setFeatureValue("whQuestion", 1.0); } else { tmp2.setTree(decomposePredicate(tmp2.getTree())); tmp2.setTree(subjectAuxiliaryInversion(tmp2.getTree())); if (GlobalProperties.getComputeFeatures()) tmp2.setFeatureValue("isSubjectMovement", 0.0); if (GlobalProperties.getComputeFeatures()) tmp2.setFeatureValue("whQuestion", 1.0); } tmp2.setTree(relabelMainClause(tmp2.getTree())); //Now generate questions by analyzing the answer phrase and choosing possible //question words (e.g., what, who) from it. //Then, remove the answer phrase and put the question phrase at //the front of the main clause before the subject. outputTrees = moveWHPhrase(tmp2.getTree(), tmp2.getIntermediateTree(), i, subjectMovement); //post-process and filter the output for (Tree t : outputTrees) { tmp2 = tmp2.deeperCopy(); tmp2.setTree(t); AnalysisUtilities.upcaseFirstToken(tmp2.getTree()); relabelPunctuationAsQuestionMark(tmp2.getTree()); tmp2.setAnswerPhraseTree(answerPhrase); if (GlobalProperties.getComputeFeatures()) QuestionFeatureExtractor.getInstance().extractFinalFeatures(tmp2); if (avoidPronounsAndDemonstratives && (containsUnresolvedPronounsOrDemonstratives(tmp2))) { if (GlobalProperties.getDebug()) System.err.println("generateQuestionsFromParse: skipping due to pronouns"); } else { questions.add(tmp2); } if (GlobalProperties.getDebug()) System.err.println(); } } //add a yes-no question by performing subject auxiliary inversion tmp2 = tmp1.deeperCopy(); tmp2.setTree(decomposePredicate(tmp2.getTree())); if (canInvert(tmp2.getTree())) { tmp2.setTree(removeMarkersFromTree(tmp2.getTree())); tmp2.setTree(subjectAuxiliaryInversion(tmp2.getTree())); tmp2.setTree(relabelMainClause(tmp2.getTree())); tmp2.setTree(moveLeadingAdjuncts(tmp2.getTree())); relabelPunctuationAsQuestionMark(tmp2.getTree()); AnalysisUtilities.upcaseFirstToken(tmp2.getTree()); tmp2.setAnswerPhraseTree(null); if (GlobalProperties.getComputeFeatures()) tmp2.setFeatureValue("isSubjectMovement", 0.0); if (GlobalProperties.getComputeFeatures()) tmp2.setFeatureValue("whQuestion", 0.0); if (GlobalProperties.getComputeFeatures()) QuestionFeatureExtractor.getInstance().extractFinalFeatures(tmp2); if (avoidPronounsAndDemonstratives && containsUnresolvedPronounsOrDemonstratives(tmp2)) { if (GlobalProperties.getDebug()) System.err.println("generateQuestionsFromParse: skipping due to pronouns"); } else { questions.add(tmp2); } if (GlobalProperties.getDebug()) System.err.println(); } } private void relabelPunctuationAsQuestionMark(Tree inputTree) { List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); String tregexOpStr; List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); TregexPattern matchPattern; TsurgeonPattern p; tregexOpStr = "/^\\./ < /^\\./=period"; ps.add(Tsurgeon.parseOperation("relabel period /?/")); matchPattern = TregexPatternFactory.getPattern(tregexOpStr); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, inputTree); //Make sure there is a question mark at the end. //This catches odd cases like "I live in Pittsburg, PA.", //where the parser think the period at the end is part of an abbreviation. tregexOpStr = "/^\\./"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); if (!matchPattern.matcher(inputTree).find()) { inputTree.getChild(0).addChild(AnalysisUtilities.getInstance().readTreeFromString("(. ?)")); } } /** * Identifies whether a particular answer phrase i is the subject of the sentence. e.g., returns * true for: sentence: John met Sally. question: Who met Sally? returns false for: sentence: * John met Sally. question: Who did John meet? */ private boolean isSubjectMovement(Tree inputTree, int i) { String tregexOpStr = "ROOT=root < (S < NP-" + i + "|SBAR-" + i + ")"; TregexPattern matchPattern = TregexPatternFactory.getPattern(tregexOpStr); TregexMatcher matcher = matchPattern.matcher(inputTree); boolean res = matcher.find(); return res; } /** * The Stanford Parser (or maybe the Penn Treebank) oddly seems to only rarely include adverbs * that precede verbs in verb phrases (e.g., ''oddly'' in this sentence). This method adjusts * for that. */ private void putLeadingAbverbPhrasesInsideVPs(Tree inputTree) { List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); String tregexOpStr; List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); TregexPattern matchPattern; TsurgeonPattern p; tregexOpStr = "ADVP=mover $. VP=vp"; ps.add(Tsurgeon.parseOperation("move mover >0 vp")); matchPattern = TregexPatternFactory.getPattern(tregexOpStr); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, inputTree); } private Tree markUnmovablePhrases(Tree inputTree) { if (noAnswerPhraseMarking) { return inputTree.deepCopy(); } else { return markUnmovablePhrasesFull(inputTree); } } /** * This method marks phrases in the tree that should not undergo WH movement and become answers * to questions, either due to syntactic constraints or some conservative restrictions used to * avoid particular constructions that the system is not designed to handle. E.g., Sentence: * Darwin studied how SPECIES evolve. Avoided Question: * What did Darwin study how evolve? */ private Tree markUnmovablePhrasesFull(Tree inputTree) { Tree copyTree = inputTree.deepCopy(); //adjunct clauses under verb phrases (following commas) markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (VP < (S=unmovable $,, /,/))"); //anything under a sentence level subordinate clause markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root < (S < PP|ADJP|ADVP|S|SBAR=unmovable)"); //anything under a phrase directly dominating a conjunction markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (/\\.*/ < CC << NP|ADJP|VP|ADVP|PP=unmovable)"); //adjunct clauses -- assume subordinate clauses that have a complementizer other than "that" (or empty) are adjuncts markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (SBAR < (IN|DT < /[^that]/) << NP|PP=unmovable)"); //anything under a WH phrase markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (SBAR < /^WH.*P$/ << NP|ADJP|VP|ADVP|PP=unmovable)"); //"Complementizer-trace effect" //the subject of a complement phrase when an explicit complementizer is present (e.g., I knew that JOHN ran.) markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (SBAR <, IN|DT < (S < (NP=unmovable !$,, VP)))"); //anything under a clause that is a predicate nominative (e.g., my favorite activity is to run in THE PARK) markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (S < (VP <+(VP) (VB|VBD|VBN|VBZ < be|being|been|is|are|was|were|am) <+(VP) (S << NP|ADJP|VP|ADVP|PP=unmovable)))"); //objects of prepositional phrases with prepositions other than "of" or "about". //"of" and "about" signal that the modifier is a complement rather than an adjunct. //allows: "John visited the capital of Alaska." -> "What did John visit the capital of?" //disallows: "John visited a city in Alaska." -> ? "What did John visit a city in?" markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (NP << (PP=unmovable !< (IN < of|about)))"); //nested prepositional phrases of any kind //disallows: "Bill saw John in the hall of mirrors." -> * "What did Bill see John in the hall of?" markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (PP << PP=unmovable)"); //prepositional phrases in subjects (e.g., disallows: "The capital of Alaska is Juneau." -> * "What is the capital of Juneau?") //Nothing can be moved out of subjects. //I think the generative account is that phrases can only be moved to the level of the verb //that governs them, and subjects (along with adjuncts) are not governed by the verb. markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (NP $ VP << PP=unmovable)"); //subordinate clauses that are not complements of verbs markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (SBAR=unmovable [ !> VP | $-- /,/ | < RB ])"); //adjunct subordinate clauses //"how", "whether", and "that" under IN or WHADVP nodes signal complements. //WHNP always signals a complement. //otherwise, the SBAR is an adjunct. //Note: we mark words like "where" as unmovable because they are potentially adjuncts. // e.g., "he knew where it was" has a complement, but "he went to college where he grew up" has an adjunct markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (SBAR=unmovable !< WHNP < (/^[^S].*/ !<< that|whether|how))"); //dominates a non-S node that doesn't include one of the unambiguous complementizers ////////////////////////////////////////////////////////////// //MARK SOME AS UNMOVABLE TO AVOID OBVIOUSLY BAD QUESTIONS // //existential there NPs markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (NP=unmovable < EX)"); //phrases in quotations markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (/^S/ < `` << NP|ADJP|VP|ADVP|PP=unmovable)"); //prepositional phrases that don't have NP objects markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (PP=unmovable !< /.*NP/)"); //pronouns which are the subject of complement verb phrases //These would nearly always lead to silly/tricky questions (e.g., "GM says its profits will fall." -> "Whose profits did GM say will fall?") //markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (VP < (SBAR < (S <<, (NP=unmovable < PRP))))"); //both NPs that are under an S (MJH: we are punting on this). //If there are multiple NPs, one may be a temporal modifier markMultipleNPsAsUnmovable(copyTree); ///////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////// //PROPAGATE ABOVE CONSTRAINTS //any non-PP phrases under otherwise movable phrases (we assume movable phrases serve as islands) markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (NP|PP|ADJP|ADVP|PP << (NP|ADJP|VP|ADVP=unmovable))"); //anything under an unmovable node markNodesAsUnmovableUsingPattern(copyTree, "ROOT=root << (@UNMOVABLE << NP|ADJP|VP|ADVP|PP=unmovable)"); if (GlobalProperties.getDebug()) System.err.println("markUnmovablePhrases: " + copyTree.toString()); return copyTree; } /** * This method is used to mark noun phrases that are sisters of each other, such as in double * object dative constructions. I could not figure out how to get Tsurgeon to do this easily, so * phrases are just marked using the stanford parser API instead. E.g., sentence: John gave Mary * the book. avoided question: * Who did John give the book? (the system doesn't convert * "indirect" objects to oblique arguments) * * @param inputTree */ private void markMultipleNPsAsUnmovable(Tree inputTree) { List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); TregexPattern matchPattern; TsurgeonPattern p; String tregexOpStr = "(NP=unmovable $ @NP)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); ps.add(Tsurgeon.parseOperation("relabel unmovable NP-UNMOVABLE")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, inputTree); ops.clear(); ps.clear(); tregexOpStr = "NP-UNMOVABLE=unmovable"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); ps.add(Tsurgeon.parseOperation("relabel unmovable UNMOVABLE-NP")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, inputTree); } private Tree removeMarkersFromTree(Tree inputTree) { if (inputTree == null) return null; Tree res; String treeStr = inputTree.toString(); treeStr = treeStr.replaceAll("UNMOVABLE-", ""); treeStr = treeStr.replaceAll("-\\d+ ", " "); res = AnalysisUtilities.getInstance().readTreeFromString(treeStr); return res; } /** * Note: It would probably be easier to use the Tregex operation to find the nodes and then * change the labels directly rather than writing a Tsurgeon operation. But, when I wrote the * original code, I used Tsurgeon. Probably not worth refactoring. * * @param inputTree * @param tregexOpStr */ private void markNodesAsUnmovableUsingPattern(Tree inputTree, String tregexOpStr) { TregexPattern matchPattern; TregexMatcher matcher; String label; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(inputTree); Tree tmp; while (matcher.find()) { tmp = matcher.getNode("unmovable"); label = tmp.label().value(); tmp.label().setValue("UNMOVABLE-" + label); } } /** * Thsi method returns the node for the ith possible answer phrase in this sentence (after * potential answer phrases have been identified by marking unmovable ones) * * @param inputTree * @param i * @return */ private Tree getAnswerPhrase(Tree inputTree, int i) { Tree answerPhrase; String tregexOpStr; TregexPattern matchPattern; String marker = "/^(NP|PP|SBAR)-" + i + "$/"; tregexOpStr = marker + "=answer"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); if (GlobalProperties.getDebug()) System.err.println("moveWHPhrase: inputTree:" + inputTree.toString()); //if(GlobalProperties.getDebug()) System.err.println("moveWHPhrase: tregexOpStr:" + tregexOpStr); TregexMatcher matcher = matchPattern.matcher(inputTree); matcher.find(); answerPhrase = matcher.getNode("answer"); return answerPhrase; } /** * This method removes the answer phrase from its original position and places it at the front * of the main clause. Note: Tsurgeon operations are perhaps not optimal here. Using the * Stanford API to move nodes directly might be simpler... */ private List<Tree> moveWHPhrase(Tree inputTree, Tree intermediateTree, int i, boolean subjectMovement) { Tree copyTree; Tree copyTree2; List<Tree> res = new ArrayList<Tree>(); Tree mainclauseNode; Tree prepPlaceholderParent; String marker = "/^(NP|PP|SBAR)-" + i + "$/"; List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); String tregexOpStr; TregexPattern matchPattern; TsurgeonPattern p; //extract the "answer" phrase and generate a WH phrase from it tregexOpStr = "ROOT=root < (SQ=qclause << " + marker + "=answer < VP=predicate)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); if (GlobalProperties.getDebug()) System.err.println("moveWHPhrase: inputTree:" + inputTree.toString()); if (GlobalProperties.getDebug()) System.err.println("moveWHPhrase: tregexOpStr:" + tregexOpStr); TregexMatcher matcher = matchPattern.matcher(inputTree); matcher.find(); Tree phraseToMove = matcher.getNode("answer"); String whPhraseSubtree; String leftOverPreposition; if (printExtractedPhrases) System.out.println("EXTRACTED\t" + phraseToMove.yield().toString()); whGen.generateWHPhraseSubtrees(removeMarkersFromTree(phraseToMove), StringUtils .join(Arrays.asList(AnalysisUtilities.stringArrayFromLabels(intermediateTree.yield())), " ")); List<String> whPhraseSubtrees = whGen.getWHPhraseSubtrees(); List<String> leftOverPrepositions = whGen.getLeftOverPrepositions(); copyTree = inputTree.deepCopy(); //The placeholder is necessary because tsurgeon will complain //if an added node has no children. This placeholder is removed below. ps.add(Tsurgeon.parseOperation("insert (PREPPLACEHOLDER dummy) $+ answer")); ps.add(Tsurgeon.parseOperation("prune answer")); ps.add(Tsurgeon.parseOperation("insert (SBARQ PLACEHOLDER) >0 root")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); //move qclause in (9/2012 Michael Kutschke: seperated this from above operation, as newer TSurgeon versions // don't seem to support naming nodes created through TSurgeon operations) ps.clear(); tregexOpStr = "ROOT < SBARQ=mainclause < SQ=qclause"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); ps.add(Tsurgeon.parseOperation("move qclause >-1 mainclause")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, copyTree); copyTree = removeMarkersFromTree(copyTree); //Now put each WH phrase into the tree and remove the original answer. //Operate on the tree directly rather than using tsurgeon //because tsurgeon can't parse operations that insert trees with special characters (e.g., ":") for (int j = 0; j < whPhraseSubtrees.size(); j++) { copyTree2 = copyTree.deepCopy(); whPhraseSubtree = whPhraseSubtrees.get(j); leftOverPreposition = leftOverPrepositions.get(j); if (GlobalProperties.getDebug()) System.err.println("moveWHPhrase: whPhraseSubtree:" + whPhraseSubtree); tregexOpStr = "ROOT < (SBARQ=mainclause < PLACEHOLDER=ph1) << (__=ph2Parent < PREPPLACEHOLDER=ph2)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(copyTree2); if (!matcher.find()) { continue; } mainclauseNode = matcher.getNode("mainclause"); //replace the wh placeholder with a wh phrase mainclauseNode.removeChild(0); mainclauseNode.addChild(0, AnalysisUtilities.getInstance().readTreeFromString(whPhraseSubtree)); //Replace the pp placeholder with the left over preposition. //This may happen when the answer phrase was a PP. //e.g., John went to the game. -> What did John go to? prepPlaceholderParent = matcher.getNode("ph2Parent"); int index = prepPlaceholderParent.indexOf(matcher.getNode("ph2")); if (leftOverPreposition != null && leftOverPreposition.length() > 0) { prepPlaceholderParent.addChild(index, AnalysisUtilities.getInstance().readTreeFromString(leftOverPreposition)); } //now remove the left-over-preposition placeholder ps.clear(); ps.add(Tsurgeon.parseOperation("prune ph2")); p = Tsurgeon.collectOperations(ps); ops.clear(); ops.add(new Pair<TregexPattern, TsurgeonPattern>(TregexPatternFactory.getPattern("PREPPLACEHOLDER=ph2"), p)); Tsurgeon.processPatternsOnTree(ops, copyTree2); copyTree2 = moveLeadingAdjuncts(copyTree2); if (GlobalProperties.getDebug()) System.err.println("moveWHPhrase: " + copyTree2.toString()); res.add(copyTree2); } return res; } /** * This method moves adjunct phrases that appear prior to the first possible subject. e.g., in * order to produce "WHILE I WAS AT THE STORE, who did I meet?" from * "WHILE I WAS AT THE STORE, I met him." This operation is not actually used in the full system * because leading modifiers are either moved or removed by the simplified factual statement * extraction step in stage 1. */ private Tree moveLeadingAdjuncts(Tree inputTree) { if (GlobalProperties.getDebug()) System.err.println("moveLeadingAdjuncts:" + inputTree.toString()); Tree copyTree = inputTree.deepCopy(); String tregexOpStr; TregexPattern matchPattern; TregexMatcher matcher; boolean matchFound = true; List<Pair<TregexPattern, TsurgeonPattern>> ops; List<TsurgeonPattern> ps; TsurgeonPattern p; while (true) { ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); ps = new ArrayList<TsurgeonPattern>(); tregexOpStr = "TMPROOT=root"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(copyTree); matchFound = matcher.find(); ps.add(Tsurgeon.parseOperation("relabel root ROOT")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, copyTree); ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); ps = new ArrayList<TsurgeonPattern>(); //for yes/no questions, find any phrases that precede the first possible subject (NP|SBAR) // and move them to the front of the question clause. tregexOpStr = "ROOT=root < (SQ=mainclause < (/,|ADVP|ADJP|SBAR|S|PP/=mover $,, /MD|VB.*/=pivot $ NP=subject))"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(copyTree); matchFound = matcher.find(); if (!matchFound) { //for WH questions, move any phrases that precede the first potential subject //--or verb phrase for when the original subject is the answer phrase tregexOpStr = "ROOT=root < (SBARQ=mainclause < WHNP|WHPP|WHADJP|WHADVP=pivot < (SQ=invertedclause < (/,|S|ADVP|ADJP|SBAR|PP/=mover !$,, /\\*/ $.. /^VP|VB.*/)))"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(copyTree); matchFound = matcher.find(); } if (!matchFound) { break; } //need to relabel as TMPROOT so things are moved one at a time, to preserve their order ps.add(Tsurgeon.parseOperation("move mover $+ pivot")); ps.add(Tsurgeon.parseOperation("relabel root TMPROOT")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, copyTree); //System.err.println("moving..."+copyTree.toString()); } //remove extra commas for sentences like "Bill, while walking, saw John." Tree firstChild = copyTree.getChild(0); if (firstChild.getChild(0).label().value().equals(",")) { firstChild.removeChild(0); } if (GlobalProperties.getDebug()) System.err.println("moveLeadingAdjuncts(out):" + copyTree.toString()); return copyTree; } /** * This method decomposes the main verb of the sentence for yes-no questions and WH questions * where the answer phrase is not the subject. e.g., I met John -> I did meet John. (which would * later become "Who did I meet?") */ private Tree decomposePredicate(Tree inputTree) { Tree copyTree = inputTree.deepCopy(); List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); String tregexOpStr; List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); TregexPattern matchPattern; TsurgeonPattern p; TregexMatcher matcher; Tree tmpNode; //tregexOpStr = "ROOT < (S=mainclause < (VP=predphrase < /VB.?/=tensedverb !< (VP < /VB.?/)))"; //tregexOpStr = "ROOT < (S=mainclause < (VP=predphrase < (/VB.?/=tensedverb !< is|was|were|am|are|has|have|had|do|does|did)))"; //This rather complex rule identifies predicates to decompose. //There are two cases, separated by a disjunction. //One could break it apart into separate rules to make it simpler... // //The first part of the disjunction //(i.e., < (/VB.?/=tensedverb !< is|was|were|am|are|has|have|had|do|does|did) ) //is for handling basic sentences //(e.g., John bought an apple -> What did John buy?), //sentences with auxiliaries //(e.g., John had bought an apple -> Had John bought an apple?), //and sentences with participial phrases //(e.g., John seemed finished with the apple -> What did John seem finished with?). // //The second part of the disjunction //(i.e., < /VB.?/=tensedverb !< VP ) //is for handling sentences that have predicates //that can also be auxiliaries (e.g., I have a book). //In these cases, we do want to decompose have, has, had, etc. //(e.g., What did I have?) tregexOpStr = "ROOT < (S=mainclause < (VP=predphrase [ < (/VB.?/=tensedverb !< is|was|were|am|are|has|have|had|do|does|did) | < /VB.?/=tensedverb !< VP ]) ! < MAINVP)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(copyTree); if (matcher.find()) { Tree subtree = matcher.getNode("tensedverb"); String lemma = AnalysisUtilities.getInstance().getLemma(subtree.getChild(0).label().value(), subtree.label().value()); String aux = getAuxiliarySubtree(subtree); if (!lemma.equals("be")) { ps.add(Tsurgeon.parseOperation("insert (MAINVP PLACEHOLDER) $+ predphrase")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); // 9/2012 Michael Kutschke: split these two steps as naming nodes created from TSurgeon operations // is unsupported in newer TSurgeon versions ps.clear(); tregexOpStr = "ROOT < (S=mainclause < (VP=predphrase [ < (/VB.?/=tensedverb !< is|was|were|am|are|has|have|had|do|does|did) | < /VB.?/=tensedverb !< VP ]) < MAINVP=mainvp)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); ps.add(Tsurgeon.parseOperation("move predphrase >-1 mainvp")); ps.add(Tsurgeon.parseOperation("insert (VBLEMMA PLACEHOLDER) $+ tensedverb")); ps.add(Tsurgeon.parseOperation("delete tensedverb")); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, copyTree); matchPattern = TregexPatternFactory.getPattern("MAINVP=mainvp"); matcher = matchPattern.matcher(copyTree); matcher.find(); tmpNode = matcher.getNode("mainvp"); tmpNode.removeChild(0); tmpNode.label().setValue("VP"); tmpNode.addChild(0, AnalysisUtilities.getInstance().readTreeFromString(aux)); matchPattern = TregexPatternFactory.getPattern("VBLEMMA=vblemma"); matcher = matchPattern.matcher(copyTree); matcher.find(); tmpNode = matcher.getNode("vblemma"); tmpNode.removeChild(0); tmpNode.label().setValue("VB"); tmpNode.addChild(0, tmpNode.treeFactory().newLeaf(lemma)); } } if (GlobalProperties.getDebug()) System.err.println("decomposePredicate: " + copyTree.toString()); return copyTree; } /** * Returns the singular present tense form of a tensed verb. This only affects the output when * generating from sentences where first and second person pronouns are the subject. E.g., * Affects: I walk -> Who walks? (rather than, Who walk?) Does not affect: He walks -> Who * walks? */ private String getSingularFormSubtree(Tree tensedVerbSubtree) { String res = ""; String lemma = AnalysisUtilities.getInstance().getLemma(tensedVerbSubtree.getChild(0).label().value(), tensedVerbSubtree.label().value()); String pos = tensedVerbSubtree.value(); if (pos.equals("VBD")) { res = tensedVerbSubtree.toString(); } else { res = "(VBZ " + AnalysisUtilities.getInstance().getSurfaceForm(lemma, "VBZ") + ")"; } return res; } /** * This method is used to decompose the main verb. e.g., input: (VBD walked) output: (VBD did) * Note: another method would extract the base form of the verb "(VB walk)" * * @param tensedverb * @return */ private String getAuxiliarySubtree(Tree tensedverb) { if (tensedverb == null) { return ""; } String res = ""; String label; Pattern p = Pattern.compile("\\((\\S+) [^\\)]*\\)"); Matcher m = p.matcher(tensedverb.toString()); m.find(); label = m.group(1); if (label.equals("VBD")) { res = "(VBD did)"; } else if (label.equals("VBZ")) { res = "(VBZ does)"; } else if (label.equals("VBP")) { res = "(VBP do)"; } else { res = "(VB do)"; } return res; } /** * relabels the main clause from S (declarative sentence clause) to SQ (inverted question * clause) * * @param inputTree * @return */ private Tree relabelMainClause(Tree inputTree) { Tree copyTree = inputTree.deepCopy(); String tregexOpStr = "ROOT < S=mainclause"; TregexPattern matchPattern = TregexPatternFactory.getPattern(tregexOpStr); TregexMatcher m = matchPattern.matcher(copyTree); if (m.matches()) { m.getNode("mainclause").label().setValue("SQ"); } return copyTree; } /** * Moves an auxiliary verb to the front of the main clause (i.e., before the subject). This is * used in yes-no questions and WH questions where the answer phrase is not the subject E.g., * John did meet Paul -> Did John meet Paul (which will then become "Who did John meet?") */ private Tree subjectAuxiliaryInversion(Tree inputTree) { Tree copyTree = inputTree.deepCopy(); List<Pair<TregexPattern, TsurgeonPattern>> ops = new ArrayList<Pair<TregexPattern, TsurgeonPattern>>(); List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); String tregexOpStr; TregexPattern matchPattern; TsurgeonPattern p; //auxilaries tregexOpStr = "ROOT=root < (S=mainclause <+(/VP.*/) (VP < /(MD|VB.?)/=aux < (VP < /VB.?/=baseform)))"; ps.add(Tsurgeon.parseOperation("relabel root TMPROOT")); ps.add(Tsurgeon.parseOperation("prune aux")); ps.add(Tsurgeon.parseOperation("insert aux >0 mainclause")); matchPattern = TregexPatternFactory.getPattern(tregexOpStr); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, copyTree); //copula ops.clear(); ps.clear(); tregexOpStr = "ROOT=root < (S=mainclause <+(/VP.*/) (VP < (/VB.?/=copula < is|are|was|were|am) !< VP))"; ps.add(Tsurgeon.parseOperation("relabel root TMPROOT")); ps.add(Tsurgeon.parseOperation("prune copula\n")); ps.add(Tsurgeon.parseOperation("insert copula >0 mainclause")); matchPattern = TregexPatternFactory.getPattern(tregexOpStr); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, copyTree); ops.clear(); ps.clear(); tregexOpStr = "TMPROOT=root"; ps.add(Tsurgeon.parseOperation("relabel root ROOT")); matchPattern = TregexPatternFactory.getPattern(tregexOpStr); p = Tsurgeon.collectOperations(ps); ops.add(new Pair<TregexPattern, TsurgeonPattern>(matchPattern, p)); Tsurgeon.processPatternsOnTree(ops, copyTree); if (GlobalProperties.getDebug()) System.err.println("subjectAuxiliaryInversion: " + copyTree.toString()); return copyTree; } /** * Changes the inflection of the main verb for questions with first and second person pronouns * are the subject. Note: this probably isn't necessary for most applications. E.g., Affects: I * walk -> Who walks? (rather than, Who walk?) Does not affect: He walks -> Who walks? */ private void ensureVerbAgreementForSubjectWH(Tree inputTree) { String tregexOpStr; TregexMatcher matcher; TregexPattern matchPattern; Tree subjectTree; String subjectString; tregexOpStr = "/^(NP|PP|SBAR)-" + 0 + "$/"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(inputTree); if (matcher.find()) { subjectTree = matcher.getMatch(); subjectString = AnalysisUtilities.orginialSentence(subjectTree.yield()); if (subjectString.equalsIgnoreCase("I") || subjectString.equalsIgnoreCase("you")) { tregexOpStr = "ROOT=root < (S=mainclause < (VP=verbphrase < (/VB.?/=tensedverb)))"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(inputTree); if (matcher.find()) { Tree verbSubtree = matcher.getNode("tensedverb"); Tree vpSubtree = matcher.getNode("verbphrase"); Tree singularFormSubtree = AnalysisUtilities.getInstance() .readTreeFromString(getSingularFormSubtree(verbSubtree)); int index = vpSubtree.indexOf(verbSubtree); vpSubtree.removeChild(index); vpSubtree.addChild(index, singularFormSubtree); if (GlobalProperties.getDebug()) System.err.println("ensureVerbAgreementForSubjectWH: " + inputTree.toString()); } } } } /** * Marks possible answer phrase nodes with indexes for later processing. This step might be * easier with the Stanford Parser API's Tree class methods than with Tsurgeon... * * @param inputTree * @return */ private Tree markPossibleAnswerPhrases(Tree inputTree) { Tree copyTree = inputTree.deepCopy(); numWHPhrases = 0; List<TsurgeonPattern> ps = new ArrayList<TsurgeonPattern>(); String tregexOpStr; TregexPattern matchPattern; TregexMatcher matcher; Tree tmp; //find and mark the main clause subject tregexOpStr = "ROOT < (S < (NP|SBAR=subj $+ /,/ !$++ NP|SBAR))"; ps.add(Tsurgeon.parseOperation("relabel subj NP-0")); matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(copyTree); if (matcher.find()) { tmp = matcher.getNode("subj"); tmp.label().setValue(tmp.label().value() + "-0"); numWHPhrases++; } //noun phrases tregexOpStr = "ROOT=root << NP|PP|SBAR=np"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(copyTree); while (matcher.find()) { tmp = matcher.getNode("np"); tmp.label().setValue(tmp.label().value() + "-" + numWHPhrases); numWHPhrases++; } if (GlobalProperties.getDebug()) System.err.println("markPossibleAnswerPhrases: " + copyTree.toString()); return copyTree; } /** * returns whether to perform subject-aux inversion (true if there is an auxiliary or modal verb * in addition to the predicate) E.g., true for "John did meet Paul" (which could lead to * "Who did John meet?") false for "John met Paul" (which could lead to "Who met Paul?") Note * that this occurs after the main verb decomposition step (which depends on whether the answer * phrase is the subject or not) * * @param inputTree * @return */ private boolean canInvert(Tree inputTree) { String tregexOpStr; TregexPattern matchPattern; TregexMatcher matcher; tregexOpStr = "ROOT < (S < (VP < /(MD|VB.?)/))"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(inputTree); return matcher.matches(); } /** * Filters out some stuff we don't want to process. Note: this method is somewhat redundant with * a similar method in stage 1. * * @param inputTree * @return */ private boolean isUsableInputSentence(Tree inputTree) { boolean res = false; String tregexOpStr; TregexPattern matchPattern; TregexMatcher matcher; //skip if there are leading conjunctions (need to drop these during stage 1) tregexOpStr = "ROOT < (S=mainclause < CC=frontedconj)"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); if (matchPattern.matcher(inputTree).matches()) { return false; } //make sure this is not just a single node tregexOpStr = "/\\./ !< /\\./"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); if (matchPattern.matcher(inputTree).matches()) { return false; } //MAKE SURE THERE IS A RECOGNIZABLE SUBJECT //PUNT IF THERE IS A NON-NP SUBJECT //also, avoid "there are ..." sentences tregexOpStr = "ROOT < (S < (NP !< EX))"; matchPattern = TregexPatternFactory.getPattern(tregexOpStr); matcher = matchPattern.matcher(inputTree); res = matcher.matches(); return res; } protected void setPrintExtractedPhrases(boolean b) { printExtractedPhrases = b; } public void setAvoidPronounsAndDemonstratives(boolean b) { avoidPronounsAndDemonstratives = b; } public boolean getAvoidPronounsAndDemonstratives() { return avoidPronounsAndDemonstratives; } public List<Question> getQuestions() { return questions; } public void setNoAnswerPhraseMarking(boolean b) { this.noAnswerPhraseMarking = b; } public boolean getNoAnswerPhraseMarking() { return noAnswerPhraseMarking; } /** * main method for testing stage 2 in isolation. The QuestionAsker class's main method should be * used to generate questions from the end-to-end system. * * @param args */ public static void main(String[] args) { QuestionTransducer qt = new QuestionTransducer(); AnalysisUtilities.getInstance(); String buf; Tree inputTree; boolean printParse = false; boolean printOriginal = false; boolean treeInput = false; boolean printFeatures = false; Set<Question> inputTrees = new HashSet<Question>(); qt.setAvoidPronounsAndDemonstratives(true); for (int i = 0; i < args.length; i++) { if (args[i].equals("--debug")) { GlobalProperties.setDebug(true); } else if (args[i].equals("--print-parse")) { printParse = true; } else if (args[i].equals("--print-original")) { printOriginal = true; } else if (args[i].equals("--print-features")) { printFeatures = true; } else if (args[i].equals("--print-extracted-phrases")) { qt.setPrintExtractedPhrases(true); } else if (args[i].equals("--tree-input")) { treeInput = true; } else if (args[i].equals("--keep-pro")) { qt.setAvoidPronounsAndDemonstratives(false); } else if (args[i].equals("--properties")) { GlobalProperties.loadProperties(args[i + 1]); } } try { BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); //take input from the user on stdin if (GlobalProperties.getDebug()) System.err.println("\nInput Declarative Sentence:"); while ((buf = br.readLine()) != null) { if (treeInput) { buf = AnalysisUtilities.preprocessTreeString(buf); inputTree = AnalysisUtilities.getInstance().readTreeFromString(buf); AnalysisUtilities.getInstance().normalizeTree(inputTree); } else { if (AnalysisUtilities.filterOutSentenceByPunctuation(buf)) { continue; } buf = AnalysisUtilities.preprocess(buf); if (printOriginal) System.out.println("\n" + buf); ParseResult parseRes = AnalysisUtilities.getInstance().parseSentence(buf); inputTree = parseRes.parse; if (GlobalProperties.getDebug()) System.err.println("Parse Score: " + parseRes.score); } if (printParse) System.out.println(inputTree); inputTrees.clear(); Question tmp = new Question(); tmp.setIntermediateTree(inputTree.deepCopy()); tmp.setSourceTree(inputTree); inputTrees.add(tmp); //iterate over the trees given by the input List<Question> questions; for (Question q : inputTrees) { try { qt.generateQuestionsFromParse(q); questions = qt.getQuestions(); QuestionTransducer.removeDuplicateQuestions(questions); //iterate over the questions for each tree for (Question curQuestion : questions) { System.out.print(curQuestion.yield()); if (printFeatures) { System.out.print("\t"); int cnt = 0; for (Double val : curQuestion.featureValueList()) { if (cnt > 0) System.out.print(";"); System.out.print(NumberFormat.getInstance().format(val)); cnt++; } } System.out.println(); } } catch (Exception e) { e.printStackTrace(); } } if (GlobalProperties.getDebug()) System.err.println("\nInput Declarative Sentence:"); } } catch (IOException e) { e.printStackTrace(); } } int numWHPhrases; //the number of possible answer phrases identified in the source sentence. private boolean avoidPronounsAndDemonstratives; //don't produce questions with pronouns private List<Question> questions; //output questions, co-indexed with sourceTrees and featureValueLists private WhPhraseGenerator whGen; private boolean printExtractedPhrases; //whether or not to print out answer phrases private boolean noAnswerPhraseMarking = false; }