Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package opennlp.tools.parser; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import opennlp.tools.chunker.Chunker; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.ngram.NGramModel; import opennlp.tools.parser.chunking.ParserEventStream; import opennlp.tools.postag.POSTagger; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.Sequence; import opennlp.tools.util.Span; import opennlp.tools.util.StringList; import opennlp.tools.util.TrainingParameters; /** * Abstract class which contains code to tag and chunk parses for bottom up parsing and * leaves implementation of advancing parses and completing parses to extend class. * <p> * <b>Note:</b> <br> The nodes within the returned parses are shared with other parses * and therefore their parent node references will not be consistent with their child * node reference. {@link #setParents setParents} can be used to make the parents consistent * with a particular parse, but subsequent calls to <code>setParents</code> can invalidate * the results of earlier calls.<br> */ public abstract class AbstractBottomUpParser implements Parser { /** * The maximum number of parses advanced from all preceding * parses at each derivation step. */ protected int M; /** * The maximum number of parses to advance from a single preceding parse. */ protected int K; /** * The minimum total probability mass of advanced outcomes. */ protected double Q; /** * The default beam size used if no beam size is given. */ public static final int defaultBeamSize = 20; /** * The default amount of probability mass required of advanced outcomes. */ public static final double defaultAdvancePercentage = 0.95; /** * Completed parses. */ private SortedSet<Parse> completeParses; /** * Incomplete parses which will be advanced. */ private SortedSet<Parse> odh; /** * Incomplete parses which have been advanced. */ private SortedSet<Parse> ndh; /** * The head rules for the parser. */ protected HeadRules headRules; /** * The set strings which are considered punctuation for the parser. * Punctuation is not attached, but floats to the top of the parse as attachment * decisions are made about its non-punctuation sister nodes. */ protected Set<String> punctSet; /** * The label for the top node. */ public static final String TOP_NODE = "TOP"; /** * The label for the top if an incomplete node. */ public static final String INC_NODE = "INC"; /** * The label for a token node. */ public static final String TOK_NODE = "TK"; /** * The integer 0. */ public static final Integer ZERO = 0; /** * Prefix for outcomes starting a constituent. */ public static final String START = "S-"; /** * Prefix for outcomes continuing a constituent. */ public static final String CONT = "C-"; /** * Outcome for token which is not contained in a basal constituent. */ public static final String OTHER = "O"; /** * Outcome used when a constituent is complete. */ public static final String COMPLETE = "c"; /** * Outcome used when a constituent is incomplete. */ public static final String INCOMPLETE = "i"; /** * The pos-tagger that the parser uses. */ protected POSTagger tagger; /** * The chunker that the parser uses to chunk non-recursive structures. */ protected Chunker chunker; /** * Specifies whether failed parses should be reported to standard error. */ protected boolean reportFailedParse; /** * Specifies whether a derivation string should be created during parsing. * This is useful for debugging. */ protected boolean createDerivationString = false; /** * Turns debug print on or off. */ protected boolean debugOn = false; public AbstractBottomUpParser(POSTagger tagger, Chunker chunker, HeadRules headRules, int beamSize, double advancePercentage) { this.tagger = tagger; this.chunker = chunker; this.M = beamSize; this.K = beamSize; this.Q = advancePercentage; reportFailedParse = true; this.headRules = headRules; this.punctSet = headRules.getPunctuationTags(); odh = new TreeSet<>(); ndh = new TreeSet<>(); completeParses = new TreeSet<>(); } /** * Specifies whether the parser should report when it was unable to find a parse for * a particular sentence. * @param errorReporting If true then un-parsed sentences are reported, false otherwise. */ public void setErrorReporting(boolean errorReporting) { this.reportFailedParse = errorReporting; } /** * Assigns parent references for the specified parse so that they * are consistent with the children references. * @param p The parse whose parent references need to be assigned. */ public static void setParents(Parse p) { Parse[] children = p.getChildren(); for (int ci = 0; ci < children.length; ci++) { children[ci].setParent(p); setParents(children[ci]); } } /** * Removes the punctuation from the specified set of chunks, adds it to the parses * adjacent to the punctuation is specified, and returns a new array of parses with * the punctuation removed. * * @param chunks A set of parses. * @param punctSet The set of punctuation which is to be removed. * @return An array of parses which is a subset of chunks with punctuation removed. */ public static Parse[] collapsePunctuation(Parse[] chunks, Set<String> punctSet) { List<Parse> collapsedParses = new ArrayList<>(chunks.length); int lastNonPunct = -1; int nextNonPunct; for (int ci = 0, cn = chunks.length; ci < cn; ci++) { if (punctSet.contains(chunks[ci].getType())) { if (lastNonPunct >= 0) { chunks[lastNonPunct].addNextPunctuation(chunks[ci]); } for (nextNonPunct = ci + 1; nextNonPunct < cn; nextNonPunct++) { if (!punctSet.contains(chunks[nextNonPunct].getType())) { break; } } if (nextNonPunct < cn) { chunks[nextNonPunct].addPreviousPunctuation(chunks[ci]); } } else { collapsedParses.add(chunks[ci]); lastNonPunct = ci; } } if (collapsedParses.size() == chunks.length) { return chunks; } //System.err.println("collapsedPunctuation: collapsedParses"+collapsedParses); return collapsedParses.toArray(new Parse[collapsedParses.size()]); } /** * Advances the specified parse and returns the an array advanced parses whose * probability accounts for more than the specified amount of probability mass. * * @param p The parse to advance. * @param probMass The amount of probability mass that should be accounted for * by the advanced parses. */ protected abstract Parse[] advanceParses(final Parse p, double probMass); /** * Adds the "TOP" node to the specified parse. * @param p The complete parse. */ protected abstract void advanceTop(Parse p); public Parse[] parse(Parse tokens, int numParses) { if (createDerivationString) tokens.setDerivation(new StringBuffer(100)); odh.clear(); ndh.clear(); completeParses.clear(); int derivationStage = 0; //derivation length int maxDerivationLength = 2 * tokens.getChildCount() + 3; odh.add(tokens); Parse guess = null; double minComplete = 2; double bestComplete = -100000; //approximating -infinity/0 in ln domain while (odh.size() > 0 && (completeParses.size() < M || (odh.first()).getProb() < minComplete) && derivationStage < maxDerivationLength) { ndh = new TreeSet<>(); int derivationRank = 0; for (Iterator<Parse> pi = odh.iterator(); pi.hasNext() && derivationRank < K; derivationRank++) { // foreach derivation Parse tp = pi.next(); //TODO: Need to look at this for K-best parsing cases /* //this parse and the ones which follow will never win, stop advancing. if (tp.getProb() < bestComplete) { break; } */ if (guess == null && derivationStage == 2) { guess = tp; } if (debugOn) { System.out.print(derivationStage + " " + derivationRank + " " + tp.getProb()); tp.show(); System.out.println(); } Parse[] nd; if (0 == derivationStage) { nd = advanceTags(tp); } else if (1 == derivationStage) { if (ndh.size() < K) { //System.err.println("advancing ts "+j+" "+ndh.size()+" < "+K); nd = advanceChunks(tp, bestComplete); } else { //System.err.println("advancing ts "+j+" prob="+((Parse) ndh.last()).getProb()); nd = advanceChunks(tp, (ndh.last()).getProb()); } } else { // i > 1 nd = advanceParses(tp, Q); } if (nd != null) { for (int k = 0, kl = nd.length; k < kl; k++) { if (nd[k].complete()) { advanceTop(nd[k]); if (nd[k].getProb() > bestComplete) { bestComplete = nd[k].getProb(); } if (nd[k].getProb() < minComplete) { minComplete = nd[k].getProb(); } completeParses.add(nd[k]); } else { ndh.add(nd[k]); } } } else { //if (reportFailedParse) { // System.err.println("Couldn't advance parse " + derivationStage // + " stage " + derivationRank + "!\n"); //} advanceTop(tp); completeParses.add(tp); } } derivationStage++; odh = ndh; } if (completeParses.size() == 0) { // if (reportFailedParse) System.err.println("Couldn't find parse for: " + tokens); //Parse r = (Parse) odh.first(); //r.show(); //System.out.println(); return new Parse[] { guess }; } else if (numParses == 1) { return new Parse[] { completeParses.first() }; } else { List<Parse> topParses = new ArrayList<>(numParses); while (!completeParses.isEmpty() && topParses.size() < numParses) { Parse tp = completeParses.last(); completeParses.remove(tp); topParses.add(tp); //parses.remove(tp); } return topParses.toArray(new Parse[topParses.size()]); } } public Parse parse(Parse tokens) { if (tokens.getChildCount() > 0) { Parse p = parse(tokens, 1)[0]; setParents(p); return p; } else { return tokens; } } /** * Returns the top chunk sequences for the specified parse. * @param p A pos-tag assigned parse. * @param minChunkScore A minimum score below which chunks should not be advanced. * @return The top chunk assignments to the specified parse. */ protected Parse[] advanceChunks(final Parse p, double minChunkScore) { // chunk Parse[] children = p.getChildren(); String[] words = new String[children.length]; String[] ptags = new String[words.length]; double[] probs = new double[words.length]; for (int i = 0, il = children.length; i < il; i++) { Parse sp = children[i]; words[i] = sp.getHead().getCoveredText(); ptags[i] = sp.getType(); } //System.err.println("adjusted mcs = "+(minChunkScore-p.getProb())); Sequence[] cs = chunker.topKSequences(words, ptags, minChunkScore - p.getProb()); Parse[] newParses = new Parse[cs.length]; for (int si = 0, sl = cs.length; si < sl; si++) { newParses[si] = (Parse) p.clone(); //copies top level if (createDerivationString) newParses[si].getDerivation().append(si).append("."); String[] tags = cs[si].getOutcomes().toArray(new String[words.length]); cs[si].getProbs(probs); int start = -1; int end = 0; String type = null; //System.err.print("sequence "+si+" "); for (int j = 0; j <= tags.length; j++) { // if (j != tags.length) {System.err.println(words[j]+" " // +ptags[j]+" "+tags[j]+" "+probs.get(j));} if (j != tags.length) { newParses[si].addProb(Math.log(probs[j])); } // if continue just update end chunking tag don't use contTypeMap if (j != tags.length && tags[j].startsWith(CONT)) { end = j; } else { //make previous constituent if it exists if (type != null) { //System.err.println("inserting tag "+tags[j]); Parse p1 = p.getChildren()[start]; Parse p2 = p.getChildren()[end]; // System.err.println("Putting "+type+" at "+start+","+end+" for " // +j+" "+newParses[si].getProb()); Parse[] cons = new Parse[end - start + 1]; cons[0] = p1; //cons[0].label="Start-"+type; if (end - start != 0) { cons[end - start] = p2; //cons[end-start].label="Cont-"+type; for (int ci = 1; ci < end - start; ci++) { cons[ci] = p.getChildren()[ci + start]; //cons[ci].label="Cont-"+type; } } Parse chunk = new Parse(p1.getText(), new Span(p1.getSpan().getStart(), p2.getSpan().getEnd()), type, 1, headRules.getHead(cons, type)); chunk.isChunk(true); newParses[si].insert(chunk); } if (j != tags.length) { //update for new constituent if (tags[j].startsWith(START)) { // don't use startTypeMap these are chunk tags type = tags[j].substring(START.length()); start = j; end = j; } else { // other type = null; } } } } //newParses[si].show();System.out.println(); } return newParses; } /** * Advances the parse by assigning it POS tags and returns multiple tag sequences. * @param p The parse to be tagged. * @return Parses with different POS-tag sequence assignments. */ protected Parse[] advanceTags(final Parse p) { Parse[] children = p.getChildren(); String[] words = new String[children.length]; double[] probs = new double[words.length]; for (int i = 0, il = children.length; i < il; i++) { words[i] = children[i].getCoveredText(); } Sequence[] ts = tagger.topKSequences(words); Parse[] newParses = new Parse[ts.length]; for (int i = 0; i < ts.length; i++) { String[] tags = ts[i].getOutcomes().toArray(new String[words.length]); ts[i].getProbs(probs); newParses[i] = (Parse) p.clone(); //copies top level if (createDerivationString) newParses[i].getDerivation().append(i).append("."); for (int j = 0; j < words.length; j++) { Parse word = children[j]; //System.err.println("inserting tag "+tags[j]); double prob = probs[j]; newParses[i].insert(new Parse(word.getText(), word.getSpan(), tags[j], prob, j)); newParses[i].addProb(Math.log(prob)); } } return newParses; } /** * Determines the mapping between the specified index into the specified parses without punctuation to * the corresponding index into the specified parses. * @param index An index into the parses without punctuation. * @param nonPunctParses The parses without punctuation. * @param parses The parses wit punctuation. * @return An index into the specified parses which corresponds to the same node the specified index * into the parses with punctuation. */ protected int mapParseIndex(int index, Parse[] nonPunctParses, Parse[] parses) { int parseIndex = index; while (parses[parseIndex] != nonPunctParses[index]) { parseIndex++; } return parseIndex; } private static boolean lastChild(Parse child, Parse parent, Set<String> punctSet) { if (parent == null) { return false; } Parse[] kids = collapsePunctuation(parent.getChildren(), punctSet); return (kids[kids.length - 1] == child); } /** * Creates a n-gram dictionary from the specified data stream using the specified * head rule and specified cut-off. * * @param data The data stream of parses. * @param rules The head rules for the parses. * @param params can contain a cutoff, the minimum number of entries required for the * n-gram to be saved as part of the dictionary. * @return A dictionary object. */ public static Dictionary buildDictionary(ObjectStream<Parse> data, HeadRules rules, TrainingParameters params) throws IOException { int cutoff = params.getIntParameter("dict", TrainingParameters.CUTOFF_PARAM, 5); NGramModel mdict = new NGramModel(); Parse p; while ((p = data.read()) != null) { p.updateHeads(rules); Parse[] pwords = p.getTagNodes(); String[] words = new String[pwords.length]; //add all uni-grams for (int wi = 0; wi < words.length; wi++) { words[wi] = pwords[wi].getCoveredText(); } mdict.add(new StringList(words), 1, 1); //add tri-grams and bi-grams for inital sequence Parse[] chunks = collapsePunctuation(ParserEventStream.getInitialChunks(p), rules.getPunctuationTags()); String[] cwords = new String[chunks.length]; for (int wi = 0; wi < cwords.length; wi++) { cwords[wi] = chunks[wi].getHead().getCoveredText(); } mdict.add(new StringList(cwords), 2, 3); //emulate reductions to produce additional n-grams int ci = 0; while (ci < chunks.length) { // System.err.println("chunks["+ci+"]="+chunks[ci].getHead().getCoveredText() // +" chunks.length="+chunks.length + " " + chunks[ci].getParent()); if (chunks[ci].getParent() == null) { chunks[ci].show(); } if (lastChild(chunks[ci], chunks[ci].getParent(), rules.getPunctuationTags())) { //perform reduce int reduceStart = ci; while (reduceStart >= 0 && chunks[reduceStart].getParent() == chunks[ci].getParent()) { reduceStart--; } reduceStart++; chunks = ParserEventStream.reduceChunks(chunks, ci, chunks[ci].getParent()); ci = reduceStart; if (chunks.length != 0) { String[] window = new String[5]; int wi = 0; if (ci - 2 >= 0) window[wi++] = chunks[ci - 2].getHead().getCoveredText(); if (ci - 1 >= 0) window[wi++] = chunks[ci - 1].getHead().getCoveredText(); window[wi++] = chunks[ci].getHead().getCoveredText(); if (ci + 1 < chunks.length) window[wi++] = chunks[ci + 1].getHead().getCoveredText(); if (ci + 2 < chunks.length) window[wi++] = chunks[ci + 2].getHead().getCoveredText(); if (wi < 5) { String[] subWindow = new String[wi]; System.arraycopy(window, 0, subWindow, 0, wi); window = subWindow; } if (window.length >= 3) { mdict.add(new StringList(window), 2, 3); } else if (window.length == 2) { mdict.add(new StringList(window), 2, 2); } } ci = reduceStart - 1; //ci will be incremented at end of loop } ci++; } } //System.err.println("gas,and="+mdict.getCount((new TokenList(new String[] {"gas","and"})))); mdict.cutoff(cutoff, Integer.MAX_VALUE); return mdict.toDictionary(true); } /** * Creates a n-gram dictionary from the specified data stream using the specified * head rule and specified cut-off. * * @param data The data stream of parses. * @param rules The head rules for the parses. * @param cutoff The minimum number of entries required for the n-gram to be * saved as part of the dictionary. * @return A dictionary object. */ public static Dictionary buildDictionary(ObjectStream<Parse> data, HeadRules rules, int cutoff) throws IOException { TrainingParameters params = new TrainingParameters(); params.put("dict", TrainingParameters.CUTOFF_PARAM, cutoff); return buildDictionary(data, rules, params); } }