Java tutorial
/******************************************************************************* * Copyright 2008, 2009, 2014 Institute of Mathematics and Computer Science, University of Latvia * Author: Pteris Paikens * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. *******************************************************************************/ package lv.semti.morphology.analyzer; import java.io.IOException; import java.io.PrintStream; import java.io.PrintWriter; import java.io.Writer; import java.util.ArrayList; import java.util.Iterator; import java.util.Observable; import org.json.simple.JSONValue; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import lv.semti.morphology.attributes.AttributeNames; import lv.semti.morphology.attributes.AttributeValues; import lv.semti.morphology.corpus.Statistics; /** * Morphologically analyzed token with potentially multiple variants of * analysis. * * @author Pteris Paikens */ public class Word extends Observable implements Cloneable { private String token; public ArrayList<Wordform> wordforms = new ArrayList<Wordform>(); private Wordform correctWordform = null; public Word(String token) { this.token = token.trim(); this.wordforms = new ArrayList<Wordform>(1); } public Word(Node node) { if (node.getNodeName().equalsIgnoreCase("V?rds")) { NodeList nodes = node.getChildNodes(); for (int i = 0; i < nodes.getLength(); i++) { Node n = nodes.item(i); if (n.getNodeName().equalsIgnoreCase("V?rdforma")) wordforms.add(new Wordform(n)); } Node n = node.getAttributes().getNamedItem("v?rds"); if (n != null) token = n.getTextContent(); n = node.getAttributes().getNamedItem("pareiz?V?rdforma"); if (n != null) setCorrectWordform(wordforms.get(Integer.parseInt(n.getTextContent()))); } else if (node.getNodeName().equalsIgnoreCase("V?rdforma")) { token = node.getAttributes().getNamedItem("v?rds").getTextContent(); wordforms.add(new Wordform(node)); } else throw new Error("Node " + node.getNodeName() + " nav ne V?rds, ne V?rdforma"); } @Override public String toString() { return token; } @Override public Object clone() { try { Word kopija = (Word)super.clone(); kopija.token = this.token; kopija.wordforms = new ArrayList<Wordform>(); for (Wordform v?rdforma : wordforms) { Wordform klons = (Wordform) v?rdforma.clone(); kopija.wordforms.add(klons); if (this.getCorrectWordform() == v?rdforma) kopija.setCorrectWordform(klons); } return kopija; } catch (CloneNotSupportedException e) { throw new Error("Guks - nu vajag vart klasi V?rds noklont."); } } @Override public boolean equals(Object o) { try { Word w = (Word) o; if (token == null ^ w.token == null || wordforms == null ^ w.wordforms == null || correctWordform == null ^ w.correctWordform == null) return false; return (token == w.token || token.equals(w.token)) && (wordforms == w.wordforms || wordforms.equals(w.wordforms)) && (correctWordform == w.correctWordform || correctWordform.equals(w.correctWordform)); } catch (ClassCastException e) { return false; } } @Override public int hashCode() { //return 0; String signature = "1117 " + token + " " + wordforms; // TODO: Ilmaar, paskaties. // It's a kind of magic: adding the lower one makes Word-s unfindable in // LinkedHashMap, even there exists an key to which .equals gives true // and .hashCode gives the same value as for the searched object. // signature = signature + " " + correctWordform + " "; return signature.hashCode(); } public void addWordform(Wordform wordform) { wordform.setToken(this.token); wordforms.add(wordform); } public boolean isRecognized() { return !wordforms.isEmpty(); } public void print(PrintWriter stream) { stream.format("Aprakstam v?rdu '%s'%n", token); if (wordforms.isEmpty()) { stream.println("\tV?rds nav atpazts.\n"); } else { if (wordforms.size() == 1) { stream.println("\tV?rds ir atpazts viennozmgi.\n"); wordforms.get(0).describe(stream); } else { stream.format("\tV?rds ir atpazts %d variantos%n", wordforms.size()); for (Wordform variants : wordforms) { stream.format("\tVariants %d%n", wordforms.indexOf(variants) + 1); variants.describe(stream); } } } stream.flush(); } public void printShort(PrintWriter stream) { if (wordforms.isEmpty()) { stream.printf("%s : nav atpazts.\n", token); } else { for (Wordform variants : wordforms) variants.shortDescription(stream); } stream.flush(); } public void addAttribute(String attribute, String value) { for (Wordform variants : wordforms) variants.addAttribute(attribute, value); } /** * gets rid of those wordforms that match (weakly) the attributes provided. Destructive! * @param attributes */ public void filterByAttributes(AttributeValues attributes) { ArrayList<Wordform> derg?s = new ArrayList<Wordform>(); for (Wordform v?rdforma : wordforms) { if (v?rdforma.isMatchingWeak(attributes)) derg?s.add(v?rdforma); } wordforms = derg?s; } public String getToken() { return token; } // variantuSkaits public int wordformsCount() { return wordforms.size(); } public void setCorrectWordform(Wordform wordform) { if (wordforms.indexOf(wordform) == -1) throw new Error(String.format("V?rdam %s mina uzlikt par pareizo sveu v?rdformu %s.", token, wordform.getToken())); correctWordform = wordform; } public Wordform getCorrectWordform() { return correctWordform; } public void toXML(Writer stream) throws IOException { stream.write("<V?rds"); stream.write(" v?rds=\"" + token.replace("\"", """) + "\""); if (correctWordform != null) stream.write(" pareiz?V?rdforma=\""+wordforms.indexOf(correctWordform)+"\""); stream.write(">\n"); for (Wordform v?rdforma : wordforms) { v?rdforma.toXML(stream); } stream.write("</V?rds>"); } public String toJSON() { Iterator<Wordform> i = wordforms.iterator(); String out = "["; while (i.hasNext()) { out += i.next().toJSON(); if (i.hasNext()) out += ", "; } out += "]"; return out; } public String toJSONsingle() { if (isRecognized()) { /* is ir tad, ja vajag tikai vienu - ticam?ko formu. t? jau vartu atgriezt visu sarakstu. */ Wordform maxwf = getBestWordform(); //return maxwf.toJSON(); TODO - varbt ar o te vajag atgriezt return String.format("{\"V?rds\":\"%s\",\"Marjums\":\"%s\",\"Pamatforma\":\"%s\"}", JSONValue.escape(maxwf.getToken()), JSONValue.escape(maxwf.getTag()), JSONValue.escape(maxwf.getValue(AttributeNames.i_Lemma))); } else return String.format("{\"V?rds\":\"%s\",\"Marjums\":\"-\",\"Pamatforma\":\"%s\"}", JSONValue.escape(getToken()), JSONValue.escape(getToken())); } public Wordform getBestWordform() { if (wordforms.size() == 0) return null; Wordform maxwf = wordforms.get(0); double maxticamba = -1; for (Wordform wf : wordforms) { // Paskatamies visus atrastos variantus un emam statistiski ticam?ko //tag += String.format("%s\t%d\n", wf.getDescription(), MorphoServer.statistics.getTicamba(wf)); double estimate = Statistics.getStatistics().getEstimate(wf); if (estimate > maxticamba) { maxticamba = estimate; maxwf = wf; } } return maxwf; } public Wordform getMatchingWordform(String answerTag, boolean complain) { Wordform result = null; AttributeValues av = MarkupConverter.fromKamolsMarkup(answerTag); //FIXME - hardcoded workaround tagera kdai if (this.getToken().endsWith("ais") && av.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective) && av.isMatchingStrong(AttributeNames.i_Definiteness, AttributeNames.v_Indefinite)) { av.addAttribute(AttributeNames.i_Definiteness, AttributeNames.v_Definite); } double maxticamba = -1; for (Wordform wf : this.wordforms) { if (wf.isMatchingWeak(av)) { double estimate = Statistics.getStatistics().getEstimate(wf); if (estimate > maxticamba) { maxticamba = estimate; result = wf; } // if (complain && result != null) // System.err.printf("Multiple valid word(lemma) options for word %s tag %s: %s and %s\n", this.getToken(), answerTag, wf.getTag(), result.getTag()); } } if (result == null) { result = new Wordform(this.getToken()); result.addAttributes(av); result.addAttribute(AttributeNames.i_Source, "CMM tagger guess"); result.addAttribute(AttributeNames.i_Lemma, this.getToken()); //FIXME - most likely wrong lemma, guesser should be used to obtain a realistic one if (complain) System.err.printf("Tagger chose a tag that's not one of analysis options for word %s tag %s\n", this.getToken(), answerTag); if (complain) this.addWordform(result); //FIXME - nav sti atbilstos complain } if (complain && (result.getValue(AttributeNames.i_Lemma).equalsIgnoreCase("nav") || result.getValue(AttributeNames.i_Lemma).equalsIgnoreCase("nen?k"))) { result.describe(); } return result; } public String toTabSepsingle() { // akargs form?ts haskell-pipe-export ?trdarbbai if (isRecognized()) { Wordform maxwf = getBestWordform(); //return maxwf.toJSON(); TODO - varbt ar o te vajag atgriezt return String.format("%s\t%s\t%s", maxwf.getToken(), maxwf.getTag(), maxwf.getValue(AttributeNames.i_Lemma)); } else return String.format("%s\t-\t%s", getToken(), getToken()); } public String toTabSep(boolean probabilities) { // akargs form?ts postagera pitonam if (isRecognized()) { double sumTicamba = 0; for (Wordform wf : wordforms) sumTicamba += Statistics.getStatistics().getEstimate(wf); if (sumTicamba < 0.001) sumTicamba = 0.001; Iterator<Wordform> i = wordforms.iterator(); String out = ""; while (i.hasNext()) { Wordform wf = i.next(); out += String.format("%s\t%s\t%s", wf.getToken(), wf.getTag(), wf.getValue(AttributeNames.i_Lemma));; if (probabilities) out += String.format("\t%.5f", Statistics.getStatistics().getEstimate(wf)/sumTicamba); if (i.hasNext()) out += "\t"; } return out; } else { String out = String.format("%s\t-\t%s", getToken(), getToken()); if (probabilities) out += "\t1.0"; return out; } } public boolean hasAttribute(String attribute, String value){ boolean results = false; for (Wordform v?rdforma : wordforms) if (v?rdforma.isMatchingStrong(attribute, value)) results = true; return results; } public void describe(PrintWriter pipe) { pipe.println(this.token); for (Wordform wf : wordforms) wf.describe(pipe); } public void describe(PrintStream out) { this.describe(new PrintWriter(out)); } }