lv.semti.morphology.analyzer.Word.java Source code

Introduction

Here is the source code for lv.semti.morphology.analyzer.Word.java
Source

    /*******************************************************************************
     * Copyright 2008, 2009, 2014 Institute of Mathematics and Computer Science, University of Latvia
     * Author: Pteris Paikens
     * 
     *     This program is free software: you can redistribute it and/or modify
     *     it under the terms of the GNU General Public License as published by
     *     the Free Software Foundation, either version 3 of the License, or
     *     (at your option) any later version.
     * 
     *     This program is distributed in the hope that it will be useful,
     *     but WITHOUT ANY WARRANTY; without even the implied warranty of
     *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     *     GNU General Public License for more details.
     * 
     *     You should have received a copy of the GNU General Public License
     *     along with this program.  If not, see <http://www.gnu.org/licenses/>.
     *******************************************************************************/
    package lv.semti.morphology.analyzer;

    import java.io.IOException;
    import java.io.PrintStream;
    import java.io.PrintWriter;
    import java.io.Writer;
    import java.util.ArrayList;
    import java.util.Iterator;
    import java.util.Observable;

    import org.json.simple.JSONValue;
    import org.w3c.dom.Node;
    import org.w3c.dom.NodeList;

    import lv.semti.morphology.attributes.AttributeNames;
    import lv.semti.morphology.attributes.AttributeValues;
    import lv.semti.morphology.corpus.Statistics;

    /**
     * Morphologically analyzed token with potentially multiple variants of
     * analysis.
     * 
     * @author Pteris Paikens
     */
    public class Word extends Observable implements Cloneable {

        private String token;
        public ArrayList<Wordform> wordforms = new ArrayList<Wordform>();
        private Wordform correctWordform = null;

        public Word(String token) {
            this.token = token.trim();
            this.wordforms = new ArrayList<Wordform>(1);
        }

        public Word(Node node) {
            if (node.getNodeName().equalsIgnoreCase("V?rds")) {
                NodeList nodes = node.getChildNodes();
                for (int i = 0; i < nodes.getLength(); i++) {
                    Node n = nodes.item(i);
                    if (n.getNodeName().equalsIgnoreCase("V?rdforma"))
                        wordforms.add(new Wordform(n));
                }

                Node n = node.getAttributes().getNamedItem("v?rds");
                if (n != null)
                    token = n.getTextContent();
                n = node.getAttributes().getNamedItem("pareiz?V?rdforma");
                if (n != null)
                    setCorrectWordform(wordforms.get(Integer.parseInt(n.getTextContent())));

            } else if (node.getNodeName().equalsIgnoreCase("V?rdforma")) {
                token = node.getAttributes().getNamedItem("v?rds").getTextContent();
                wordforms.add(new Wordform(node));
            } else
                throw new Error("Node " + node.getNodeName() + " nav ne V?rds, ne V?rdforma");
        }

        @Override
        public String toString() {
            return token;
        }

   @Override
   public Object clone() {
      try {
         Word kopija = (Word)super.clone();
         kopija.token = this.token;
         kopija.wordforms = new ArrayList<Wordform>();
         for (Wordform v?rdforma : wordforms) {
            Wordform klons = (Wordform) v?rdforma.clone();
            kopija.wordforms.add(klons);
            if (this.getCorrectWordform() == v?rdforma)
               kopija.setCorrectWordform(klons);
         }
         return kopija;
        } catch (CloneNotSupportedException e) {
            throw new Error("Guks - nu vajag vart klasi V?rds noklont.");
        }
   }

        @Override
        public boolean equals(Object o) {
            try {
                Word w = (Word) o;
                if (token == null ^ w.token == null || wordforms == null ^ w.wordforms == null
                        || correctWordform == null ^ w.correctWordform == null)
                    return false;
                return (token == w.token || token.equals(w.token))
                        && (wordforms == w.wordforms || wordforms.equals(w.wordforms))
                        && (correctWordform == w.correctWordform || correctWordform.equals(w.correctWordform));
            } catch (ClassCastException e) {
                return false;
            }
        }

        @Override
        public int hashCode() {
            //return 0;
            String signature = "1117 " + token + " " + wordforms;
            // TODO: Ilmaar, paskaties.
            // It's a kind of magic: adding the lower one makes Word-s unfindable in 
            // LinkedHashMap, even there exists an key to which .equals gives true
            // and .hashCode gives the same value as for the searched object. 
            //      signature = signature + " " + correctWordform + " ";
            return signature.hashCode();
        }

        public void addWordform(Wordform wordform) {
            wordform.setToken(this.token);
            wordforms.add(wordform);
        }

        public boolean isRecognized() {
            return !wordforms.isEmpty();
        }

        public void print(PrintWriter stream) {
            stream.format("Aprakstam v?rdu '%s'%n", token);
            if (wordforms.isEmpty()) {
                stream.println("\tV?rds nav atpazts.\n");
            } else {
                if (wordforms.size() == 1) {
                    stream.println("\tV?rds ir atpazts viennozmgi.\n");
                    wordforms.get(0).describe(stream);
                } else {
                    stream.format("\tV?rds ir atpazts %d variantos%n", wordforms.size());
                    for (Wordform variants : wordforms) {
                        stream.format("\tVariants %d%n", wordforms.indexOf(variants) + 1);
                        variants.describe(stream);
                    }
                }
            }
            stream.flush();
        }

        public void printShort(PrintWriter stream) {
            if (wordforms.isEmpty()) {
                stream.printf("%s : nav atpazts.\n", token);
            } else {
                for (Wordform variants : wordforms)
                    variants.shortDescription(stream);
            }
            stream.flush();
        }

        public void addAttribute(String attribute, String value) {
            for (Wordform variants : wordforms)
                variants.addAttribute(attribute, value);
        }

   /**
    *    gets rid of those wordforms that match (weakly) the attributes provided. Destructive!
    * @param attributes
    */
   public void filterByAttributes(AttributeValues attributes) {
      ArrayList<Wordform> derg?s = new ArrayList<Wordform>();

      for (Wordform v?rdforma : wordforms) {
         if (v?rdforma.isMatchingWeak(attributes)) derg?s.add(v?rdforma);
      }

      wordforms = derg?s;
   }

        public String getToken() {
            return token;
        }

        // variantuSkaits
        public int wordformsCount() {
            return wordforms.size();
        }

        public void setCorrectWordform(Wordform wordform) {
            if (wordforms.indexOf(wordform) == -1)
                throw new Error(String.format("V?rdam %s mina uzlikt par pareizo sveu v?rdformu %s.", token,
                        wordform.getToken()));

            correctWordform = wordform;
        }

        public Wordform getCorrectWordform() {
            return correctWordform;
        }

   public void toXML(Writer stream) throws IOException {
      stream.write("<V?rds");
      stream.write(" v?rds=\"" + token.replace("\"", "&quot;") + "\"");
      if (correctWordform != null)
         stream.write(" pareiz?V?rdforma=\""+wordforms.indexOf(correctWordform)+"\"");
      stream.write(">\n");
      for (Wordform v?rdforma : wordforms) {
         v?rdforma.toXML(stream);
      }
      stream.write("</V?rds>");
   }

        public String toJSON() {
            Iterator<Wordform> i = wordforms.iterator();
            String out = "[";
            while (i.hasNext()) {
                out += i.next().toJSON();
                if (i.hasNext())
                    out += ", ";
            }
            out += "]";
            return out;
        }

        public String toJSONsingle() {
            if (isRecognized()) {
                /* is ir tad, ja vajag tikai vienu - ticam?ko formu. t? jau vartu atgriezt visu sarakstu. */
                Wordform maxwf = getBestWordform();
                //return maxwf.toJSON(); TODO - varbt ar o te vajag atgriezt
                return String.format("{\"V?rds\":\"%s\",\"Marjums\":\"%s\",\"Pamatforma\":\"%s\"}",
                        JSONValue.escape(maxwf.getToken()), JSONValue.escape(maxwf.getTag()),
                        JSONValue.escape(maxwf.getValue(AttributeNames.i_Lemma)));
            } else
                return String.format("{\"V?rds\":\"%s\",\"Marjums\":\"-\",\"Pamatforma\":\"%s\"}",
                        JSONValue.escape(getToken()), JSONValue.escape(getToken()));
        }

   public Wordform getBestWordform() {
      if (wordforms.size() == 0) return null;
      Wordform maxwf = wordforms.get(0);
      double maxticamba = -1;
      for (Wordform wf : wordforms) {  // Paskatamies visus atrastos variantus un emam statistiski ticam?ko
         //tag += String.format("%s\t%d\n", wf.getDescription(), MorphoServer.statistics.getTicamba(wf));
         double estimate = Statistics.getStatistics().getEstimate(wf);
         if (estimate > maxticamba) {
            maxticamba = estimate;
            maxwf = wf;
         }
      }
      return maxwf;
   }

   public Wordform getMatchingWordform(String answerTag, boolean complain) {
      Wordform result = null;
      AttributeValues av = MarkupConverter.fromKamolsMarkup(answerTag);
      
      //FIXME - hardcoded workaround tagera kdai
      if (this.getToken().endsWith("ais") && av.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective) 
                                  && av.isMatchingStrong(AttributeNames.i_Definiteness, AttributeNames.v_Indefinite)) {
         av.addAttribute(AttributeNames.i_Definiteness, AttributeNames.v_Definite);
      }
      
      double maxticamba = -1;
      for (Wordform wf : this.wordforms) {
         if (wf.isMatchingWeak(av)) {
            double estimate = Statistics.getStatistics().getEstimate(wf);
            if (estimate > maxticamba) {
               maxticamba = estimate;
               result = wf;
            }
//            if (complain && result != null) 
//               System.err.printf("Multiple valid word(lemma) options for word %s tag %s: %s and %s\n", this.getToken(), answerTag, wf.getTag(), result.getTag());
         }
      }
      
      if (result == null) {
         result = new Wordform(this.getToken());
         result.addAttributes(av);
         result.addAttribute(AttributeNames.i_Source, "CMM tagger guess");
         result.addAttribute(AttributeNames.i_Lemma, this.getToken()); //FIXME - most likely wrong lemma, guesser should be used to obtain a realistic one
         if (complain) System.err.printf("Tagger chose a tag that's not one of analysis options for word %s tag %s\n", this.getToken(), answerTag);
         if (complain) this.addWordform(result); //FIXME - nav sti atbilstos complain
      }
      if (complain && (result.getValue(AttributeNames.i_Lemma).equalsIgnoreCase("nav") || result.getValue(AttributeNames.i_Lemma).equalsIgnoreCase("nen?k"))) {
         result.describe();
      }

      
      return result;
   }

        public String toTabSepsingle() { // akargs form?ts haskell-pipe-export ?trdarbbai
            if (isRecognized()) {
                Wordform maxwf = getBestWordform();
                //return maxwf.toJSON(); TODO - varbt ar o te vajag atgriezt
                return String.format("%s\t%s\t%s", maxwf.getToken(), maxwf.getTag(),
                        maxwf.getValue(AttributeNames.i_Lemma));
            } else
                return String.format("%s\t-\t%s", getToken(), getToken());
        }

   public String toTabSep(boolean probabilities) { // akargs form?ts postagera pitonam
      if (isRecognized()) {
         double sumTicamba = 0;
         for (Wordform wf : wordforms) sumTicamba += Statistics.getStatistics().getEstimate(wf);
         if (sumTicamba < 0.001) sumTicamba = 0.001;
         
         Iterator<Wordform> i = wordforms.iterator();
         String out = "";
         while (i.hasNext()) {
            Wordform wf = i.next();
            out += String.format("%s\t%s\t%s", wf.getToken(), wf.getTag(), wf.getValue(AttributeNames.i_Lemma));;
            if (probabilities) out += String.format("\t%.5f", Statistics.getStatistics().getEstimate(wf)/sumTicamba);
            if (i.hasNext()) out += "\t";
         }
         return out;
      } else {
         String out = String.format("%s\t-\t%s", getToken(), getToken());
         if (probabilities) out += "\t1.0";
         return out;
      }
         
   }

   public boolean hasAttribute(String attribute, String value){
      boolean results = false;
      for (Wordform v?rdforma : wordforms)
         if (v?rdforma.isMatchingStrong(attribute, value)) results = true;
      return results;
   }

        public void describe(PrintWriter pipe) {
            pipe.println(this.token);
            for (Wordform wf : wordforms)
                wf.describe(pipe);
        }

        public void describe(PrintStream out) {
            this.describe(new PrintWriter(out));
        }

    }