lv.semti.Thesaurus.struct.ThesaurusEntry.java Source code

Introduction

Here is the source code for lv.semti.Thesaurus.struct.ThesaurusEntry.java
Source

/*******************************************************************************
 * Copyright 2013, 2014 Institute of Mathematics and Computer Science, University of Latvia
 * Author: Lauma Pretkalnia
 * 
 *     This program is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 * 
 *     This program is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU General Public License for more details.
 * 
 *     You should have received a copy of the GNU General Public License
 *     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *******************************************************************************/
package lv.semti.Thesaurus.struct;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;

import lv.semti.Thesaurus.utils.Loaders;
import lv.semti.Thesaurus.utils.JSONUtils;
import lv.semti.morphology.analyzer.Analyzer;
import lv.semti.morphology.analyzer.Wordform;
import lv.semti.morphology.attributes.AttributeNames;

import org.json.simple.JSONObject;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/**
 * Structured representation of entry header.
 */
public class ThesaurusEntry {
    /**
     * i field.
     */
    public String homId;

    /**
     * avots field.
     */
    public Sources sources;

    /**
     * Lemma and all-entry related grammar information.
     */
    public Header head;

    /**
     * g_n (nozmju grupa) field.
     */
    public LinkedList<Sense> senses;

    /**
     * g_fraz (frazeoloismu grupa) field.
     */
    public LinkedList<Phrase> phrases;

    /**
     * g_de (atvasin?jumu grupa) field.
     */
    public LinkedList<Header> derivs;

    /**
     * Lemmas identifying entries currently ignored. See also inBlacklist().
     */
    private static HashSet<String> blacklist = initBlacklist();

    // Reads data of a single thesaurus entry from the XML format
    public ThesaurusEntry(Node sNode) {
        NodeList fields = sNode.getChildNodes();
        LinkedList<Node> postponed = new LinkedList<Node>();
        for (int i = 0; i < fields.getLength(); i++) {
            Node field = fields.item(i);
            String fieldname = field.getNodeName();
            if (fieldname.equals("v")) // word info
            {
                if (head != null)
                    System.err.printf("Entry \"%s\" contains more than one \'v\'\n", head.lemma.text);
                head = new Header(field);
            } else if (!fieldname.equals("#text")) // Text nodes here are ignored.
                postponed.add(field);
        }
        for (Node field : postponed) {
            String fieldname = field.getNodeName();
            if (fieldname.equals("avots")) // source
                sources = new Sources(field);
            else if (fieldname.equals("g_n")) // all senses
                senses = Loaders.loadSenses(field, head.lemma.text);
            else if (fieldname.equals("g_fraz")) //phraseological forms
                phrases = Loaders.loadPhrases(field, head.lemma.text, "fraz");
            else if (fieldname.equals("g_de")) //derived forms
                loadDerivs(field);
            else
                System.err.printf("Entry - s - field %s not processed\n", fieldname);
        }

        homId = ((org.w3c.dom.Element) sNode).getAttribute("i");
        if ("".equals(homId))
            homId = null;

        //if (inBlacklist()) return;

        if (head == null)
            System.err.printf("Thesaurus entry without a lemma/header :(\n");
    }

    /**
     * Process g_de field.
     * Derived forms - in Lexicon sense, they are separate lexemes, alternate
     * wordforms but with a link to the same dictionary entry. 
     */
    private void loadDerivs(Node allDerivs) {
        if (derivs == null)
            derivs = new LinkedList<Header>();
        NodeList derivNodes = allDerivs.getChildNodes();
        for (int i = 0; i < derivNodes.getLength(); i++) {
            Node deriv = derivNodes.item(i);
            if (deriv.getNodeName().equals("de")) {
                NodeList derivSubNodes = deriv.getChildNodes();
                for (int j = 0; j < derivSubNodes.getLength(); j++) {
                    Node derivSubNode = derivSubNodes.item(j);
                    if (derivSubNode.getNodeName().equals("v"))
                        derivs.add(new Header(derivSubNode));
                    else if (!derivSubNode.getNodeName().equals("#text")) // Text nodes here are ignored.
                        System.err.printf("g_de/de entry field %s not processed, expected only 'v'.\n",
                                derivSubNode.getNodeName());
                }
            } else if (!deriv.getNodeName().equals("#text")) // Text nodes here are ignored.
                System.err.printf("g_de entry field %s not processed, expected only 'de'.\n", deriv.getNodeName());
        }
    }

    public boolean inBlacklist() {
        //if (sources == null || !sources.s.contains("LLVV")) return true; // FIXME - temporary restriction to focus on LLVV first
        return blacklist.contains(head.lemma.text);
    }

    /**
     * Constructing a list of lemmas to ignore - basically meant to ease
     * development and testing.
     */
    private static HashSet<String> initBlacklist() {
        HashSet<String> blist = new HashSet<String>();
        BufferedReader ieeja;
        try {
            // Blacklist file format - one word (lemma) per line.
            ieeja = new BufferedReader(new InputStreamReader(new FileInputStream("blacklist.txt"), "UTF-8"));
            String rinda;
            while ((rinda = ieeja.readLine()) != null) {
                //if (rinda.contains("<s>") || rinda.contains("</s>") || rinda.isEmpty())
                //   continue;
                blist.add(rinda.trim());
            }
            ieeja.close();
        } catch (Exception e) {
            System.err.println("Blacklist was not loaded.");
        } //TODO - any IO issues ignored
        return blist;
    }

    /**
     * Not sure if this is the best way to treat paradigms.
     * Currently to trigger true, paradigm must be set for all derivatives and
     * either for header or at least one sense.
     */
    public boolean hasParadigm() {
        boolean res = head.hasParadigm();
        //if (head.hasParadigm()) return true;
        if (senses != null)
            for (Sense s : senses) {
                if (s != null && s.hasParadigm())
                    res = true; //return true;
            }
        //for (Phrase e : phrases)
        //{
        //   if (e.hasParadigm()) return true;
        //}

        if (derivs != null)
            for (Header d : derivs) {
                if (!d.hasParadigm())
                    res = false;
            }
        return res;
    }

    public boolean hasUnparsedGram() {
        if (head != null && head.hasUnparsedGram())
            return true;
        if (senses != null)
            for (Sense s : senses) {
                if (s.hasUnparsedGram())
                    return true;
            }
        if (phrases != null)
            for (Phrase e : phrases) {
                if (e.hasUnparsedGram())
                    return true;
            }
        if (derivs != null)
            for (Header h : derivs) {
                if (h.hasUnparsedGram())
                    return true;
            }
        return false;
    }

    /**
     * Build a JSON representation, designed to load in Tezaurs2 webapp well.
     * @return JSON representation
     */
    public String toJSON() {
        StringBuilder s = new StringBuilder();
        s.append('{');
        s.append(head.toJSON());
        /*if (paradigm != 0) {
           s.append(String.format(",\"Paradigm\":%d", paradigm));
           if (analyzer != null) {
        // generate a list of inflected wordforms and format them as JSON array
        ArrayList<Wordform> inflections = analyzer.generateInflections(lemma.l, paradigm);
        s.append(String.format(",\"Inflections\":%s", formatInflections(inflections) )); 
           }
        }//*/

        if (homId != null) {
            s.append(", \"ID\":\"");
            s.append(JSONObject.escape(homId.toString()));
            s.append("\"");
        }

        s.append(", \"Senses\":");
        s.append(JSONUtils.objectsToJSON(senses));

        if (phrases != null) {
            s.append(", \"Phrases\":");
            s.append(JSONUtils.objectsToJSON(phrases));
        }

        if (derivs != null) {
            s.append(", \"Derivatives\":");
            s.append(JSONUtils.objectsToJSON(derivs));
        }

        if (sources != null && !sources.isEmpty()) {
            s.append(",");
            s.append(sources.toJSON());
        }
        s.append('}');
        return s.toString();
    }

    // Here the magic magic must happen.
    /*   private void setParadigm() {
          // 1: Lietv?rds 1. deklin?cija -s
          if (( lemma.l.endsWith("s") 
     && gramContains("v.") && !gramContains("-ais")) //FIXME pabasv?rdi kas nav pai nor?dti????
     && !lemma.l.endsWith("is") && !lemma.l.endsWith("us") && !gramContains("nenoteiktais vietn.") 
     && !gramContains("-s?ls") && !gramContains("-rudens")
     && !lemma.l.endsWith("rudens") && !lemma.l.endsWith("debess")
     && !lemma.l.endsWith("akmens") && !lemma.l.endsWith("asmens")
     && !lemma.l.endsWith("dens") && !lemma.l.endsWith("suns")
     && !lemma.l.endsWith("zibens") && !lemma.l.endsWith("mness")) {
     if (paradigm > 0)
        System.err.printf(
           "V?rds '%s' gram '%s' atbilst paradigm?m %d un %d\n", lemma,
           gram, paradigm, 1);
         
     removeGram("v.");
     removeGram("lietv.");
     paradigm = 1;
          }
               
          // 21: Apst?ka v?rds
          //if (gram != null && gram.equalsIgnoreCase("apst.")) return 21;
              
          if (paradigm > 0) {
     // ja gramatik? ir -a, tad p?rbaudam vai tie?m izpild?s
     assertNounEnding("-a","a", AttributeNames.v_Singular, AttributeNames.v_Genitive);
     assertNounEnding("a","a", AttributeNames.v_Singular, AttributeNames.v_Genitive);
     assertNounEnding("- a","a", AttributeNames.v_Singular, AttributeNames.v_Genitive); //TODO - typo pirmavot?
     assertNounEnding("-a","a", AttributeNames.v_Singular, AttributeNames.v_Genitive);
     assertNounEnding("-sa","sa", AttributeNames.v_Singular, AttributeNames.v_Genitive);
     assertNounEnding("-ja","ja", AttributeNames.v_Singular, AttributeNames.v_Genitive);
     assertNounEnding("-a","a", AttributeNames.v_Singular, AttributeNames.v_Genitive);
     assertNounEnding("-ra","ra", AttributeNames.v_Singular, AttributeNames.v_Genitive);
     assertNounEnding("-u","u", AttributeNames.v_Plural, AttributeNames.v_Genitive); //TODO - vai vienmr t??
         
     if (gram != null && gram.length() != 0)
        System.err.printf("%s\t('%s' - gram bija %s)\n",gram,lemma.l,originalGram);
                  
     if (analyzer != null) {
        Word analze = analyzer.analyzeLemma(lemma.l);
        boolean found = false;
        String paradigmas = "";
        for (Wordform variants : analze.wordforms) {
           Paradigm paradigmas_variants = variants.getEnding().getParadigm();
           if (paradigmas_variants.getID() == paradigm
              || (paradigmas_variants.getID() == 13 && paradigm==1)
              || (paradigmas_variants.getID() == 15 && paradigm==1)) //-iens atvasin?jumi
              found = true;
           else paradigmas = paradigmas + " " + String.valueOf(paradigmas_variants.getID());
        }
        if (analze.isRecognized() && !found) 
           System.err.printf("'%s' - iet %d bet leksikon? ir %s\n", lemma, paradigm, paradigmas);
        
     }
          }
              
          //if (true_gram != null) System.out.printf("Truegram: '%s' out of '%s'\n",true_gram,original_gram);
        
          //if (gram != null && gram.contains(".:")) System.err.println(original_gram); FIXME - te ir puse typo ...
       }//*/

    // What is this?
    // This is for test purposes.
    /*   private void assertNounEnding(
          String gramDesc, String ending, String number, String nouncase)
       {
          // Assertion to verify if analyzer stemchanges match the dictionary.
          if (gramContains(gramDesc) && analyzer != null) { 
     Paradigm p = analyzer.paradigmByID(paradigm);
     //FIXME - k? tad is str?d? ar daudzskaitliniekiem?
         
     ArrayList<Wordform> inflections = analyzer.generateInflections(
        lemma.l, paradigm);
     for (Wordform wf : inflections) {
        if (wf.isMatchingStrong(AttributeNames.i_Case, nouncase) &&
           wf.isMatchingStrong(AttributeNames.i_Number, number)) {
               
           if (!wf.getToken().endsWith(ending)) 
              System.err.printf(
                 "Gram '%s' mismatch - expected to end with -%s but got %s\n",
                 gramDesc, ending, wf.getToken());
        }
     }
          }      
          removeGram(gramDesc);
       }//*/

    /**
     *  Formats a list of inflections as an JSON array.
     */
    /*   private static Object formatInflections(ArrayList<Wordform> inflections) {
          StringBuilder s = new StringBuilder();
          s.append('[');
              
          LinkedList<String> showAttrs = new LinkedList<String>();
          showAttrs.add(AttributeNames.i_Word);
          showAttrs.add(AttributeNames.i_Case);
          showAttrs.add(AttributeNames.i_Number);
              
          Iterator<Wordform> i = inflections.iterator();
          while (i.hasNext()) {
     Wordform wf = i.next();
     wf.filterAttributes(showAttrs);
     s.append(wf.toJSON());
     if (i.hasNext()) s.append(", ");
          }
          s.append(']');
          return s.toString();
       }//*/

    public void addToLexicon(Analyzer analizators, String importSource) {
        this.head.addToLexicon(analizators, importSource);
        if (this.derivs != null)
            for (Header h : this.derivs)
                h.addToLexicon(analizators, importSource);
    }

}