Java tutorial
/******************************************************************************* * Copyright 2013, 2014 Institute of Mathematics and Computer Science, University of Latvia * Author: Lauma Pretkalnia * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. *******************************************************************************/ package lv.semti.Thesaurus.struct; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.regex.Matcher; import java.util.regex.Pattern; import lv.semti.Thesaurus.utils.HasToJSON; import lv.semti.Thesaurus.utils.MappingSet; import lv.semti.Thesaurus.utils.Tuple; import lv.semti.Thesaurus.utils.JSONUtils; import org.json.simple.JSONObject; import org.w3c.dom.Node; /** * g (gramatika) field. */ public class Gram implements HasToJSON { public String orig; public HashSet<String> flags; public LinkedList<LinkedList<String>> leftovers; public HashSet<Integer> paradigm; /** * If grammar contains additional information about lemmas, it is * collected here. Mapping from paradigms to lemma-flagset tuples. * Flag set contains only flags for which alternate lemma differs from * general flags given in "flags" field in this grammar. */ public MappingSet<Integer, Tuple<Lemma, HashSet<String>>> altLemmas; /** * Known abbreviations and their de-abbreviations. */ public static MappingSet<String, String> knownAbbr = generateKnownAbbr(); private static MappingSet<String, String> generateKnownAbbr() { MappingSet<String, String> res = new MappingSet<String, String>(); // TODO Sort out this mess. // Source: LLVV, data. res.put("adj.", "pabas v?rds"); res.put("adv.", "Apst?ka v?rds"); res.put("apst.", "Apst?ka v?rds"); res.put("divd.", "Divdabis"); res.put("Divd.", "Divdabis"); res.put("interj.", "Izsauksmes v?rds"); res.put("p. v.", "pabas v?rds"); res.put("p.", "pabas v?rds"); res.put("izsauk.", "Izsauksmes v?rds"); res.put("jaut.", "Jaut?jamais vietniekv?rds"); res.put("lietv.", "Lietv?rds"); res.put("noliedz.", "Noliedzamais vietniekv?rds"); res.put("nor?d.", "Nor?d?mais vietniekv?rds"); res.put("noteic.", "Noteicamais vietniekv?rds"); res.put("part.", "Partikula"); res.put("pieder.", "Piederbas vietniekv?rds"); res.put("pried.", "Priedklis"); // Specific processing needed. res.put("priev.", "Priev?rds"); res.put("skait.", "Skaita v?rds"); res.put("vietn.", "Vietniekv?rds"); res.put("vietniekv.", "Vietniekv?rds"); // ? res.put("visp?rin.", "Visp?rin?mais vietniekv?rds"); res.put("sas.", "Sasin?jums"); res.put("simb.", "Sasin?jums"); // ? res.put("salikteu pirm? daa.", "Salikteu daa"); res.put("salikteu pirm? daa", "Salikteu daa"); res.put("saliktea pirm? daa.", "Salikteu daa"); res.put("saliktea pirm? daa", "Salikteu daa"); res.put("salikteu daa.", "Salikteu daa"); res.put("salikteu daa", "Salikteu daa"); res.put("priev. ar en.", "Priev?rds"); res.put("priev. ar en.", "Lieto ar enetvu"); res.put("ar en.", "Priev?rds"); // It seems that without additional comments this is used for prepositions only res.put("ar en.", "Lieto ar enetvu"); res.put("priev. ar dat.", "Priev?rds"); res.put("priev. ar dat.", "Lieto ar datvu"); res.put("persv.", "Personv?rds"); res.put("vietv.", "Vietv?rds"); res.put("akuz.", "Akuzatvs"); res.put("dat.", "Datvs"); res.put("en.", "enitvs"); res.put("instr.", "Instrument?lis"); res.put("lok.", "Lokatvs"); res.put("nom.", "Nominatvs"); res.put("divsk.", "Divskaitlis"); // Do we really still have one of these?! res.put("dsk.", "Daudzskaitlis"); res.put("vsk.", "Vienskaitlis"); res.put("n?k.", "N?kotne"); res.put("pag.", "Pag?tne"); res.put("tag.", "Tagadne"); res.put("nenot.", "Nenoteikt? galotne"); res.put("not.", "Noteikt? galotne"); res.put("s.", "Sievieu dzimte"); res.put("v.", "Vrieu dzimte"); res.put("kopdz.", "Kopdzimte"); res.put("intrans.", "Nep?rejos"); res.put("intr.", "Nep?rejos"); res.put("trans.", "P?rejos"); // TODO vai ie vienmr ir darbbas v?rdi? res.put("konj.", "Konjug?cija"); res.put("pers.", "Persona"); //res.put("atgr.", "Atgriezensisks (vietniekv?rds?)"); //not present res.put("dem.", "Deminutvs"); res.put("Dem.", "Deminutvs"); res.put("imperf.", "Imperfektva forma"); //??? res.put("nelok.", "Nelok?ms v?rds"); res.put("Nol.", "Noliegums"); // Check with other sources! res.put("refl.", "Refleksvs"); res.put("refl.", "Darbbas v?rds"); res.put("Refl.", "Refleksvs"); res.put("Refl.", "Darbbas v?rds"); res.put("aeron.", "Aeronautika"); // ? res.put("anat.", "Anatomija"); res.put("arheol.", "Arheoloija"); res.put("arhit.", "Arhitektra"); res.put("arh.", "Arhitektra"); res.put("astr.", "Astronomija"); res.put("av.", "Avi?cija"); res.put("biol.", "Bioloija"); res.put("bikop.", "Bikopba"); res.put("bot.", "Bot?nika"); res.put("bvn.", "Bvniecba"); res.put("ek.", "Ekonomika"); res.put("ekol.", "Ekoloija"); // ? res.put("ekon.", "Ekonomika"); res.put("el.", "Elektrotehnika"); res.put("etn.", "Etnogr?fija"); res.put("farm.", "Farmakoloija"); res.put("filoz.", "Filozofija"); res.put("fin.", "Finanses"); res.put("fiz.", "Fizika"); res.put("fiziol.", "Fizioloija"); res.put("fizk.", "Fizisk? kultra un sports"); res.put("folkl.", "Folklora"); res.put("ent.", "entika"); // ? res.put("eod.", "eodzija"); res.put("eogr.", "eogr?fija"); res.put("eol.", "eoloija"); res.put("eom.", "eometrija"); res.put("gr?matv.", "Gr?matvedba"); res.put("hidr.", "Hidroloija"); res.put("hidrotehn.", "Hidrotehnika"); res.put("inf.", "Inform?tika"); res.put("jur.", "Jurisprudence"); res.put("jrn.", "Jrniecba"); res.put("kap.", "Attiecas uz kapit?listisko iek?rtu, kapit?listisko sabiedrbu"); res.put("kardioloij?", "Kardioloija"); res.put("kart.", "Kartogr?fija"); // ? res.put("kibern.", "Kiberntika"); res.put("kino", "Kinematogr?fija"); res.put("kokapstr.", "Kokapstr?de"); // ? res.put("kul.", "Kulin?rija"); res.put("m.", "mija"); res.put("lauks.", "Lauksaimniecba"); res.put("lauks. tehn.", "Lauksaimniecbas tehnika"); // ? res.put("literat.", "Literatrzin?tne"); res.put("lo.", "Loika"); res.put("lopk.", "Lopkopba"); res.put("mat.", "Matem?tika"); res.put("matem.", "Matem?tika"); // ? res.put("med.", "Medicna"); res.put("medn.", "Medniecba"); res.put("met.", "Meteoroloija"); // ? res.put("metal.", "Metalurija"); res.put("met?l.", "Met?lapstr?de"); // ? res.put("meteorol.", "Meteoroloija"); res.put("me.", "Meniecba"); // ? res.put("mer.", "Merpniecba"); res.put("mes.", "Mesaimniecba"); res.put("mil.", "Milit?r?s zin?tnes"); res.put("min.", "Mineraloija"); res.put("mit.", "Mitoloija"); res.put("mz.", "Mzika"); res.put("oftalmoloij?", "Oftalmoloija"); res.put("ornit.", "Ornitoloija"); res.put("pol.", "Politika"); res.put("poligr.", "Poligr?fija"); res.put("psih.", "Psiholoija"); res.put("rel.", "Reliija"); res.put("social.", "Socioloija"); // ? res.put("sociol.", "Socioloija"); res.put("tehn.", "Tehnika"); res.put("tehnol.", "Tehnoloija"); res.put("telek.", "Telekomunik?cijas"); // ? res.put("tekst.", "Tekstilrpniecba"); res.put("tekstilr.", "Tekstilrpniecba"); // ? res.put("TV", "Televzija"); res.put("val.", "Valodniecba"); res.put("vet.", "Veterin?rija"); res.put("zool.", "Zooloija"); res.put("ar?bu", "Ar?bu"); res.put("ar?bu", "V?rds svevalod?"); res.put("ar?bu val.", "Ar?bu"); res.put("ar?bu val.", "V?rds svevalod?"); res.put("v?cu val.", "V?cu"); res.put("v?cu val.", "V?rds svevalod?"); res.put("fr.", "Fran?u"); res.put("fr.", "V?rds svevalod?"); res.put("grieu", "Grieu"); res.put("grieu", "V?rds svevalod?"); res.put("gr.", "Grieu"); res.put("gr.", "V?rds svevalod?"); res.put("it.", "It?lieu"); //Muz res.put("it.", "V?rds svevalod?"); res.put("lat.", "Latu"); res.put("lat.", "V?rds svevalod?"); res.put("liet.", "Lietuvieu"); res.put("liet.", "V?rds svevalod?"); res.put("sengr.", "Sengrieu"); res.put("sengr.", "V?rds svevalod?"); res.put("dial. (augzemnieku)", "Aguzemnieku"); // Unique. res.put("dial. (augzemnieku)", "Dialekts"); // Unique. res.put("latg.", "Latgalieu"); res.put("latg.", "Dialekts"); res.put("apv.", "Apvidv?rds"); res.put("vst.", "Vsturisks"); res.put("novec.", "Novecojis"); res.put("neakt.", "Neaktu?ls"); res.put("pot.", "Potiska stilistisk? nokr?sa"); res.put("niev.", "Nievga ekspresv? nokr?sa"); res.put("iron.", "Ironiska ekspresv? nokr?sa"); res.put("hum.", "Humoristiska ekspresv? nokr?sa"); res.put("vienk.", "Vienk?rrunas stilistisk? nokr?sa"); res.put("p?rn.", "P?rnest? nozm"); res.put("nevl.", "Nevlams"); // TODO - nevlamos, neliter?ros un argonus apvienot?? res.put("nelit.", "Neliter?rs"); res.put("arg.", "argonv?rds"); res.put("sar.", "Sarunvaloda"); res.put("vulg.", "Vulg?risms"); // ? //TODO - os drz?k k? atseviu koment?ru lauku(s) res.put("ar vsk.", "Ar vienskaitl"); // aunums. res.put("parasti vsk.", "Parasti vienskaitl"); res.put("parasti vsk", "Parasti vienskaitl"); res.put("par. vsk.", "Parasti vienskaitl"); res.put("tikai vsk.", "Tikai vienskaitl"); res.put("parasti dsk.", "Parasti daudzskaitl"); res.put("tikai dsk.", "Tikai daudzskaitl"); res.put("parasti 3. pers.", "Parasti 3. person?"); res.put("parasti saliktajos laikos", "Parasti saliktajos laikos"); res.put("parasti saliktajos laikos.", "Parasti saliktajos laikos"); res.put("parasti nenoteiksm", "Parasti nenoteiksm"); res.put("parasti nenoteiksm", "Darbbas v?rds"); res.put("parasti pavles form?", "Parasti pavles izteiksm"); res.put("parasti pavles form?", "Darbbas v?rds"); res.put("parasti pavles form?.", "Parasti pavles izteiksm"); res.put("parasti pavles form?.", "Darbbas v?rds"); res.put("nelok.", "Nelok?ms"); res.put("subst. noz.", "Lietv?rda nozm"); res.put("lietv. nozm.", "Lietv?rda nozm"); res.put("p. nozm.", "pabas v?rda nozm"); res.put("ar not. gal.", "Ar noteikto galotni"); res.put("ar lielo s?kumburtu", "Ar lielo s?kumburtu"); res.put("pareti.", "Pareti"); res.put("pareti", "Pareti"); res.put("reti.", "Reti"); res.put("reti", "Reti"); res.put("ret?k", "Ret?k"); res.put("hip.", "Hipotze"); return res; } /*/* * Patterns for identifying (true) grammatical information. */ // public static LinkedList<Pattern> knownPatterns = generateKnownPatterns(); /* private static LinkedList<Pattern> generateKnownPatterns() { LinkedList<Pattern> res = new LinkedList<Pattern>(); res.add(Pattern.compile("^(.*)(vokatvs [^ ,;:]+)(.*)$")); res.add(Pattern.compile("^(.*)(biei lok\\.: [^ ,;:]+)(.*)$")); res.add(Pattern.compile("^(.*)(parasti lok\\.: [^ ,;:]+)(.*)$")); res.add(Pattern.compile("^(.*)(parasti vsk\\. lok\\.: [^ ,;:]+)(.*)$")); res.add(Pattern.compile("^(.*)(parasti en\\.: [^ ,;:]+)(.*)$")); res.add(Pattern.compile("^(.*)(pamata skait(\\.|a v?rds) lietv(\\.|?rda) nozm\\.?)(.*)$")); res.add(Pattern.compile("^(.*)(\\(?parasti folkl\\.(\\)\\.)?)(.*)$")); res.add(Pattern.compile("^(.*)(parasti saistt? valod?\\.)(.*)$")); res.add(Pattern.compile("^(.*)(apst\\. nozm)(.*)$")); res.add(Pattern.compile("^(.*)(\\(v?cu \"krava\"\\))(.*)$")); return res; }//*/ public Gram() { orig = null; flags = null; leftovers = null; paradigm = null; altLemmas = null; } /** * @param lemma is used for grammar parsing. */ public Gram(Node gramNode, String lemma) { orig = gramNode.getTextContent(); leftovers = null; flags = new HashSet<String>(); paradigm = new HashSet<Integer>(); altLemmas = null; parseGram(lemma); } /** * @param lemma is used for grammar parsing. */ public void set(String gramText, String lemma) { orig = gramText; leftovers = null; flags = new HashSet<String>(); paradigm = new HashSet<Integer>(); altLemmas = null; parseGram(lemma); } public boolean hasParadigm() { return !paradigm.isEmpty(); } /** * Only works correctly, if cleanupLeftovers is used, when needed. */ public boolean hasUnparsedGram() { //cleanupLeftovers(); // What is better - unexpected side effects or not working, when used incorrectly? return !leftovers.isEmpty(); } /** * @param lemma is used for grammar parsing. */ private void parseGram(String lemma) { String correctedGram = correctOCRErrors(orig); altLemmas = new MappingSet<Integer, Tuple<Lemma, HashSet<String>>>(); // First process ending patterns, usually located in the beginning // of the grammar string. correctedGram = processBeginingWithPatterns(correctedGram, lemma); String[] subGrams = correctedGram.split("\\s*;\\s*"); leftovers = new LinkedList<LinkedList<String>>(); // Process each semicolon-separated substring. for (String subGram : subGrams) { subGram = processWithNoSemicolonPatterns(subGram, lemma); String[] gramElems = subGram.split("\\s*,\\s*"); LinkedList<String> toDo = new LinkedList<String>(); // Process each comma-separated substring. for (String gramElem : gramElems) { gramElem = gramElem.trim(); // Check for abbreviations. if (knownAbbr.containsKey(gramElem)) flags.addAll(knownAbbr.getAll(gramElem)); else { // Check for matches regular expressions. gramElem = processWithNoCommaPatterns(gramElem, lemma); // Unprocessed leftovers. if (!gramElem.equals("")) toDo.add(gramElem); } } // TODO: magical patterns for processing endings. leftovers.add(toDo); } // Try to deduce paradigm from flags. paradigmFromFlags(lemma); cleanupLeftovers(); // TODO cleanup altLemmas; } /** * This method contains collection of ending patterns, found in data. * These patterns are meant for using on the beginning of the * unsegmented grammar string. * Thus,e.g., if there was no plural-only nouns with ending -as, then * there is no rule for processing such words (at least in most cases). * @param lemma is used for grammar parsing. */ private String processBeginingWithPatterns(String gramText, String lemma) { gramText = gramText.trim(); int newBegin = -1; // Blocks of rules. if (newBegin == -1) newBegin = firstConjDirVerb3PersRules(gramText, lemma); if (newBegin == -1) newBegin = firstConjDirVerbAllPersRules(gramText, lemma); if (newBegin == -1) newBegin = secondConjDirVerbRules(gramText, lemma); if (newBegin == -1) newBegin = thirdConjDir3PersVerbRules(gramText, lemma); if (newBegin == -1) newBegin = thirdConjDirAllPersVerbRules(gramText, lemma); if (newBegin == -1) newBegin = firstConjRef3PersVerbRules(gramText, lemma); if (newBegin == -1) newBegin = firstConjRefAllPersVerbRules(gramText, lemma); if (newBegin == -1) newBegin = secondConjRefVerbRules(gramText, lemma); if (newBegin == -1) newBegin = thirdConjRef3PersVerbRules(gramText, lemma); if (newBegin == -1) newBegin = thirdConjRefAllPersVerbRules(gramText, lemma); if (newBegin == -1) newBegin = sixthDeclNounFullWordRules(gramText, lemma); // Complicated rules: grammar contains lemma variation spelled out. if (newBegin == -1) { // Super-complicated case: pronunciation included. // Paradigm 1: Lietv?rds 1. deklin?cija -s // Changed in new version /*if (lemma.endsWith("di") && gramText.matches("(-u, vsk\\. (\\Q" + lemma.substring(0, lemma.length() - 1) + "s\\E) \\[([^\\]]*?)\\] -a, v\\.)(.*)?")) // ?beziedi: -u, vsk. ?bezieds [a^be`zie^c] -a, v. { Pattern pattern = Pattern.compile("(-u, vsk\\. (\\Q" + lemma.substring(0, lemma.length() - 1) + "s\\E) \\[([^\\]]*?)\\] -a, v\\.)(.*)?"); Matcher matcher = pattern.matcher(gramText); if (!matcher.matches()) System.err.printf("Problem matching \"%s\" with \"?bezieds\" rule\n", lemma); newBegin = matcher.group(1).length(); Lemma altLemma = new Lemma(matcher.group(2)); altLemma.pronunciation = matcher.group(3); HashSet<String> altParams = new HashSet<String> (); altParams.add("irkav?rds vienskaitl"); altLemmas.put(1, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); paradigm.add(1); flags.add("Vrieu dzimte"); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); }//*/ // Paradigm 2: Lietv?rds 1. deklin?cija - if (lemma.endsWith("i") && gramText .startsWith("-u, vsk. " + lemma.substring(0, lemma.length() - 2) + ", -a, v.")) // ditaurii: -u, vsk. ditauri, -a, v. { newBegin = ("-u, vsk. " + lemma.substring(0, lemma.length() - 2) + ", -a, v.").length(); Lemma altLemma = new Lemma(lemma.substring(0, lemma.length() - 2) + ""); HashSet<String> altParams = new HashSet<String>(); altParams.add("irkav?rds vienskaitl"); altLemmas.put(2, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); paradigm.add(2); flags.add("Vrieu dzimte"); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); } // Paradigm 3: Lietv?rds 2. deklin?cija -is else if (lemma.endsWith("i") && gramText .startsWith("-u, vsk. " + lemma.substring(0, lemma.length() - 2) + "nis, -a, v.")) // aizvirti: -u, vsk. aizvirtnis, -a, v. { newBegin = ("-u, vsk. " + lemma.substring(0, lemma.length() - 2) + "nis, -a, v.").length(); Lemma altLemma = new Lemma(lemma.substring(0, lemma.length() - 2) + "nis"); HashSet<String> altParams = new HashSet<String>(); altParams.add("irkav?rds vienskaitl"); altLemmas.put(3, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); paradigm.add(3); flags.add("Vrieu dzimte"); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); } else if (lemma.endsWith("i") && gramText .startsWith("-u, vsk. " + lemma.substring(0, lemma.length() - 3) + "lnis, -a, v.")) // starpvii: -u, vsk. starpvilnis, -a, v. { newBegin = ("-u, vsk. " + lemma.substring(0, lemma.length() - 3) + "lnis, -a, v.").length(); Lemma altLemma = new Lemma(lemma.substring(0, lemma.length() - 3) + "lnis"); HashSet<String> altParams = new HashSet<String>(); altParams.add("irkav?rds vienskaitl"); altLemmas.put(3, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); paradigm.add(3); flags.add("Vrieu dzimte"); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); } else if (lemma.endsWith("ji") && gramText.startsWith("-u, vsk. " + lemma + "s, -ja, v.")) // airk?ji: -u, vsk. airk?jis, -ja, v. { newBegin = ("-u, vsk. " + lemma + "s, -ja, v.").length(); Lemma altLemma = new Lemma(lemma + "s"); HashSet<String> altParams = new HashSet<String>(); altParams.add("irkav?rds vienskaitl"); altLemmas.put(3, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); paradigm.add(3); flags.add("Vrieu dzimte"); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); } // Paradigm 1: Lietv?rds 1. deklin?cija -s else if (lemma.endsWith("i") && gramText.startsWith("-u, vsk. " + lemma.substring(0, lemma.length() - 1) + "s, -a, v.")) // aizkari: -u, vsk. aizkars, -a, v. { newBegin = ("-u, vsk. " + lemma.substring(0, lemma.length() - 1) + "s, -a, v.").length(); Lemma altLemma = new Lemma(lemma.substring(0, lemma.length() - 1) + "s"); HashSet<String> altParams = new HashSet<String>(); altParams.add("irkav?rds vienskaitl"); altLemmas.put(1, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); paradigm.add(1); flags.add("Vrieu dzimte"); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); } } // "-es, dsk. en. -??u, s." if (newBegin == -1) newBegin = esEndingPluralGenUEndingFemRules(gramText, lemma); // More rules if (newBegin == -1) { // Long, specific patterns. // Paradigm Unknown: Atgriezeniskie lietv?rdi -an?s if (gramText.startsWith("en. -?s, akuz. -os, instr. -os, dsk. -?s, en. -os, akuz. -?s, s.")) //aizbildin?an?s { newBegin = "en. -?s, akuz. -os, instr. -os, dsk. -?s, en. -os, akuz. -?s, s.".length(); if (lemma.endsWith("an?s")) { paradigm.add(0); flags.add("Lietv?rds"); flags.add("Atgriezeniskais lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm -an?s\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } // Paradigm 25: Vietniekv?rdi else if (gramText.matches("en\\. -k?, dat\\. -kam, akuz\\., instr\\. -ko([.,;].*)?")) //daudzkas { newBegin = "en. -k?, dat. -kam, akuz., instr. -ko".length(); if (lemma.endsWith("kas")) { paradigm.add(25); flags.add("Vietniekv?rds"); flags.add("Loct k? \"kas\""); } else { System.err.printf("Problem matching \"%s\" with paradigm 25\n", lemma); newBegin = 0; } } // Paradigm 7: Lietv?rds 4. deklin?cija -a siev. dz. // Paradigm 8: Lietv?rds 4. deklin?cija -a vr. dz. else if (gramText.startsWith("en. -as, v. dat. -am, s. dat. -ai, kopdz.")) { newBegin = "en. -as, v. dat. -am, s. dat. -ai, kopdz.".length(); if (lemma.endsWith("a")) { paradigm.add(7); paradigm.add(8); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 7, 8\n", lemma); newBegin = 0; } flags.add("Kopdzimte"); } // Paradigm 1: Lietv?rds 1. deklin?cija -s // Paradigm 2: Lietv?rds 1. deklin?cija - else if (gramText.startsWith("lietv. -a, v.")) // aerobs { newBegin = "lietv. -a, v.".length(); //if (lemma.matches(".*[jr]is")) paradigm.add(3); //else //{ //if (lemma.matches(".*[aeiou??]s") || lemma.matches(".*[^s]")) // System.err.printf("Problem matching \"%s\" with paradigms 1, 2, 3\n", lemma); if (lemma.endsWith("")) paradigm.add(2); else if (lemma.matches(".*[^aeiou??]s")) paradigm.add(1); else { System.err.printf("Problem matching \"%s\" with paradigms 1, 2, 3\n", lemma); newBegin = 0; } //} flags.add("Vrieu dzimte"); flags.add("Lietv?rds"); } else if (gramText.startsWith("vsk. -a, v.")) // acteks { newBegin = "vsk. -a, v.".length(); if (lemma.endsWith("")) { paradigm.add(2); flags.add("Lietv?rds"); } else if (lemma.matches(".*[^aeiou??]s")) { paradigm.add(1); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigms 1, 2\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); flags.add("Vienskaitlis"); } // Paradigm 3: Lietv?rds 2. deklin?cija -is else if (gramText.startsWith("-a, dsk. en. -u, v.")) // bizmanis { newBegin = "-a, dsk. en. -u, v.".length(); if (lemma.endsWith("nis")) { paradigm.add(3); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigms 3\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); } else if (gramText.matches("-a, dsk\\. en\\. -u([;,.].*)?")) // afroamerik?i { newBegin = "-a, dsk. en. -u".length(); if (lemma.endsWith("i")) { paradigm.add(3); flags.add("irkav?rds daudzskaitl"); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigms 3\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); } // Paradigm 9: Lietv?rds 5. deklin?cija -e siev. dz. else if (gramText.matches("-es, s\\., dsk\\. en\\. -bju([;,.].*)?")) //acetilsalicilsk?be { newBegin = "-es, s., dsk. en. -bju".length(); if (lemma.endsWith("be")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.matches("-es, dsk\\. en\\. -ru([;,.].*)?")) //?dere { newBegin = "-es, dsk. en. -ru".length(); if (lemma.endsWith("re")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.matches("-es, dsk\\. en\\. -u([;,.].*)?")) //aizkr?sne { newBegin = "-es, dsk. en. -u".length(); if (lemma.matches(".*[s][n]e")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-es, s.")) //aizture { newBegin = "-es, s.".length(); if (lemma.endsWith("e")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } // Paradigm 11: Lietv?rds 6. deklin?cija -s // Ending rules else if (gramText.matches("-ts, -u([;,.].*)?")) //abonentpults { newBegin = "-ts, -u".length(); if (lemma.endsWith("ts")) { paradigm.add(11); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 11\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.matches("-vs, -vju([;,.].*)?")) //adatzivs { newBegin = "-vs, -vju".length(); if (lemma.endsWith("vs")) { paradigm.add(11); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 11\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } // Paradigm 7: Lietv?rds 4. deklin?cija -a siev. dz. // Paradigm 11: Lietv?rds 6. deklin?cija -s siev. dz. else if (gramText.startsWith("-as, s.")) //aber?cija, milns, naj?das { newBegin = "-as, s.".length(); if (lemma.matches(".*[^aeiou??]s")) { paradigm.add(11); flags.add("Lietv?rds"); } else if (lemma.endsWith("a")) { paradigm.add(7); flags.add("Lietv?rds"); } else if (lemma.matches(".*[^aeiou??]as")) { paradigm.add(7); flags.add("irkav?rds daudzskaitl"); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 7, 11\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } // Paradigm 9: Lietv?rds 5. deklin?cija -e siev. dz. // Paradigm 11: Lietv?rds 6. deklin?cija -s else if (gramText.startsWith("dsk. en. -u, s.")) //?dmine, b?kuguns, b?rksaknes { newBegin = "dsk. en. -u, s.".length(); if (lemma.endsWith("ns")) { paradigm.add(11); flags.add("Lietv?rds"); } else if (lemma.endsWith("nes")) { paradigm.add(9); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); } else if (lemma.endsWith("ne")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9, 11\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } // Grammar includes endings for other lemma variants. // Paradigm 1: Lietv?rds 1. deklin?cija -s // Paradigm 9: Lietv?rds 5. deklin?cija -e siev. dz. else if (gramText.matches("s\\. -te, -u([;.].*)?")) //abstinents { newBegin = "s. -te, -u".length(); if (lemma.endsWith("ts")) { Lemma altLemma = new Lemma(lemma.substring(0, lemma.length() - 1) + "e"); HashSet<String> altParams = new HashSet<String>(); altParams.add("Sievieu dzimte"); altParams.add("Cita paradigma"); altLemmas.put(9, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); paradigm.add(1); flags.add("Lietv?rds"); flags.add("Vrieu dzimte"); } else { System.err.printf("Problem matching \"%s\" with paradigm 1 & 5\n", lemma); newBegin = 0; } } // Paradigm 3: Lietv?rds 2. deklin?cija -is // Paradigm 9: Lietv?rds 5. deklin?cija -e siev. dz. else if (gramText.matches("-a; s\\. -e -u([;.].*)?")) //agonistiis { newBegin = "-a; s. -e -u".length(); if (lemma.endsWith("is")) { Lemma altLemma = new Lemma(lemma.substring(0, lemma.length() - 2) + "e"); HashSet<String> altParams = new HashSet<String>(); altParams.add("Sievieu dzimte"); altParams.add("Cita paradigma"); altLemmas.put(9, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); paradigm.add(2); flags.add("Lietv?rds"); flags.add("Vrieu dzimte"); } else { System.err.printf("Problem matching \"%s\" with paradigm 3 & 5\n", lemma); newBegin = 0; } } else if (gramText.matches("-a; s. -te, -u([;.].*)?")) //aiolietis { newBegin = "-a; s. -te, -u".length(); if (lemma.endsWith("tis")) { Lemma altLemma = new Lemma(lemma.substring(0, lemma.length() - 2) + "e"); HashSet<String> altParams = new HashSet<String>(); altParams.add("Sievieu dzimte"); altParams.add("Cita paradigma"); altLemmas.put(9, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); paradigm.add(2); flags.add("Lietv?rds"); flags.add("Vrieu dzimte"); } else { System.err.printf("Problem matching \"%s\" with paradigm 3 & 5\n", lemma); newBegin = 0; } } // Paradigm 13: pabas v?rdi ar -s // Paradigm 14: pabas v?rdi ar - else if (gramText.matches("p\\. v\\. -ais; s\\. -a, -?([;,.].*)?")) //aerobs { newBegin = "p. v. -ais; s. -a, -?".length(); if (lemma.matches(".*[^aeiou??]")) { paradigm.add(14); flags.add("pabas v?rds"); } else if (lemma.matches(".*[^aeiou??]s")) { paradigm.add(13); flags.add("pabas v?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigms 13, 14\n", lemma); newBegin = 0; } } else if (gramText.matches("-ais[;,] s\\. -a, -?([;,.].*)?")) //abj?ds, acains, ag?ms { newBegin = "-ais; s. -a, -?".length(); if (lemma.matches(".*[^aeiou??]")) { paradigm.add(14); flags.add("pabas v?rds"); } else if (lemma.matches(".*[^aeiou??]s")) { paradigm.add(13); flags.add("pabas v?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigms 13, 14\n", lemma); newBegin = 0; } } // Paradigm 13-14: plural forms else if (gramText.startsWith("s. -as; adj.")) //abji 2 { newBegin = "s. -as; adj.".length(); if (lemma.endsWith("i")) { paradigm.add(13); paradigm.add(14); flags.add("pabas v?rds"); flags.add("irkav?rds daudzskaitl"); flags.add("Neviennozmga paradigma"); } else { System.err.printf("Problem matching \"%s\" with paradigms 13-14\n", lemma); newBegin = 0; } } else if (gramText.startsWith("s. -as; tikai dsk.")) //abji 1 { // This exception is on purpose! this way "tikai dsk." is later // transformed to appropriate flag. newBegin = "s. -as;".length(); if (lemma.endsWith("i")) { paradigm.add(13); paradigm.add(14); flags.add("pabas v?rds"); flags.add("irkav?rds daudzskaitl"); flags.add("Neviennozmga paradigma"); } else { System.err.printf("Problem matching \"%s\" with paradigms 13-14\n", lemma); newBegin = 0; } } // Paradigm 25: Vietniekv?rdi else if (gramText.startsWith("s. -as; vietniekv.")) // abi { newBegin = "s. -as; vietniekv.".length(); if (lemma.endsWith("i")) { paradigm.add(25); flags.add("irkav?rds daudzskaitl"); } else { System.err.printf("Problem matching \"%s\" with paradigm 25\n", lemma); newBegin = 0; } flags.add("Vietniekv?rds"); } // Paradigm 30: jaundzimuais, pdjais else if (gramText.startsWith("-?, v. -?s, s.")) //iereibuais { newBegin = "-?, v. -?s, s.".length(); if (lemma.endsWith("uais")) { paradigm.add(30); flags.add("pabas v?rds"); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 30\n", lemma); newBegin = 0; } } else if (gramText.startsWith("-?, v.")) //pirmdzimtais { newBegin = "-?, v.".length(); if (lemma.endsWith("ais")) { paradigm.add(30); flags.add("pabas v?rds"); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 30\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); } else if (gramText.startsWith("-?s, s.")) //pirmdzimt?, -an?s { newBegin = "-?s, s.".length(); if (lemma.endsWith("an?s")) { paradigm.add(0); flags.add("Atgriezeniskais lietv?rds"); flags.add("Lietv?rds"); } else if (lemma.endsWith("?")) { paradigm.add(30); flags.add("pabas v?rds"); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigms 30, -an?s\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.matches("s\\. -?([.;].*)?")) //agr?kais { newBegin = "s. -?".length(); if (lemma.endsWith("ais")) { paradigm.add(30); flags.add("pabas v?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 30\n", lemma); newBegin = 0; } } // Paradigm Unknown: Divdabis // Grammar includes endings for other lemma variants. else if (gramText.matches("-guais; s\\. -gusi, -gus([.;].*)?")) //aizdudzis { newBegin = "-guais; s. -gusi, -gus".length(); if (lemma.endsWith("dzis")) { Lemma altLemma = new Lemma(lemma.substring(0, lemma.length() - 4) + "gusi"); HashSet<String> altParams = new HashSet<String>(); altParams.add("Sievieu dzimte"); altLemmas.put(0, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); paradigm.add(0); flags.add("Divdabis"); flags.add("Lok?mais dar?m?s k?rtas pag?tnes divdabis (-is, -usi, -ies, -usies)"); flags.add("Vrieu dzimte"); } else { System.err.printf("Problem matching \"%s\" with paradigm 0 (Divdabis)\n", lemma); newBegin = 0; } } else if (gramText.matches("-uais; s. -usi\\, -us([.;].*)?")) //aizkpis { newBegin = "-uais; s. -usi, -us".length(); if (lemma.matches(".*[cdjlmprstv]is")) { Lemma altLemma = new Lemma(lemma.substring(0, lemma.length() - 3) + "usi"); HashSet<String> altParams = new HashSet<String>(); altParams.add("Sievieu dzimte"); altLemmas.put(0, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); paradigm.add(0); flags.add("Divdabis"); flags.add("Lok?mais dar?m?s k?rtas pag?tnes divdabis (-is, -usi, -ies, -usies)"); flags.add("Vrieu dzimte"); } else { System.err.printf("Problem matching \"%s\" with paradigm 0 (Divdabis)\n", lemma); newBegin = 0; } } } // "-??a, v." if (newBegin == -1) newBegin = aEndingMascRules(gramText, lemma); // "-??u, v." if (newBegin == -1) newBegin = uEndingMascRules(gramText, lemma); // "-??u, s." if (newBegin == -1) newBegin = uEndingFemRules(gramText, lemma); // === Risky rules ================================================= // These rules matches prefix of some other rule. if (newBegin == -1) newBegin = singleEndingOnlyRules(gramText, lemma); if (newBegin > 0 && newBegin <= gramText.length()) gramText = gramText.substring(newBegin); else if (newBegin > gramText.length()) { System.err.printf("Problem with processing lemma \"%s\" and grammar \"%s\": obtained cut index \"%d\"", lemma, gramText, newBegin); } if (gramText.matches("[.,;].*")) gramText = gramText.substring(1); return gramText; } /** * This method contains collection of patterns with no commas in them - * these patterns can be applied to any segmented grammar substring, not * only on the beginning of the grammar. Only patterns found in data are * given. Thus,e.g., if there was no plural-only nouns with ending -as, * then there is no rule for processing such words (at least in most * cases). * @param lemma is used for grammar parsing. * @return leftovers (unprocessed part of string) */ private String processWithNoCommaPatterns(String gramText, String lemma) { gramText = gramText.trim(); int newBegin = -1; // Alternative form processing. if (gramText.matches("parasti divd\\. form?: (\\w+)")) //aizdzert->aizdzerts { Matcher m = Pattern.compile("(parasti divd\\. form?: (\\w+))([.;].*)?").matcher(gramText); m.matches(); String newLemma = m.group(2); Lemma altLemma = new Lemma(newLemma); HashSet<String> altParams = new HashSet<String>(); altParams.add("Divdabis"); altParams.add("Cita paradigma"); newBegin = m.group(1).length(); if (newLemma.endsWith("ts")) // aizdzert->aizdzerts { altParams.add("Lok?mais cieam?s k?rtas pag?tnes divdabis (-ts, -ta)"); altLemmas.put(0, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); flags.add("Darbbas v?rds"); flags.add("Parasti divdabja form?"); flags.add("Parasti lok?m? cieam?s k?rtas pag?tnes divdabja form?"); } else if (newLemma.endsWith("is") || newLemma.endsWith("ies")) // aizmakt->aizsmacis, pieriesties->pieriesies { altParams.add("Lok?mais dar?m?s k?rtas pag?tnes divdabis (-is, -usi, -ies, -usies)"); altLemmas.put(0, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); flags.add("Darbbas v?rds"); flags.add("Parasti divdabja form?"); flags.add("Parasti lok?m? dar?m?s k?rtas pag?tnes divdabja form?"); } else if (newLemma.endsWith("damies")) //aizvilkties->aizvilkdamies { altParams.add("Daji lok?mais divdabis (-dams, -dama, -damies, -dam?s)"); altLemmas.put(0, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); flags.add("Darbbas v?rds"); flags.add("Parasti divdabja form?"); flags.add("Parasti daji lok?m? divdabja form?"); } else { System.err.printf("Problem matching \"%s\" in entry \"%s\" with paradigm 0 (Divdabis)\n", newLemma, lemma); newBegin = 0; } } else if (gramText.matches("biei lok\\.: (\\w+)")) // agrums->agrum? { Matcher m = Pattern.compile("(biei lok\\.: (\\w+))([.;].*)?").matcher(gramText); newBegin = m.group(1).length(); flags.add("Biei lokatva form?"); } if (newBegin > 0) gramText = gramText.substring(newBegin); return gramText; } /** * This method contains collection of patterns with no semicolon in them - * these patterns can be applied to grammar segmented on ';', but not * segmented on ','. Only patterns found in data are * given. Thus,e.g., if there was no plural-only nouns with ending -as, * then there is no rule for processing such words (at least in most * cases). * @param lemma is used for grammar parsing. * @return leftovers (unprocessed part of string) */ private String processWithNoSemicolonPatterns(String gramText, String lemma) { gramText = gramText.trim(); int newBegin = -1; // Alternative form processing. if (gramText.matches("parasti divd\\. form?: (\\w+), (\\w+)")) //aizelsties->aizelsies, aizelsdamies { Matcher m = Pattern.compile("(parasti divd\\. form?: (\\w+), (\\w+))([.;].*)?").matcher(gramText); m.matches(); String[] newLemmas = { m.group(2), m.group(3) }; newBegin = m.group(1).length(); for (String newLemma : newLemmas) { Lemma altLemma = new Lemma(newLemma); HashSet<String> altParams = new HashSet<String>(); altParams.add("Divdabis"); altParams.add("Cita paradigma"); if (newLemma.endsWith("ts")) // noliegt->noliegts { altParams.add("Lok?mais cieam?s k?rtas pag?tnes divdabis (-ts, -ta)"); altLemmas.put(0, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); flags.add("Darbbas v?rds"); flags.add("Parasti divdabja form?"); flags.add("Parasti lok?m? cieam?s k?rtas pag?tnes divdabja form?"); } else if (newLemma.endsWith("is") || newLemma.endsWith("ies")) // aizelsties->aizelsies { altParams.add("Lok?mais dar?m?s k?rtas pag?tnes divdabis (-is, -usi, -ies, -usies)"); altLemmas.put(0, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); flags.add("Darbbas v?rds"); flags.add("Parasti divdabja form?"); flags.add("Parasti lok?m? dar?m?s k?rtas pag?tnes divdabja form?"); } else if (newLemma.endsWith("ams") || newLemma.endsWith("?ms")) // noliegt->noliedzams { altParams.add("Lok?mais cieam?s k?rtas tagadnes divdabis (-ams, -ama, -?ms, -?ma)"); altLemmas.put(0, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); flags.add("Darbbas v?rds"); flags.add("Parasti divdabja form?"); flags.add("Parasti lok?m? cieam?s k?rtas tagadnes divdabja form?"); } else if (newLemma.endsWith("damies")) //aizelsties->aizelsdamies { altParams.add("Daji lok?mais divdabis (-dams, -dama, -damies, -dam?s)"); altLemmas.put(0, new Tuple<Lemma, HashSet<String>>(altLemma, altParams)); flags.add("Darbbas v?rds"); flags.add("Parasti divdabja form?"); flags.add("Parasti daji lok?m? divdabja form?"); } else { System.err.printf("Problem matching \"%s\" in entry \"%s\" with paradigm 0 (Divdabis)\n", newLemma, lemma); newBegin = 0; } } } if (newBegin > 0) gramText = gramText.substring(newBegin); return gramText; } /** * Simple rule - tries to match grammar text to given string and lemma * ending. If matched, adds a single paradigm. * @param pattern Unescaped ending string grammar text must begin with * to apply this rule. * @param requiredEnding Required ending for the lemma to apply this * rule. * @param paradigmId Paradigm ID to set if rule matched. * @param positiveFlags These flags are added if rule and lemma ending * matched. * @param alwaysFlags These flags are added if rule matched. * @param gramText Grammar string currently being processed. * @param lemma Lemma string for this header. * @return New begining for gram string if one of these rulles matched, * -1 otherwise. */ private int simpleRule(String pattern, String requiredEnding, int paradigmId, String[] positiveFlags, String[] alwaysFlags, String gramText, String lemma) { int newBegin = -1; if (gramText.matches("\\Q" + pattern + "\\E([;,.].*)?")) { newBegin = pattern.length(); if (lemma.endsWith(requiredEnding)) { paradigm.add(paradigmId); if (positiveFlags != null) flags.addAll(Arrays.asList(positiveFlags)); } else { System.err.printf("Problem matching \"%s\" with paradigm %s\n", lemma, paradigmId); newBegin = 0; } if (alwaysFlags != null) flags.addAll(Arrays.asList(alwaysFlags)); } return newBegin; } /** * The same as simple rule, but hyperns ar optional. It tries to match * grammar text to given pattern and lemma ending. If matched, adds a single * paradigm. * @param pattern Unescaped ending string grammar text must begin with * to apply this rule. * @param requiredEnding Required ending for the lemma to apply this * rule. * @param paradigmId Paradigm ID to set if rule matched. * @param positiveFlags These flags are added if rule and lemma ending * matched. * @param alwaysFlags These flags are added if rule matched. * @param gramText Grammar string currently being processed. * @param lemma Lemma string for this header. * @return New begining for gram string if one of these rulles matched, * -1 otherwise. */ private int simpleRuleOptHyperns(String pattern, String requiredEnding, int paradigmId, String[] positiveFlags, String[] alwaysFlags, String gramText, String lemma) { int newBegin = -1; pattern = pattern.replace("-", "\\E-?\\Q"); pattern = "(\\Q" + pattern + "\\E)([;,.].*)?"; Matcher m = Pattern.compile(pattern).matcher(gramText); if (m.matches()) { newBegin = m.group(1).length(); if (lemma.endsWith(requiredEnding)) { paradigm.add(paradigmId); if (positiveFlags != null) flags.addAll(Arrays.asList(positiveFlags)); } else { System.err.printf("Problem matching \"%s\" with paradigm %s\n", lemma, paradigmId); newBegin = 0; } if (alwaysFlags != null) flags.addAll(Arrays.asList(alwaysFlags)); } return newBegin; } /** * Paradigm 9: Lietv?rds 5. deklin?cija -e siev. dz. * Rules in form "-es, dsk. en. -?u, s.". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int esEndingPluralGenUEndingFemRules(String gramText, String lemma) { int newBegin = -1; // Paradigm 9: Lietv?rds 5. deklin?cija -e siev. dz. if (gramText.startsWith("-es, dsk. en. -?u, s.")) //?bece { newBegin = "-es, dsk. en. -?u, s.".length(); if (lemma.matches(".*[c?]e")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-es, dsk. en. -u, s.")) //?bele { newBegin = "-es, dsk. en. -u, s.".length(); if (lemma.endsWith("le")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-es, dsk. en. -u, s.")) //abate { newBegin = "-es, dsk. en. -u, s.".length(); if (lemma.matches(".*[ts]e")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-es, dsk. en. -u, s.")) //?bolaine { newBegin = "-es, dsk. en. -u, s.".length(); if (lemma.endsWith("ne")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-es, dsk. en. -u, s.")) //?bolmaize { newBegin = "-es, dsk. en. -u, s.".length(); if (lemma.matches(".*[zd]e")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-es, dsk. en. -ru, s.")) //administratore { newBegin = "-es, dsk. en. -ru, s.".length(); if (lemma.endsWith("re")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-es, dsk. en. -stu, s.")) //abolicioniste { newBegin = "-es, dsk. en. -stu, s.".length(); if (lemma.endsWith("ste")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-es, dsk. en. -u, s.")) //aeroloe { newBegin = "-es, dsk. en. -u, s.".length(); if (lemma.endsWith("e")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-es, dsk. en. -vju, s.")) //agave { newBegin = "-es, dsk. en. -vju, s.".length(); if (lemma.endsWith("ve")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-es, dsk. en. -u, s.")) //agnostie { newBegin = "-es, dsk. en. -u, s.".length(); if (lemma.endsWith("e")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-es, dsk. en. -mju, s.")) //agronome { newBegin = "-es, dsk. en. -mju, s.".length(); if (lemma.endsWith("me")) { paradigm.add(9); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-es, dsk. en. -pju, s.")) //aitkope, tsklapes { newBegin = "-es, dsk. en. -pju, s.".length(); if (lemma.endsWith("pe")) { paradigm.add(9); flags.add("Lietv?rds"); } else if (lemma.endsWith("pes")) { paradigm.add(9); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } return newBegin; } /** * Paradigm 7: Lietv?rds 4. deklin?cija -a siev. dz. * Paradigm 9: Lietv?rds 5. deklin?cija -e siev. dz. * Paradigm 11: Lietv?rds 6. deklin?cija -s * Rules in form "-u, s." and "-u, s.". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int uEndingFemRules(String gramText, String lemma) { int newBegin = -1; // Paradigms: 7, 9, 11 if (gramText.startsWith("-u, s.")) //ahajiete, aizkulises, bikses, klauas { newBegin = "-u, s.".length(); if (lemma.endsWith("te")) { paradigm.add(9); flags.add("Lietv?rds"); } else if (lemma.endsWith("as")) { paradigm.add(7); flags.add("irkav?rds daudzskaitl"); flags.add("Lietv?rds"); } else if (lemma.endsWith("tis")) { paradigm.add(11); flags.add("irkav?rds daudzskaitl"); flags.add("Lietv?rds"); } else if (lemma.matches(".*[st]es")) { paradigm.add(9); flags.add("irkav?rds daudzskaitl"); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 7, 9, 11\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } // Paradigms: 7, 9 else if (gramText.startsWith("-u, s.")) //mir?des, graizes, baas { newBegin = "-u, s.".length(); if (lemma.endsWith("as")) { paradigm.add(7); flags.add("irkav?rds daudzskaitl"); flags.add("Lietv?rds"); } else if (lemma.matches(".*[dz]es")) { paradigm.add(9); flags.add("irkav?rds daudzskaitl"); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 7, 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-u, s.")) //acenes, iemaas { newBegin = "-u, s.".length(); if (lemma.endsWith("as")) { paradigm.add(7); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); } else if (lemma.endsWith("ne")) { paradigm.add(9); flags.add("Lietv?rds"); } else if (lemma.endsWith("nes")) { paradigm.add(9); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); } else { System.err.printf("Problem matching \"%s\" with paradigm 7, 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } else if (gramText.startsWith("-u, s.")) // aijas, zees { newBegin = "-u, s.".length(); if (lemma.endsWith("as")) { paradigm.add(7); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); } else if (lemma.endsWith("a")) { paradigm.add(7); flags.add("Lietv?rds"); } else if (lemma.matches(".*[]es")) { paradigm.add(9); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); } else { System.err.printf("Problem matching \"%s\" with paradigm 7, 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } // Paradigms: 9 else if (gramText.startsWith("-u, s.")) //bailes { newBegin = "-u, s.".length(); if (lemma.endsWith("les")) { paradigm.add(9); flags.add("irkav?rds daudzskaitl"); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } flags.add("Sievieu dzimte"); } return newBegin; } /** * Paradigm 1: Lietv?rds 1. deklin?cija -s * Paradigm 2: Lietv?rds 1. deklin?cija - * Paradigm 3: Lietv?rds 2. deklin?cija -is * Paradigm 4: Lietv?rds 2. deklin?cija -s (nom. == en.) * Paradigm 5: Lietv?rds 2. deklin?cija -suns * Rules in form "-a, v." and "-a, v.". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int aEndingMascRules(String gramText, String lemma) { int newBegin = -1; // Paradigms: 3, 5 if (gramText.startsWith("-a, v.")) // acumirklis, durkls { newBegin = "-a, v.".length(); if (lemma.endsWith("ls")) { paradigm.add(5); flags.add("Lietv?rds"); } else if (lemma.endsWith("lis")) { paradigm.add(3); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 3, 5\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); } else if (gramText.startsWith("-a, v.")) // abrkasis, lemess { newBegin = "-a, v.".length(); if (lemma.endsWith("ss")) { paradigm.add(5); flags.add("Lietv?rds"); } else if (lemma.matches(".*[st]is")) { paradigm.add(3); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 3, 5\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); } // Paradigm 3 else if (gramText.startsWith("-a, v.")) // agnostiis { newBegin = "-a, v.".length(); if (lemma.matches(".*[]is")) { paradigm.add(3); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 3\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); } else if (gramText.startsWith("-pja, v.")) // aitkopis { newBegin = "-pja, v.".length(); if (lemma.endsWith("pis")) { paradigm.add(3); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 3\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); } else if (gramText.startsWith("-a, v.")) // aizb?znis { newBegin = "-a, v.".length(); if (lemma.endsWith("znis")) { paradigm.add(3); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 3\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); } else if (gramText.startsWith("-a, vsk.")) // ?dgrauzis { newBegin = "-a, vsk.".length(); if (lemma.matches(".*[zd]is")) { paradigm.add(3); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 3\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); } //Paradigms: 1, 3 else if (gramText.matches("-ra[,;] v.(.*)?")) // airis, mrniekmeistars { newBegin = "-ra, v.".length(); if (lemma.endsWith("ris")) { paradigm.add(3); flags.add("Lietv?rds"); } else if (lemma.endsWith("rs")) { paradigm.add(1); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 3\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); } // Paradigms: 2, 3, 5 else if (gramText.startsWith("-a, v.")) // abesnis { newBegin = "-a, v.".length(); if (lemma.endsWith("suns")) { paradigm.add(5); flags.add("Lietv?rds"); } else if (lemma.endsWith("")) { paradigm.add(2); flags.add("Lietv?rds"); } else if (lemma.endsWith("nis")) { paradigm.add(3); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigms 2, 3, 5\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); } // Paradigms: 1, 2, 3 (if no sound changes), 1-5 (if plural) else if (gramText.startsWith("-a, v.")) // abats, akustiis, spargui, skostii { newBegin = "-a, v.".length(); if (lemma.matches(".*[jr]is")) { paradigm.add(3); flags.add("Lietv?rds"); } else if (lemma.endsWith("")) { paradigm.add(2); flags.add("Lietv?rds"); } else if (lemma.matches(".*[^aeiou??]s")) { paradigm.add(1); flags.add("Lietv?rds"); } else if (lemma.matches(".*[]i")) { paradigm.add(1); paradigm.add(2); paradigm.add(3); paradigm.add(4); paradigm.add(5); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); flags.add("Neviennozmga paradigma"); } else if (lemma.matches(".*[]i")) { paradigm.add(1); paradigm.add(2); paradigm.add(3); paradigm.add(5); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); flags.add("Neviennozmga paradigma"); } else { System.err.printf("Problem matching \"%s\" with paradigms 1, 2, 3\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); } return newBegin; } /** * Paradigm 1: Lietv?rds 1. deklin?cija -s * Paradigm 2: Lietv?rds 1. deklin?cija - * Paradigm 3: Lietv?rds 2. deklin?cija -is * Paradigm 4: Lietv?rds 2. deklin?cija -s (piem., mness) (vsk. nom. = vsk. gen) * Paradigm 5: Lietv?rds 2. deklin?cija -suns * Paradigm 32: Lietv?rds 6. deklin?cija - audis * Rules in form "-u, v." and "-u, v.". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int uEndingMascRules(String gramText, String lemma) { int newBegin = -1; // Paradigm 32 if (gramText.startsWith("-u, v.")) //audis { newBegin = "-u, v.".length(); if (lemma.endsWith("audis")) { paradigm.add(11); flags.add("irkav?rds daudzskaitl"); flags.add("Lietv?rds"); } else { System.err.printf("Problem matching \"%s\" with paradigm 32\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); // TODO Daudzskaitlinieks? } // Paradigms: 1-5 (plural forms) else if (gramText.startsWith("-u, v.")) // bretoi { newBegin = "-u, v.".length(); if (lemma.endsWith("i")) { paradigm.add(1); paradigm.add(2); paradigm.add(3); paradigm.add(4); paradigm.add(5); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); flags.add("Neviennozmga paradigma"); } else { System.err.printf("Problem matching \"%s\" with paradigms 1-5\n", lemma); newBegin = 0; } flags.add("Vrieu dzimte"); } else if (gramText.startsWith("-u, v.")) // abesi, abh?zi, ?dgraui, adigejiei, ad?ri, alimenti, angi, antinukloni, apakbrun?i { newBegin = "-u, v.".length(); if (lemma.endsWith("nieki") || lemma.endsWith("umi") || lemma.endsWith("ot?ji")) { paradigm.add(1); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); } else if (lemma.endsWith("iei")) { paradigm.add(3); paradigm.add(5); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); flags.add("Neviennozmga paradigma"); } else { if (lemma.matches(".*[]i")) // akmei, mnei etc. { paradigm.add(1); paradigm.add(2); paradigm.add(3); paradigm.add(4); paradigm.add(5); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); flags.add("Neviennozmga paradigma"); } else if (lemma.matches(".*[vpm]ji")) // looks like these are predefined sound changes always { paradigm.add(3); paradigm.add(5); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); flags.add("Neviennozmga paradigma"); } else if (lemma.matches(".*[bgkhrst?]i") || lemma.matches(".*[aeiou??]ji")) // can't determine if there is sound change (t - tti, s - viesi, j - airk?ji) { paradigm.add(1); paradigm.add(2); paradigm.add(3); paradigm.add(5); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); flags.add("Neviennozmga paradigma"); } else if (lemma.matches(".*[cdlmnpvz]i")) // there is no sound change { paradigm.add(1); paradigm.add(2); flags.add("Lietv?rds"); flags.add("irkav?rds daudzskaitl"); flags.add("Neviennozmga paradigma"); } else { System.err.printf("Problem matching \"%s\" with paradigms 1-5\n", lemma); newBegin = 0; } } flags.add("Vrieu dzimte"); } return newBegin; } /** * Paradigm 3: Lietv?rds 2. deklin?cija -is * Paradigm 9: Lietv?rds 5. deklin?cija -e siev. dz. * Rules containing single ending with no other information, e.g. "-u". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int singleEndingOnlyRules(String gramText, String lemma) { int newBegin = -1; // Paradigm 9 if (gramText.matches("-u([;.].*)?")) //abioenze, ablumozes, akol?de, nematodes { newBegin = "-u".length(); if (lemma.matches(".*[dz]es")) { paradigm.add(9); flags.add("Lietv?rds"); flags.add("Sievieu dzimte"); flags.add("irkav?rds daudzskaitl"); } else if (lemma.matches(".*[dz]e")) { paradigm.add(9); flags.add("Lietv?rds"); flags.add("Sievieu dzimte"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } } else if (gramText.matches("-u([;.].*)?")) //agrene, aizlaidnes { newBegin = "-u".length(); if (lemma.endsWith("nes")) { paradigm.add(9); flags.add("Lietv?rds"); flags.add("Sievieu dzimte"); flags.add("irkav?rds daudzskaitl"); } else if (lemma.endsWith("ne")) { paradigm.add(9); flags.add("Lietv?rds"); flags.add("Sievieu dzimte"); } else { System.err.printf("Problem matching \"%s\" with paradigm 9\n", lemma); newBegin = 0; } } // Paradigm 3 else if (gramText.matches("-a([;,.].*)?")) //?bolainis { newBegin = "-a".length(); if (lemma.endsWith("nis")) { paradigm.add(3); flags.add("Lietv?rds"); flags.add("Vrieu dzimte"); } else { System.err.printf("Problem matching \"%s\" with paradigm 3\n", lemma); newBegin = 0; } } return newBegin; } /** * // Paradigm 11: Lietv?rds 6. deklin?cija -s * Rules in form "-valsts, dsk. en. -valstu, s.", i.e containing full 6th * ceclension nouns. * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int sixthDeclNounFullWordRules(String gramText, String lemma) { int newBegin = -1; if (newBegin == -1) newBegin = simpleRuleOptHyperns("-acs, dsk. en. -acu, s.", "acs", 11, new String[] { "Lietv?rds" }, new String[] { "Sievieu dzimte" }, gramText, lemma); //uzacs, acs if (newBegin == -1) newBegin = simpleRuleOptHyperns("-kr?sns, dsk. en. -kr?u, s.", "kr?sns", 11, new String[] { "Lietv?rds" }, new String[] { "Sievieu dzimte" }, gramText, lemma); //aizkr?sns if (newBegin == -1) newBegin = simpleRuleOptHyperns("-valsts, dsk. en. -valstu, s.", "valsts", 11, new String[] { "Lietv?rds" }, new String[] { "Sievieu dzimte" }, gramText, lemma); //agr?rvalsts return newBegin; } /** * Paradigm 15: Darbbas v?rdi 1. konjug?cija tieie * Rules in form "parasti 3. pers., -alc, pag. -alca". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int firstConjDirVerb3PersRules(String gramText, String lemma) { int newBegin = -1; // Rules ordered alphabetically by verb infinitive. // A if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -aug, pag. -auga", "augt", 15, new String[] { "Darbbas v?rds", "Loct k? \"augt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizaugt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -aust, pag. -ausa", "aust", 15, new String[] { "Darbbas v?rds", "Loct k? \"aust\" (k? gaisma)" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizaust 1 // B if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -birst, pag. -bira", "birt", 15, new String[] { "Darbbas v?rds", "Loct k? \"birt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizbirt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -brk, pag. -bruka", "brukt", 15, new String[] { "Darbbas v?rds", "Loct k? \"brukt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizbrukt // C // D if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -deg, pag. -dega", "degt", 15, new String[] { "Darbbas v?rds", "Loct k? \"degt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdegt 2 if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -dim, pag. -dima", "dimt", 15, new String[] { "Darbbas v?rds", "Loct k? \"dimt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdimt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -dip, pag. -dipa", "dipt", 15, new String[] { "Darbbas v?rds", "Loct k? \"dipt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdipt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -dc, pag. -dca", "dkt", 15, new String[] { "Darbbas v?rds", "Loct k? \"dkt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdkt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -dze, pag. -dzla", "dzelt", 15, new String[] { "Darbbas v?rds", "Loct k? \"dzelt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdzelt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -dzst, pag. -dzija", "dzt", 15, new String[] { "Darbbas v?rds", "Loct k? \"dzt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdzt 2 // E, F // G if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -grimst, pag. -grima", "grimt", 15, new String[] { "Darbbas v?rds", "Loct k? \"grimt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizgrimt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -grst, pag. -gruva", "grt", 15, new String[] { "Darbbas v?rds", "Loct k? \"grt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizgrt if (newBegin == -1) newBegin = simpleRuleOptHyperns("3. pers. -guldz, pag. -guldza", "gulgt", 15, new String[] { "Darbbas v?rds", "Loct k? \"gulgt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizgulgt // H // I if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -irst, pag. -ira", "irt", 15, new String[] { "Darbbas v?rds", "Loct k? \"irt\" (k? audums)" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //irt 2 // J // K if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -kalst, pag. -kalta", "kalst", 15, new String[] { "Darbbas v?rds", "Loct k? \"kalst\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizkalst if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -kauc, pag. -kauca", "kaukt", 15, new String[] { "Darbbas v?rds", "Loct k? \"kaukt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizkaukt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -kn?bj, pag. -kn?ba", "kn?bt", 15, new String[] { "Darbbas v?rds", "Loct k? \"kn?bt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizkn?bt if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -kvpst, pag. -kvpa", "kvpt", 15, new String[] { "Darbbas v?rds", "Loct k? \"kvpt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizkvpt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -kviec, pag. -kvieca", "kviekt", 15, new String[] { "Darbbas v?rds", "Loct k? \"kviekt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizkviekt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -ep, pag. -epa", "ept", 15, new String[] { "Darbbas v?rds", "Loct k? \"ept\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizept // L if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -lkst, pag. -lka", "lkt", 15, new String[] { "Darbbas v?rds", "Loct k? \"lkt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizlkt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -lp, pag. -lipa", "lipt", 15, new String[] { "Darbbas v?rds", "Loct k? \"lipt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizlipt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -lst, pag. -lija", "lt", 15, new String[] { "Darbbas v?rds", "Loct k? \"lt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizlt // M, N, O, P, R // S, if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -alc, pag. -alca", "alkt", 15, new String[] { "Darbbas v?rds", "Loct k? \"alkt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizalkt // T if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -tkst, pag. -tka", "tkt", 15, new String[] { "Darbbas v?rds", "Loct k? \"tkt\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aiztkt // U, V, Z return newBegin; } /** * Paradigm 15: Darbbas v?rdi 1. konjug?cija tieie * Rules in form "-tupstu, -tupsti, -tupst, pag. -tupu". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int firstConjDirVerbAllPersRules(String gramText, String lemma) { int newBegin = -1; // Rules ordered alphabetically by verb infinitive. // A if (newBegin == -1) newBegin = simpleRuleOptHyperns("-aru, -ar, -ar, pag. -aru", "art", 15, new String[] { "Darbbas v?rds", "Loct k? \"art\"" }, null, gramText, lemma); //aizart if (newBegin == -1) newBegin = simpleRuleOptHyperns("-auu, -aud, -au, pag. -audu", "aust", 15, new String[] { "Darbbas v?rds", "Loct k? \"aust\" (k? zirneklis)" }, null, gramText, lemma); //aizaust 2 // B if (newBegin == -1) newBegin = simpleRuleOptHyperns("-b?u, -b?z, -b?, pag. -b?zu", "b?zt", 15, new String[] { "Darbbas v?rds", "Loct k? \"b?zt\"" }, null, gramText, lemma); //aizb?zt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-bgu, -bdz, -bg, pag. -bgu", "bgt", 15, new String[] { "Darbbas v?rds", "Loct k? \"bgt\"" }, null, gramText, lemma); //aizbgt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-beru, -ber, -ber, pag. -bru", "brt", 15, new String[] { "Darbbas v?rds", "Loct k? \"brt\"" }, null, gramText, lemma); //aizbrt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-bilstu, -bilsti, -bilst, pag. -bildu", "bilst", 15, new String[] { "Darbbas v?rds", "Loct k? \"bilst\"" }, null, gramText, lemma); //aizbilst if (newBegin == -1) newBegin = simpleRuleOptHyperns("-braucu, -brauc, -brauc, pag. -braucu", "braukt", 15, new String[] { "Darbbas v?rds", "Loct k? \"braukt\"" }, null, gramText, lemma); //aizbraukt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-br?u, -br?z, -br?, pag. -br?zu", "br?zt", 15, new String[] { "Darbbas v?rds", "Loct k? \"br?zt\"" }, null, gramText, lemma); //aizbr?zt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-brienu, -brien, -brien, pag. -bridu", "brist", 15, new String[] { "Darbbas v?rds", "Loct k? \"brist\"" }, null, gramText, lemma); //aizbrist // C if (newBegin == -1) newBegin = simpleRuleOptHyperns("-ceu, -cel, -ce, pag. -clu", "celt", 15, new String[] { "Darbbas v?rds", "Loct k? \"celt\"" }, null, gramText, lemma); //aizcelt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-crtu, -crt, -crt, pag. -cirtu", "cirst", 15, new String[] { "Darbbas v?rds", "Loct k? \"cirst\"" }, null, gramText, lemma); //aizcirst // D if (newBegin == -1) newBegin = simpleRuleOptHyperns("-dedzu, -dedz, -dedz, pag. -dedzu", "degt", 15, new String[] { "Darbbas v?rds", "Loct k? \"degt\"" }, null, gramText, lemma); //aizdegt 1 if (newBegin == -1) newBegin = simpleRuleOptHyperns("-diebju, -dieb, -diebj, pag. -diebu", "diebt", 15, new String[] { "Darbbas v?rds", "Loct k? \"diebt\"" }, null, gramText, lemma); //aizdiebt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-diedzu, -diedz, -diedz, pag. -diedzu", "diegt", 15, new String[] { "Darbbas v?rds", "Loct k? \"diegt\"" }, null, gramText, lemma); //aizdiegt 1 if (newBegin == -1) newBegin = simpleRuleOptHyperns("-dodu, -dod, -dod, pag. -devu", "dot", 15, new String[] { "Darbbas v?rds", "Loct k? \"dot\"" }, null, gramText, lemma); //aizdot if (newBegin == -1) newBegin = simpleRuleOptHyperns("-dr?u, -dr?z, -dr?, pag. -dr?zu", "dr?zt", 15, new String[] { "Darbbas v?rds", "Loct k? \"dr?zt\"" }, null, gramText, lemma); //aizdr?zt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-duru, -dur, -dur, pag. -dru", "durt", 15, new String[] { "Darbbas v?rds", "Loct k? \"durt\"" }, null, gramText, lemma); //aizdurt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-dzeru, -dzer, -dzer, pag. -dzru", "dzert", 15, new String[] { "Darbbas v?rds", "Loct k? \"dzert\"" }, null, gramText, lemma); //aizdzert if (newBegin == -1) newBegin = simpleRuleOptHyperns("-dzenu, -dzen, -dzen, pag. -dzinu", "dzt", 15, new String[] { "Darbbas v?rds", "Loct k? \"dzt\"" }, null, gramText, lemma); //aizdzt 1 // E if (newBegin == -1) newBegin = simpleRuleOptHyperns("-du, -d, -d, pag. -du", "st", 15, new String[] { "Darbbas v?rds", "Loct k? \"st\"" }, null, gramText, lemma); //aizst // F // G if (newBegin == -1) newBegin = simpleRuleOptHyperns("-g?u, -g?z, -g?, pag. -g?zu", "g?zt", 15, new String[] { "Darbbas v?rds", "Loct k? \"g?zt\"" }, null, gramText, lemma); //aizg?zt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-glauu, -glaud, -glau, pag. -glaudu", "glaust", 15, new String[] { "Darbbas v?rds", "Loct k? \"glaust\"" }, null, gramText, lemma); //aizglaust if (newBegin == -1) newBegin = simpleRuleOptHyperns("-gr?bju, -gr?b, -gr?bj, pag. -gr?bu", "gr?bt", 15, new String[] { "Darbbas v?rds", "Loct k? \"gr?bt\"" }, null, gramText, lemma); //aizgr?bt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-grauu, -grauz, -grau, pag. -grauzu", "grauzt", 15, new String[] { "Darbbas v?rds", "Loct k? \"grauzt\"" }, null, gramText, lemma); //aizgrauzt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-grieu, -griez, -grie, pag. -griezu", "griezt", 15, new String[] { "Darbbas v?rds", "Loct k? \"griezt\"" }, null, gramText, lemma); //aizgriezt 2 if (newBegin == -1) newBegin = simpleRuleOptHyperns("-gru, -grd, -gr, pag. -grdu", "grst", 15, new String[] { "Darbbas v?rds", "Loct k? \"grst\"" }, null, gramText, lemma); //aizgrst if (newBegin == -1) newBegin = simpleRuleOptHyperns("-gulstu, -gulsti, -gulst, pag. -glu, ar -gulu", "gult", 15, new String[] { "Darbbas v?rds", "Loct k? \"gult\"", "Parall?s formas" }, null, gramText, lemma); //aizgult if (newBegin == -1) newBegin = simpleRuleOptHyperns("-gstu, -gsti, -gst, pag. -guvu", "gt", 15, new String[] { "Darbbas v?rds", "Loct k? \"gt\"" }, null, gramText, lemma); //aizgt // if (newBegin == -1) newBegin = simpleRuleOptHyperns("-iedu, -ied, -ied, pag. -gidu", "ist", 15, new String[] { "Darbbas v?rds", "Loct k? \"ist\"" }, null, gramText, lemma); //apist // H // I if (newBegin == -1) newBegin = simpleRuleOptHyperns("-eju, -ej, -iet, pag. -g?ju", "iet", 15, new String[] { "Darbbas v?rds", "Loct k? \"iet\"" }, null, gramText, lemma); //apiet if (newBegin == -1) newBegin = simpleRuleOptHyperns("-iru, -ir, -ir, pag. -ru", "irt", 15, new String[] { "Darbbas v?rds", "Loct k? \"irt\" (k? ar airiem)" }, null, gramText, lemma); //aizirt 1 // J if (newBegin == -1) newBegin = simpleRuleOptHyperns("-j?ju, -j?j, -j?j, pag. -j?ju", "j?t", 15, new String[] { "Darbbas v?rds", "Loct k? \"j?t\"" }, null, gramText, lemma); //aizj?t if (newBegin == -1) newBegin = simpleRuleOptHyperns("-jou, -joz, -jo, pag. -jozu", "jozt", 15, new String[] { "Darbbas v?rds", "Loct k? \"jozt\"" }, null, gramText, lemma); //aizjozt 1, 2 if (newBegin == -1) newBegin = simpleRuleOptHyperns("-jdzu, -jdz, -jdz, pag. -jdzu", "jgt", 15, new String[] { "Darbbas v?rds", "Loct k? \"jgt\"" }, null, gramText, lemma); //aizjgt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-jumju, -jum, -jumj, pag. -jmu, ar -jumu", "jumt", 15, new String[] { "Darbbas v?rds", "Loct k? \"jumt\"", "Parall?s formas" }, null, gramText, lemma); //aizjumt // K if (newBegin == -1) newBegin = simpleRuleOptHyperns("-k?pju, -k?p, -k?pj, pag. -k?pu", "k?pt", 15, new String[] { "Darbbas v?rds", "Loct k? \"k?pt\"" }, null, gramText, lemma); //aizk?pt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-karu, -kar, -kar, pag. -k?ru", "k?rt", 15, new String[] { "Darbbas v?rds", "Loct k? \"k?rt\"" }, null, gramText, lemma); //aizk?rt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-kauju, -kauj, -kauj, pag. -k?vu", "kaut", 15, new String[] { "Darbbas v?rds", "Loct k? \"kaut\"" }, null, gramText, lemma); //apkaut if (newBegin == -1) newBegin = simpleRuleOptHyperns("-kl?ju, -kl?j, -kl?j, pag. -kl?ju", "kl?t", 15, new String[] { "Darbbas v?rds", "Loct k? \"kl?t\"" }, null, gramText, lemma); //apkl?t if (newBegin == -1) newBegin = simpleRuleOptHyperns("-kliedzu, -kliedz, -kliedz, pag. -kliedzu", "kliegt", 15, new String[] { "Darbbas v?rds", "Loct k? \"kliegt\"" }, null, gramText, lemma); //aizkliegt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-klimstu, -klimsti, -klimst, pag. -klimtu", "klimst", 15, new String[] { "Darbbas v?rds", "Loct k? \"klimst\"" }, null, gramText, lemma); //aizklimst if (newBegin == -1) newBegin = simpleRuleOptHyperns("-klstu, -klsti, -klst, pag. -kldu", "klst", 15, new String[] { "Darbbas v?rds", "Loct k? \"klst\"" }, null, gramText, lemma); //aizklst if (newBegin == -1) newBegin = simpleRuleOptHyperns("-kstu, -ksti, -kst, pag. -kuvu", "kt", 15, new String[] { "Darbbas v?rds", "Loct k? \"kt\"" }, null, gramText, lemma); //aizkt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-kou, -kod, -ko, pag. -kodu", "kost", 15, new String[] { "Darbbas v?rds", "Loct k? \"kost\"" }, null, gramText, lemma); //aizkost if (newBegin == -1) newBegin = simpleRuleOptHyperns("-kr?pju, -kr?p, -kr?pj, pag. -kr?pu", "kr?pt", 15, new String[] { "Darbbas v?rds", "Loct k? \"kr?pt\"" }, null, gramText, lemma); //aizkr?pt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-krauju, -krauj, -krauj, pag. -kr?vu", "kraut", 15, new String[] { "Darbbas v?rds", "Loct k? \"kraut\"" }, null, gramText, lemma); //aizkraut if (newBegin == -1) newBegin = simpleRuleOptHyperns("-krtu, -krti, -krt, pag. -kritu", "krist", 15, new String[] { "Darbbas v?rds", "Loct k? \"krist\"" }, null, gramText, lemma); //aizkrist if (newBegin == -1) newBegin = simpleRuleOptHyperns("-kuru, -kur, -kur, pag. -kru", "kurt", 15, new String[] { "Darbbas v?rds", "Loct k? \"kurt\"" }, null, gramText, lemma); //aizkurt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-kstu, -kusti, -kst, pag. -kusu", "kust", 15, new String[] { "Darbbas v?rds", "Loct k? \"kust\"" }, null, gramText, lemma); //aizkust if (newBegin == -1) newBegin = simpleRuleOptHyperns("-eru, -er, -er, pag. -ru", "ert", 15, new String[] { "Darbbas v?rds", "Loct k? \"ert\"" }, null, gramText, lemma); //aizert // L if (newBegin == -1) newBegin = simpleRuleOptHyperns("-laiu, -laid, -lai, pag. -laidu", "laist", 15, new String[] { "Darbbas v?rds", "Loct k? \"laist\"" }, null, gramText, lemma); //aizlaist if (newBegin == -1) newBegin = simpleRuleOptHyperns("-lauu, -lauz, -lau, pag. -lauzu", "lauzt", 15, new String[] { "Darbbas v?rds", "Loct k? \"lauzt\"" }, null, gramText, lemma); //aizlauzt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-lecu, -lec, -lec, pag. -lcu", "lkt", 15, new String[] { "Darbbas v?rds", "Loct k? \"lkt\"" }, null, gramText, lemma); //aizlkt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-liedzu, -liedz, -liedz, pag. -liedzu", "liegt", 15, new String[] { "Darbbas v?rds", "Loct k? \"liegt\"" }, null, gramText, lemma); //aizliegt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-leju, -lej, -lej, pag. -lju", "liet", 15, new String[] { "Darbbas v?rds", "Loct k? \"liet\"" }, null, gramText, lemma); //aizliet if (newBegin == -1) newBegin = simpleRuleOptHyperns("-lieku, -liec, -liek, pag. -liku", "likt", 15, new String[] { "Darbbas v?rds", "Loct k? \"likt\"" }, null, gramText, lemma); //aizlikt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-lienu, -lien, -lien, pag. -ldu", "lst", 15, new String[] { "Darbbas v?rds", "Loct k? \"lst\"" }, null, gramText, lemma); //aizlst if (newBegin == -1) newBegin = simpleRuleOptHyperns("-lobju, -lob, -lobj, pag. -lobu", "lobt", 15, new String[] { "Darbbas v?rds", "Loct k? \"lobt\"" }, null, gramText, lemma); //aizlobt // M, N, O, P, R, S // T if (newBegin == -1) newBegin = simpleRuleOptHyperns("-tupstu, -tupsti, -tupst, pag. -tupu", "tupt", 15, new String[] { "Darbbas v?rds", "Loct k? \"tupt\"", "Parall?s formas" }, null, gramText, lemma); //aiztupt // TODO tupu/tupstu if (newBegin == -1) newBegin = simpleRuleOptHyperns("-tveru, -tver, -tver, pag. -tvru", "tvert", 15, new String[] { "Darbbas v?rds", "Loct k? \"tvert\"" }, null, gramText, lemma); //aiztvert // U, V, Z return newBegin; } /** * Paradigm 16: Darbbas v?rdi 2. konjug?cija tieie * Rules in form "parasti 3. pers., -o, pag. -oja", * "-oju, -o, -o, -ojam, -ojat, pag. -oju; -oj?m, -oj?t; pav. -o, -ojiet" * and "-ju, -, -, pag. -ju". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int secondConjDirVerbRules(String gramText, String lemma) { int newBegin = -1; // Paradigm 16: Darbbas v?rdi 2. konjug?cija tieie if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -kko, pag. -kkoja", "kkot", 16, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizkkot if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -?, pag. -?ja", "?t", 16, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizk?b?t if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -, pag. -ja", "t", 16, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //adsorbt if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -o, pag. -oja", "ot", 16, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizalkot, aizbangot if (newBegin == -1) newBegin = simpleRuleOptHyperns("-dabju, -dab, -dab, pag. -dabju", "dabt", 16, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizdabt if (newBegin == -1) newBegin = simpleRule("-oju, -o, -o, -ojam, -ojat, pag. -oju; -oj?m, -oj?t; pav. -o, -ojiet", "ot", 16, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //acot if (newBegin == -1) newBegin = simpleRule( "-ju, -, -, -jam, -jat, pag. -ju, -j?m, -j?t; pav. -, -jiet", "t", 16, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //adverbializt if (newBegin == -1) newBegin = simpleRule("-?ju, -?, -?, pag. -?ju", "?t", 16, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aij?t if (newBegin == -1) newBegin = simpleRule("-ju, -, -, pag. -ja", "t", 16, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizdelvert if (newBegin == -1) newBegin = simpleRule("-ju, -, -, pag. -ju", "t", 16, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //absolutizt if (newBegin == -1) newBegin = simpleRule("-oju, -o, -o, pag. -oju", "ot", 16, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aiztuntuot return newBegin; } /** * Paradigm 17: Darbbas v?rdi 3. konjug?cija tieie * Rules in form "parasti 3. pers., -bl?k, pag. -bl?kja" * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int thirdConjDir3PersVerbRules(String gramText, String lemma) { int newBegin = -1; // Verb-specific rules. // Rules ordered alphabetically by verb infinitive. // A // B newBegin = simpleRuleOptHyperns("parasti 3. pers., -bl?k, pag. -bl?kja", "bl?kt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizbl?kt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -bl?k, pag. -bl?kja", "bl?kt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizbl?kt // C, if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -?ab, pag. -?abja", "?abt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aiz?abt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -?aukst, pag. -?aukstja", "?aukstt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aiz?aukstt // D if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -d?rd, pag. -d?rdja", "d?rdt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizd?rdt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -dimd, pag. -dimdja", "dimdt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdimdt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -dip, pag. -dipja", "dipt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdipt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -dun, pag. -dunja", "dunt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdunt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -dinkst, pag. -dinkstja", "dinkstt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdinkstt // E, F // G if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -grab, pag. -grabja", "grabt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizgrabt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -gurkst, pag. -gurkstja", "gurkstt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizgurkstt // H, I, J // K if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -klab, pag, -klabja", "klabt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizklabt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -klakst, pag. -klakstja", "klakstt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizklakstt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -klaudz, pag. -klaudzja", "klaudzt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizklaudzt if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -kp, pag. -kpja", "kpt", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizkpt // L, M, N, O, P, R, S, T, U, V, Z // Generic ending rules. if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -, pag. -ja", "t", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdzirkstt if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -ina, pag. -in?ja", "in?t", 17, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizducin?t return newBegin; } /** * Paradigm 17: Darbbas v?rdi 3. konjug?cija tieie * Rules in form "-dziedu, -dziedi, -dzied, pag. -dzied?ju" and * "-u, -i, -a, pag. -ju". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int thirdConjDirAllPersVerbRules(String gramText, String lemma) { int newBegin = -1; // Verb-specific rules. if (newBegin == -1) newBegin = simpleRuleOptHyperns("-dziedu, -dziedi, -dzied, pag. -dzied?ju", "dzied?t", 17, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizdzied?t if (newBegin == -1) newBegin = simpleRuleOptHyperns("-guu, -guli, -gu, pag. -gulju", "gult", 17, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizgult if (newBegin == -1) newBegin = simpleRuleOptHyperns("-klimstu, -klimsti, -klimst, pag. -klimstju", "klimstt", 17, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizklimstt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-kustu, -kusti, -kust, pag. -kustju", "kustt", 17, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizkustt if (newBegin == -1) newBegin = simpleRuleOptHyperns("-turu, -turi, -tur, pag. -turju", "turt", 17, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizturt // Generic ending rules. if (newBegin == -1) newBegin = simpleRule("-u, -i, -a, pag. -ju", "t", 17, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizstt if (newBegin == -1) newBegin = simpleRule("-inu, -ini, -ina, pag. -in?ju", "in?t", 17, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizsvilin?t return newBegin; } /** * Paradigm 18: Darbbas v?rdi 1. konjug?cija atgriezeniski * Rules in form "parasti 3. pers., -alcas, pag. -alc?s". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int firstConjRef3PersVerbRules(String gramText, String lemma) { int newBegin = -1; // Rules ordered alphabetically by verb infinitive. // A, B, C // D if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -dcas, pag. -dc?s", "dkties", 18, new String[] { "Darbbas v?rds", "Loct k? \"dkties\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdkties if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -duras, pag. -dr?s", "durties", 18, new String[] { "Darbbas v?rds", "Loct k? \"durties\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdurties // E, F // G if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -g?as, pag. -g?z?s", "g?zties", 18, new String[] { "Darbbas v?rds", "Loct k? \"g?zties\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizg?zties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -grauas, pag. -grauz?s", "grauzties", 18, new String[] { "Darbbas v?rds", "Loct k? \"grauzties\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizgrauzties if (newBegin == -1) newBegin = simpleRuleOptHyperns("parasti 3. pers., -grieas, pag. -griez?s", "griezties", 18, new String[] { "Darbbas v?rds", "Loct k? \"griezties\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizgriezties 2 // H, I, J // K if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -kaucas, pag. -kauc?s", "kaukties", 18, new String[] { "Darbbas v?rds", "Loct k? \"kaukties\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizkaukties // L, M, N, O, P, R // S, if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -alcas, pag. -alc?s", "alkties", 18, new String[] { "Darbbas v?rds", "Loct k? \"alkties\"" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizalkties // T, U, V, Z return newBegin; } /** * Paradigm 18: Darbbas v?rdi 1. konjug?cija atgriezeniski * Rules in form "-tupstos, -tupsties, -tupstas, pag. -tupos". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int firstConjRefAllPersVerbRules(String gramText, String lemma) { int newBegin = -1; // Rules ordered alphabetically by verb infinitive. // A // B if (newBegin == -1) newBegin = simpleRuleOptHyperns("-br?os, -br?zies, -br?as, pag. -br?os", "br?zties", 18, new String[] { "Darbbas v?rds", "Loct k? \"br?zties\"" }, null, gramText, lemma); //aizbr?zties if (newBegin == -1) newBegin = simpleRule("-brcos, -brcies, -brcas, pag. -brcos", "brkties", 18, new String[] { "Darbbas v?rds", "Loct k? \"brkties\"" }, null, gramText, lemma); //aizbrkties // C if (newBegin == -1) newBegin = simpleRuleOptHyperns("-cieos, -cieties, -cieas, pag. -cietos", "ciesties", 18, new String[] { "Darbbas v?rds", "Loct k? \"ciesties\"" }, null, gramText, lemma); //aizciesties if (newBegin == -1) newBegin = simpleRuleOptHyperns("-crtos, -crties, -crtas, pag. -cirtos", "cirsties", 18, new String[] { "Darbbas v?rds", "Loct k? \"cirsties\"" }, null, gramText, lemma); //aizcirsties // D if (newBegin == -1) newBegin = simpleRule("-degos, -dedzies, -degas, pag. -degos", "degties", 18, new String[] { "Darbbas v?rds", "Loct k? \"degties\"" }, null, gramText, lemma); //aizdegties if (newBegin == -1) newBegin = simpleRuleOptHyperns("-dr?os, -dr?zies, -dr?as, pag. -dr?zos", "dr?zties", 18, new String[] { "Darbbas v?rds", "Loct k? \"dr?zties\"" }, null, gramText, lemma); //aizdr?zties // E if (newBegin == -1) newBegin = simpleRule("-elos, -elsies, -elas, pag. -elsos", "elsties", 18, new String[] { "Darbbas v?rds", "Loct k? \"elsties\"" }, null, gramText, lemma); //aizelsties // F, // G if (newBegin == -1) newBegin = simpleRule("-g?rdzos, -g?rdzies, -g?rdzas, pag. -g?rdzos", "g?rgties", 18, new String[] { "Darbbas v?rds", "Loct k? \"g?rgties\"" }, null, gramText, lemma); //aizg?rgties if (newBegin == -1) newBegin = simpleRuleOptHyperns("-grieos, -griezies, -grieas, pag. -griezos", "griezties", 18, new String[] { "Darbbas v?rds", "Loct k? \"griezties\"" }, null, gramText, lemma); //aizgriezties 1 if (newBegin == -1) newBegin = simpleRuleOptHyperns( "-gulstos, -gulsties, -gulstas, ar -guos, -gulies, -guas, pag. -glos, ar -gulos", "gulties", 18, new String[] { "Darbbas v?rds", "Loct k? \"gulties\"", "Parall?s formas" }, null, gramText, lemma); //aizgulties if (newBegin == -1) newBegin = simpleRule("-gstos, -gsties, -gstas, pag. -guvos", "gties", 18, new String[] { "Darbbas v?rds", "Loct k? \"gties\"" }, null, gramText, lemma); //aizgties // , if (newBegin == -1) newBegin = simpleRule("-iedos, -iedies, -iedas, pag. -gidos", "isties", 18, new String[] { "Darbbas v?rds", "Loct k? \"isties\"" }, null, gramText, lemma); //apisties // H // I if (newBegin == -1) newBegin = simpleRule("-ejos, -ejos, -ietas, pag. -g?jos", "ieties", 18, new String[] { "Darbbas v?rds", "Loct k? \"ieties\"" }, null, gramText, lemma); //apieties if (newBegin == -1) newBegin = simpleRuleOptHyperns("-iros, -iries, -iras, pag. -ros", "irties", 18, new String[] { "Darbbas v?rds", "Loct k? \"irties\" (k? ar airiem)" }, null, gramText, lemma); //aizirties // J if (newBegin == -1) newBegin = simpleRuleOptHyperns("-jdzos, -jdzies, -jdzas, pag. -jdzos", "jgties", 18, new String[] { "Darbbas v?rds", "Loct k? \"jgties\"" }, null, gramText, lemma); //aizjgties // K if (newBegin == -1) newBegin = simpleRuleOptHyperns("-karos, -karies, -karas, pag. -k?ros", "k?rties", 18, new String[] { "Darbbas v?rds", "Loct k? \"k?rties\"" }, null, gramText, lemma); //apk?rties if (newBegin == -1) newBegin = simpleRuleOptHyperns("-kl?jos, -kl?jies, -kl?jas, pag. -kl?jos", "kl?ties", 18, new String[] { "Darbbas v?rds", "Loct k? \"kl?ties\"" }, null, gramText, lemma); //apkl?ties if (newBegin == -1) newBegin = simpleRule("-kliedzos, -kliedzies, -kliedzas, pag. -kliedzos", "kliegties", 18, new String[] { "Darbbas v?rds", "Loct k? \"kliegties\"" }, null, gramText, lemma); //aizkliegties if (newBegin == -1) newBegin = simpleRule("-kr?cos, -kr?cies, -kr?cas, pag. -kr?cos", "kr?kties", 18, new String[] { "Darbbas v?rds", "Loct k? \"kr?kties\"" }, null, gramText, lemma); //aizkr?kties if (newBegin == -1) newBegin = simpleRuleOptHyperns("-kuos, -kulies, -kuas, pag. -klos", "kulties", 18, new String[] { "Darbbas v?rds", "Loct k? \"kulties\"" }, null, gramText, lemma); //aizkulties if (newBegin == -1) newBegin = simpleRuleOptHyperns("-ros, -eries, -eras, pag. -ros", "erties", 18, new String[] { "Darbbas v?rds", "Loct k? \"erties\"" }, null, gramText, lemma); //aizerties // L if (newBegin == -1) newBegin = simpleRuleOptHyperns("-laios, -laidies, -laias, pag. -laidos", "laisties", 18, new String[] { "Darbbas v?rds", "Loct k? \"laisties\"" }, null, gramText, lemma); //aizlaisties if (newBegin == -1) newBegin = simpleRuleOptHyperns("-lauos, -lauzies, -lauas, pag. -lauz?s", "lauzties", 18, new String[] { "Darbbas v?rds", "Loct k? \"lauzties\"" }, null, gramText, lemma); //aizlauzties if (newBegin == -1) newBegin = simpleRuleOptHyperns("-liedzos, -liedzies, -liedzas, pag. -liedzos", "liegties", 18, new String[] { "Darbbas v?rds", "Loct k? \"liegties\"" }, null, gramText, lemma); //aizliegties if (newBegin == -1) newBegin = simpleRuleOptHyperns("-liecos, -liecies, -liecas, pag. -liecos", "liekties", 18, new String[] { "Darbbas v?rds", "Loct k? \"liekties\"" }, null, gramText, lemma); //aizliekties if (newBegin == -1) newBegin = simpleRuleOptHyperns("-liekos, -liecies, -liekas, pag. -likos", "likties", 18, new String[] { "Darbbas v?rds", "Loct k? \"likties\"" }, null, gramText, lemma); //aizlikties // M, N, O, P, R, S // T if (newBegin == -1) newBegin = simpleRuleOptHyperns("-tupstos, -tupsties, -tupstas, pag. -tupos", "tupties", 18, new String[] { "Darbbas v?rds", "Loct k? \"tupties\"", "Parall?s formas" }, null, gramText, lemma); //aiztupties //TODO check paralel forms. // U, V, Z return newBegin; } /** * Paradigm 19: Darbbas v?rdi 2. konjug?cija atgriezeniski * Rules in form "parasti 3. pers., -jas, pag. -j?s", * "-jos, -jies, -jas, -jamies, -jaties, pag. -jos, -j?mies, -j?ties; pav. -jies, -jieties", * and "-ojos, -ojies, -ojas, pag. -ojos". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int secondConjRefVerbRules(String gramText, String lemma) { int newBegin = -1; // Paradigm 19: Darbbas v?rdi 2. konjug?cija atgriezeniski newBegin = simpleRule("parasti 3. pers., -jas, pag. -j?s", "ties", 19, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //absorbties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -ojas, pag. -oj?s", "oties", 19, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //daudzk?roties if (newBegin == -1) newBegin = simpleRule( "-jos, -jies, -jas, -jamies, -jaties, pag. -jos, -j?mies, -j?ties; pav. -jies, -jieties", "ties", 19, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //adverbities if (newBegin == -1) newBegin = simpleRule("-ojos, -ojies, -ojas, pag. -ojos", "oties", 19, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aiztuntuoties, apgrkoties if (newBegin == -1) newBegin = simpleRule("-jos, -jies, -jas, pag. -jos", "ties", 19, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //abstrahties if (newBegin == -1) newBegin = simpleRule("-?jos, -?jies, -?jas, pag. -?jos", "?ties", 19, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizdom?ties return newBegin; } /** * Paradigm 20: Darbbas v?rdi 3. konjug?cija atgriezeniski * Rules in form "parasti 3. pers., -?s, pag. -j?s" and * "-os, -ies, -?s, pag. -jos". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int thirdConjRef3PersVerbRules(String gramText, String lemma) { int newBegin = -1; // Verb-specific rules. // Rules ordered alphabetically by verb infinitive. // A // B if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -brikas, pag. -brikj?s", "brikties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizbrikties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -brikas, pag. -brikj?s", "brikties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizbrikties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -brkas, pag. -brkj?s", "brkties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizbrkties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -brkas, pag. -brkj?s", "brkties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizbrkties // C, if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -?abas, pag. -?abj?s", "?abties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aiz?abties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -?aukstas, pag. -?aukstj?s", "?aukstties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aiz?aukstties // D if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -d?rdas, pag. -d?rdj?s", "d?rdties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizd?rdties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -drebas, pag. -drebj?s", "drebties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizdrebties // E, F // G if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -gr?b?s, pag. -gr?bj?s", "grabties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizgrabties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -gurkstas, pag. -gurkstj?s", "gurkstties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizgurkstties // H, I, J // K if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -klabas, pag. -klabj?s", "klabties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizklabties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -klaudzas, pag. -klaudzj?s", "klaudzties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizklaudzties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -klukstas, pag. -klukstj?s", "klukstties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizklukstties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -klunkas, pag. -klunkj?s", "klunkties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizklunkties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -klunkas, pag. -klunkj?s", "klunkties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizklunkties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -knakst?s, pag. -knakstj?s", "knakstties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizknakstties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -knakas, pag. -knakj?s", "knakties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizknakties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -knakas, pag. -knakj?s", "knakties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizknakties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -knaukas, pag. -knaukj?s", "knaukties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizknaukties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -knaukas, pag. -knaukj?s", "knaukties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizknaukties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -knikas, pag. -knikj?s", "knikties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizknikties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -knikas, pag. -knikj?s", "knikties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizknikties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -krakstas, pag. -krakstj?s", "krakstties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizkrakstties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -krakas, pag. -krakj?s", "krakties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizkrakties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -kurkstas, pag. -kurkstj?s", "kurkstties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizkurkstties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -kurkas, pag. -kurkj?s", "kurkties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizkurkties // L, M, N, O, P, R, S, T, U, V, Z // Generic ending rules. if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -as, pag. -j?s", "ties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aiz?iepstties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -in?s, pag. -in?j?s", "in?ties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizbubin?ties if (newBegin == -1) newBegin = simpleRule("parasti 3. pers., -?s, pag. -j?s", "ties", 20, new String[] { "Darbbas v?rds" }, new String[] { "Parasti 3. person?" }, gramText, lemma); //aizbdties return newBegin; } /** * Paradigm 20: Darbbas v?rdi 3. konjug?cija atgriezeniski * Rules in form "parasti 3. pers., -?s, pag. -j?s" and * "-os, -ies, -?s, pag. -jos". * This function is seperated out for readability from * {@link #processBeginingWithPatterns(String, String)} as currently these rules * for verbs are long and highly specific and, thus, do not conflict * with other rules. * @return new begining for gram string if one of these rulles matched, * -1 otherwise. */ private int thirdConjRefAllPersVerbRules(String gramText, String lemma) { int newBegin = -1; // Verb-specific rules. if (newBegin == -1) newBegin = simpleRule("-dziedos, -dziedies, -dziedas, pag. -dzied?jos", "dzied?ties", 20, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizdzied?ties if (newBegin == -1) newBegin = simpleRuleOptHyperns("-dzenos, -dzenies, -dzenas, pag. -dzinos", "dzties", 20, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizdzties if (newBegin == -1) newBegin = simpleRule("-guos, -gulies, -guas, pag. -guljos", "gulties", 20, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizgulties if (newBegin == -1) newBegin = simpleRule("-kustos, -kusties, -kustas, pag. -kustjos", "kustties", 20, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //aizkustties // Generic ending rules. if (newBegin == -1) newBegin = simpleRule("-os, -ies, -as, pag. -jos", "ties", 20, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //apkaunties if (newBegin == -1) newBegin = simpleRule("-inos, -inies, -in?s, pag. -in?jos", "in?ties", 20, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //apklauin?ties if (newBegin == -1) newBegin = simpleRule("-os, -ies, -?s, pag. -jos", "ties", 20, new String[] { "Darbbas v?rds" }, null, gramText, lemma); //apklausties return newBegin; } /** * @param lemma is used for paradigm detection in cases where endings * matter. */ private void paradigmFromFlags(String lemma) { if (flags.contains("pabas v?rds")) { if (lemma.endsWith("ais") || lemma.endsWith("?")) paradigm.add(30); else if (lemma.matches(".*[^aeiou??]s")) paradigm.add(13); else if (lemma.matches(".*[^aeiou??]")) paradigm.add(14); } if (flags.contains("Darbbas v?rds")) { if (lemma.endsWith("t") || lemma.endsWith("in?t")) paradigm.add(17); if (lemma.endsWith("ties") || lemma.endsWith("in?ties")) paradigm.add(20); } if (flags.contains("Apst?ka v?rds")) paradigm.add(21); if (flags.contains("Partikula")) paradigm.add(28); if (flags.contains("Priev?rds")) paradigm.add(26); if (flags.contains("Izsauksmes v?rds")) paradigm.add(29); // Hardcoded if (flags.contains("Sasin?jums")) paradigm.add(29); // Hardcoded if (flags.contains("V?rds svevalod?")) paradigm.add(29); if (flags.contains("Vietniekv?rds")) paradigm.add(25); if (flags.contains("Jaut?jamais vietniekv?rds")) paradigm.add(25); if (flags.contains("Noliedzamais vietniekv?rds")) paradigm.add(25); if (flags.contains("Nor?d?mais vietniekv?rds")) paradigm.add(25); if (flags.contains("Noteicamais vietniekv?rds")) paradigm.add(25); if (flags.contains("Piederbas vietniekv?rds")) paradigm.add(25); if (flags.contains("Visp?rin?mais vietniekv?rds")) paradigm.add(25); if (flags.contains("Priedklis")) paradigm.add(0); //Prefixes are not words. if (flags.contains("Salikteu daa")) paradigm.add(0); //Prefixes are not words. } /** * This should be called after something is removed from leftovers. */ public void cleanupLeftovers() { for (int i = leftovers.size() - 1; i >= 0; i--) { if (leftovers.get(i).isEmpty()) leftovers.remove(i); } } /** * Hopefully, this method will be empty for final data ;) */ private String correctOCRErrors(String gramText) { //Inconsequences in data //gramText = gramText.replaceAll("^m?t\\.", "mat\\."); //gramText = gramText.replace(" m?t.", " mat."); //gramText = gramText.replace("vsk..", "vsk."); //gramText = gramText.replace("vsk .", "vsk."); //gramText = gramText.replaceAll("^gen\\.", "en\\."); //gramText = gramText.replace(" gen.", " en."); //gramText = gramText.replaceAll("^trans;", "trans\\.;"); //gramText = gramText.replace(" trans;", " trans.;"); //gramText = gramText.replace("-ais; s. -a: -?;", "-ais; s. -a, -?;"); //apgrcgs return gramText; } public String toJSON() { return toJSON(true); } // In case of speed problems StringBuilder can be returned. public String toJSON(boolean printOrig) { StringBuilder res = new StringBuilder(); res.append("\"Gram\":{"); boolean hasPrev = false; if (paradigm != null && !paradigm.isEmpty()) { if (hasPrev) res.append(", "); res.append("\"Paradigm\":"); res.append(JSONUtils.simplesToJSON(paradigm)); hasPrev = true; } if (altLemmas != null && !altLemmas.isEmpty()) { if (hasPrev) res.append(", "); res.append("\"AltLemmas\":{"); Iterator<Integer> it = altLemmas.keySet().iterator(); while (it.hasNext()) { Integer next = it.next(); if (!altLemmas.getAll(next).isEmpty()) { res.append("\""); res.append(JSONObject.escape(next.toString())); res.append("\":["); Iterator<Tuple<Lemma, HashSet<String>>> flagIt = altLemmas.getAll(next).iterator(); while (flagIt.hasNext()) { Tuple<Lemma, HashSet<String>> alt = flagIt.next(); res.append("{"); res.append(alt.first.toJSON()); if (alt.second != null && !alt.second.isEmpty()) { res.append(", \"Flags\":"); res.append(JSONUtils.simplesToJSON(alt.second)); } res.append("}"); if (flagIt.hasNext()) res.append(", "); } res.append("]"); if (it.hasNext()) res.append(", "); } } res.append("}"); hasPrev = true; } if (flags != null && !flags.isEmpty()) { if (hasPrev) res.append(", "); res.append("\"Flags\":"); res.append(JSONUtils.simplesToJSON(flags)); hasPrev = true; } if (leftovers != null && leftovers.size() > 0) { if (hasPrev) res.append(", "); res.append("\"Leftovers\":["); Iterator<LinkedList<String>> it = leftovers.iterator(); while (it.hasNext()) { LinkedList<String> next = it.next(); if (!next.isEmpty()) { res.append(JSONUtils.simplesToJSON(next)); if (it.hasNext()) res.append(", "); } } res.append("]"); hasPrev = true; } if (printOrig && orig != null && orig.length() > 0) { if (hasPrev) res.append(", "); res.append("\"Original\":\""); res.append(JSONObject.escape(orig)); res.append("\""); hasPrev = true; } res.append("}"); return res.toString(); } }