Java tutorial
/******************************************************************************* * Copyright 2012, 2013, 2014 Institute of Mathematics and Computer Science, University of Latvia * Author: Pteris Paikens * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. *******************************************************************************/ package lv.semti.morphology.webservice; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.LinkedList; import org.json.simple.JSONValue; import org.restlet.resource.Get; import org.restlet.resource.ServerResource; import lv.semti.morphology.analyzer.*; import lv.semti.morphology.attributes.AttributeNames; import lv.semti.morphology.corpus.Statistics; public class VerbResource extends ServerResource { @Get public String retrieve() { return parsequery(true); } public String parsequery(Boolean verb) { String query = (String) getRequest().getAttributes().get("query"); try { query = URLDecoder.decode(query, "UTF8"); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } MorphoServer.analyzer.defaultSettings(); LinkedList<Word> tokens = Splitting.tokenize(MorphoServer.analyzer, query); String debug = ""; for (Word token : tokens) { if (token.isRecognized()) debug += token.wordforms.get(0).getDescription(); else debug += token.getToken(); debug += "\n"; } debug += String.valueOf(tokens.size()); String tag = ""; if (tokens.size() == 1) tag = tagWord(tokens.get(0), verb); else tag = tagChunk(tokens); // Heiristikas vair?kv?rdu situ?cijas risin?anai if (tag == "") return debug; else return tag; } private String tagWord(Word word, Boolean verb) { // Atrodam ticam?ko tagu, ja ir viens v?rds analizjams LinkedHashSet<String> tags = new LinkedHashSet<String>(); if (word.isRecognized()) { Wordform maxwf = word.wordforms.get(0); double maxticamba = -1; for (Wordform wf : word.wordforms) { // Paskatamies visus atrastos variantus un emam statistiski ticam?ko //tag += String.format("%s\t%d\n", wf.getDescription(), MorphoServer.statistics.getTicamba(wf)); double ticamba = Statistics.getStatistics().getEstimate(wf); if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb) == verb) ticamba += 200; if (ticamba > maxticamba) { maxticamba = ticamba; maxwf = wf; } } //System.out.printf("Ticamiiba vaardam %s ir %d", maxwf.getToken(), maxticamba); if (maxwf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb)) { // Verbiem tagi ar verba personu String person = maxwf.getValue(AttributeNames.i_Person); if (person != null && person.length() == 1) tags.add("V" + person); if (maxwf.isMatchingStrong(AttributeNames.i_Izteiksme, AttributeNames.v_Nenoteiksme)) tags.add("Inf"); Collections.addAll(tags, "V1","V2","V3","Inf"); } if (maxwf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Noun) || maxwf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Pronoun) ) { // Lietv?rdiem un vietniekv?rdiem tagi ar locjumu String ncase = caseCode(maxwf.getValue(AttributeNames.i_Case)); if (ncase != null) tags.add(ncase); Collections.addAll(tags, "Nom","Gen","Dat","Acc","Loc"); } if (maxwf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adverb)) { // Apst?ka v?rdi Collections.addAll(tags, "Adv"); } } if (tags.isEmpty()) Collections.addAll(tags, "Nom","Gen","Dat","Acc","Loc", "V1","V2","V3", "Inf","S","TR", "Adv"); // Ja nesaprat?m, dodam visus variantus return formatJSON(tags); } private String tagChunk(LinkedList<Word> tokens) { LinkedHashSet<String> tags = new LinkedHashSet<String>(); // da?di minjumi. norm?li bu tikai ar sintakses analzi //tags.add(String.valueOf(tokens.size())); //tags.add(tokens.get(0).getToken()); //tags.add(tokens.get(0).getPartOfSpeech()); if (tokens.size() > 1 && tokens.get(0).isRecognized() && tokens.get(0).hasAttribute(AttributeNames.i_PartOfSpeech, AttributeNames.v_Preposition)) { // ja fr?ze s?kas ar priev?rdu for (Wordform wf : tokens.get(0).wordforms) { //tags.add(wf.getDescription()); if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Preposition)) { String ncase = wf.getValue(AttributeNames.i_Rekcija); if (ncase != null) tags.add(wf.getToken() + caseCode(ncase)); } } } //ja s?kas ar saikli, tad vareetu buut paliigteikums if (tokens.size() > 1 && tokens.get(0).isRecognized() && tokens.get(0).hasAttribute(AttributeNames.i_PartOfSpeech, AttributeNames.v_Conjunction)) { tags.add("S"); } if (tags.isEmpty()) return tagWord(tokens.getLast(), false); // Ja nesaprat?m, dodam pdj? v?rda analzi - Gunta teica, ka esot ticam?k t? return formatJSON(tags); } private String formatJSON(Collection<String> tags) { Iterator<String> i = tags.iterator(); String out = "["; while (i.hasNext()) { out += "\"" + JSONValue.escape(i.next()) + "\""; if (i.hasNext()) out += ", "; } out += "]"; return out; } private String caseCode(String caseName) { if (caseName == null) return null; String result = null; if (caseName.equals(AttributeNames.v_Nominative)) result = "Nom"; if (caseName.equals(AttributeNames.v_Genitive)) result = "Gen"; if (caseName.equals(AttributeNames.v_Dative)) result = "Dat"; if (caseName.equals(AttributeNames.v_Accusative)) result = "Acc"; if (caseName.equals(AttributeNames.v_Locative)) result = "Loc"; return result; } }