Java tutorial
/******************************************************************************* * Copyright (c) 2012 Gyrgy Orosz, Attila Novk. * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU Lesser Public License v3 * which accompanies this distribution, and is available at * http://www.gnu.org/licenses/ * * This file is part of PurePos. * * PurePos is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * PurePos is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * Contributors: * Gyrgy Orosz - initial API and implementation ******************************************************************************/ package hu.ppke.itk.nlpg.purepos.common; import hu.ppke.itk.nlpg.docmodel.IToken; import hu.ppke.itk.nlpg.docmodel.internal.Token; import hu.ppke.itk.nlpg.purepos.model.IProbabilityModel; import hu.ppke.itk.nlpg.purepos.model.IVocabulary; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; /** * Global holder for tags and stems in a pretagged input. * * @author Gyrgy Orosz * */ public class AnalysisQueue { // (position, (tag+stem, prob) protected ArrayList<Map<String, Double>> anals; // (tag, (stem, prob)) // protected ArrayList<Map<String, Double>> stems; // if the input contains probability information protected ArrayList<Boolean> useProb; protected ArrayList<String> words; // protected static String alnumPat = "\\p{L}\\p{N}"; // protected static String punctPat = "\\p{P}"; // protected static String tagPat = "[^\\]]"; // // protected static String stringPat = "([" + alnumPat + punctPat + "]+)"; // protected static String analPat = "((" + stringPat + "(\\[[" + tagPat // + "|]+\\])+)(\\$\\$-?[0-9]+(\\.[0-9]+)?)?)"; // protected static String analSplitRe = "\\|\\|"; // protected static String weightSplitPat = "\\$\\$"; // // protected static Pattern analFormPat = Pattern.compile(stringPat // + "\\{\\{(" + analPat + "(\\|\\|" + analPat + ")*" + ")\\}\\}"); public void init(int capacity) { anals = new ArrayList<Map<String, Double>>(capacity); // stems = new ArrayList<Map<String, Double>>(capacity); useProb = new ArrayList<Boolean>(capacity); words = new ArrayList<String>(capacity); for (int i = 0; i < capacity; ++i) { anals.add(null); // stems.add(null); useProb.add(null); words.add(null); } } public void addWord(String input, Integer position) { Pair<String, List<String>> res = parse(input); String word = res.getLeft(); List<String> analsList = res.getRight(); words.set(position, word); anals.set(position, new HashMap<String, Double>()); for (String anal : analsList) { int indexOfValSep = anal.indexOf("$$"); String lemmaTag = anal; double prob = 1; if (indexOfValSep > -1) { useProb.set(position, true); prob = Double.parseDouble(anal.substring(indexOfValSep + 2)); lemmaTag = anal.substring(0, indexOfValSep); } anals.get(position).put(lemmaTag, prob); } } public boolean hasAnal(Integer position) { return anals.size() > position && anals.get(position) != null; } public Map<String, Double> getAnals(Integer position) { return anals.get(position); } public boolean useProbabilties(Integer position) { if (useProb.size() > position) return useProb.get(position) != null; return false; } public IProbabilityModel<Integer, String> getLexicalModelForWord(Integer position, IVocabulary<String, Integer> tagVocabulary) { Map<Integer, Double> retMap = transformTags(position, tagVocabulary); return new OneWordLexicalModel(retMap, this.words.get(position)); } protected Map<Integer, Double> transformTags(Integer position, IVocabulary<String, Integer> tagVocabulary) { Map<Integer, Double> retMap = new HashMap<Integer, Double>(); for (Map.Entry<String, Double> entry : this.anals.get(position).entrySet()) { String tagStr = anal2tag(entry.getKey()); Integer tag = tagVocabulary.getIndex(tagStr); if (tag == null) { tag = tagVocabulary.addElement(tagStr); } retMap.put(tag, entry.getValue()); } return retMap; } public Set<Integer> getTags(Integer position, IVocabulary<String, Integer> tagVocabulary) { Map<Integer, Double> retMap = transformTags(position, tagVocabulary); return retMap.keySet(); } public Set<IToken> getAnalysises(Integer position) { Set<String> fanals = anals.get(position).keySet(); Set<IToken> ret = new HashSet<IToken>(); for (String fa : fanals) { ret.add(new Token(words.get(position), anal2lemma(fa), anal2tag(fa))); } return ret; } public static Pair<String, List<String>> parse(String token) { int wordRB = token.indexOf("{{"); int analRB = token.indexOf("}}"); String word = token.substring(0, wordRB); String analsStrings = token.substring(wordRB + 2, analRB); List<String> analsList = Arrays.asList(analsStrings.split(analSplitRe)); return ImmutablePair.of(word, analsList); } public static boolean isPreanalysed(String word) { return word.indexOf("{{") > 0 && word.lastIndexOf("}}") > 0; } public static String clean(String word) { return word.substring(0, word.indexOf("{{")); } public static String anal2tag(String anal) { return anal.substring(anal.indexOf("[")); } public static String anal2lemma(String anal) { return anal.substring(0, anal.indexOf("[")); } }