Source code

Java tutorial


Here is the source code for


 * Copyright (c) 2012 Gyrgy Orosz, Attila Novk.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Lesser Public License v3
 * which accompanies this distribution, and is available at
 * This file is part of PurePos.
 * PurePos is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * PurePos is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * GNU Lesser Public License for more details.
 * Contributors:
 *     Gyrgy Orosz - initial API and implementation
package hu.ppke.itk.nlpg.purepos.common;

import hu.ppke.itk.nlpg.docmodel.IToken;
import hu.ppke.itk.nlpg.docmodel.internal.Token;
import hu.ppke.itk.nlpg.purepos.model.IProbabilityModel;
import hu.ppke.itk.nlpg.purepos.model.IVocabulary;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;

 * Global holder for tags and stems in a pretagged input.
 * @author Gyrgy Orosz
public class AnalysisQueue {
    // (position, (tag+stem, prob)
    protected ArrayList<Map<String, Double>> anals;
    // (tag, (stem, prob))
    // protected ArrayList<Map<String, Double>> stems;
    // if the input contains probability information
    protected ArrayList<Boolean> useProb;
    protected ArrayList<String> words;

    // protected static String alnumPat = "\\p{L}\\p{N}";
    // protected static String punctPat = "\\p{P}";
    // protected static String tagPat = "[^\\]]";
    // protected static String stringPat = "([" + alnumPat + punctPat + "]+)";
    // protected static String analPat = "((" + stringPat + "(\\[[" + tagPat
    // + "|]+\\])+)(\\$\\$-?[0-9]+(\\.[0-9]+)?)?)";
    protected static String analSplitRe = "\\|\\|";

    // protected static String weightSplitPat = "\\$\\$";
    // protected static Pattern analFormPat = Pattern.compile(stringPat
    // + "\\{\\{(" + analPat + "(\\|\\|" + analPat + ")*" + ")\\}\\}");

    public void init(int capacity) {
        anals = new ArrayList<Map<String, Double>>(capacity);
        // stems = new ArrayList<Map<String, Double>>(capacity);
        useProb = new ArrayList<Boolean>(capacity);
        words = new ArrayList<String>(capacity);
        for (int i = 0; i < capacity; ++i) {
            // stems.add(null);

    public void addWord(String input, Integer position) {
        Pair<String, List<String>> res = parse(input);
        String word = res.getLeft();
        List<String> analsList = res.getRight();

        words.set(position, word);
        anals.set(position, new HashMap<String, Double>());

        for (String anal : analsList) {
            int indexOfValSep = anal.indexOf("$$");
            String lemmaTag = anal;
            double prob = 1;
            if (indexOfValSep > -1) {
                useProb.set(position, true);
                prob = Double.parseDouble(anal.substring(indexOfValSep + 2));
                lemmaTag = anal.substring(0, indexOfValSep);
            anals.get(position).put(lemmaTag, prob);



    public boolean hasAnal(Integer position) {
        return anals.size() > position && anals.get(position) != null;

    public Map<String, Double> getAnals(Integer position) {
        return anals.get(position);

    public boolean useProbabilties(Integer position) {
        if (useProb.size() > position)
            return useProb.get(position) != null;
        return false;


    public IProbabilityModel<Integer, String> getLexicalModelForWord(Integer position,
            IVocabulary<String, Integer> tagVocabulary) {
        Map<Integer, Double> retMap = transformTags(position, tagVocabulary);
        return new OneWordLexicalModel(retMap, this.words.get(position));

    protected Map<Integer, Double> transformTags(Integer position, IVocabulary<String, Integer> tagVocabulary) {
        Map<Integer, Double> retMap = new HashMap<Integer, Double>();
        for (Map.Entry<String, Double> entry : this.anals.get(position).entrySet()) {
            String tagStr = anal2tag(entry.getKey());
            Integer tag = tagVocabulary.getIndex(tagStr);
            if (tag == null) {
                tag = tagVocabulary.addElement(tagStr);
            retMap.put(tag, entry.getValue());

        return retMap;

    public Set<Integer> getTags(Integer position, IVocabulary<String, Integer> tagVocabulary) {
        Map<Integer, Double> retMap = transformTags(position, tagVocabulary);
        return retMap.keySet();


    public Set<IToken> getAnalysises(Integer position) {
        Set<String> fanals = anals.get(position).keySet();
        Set<IToken> ret = new HashSet<IToken>();
        for (String fa : fanals) {
            ret.add(new Token(words.get(position), anal2lemma(fa), anal2tag(fa)));
        return ret;

    public static Pair<String, List<String>> parse(String token) {
        int wordRB = token.indexOf("{{");
        int analRB = token.indexOf("}}");
        String word = token.substring(0, wordRB);
        String analsStrings = token.substring(wordRB + 2, analRB);
        List<String> analsList = Arrays.asList(analsStrings.split(analSplitRe));
        return ImmutablePair.of(word, analsList);

    public static boolean isPreanalysed(String word) {
        return word.indexOf("{{") > 0 && word.lastIndexOf("}}") > 0;

    public static String clean(String word) {
        return word.substring(0, word.indexOf("{{"));

    public static String anal2tag(String anal) {
        return anal.substring(anal.indexOf("["));

    public static String anal2lemma(String anal) {
        return anal.substring(0, anal.indexOf("["));