nl.rug.eco.lucene.EnglishLemmaTokenizer.java Source code

Introduction

Here is the source code for nl.rug.eco.lucene.EnglishLemmaTokenizer.java
Source

/*
 * Lemmatizing library for Lucene Copyright (c) 2010-2011 Lars Buitinck
 * 
 * This program is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 * 
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 */

package nl.rug.eco.lucene;

import java.io.*;
import java.util.*;
import java.util.regex.*;
import com.google.common.collect.Iterables;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;

/**
 * A tokenizer that retrieves the lemmas (base forms) of English words. Relies
 * internally on the sentence splitter and tokenizer supplied with the Stanford
 * POS tagger.
 * 
 * @author Lars Buitinck
 * @version 2011.0122
 */
public class EnglishLemmaTokenizer extends TokenStream {
    private Iterator<TaggedWord> tagged;
    private PositionIncrementAttribute posIncr;
    private TaggedWord currentWord;
    private TermAttribute termAtt;
    private boolean lemmaNext;
    private MaxentTagger tagger;

    public Iterator<TaggedWord> getTaggedIterator() {
        return tagged;
    }

    public TermAttribute getTermAtt() {
        return termAtt;
    }

    /**
     * Construct a tokenizer processing the given input and a tagger using the
     * given model file.
     */
    /*   public EnglishLemmaTokenizer(Reader input, String posModelFile) throws Exception {
          this(input, EnglishLemmaAnalyzer.makeTagger(posModelFile));
       }
    */
    /**
     * Construct a tokenizer processing the given input using the given
     * tagger.
     */
    public EnglishLemmaTokenizer(MaxentTagger tagger) {
        super();

        this.tagger = tagger;
        lemmaNext = false;
        posIncr = addAttribute(PositionIncrementAttribute.class);
        termAtt = addAttribute(TermAttribute.class);
    }

    public void start(Reader input) {

        List<List<HasWord>> tokenized = MaxentTagger.tokenizeText(input);
        tagged = Iterables.concat(tagger.process(tokenized)).iterator();

    }

    /**
     * Consumers use this method to advance the stream to the next token. The
     * token stream emits inflected forms and lemmas interleaved (form1,
     * lemma1, form2, lemma2, etc.), giving lemmas and their inflected forms
     * the same PositionAttribute.
     */
    @Override
    public final boolean incrementToken() throws IOException {
        if (lemmaNext) {
            // Emit a lemma
            posIncr.setPositionIncrement(1);
            String tag = currentWord.tag();
            String form = currentWord.word();
            termAtt.setTermBuffer(Morphology.stemStatic(form, tag).word());
        } else {
            // Emit inflected form, if not filtered out.

            // 0 because the lemma will come in the same position
            int increment = 0;
            for (;;) {
                if (!tagged.hasNext())
                    return false;
                currentWord = tagged.next();
                if (!unwantedPOS(currentWord.tag()))
                    break;
                increment++;
            }

            posIncr.setPositionIncrement(increment);
            termAtt.setTermBuffer(currentWord.word());
        }

        lemmaNext = !lemmaNext;
        return true;
    }

    private static final Pattern unwantedPosRE = Pattern
            .compile("^(CC|DT|[LR]RB|MD|POS|PRP|UH|WDT|WP|WP\\$|WRB|\\$|\\#|\\.|\\,|:)$");

    /**
     * Determines if words with a given POS tag should be omitted from the
     * index. Defaults to filtering out punctuation and function words
     * (pronouns, prepositions, "the", "a", etc.).
     * 
     * @see <a
     *      href="http://www.ims.uni-stuttgart.de/projekte/CorpusWorkbench/CQP-HTMLDemo/PennTreebankTS.html">The
     *      Penn Treebank tag set</a> used by Stanford NLP
     */
    protected boolean unwantedPOS(String tag) {
        return false;
        // return unwantedPosRE.matcher(tag).matches();
    }

    public static void main(String argv[]) throws IOException, ClassNotFoundException {
        MaxentTagger tagger = new MaxentTagger("resources.english/wsj-0-18-bidirectional-distsim.tagger");
        StringReader sr = new StringReader(//".,;'");
                "I just left JW Marriott by there. Narrowly missed seeing fire? RT @JeffVaughn: Major wildfire burning near Stone Oak & 281 in Northeast #SA.");
        EnglishLemmaTokenizer elt = new EnglishLemmaTokenizer(tagger);
        Iterator<TaggedWord> i = elt.tagged;
        // elt.lemmaNext = true;
        int ij = 0;
        elt.start(sr);
        sr = new StringReader("xcvjksdfjk");
        elt.start(sr);

        while (elt.incrementToken()) {

            if (ij++ % 2 == 1)
                System.out.println("[" + elt.termAtt + "]");
        }
    }
}