opennlp.tools.tokenize.TokenizerME.java Source code

Introduction

Here is the source code for opennlp.tools.tokenize.TokenizerME.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.tokenize;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.ml.EventTrainer;
import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.ml.model.Event;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.tokenize.lang.Factory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;

/**
 * A Tokenizer for converting raw text into separated tokens.  It uses
 * Maximum Entropy to make its decisions.  The features are loosely
 * based off of Jeff Reynar's UPenn thesis "Topic Segmentation:
 * Algorithms and Applications.", which is available from his
 * homepage: <a href="http://www.cis.upenn.edu/~jcreynar">http://www.cis.upenn.edu/~jcreynar</a>.
 * <p>
 * This tokenizer needs a statistical model to tokenize a text which reproduces
 * the tokenization observed in the training data used to create the model.
 * The {@link TokenizerModel} class encapsulates the model and provides
 * methods to create it from the binary representation.
 * <p>
 * A tokenizer instance is not thread safe. For each thread one tokenizer
 * must be instantiated which can share one <code>TokenizerModel</code> instance
 * to safe memory.
 * <p>
 * To train a new model {{@link #train(ObjectStream, TokenizerFactory, TrainingParameters)} method
 * can be used.
 * <p>
 * Sample usage:
 * <p>
 * <code>
 * InputStream modelIn;<br>
 * <br>
 * ...<br>
 * <br>
 * TokenizerModel model = TokenizerModel(modelIn);<br>
 * <br>
 * Tokenizer tokenizer = new TokenizerME(model);<br>
 * <br>
 * String tokens[] = tokenizer.tokenize("A sentence to be tokenized.");
 * </code>
 *
 * @see Tokenizer
 * @see TokenizerModel
 * @see TokenSample
 */
public class TokenizerME extends AbstractTokenizer {

    /**
     * Constant indicates a token split.
     */
    public static final String SPLIT = "T";

    /**
     * Constant indicates no token split.
     */
    public static final String NO_SPLIT = "F";

    /**
     * Alpha-Numeric Pattern
     * @deprecated As of release 1.5.2, replaced by {@link Factory#getAlphanumeric(String)}
     */
    @Deprecated
    public static final Pattern alphaNumeric = Pattern.compile(Factory.DEFAULT_ALPHANUMERIC);

    private final Pattern alphanumeric;

    /**
     * The maximum entropy model to use to evaluate contexts.
     */
    private MaxentModel model;

    /**
     * The context generator.
     */
    private final TokenContextGenerator cg;

    /**
     * Optimization flag to skip alpha numeric tokens for further
     * tokenization
     */
    private boolean useAlphaNumericOptimization;

    /**
     * List of probabilities for each token returned from a call to
     * <code>tokenize</code> or <code>tokenizePos</code>.
     */
    private List<Double> tokProbs;

    private List<Span> newTokens;

    public TokenizerME(TokenizerModel model) {
        TokenizerFactory factory = model.getFactory();
        this.alphanumeric = factory.getAlphaNumericPattern();
        this.cg = factory.getContextGenerator();
        this.model = model.getMaxentModel();
        this.useAlphaNumericOptimization = factory.isUseAlphaNumericOptmization();

        newTokens = new ArrayList<>();
        tokProbs = new ArrayList<>(50);
    }

    /**
     * @deprecated use {@link TokenizerFactory} to extend the Tokenizer
     *             functionality
     */
    public TokenizerME(TokenizerModel model, Factory factory) {
        String languageCode = model.getLanguage();

        this.alphanumeric = factory.getAlphanumeric(languageCode);
        this.cg = factory.createTokenContextGenerator(languageCode, getAbbreviations(model.getAbbreviations()));

        this.model = model.getMaxentModel();
        useAlphaNumericOptimization = model.useAlphaNumericOptimization();

        newTokens = new ArrayList<>();
        tokProbs = new ArrayList<>(50);
    }

    private static Set<String> getAbbreviations(Dictionary abbreviations) {
        if (abbreviations == null) {
            return Collections.emptySet();
        }
        return abbreviations.asStringSet();
    }

    /**
     * Returns the probabilities associated with the most recent
     * calls to {@link TokenizerME#tokenize(String)} or {@link TokenizerME#tokenizePos(String)}.
     *
     * @return probability for each token returned for the most recent
     *     call to tokenize.  If not applicable an empty array is returned.
     */
    public double[] getTokenProbabilities() {
        double[] tokProbArray = new double[tokProbs.size()];
        for (int i = 0; i < tokProbArray.length; i++) {
            tokProbArray[i] = tokProbs.get(i);
        }
        return tokProbArray;
    }

    /**
     * Tokenizes the string.
     *
     * @param d  The string to be tokenized.
     *
     * @return   A span array containing individual tokens as elements.
     */
    public Span[] tokenizePos(String d) {
        Span[] tokens = WhitespaceTokenizer.INSTANCE.tokenizePos(d);
        newTokens.clear();
        tokProbs.clear();
        for (Span s : tokens) {
            String tok = d.substring(s.getStart(), s.getEnd());
            // Can't tokenize single characters
            if (tok.length() < 2) {
                newTokens.add(s);
                tokProbs.add(1d);
            } else if (useAlphaNumericOptimization() && alphanumeric.matcher(tok).matches()) {
                newTokens.add(s);
                tokProbs.add(1d);
            } else {
                int start = s.getStart();
                int end = s.getEnd();
                final int origStart = s.getStart();
                double tokenProb = 1.0;
                for (int j = origStart + 1; j < end; j++) {
                    double[] probs = model.eval(cg.getContext(tok, j - origStart));
                    String best = model.getBestOutcome(probs);
                    tokenProb *= probs[model.getIndex(best)];
                    if (best.equals(TokenizerME.SPLIT)) {
                        newTokens.add(new Span(start, j));
                        tokProbs.add(tokenProb);
                        start = j;
                        tokenProb = 1.0;
                    }
                }
                newTokens.add(new Span(start, end));
                tokProbs.add(tokenProb);
            }
        }

        Span[] spans = new Span[newTokens.size()];
        newTokens.toArray(spans);
        return spans;
    }

    /**
     * Trains a model for the {@link TokenizerME}.
     *
     * @param samples
     *          the samples used for the training.
     * @param factory
     *          a {@link TokenizerFactory} to get resources from
     * @param mlParams
     *          the machine learning train parameters
     * @return the trained {@link TokenizerModel}
     * @throws IOException
     *           it throws an {@link IOException} if an {@link IOException} is
     *           thrown during IO operations on a temp file which is created
     *           during training. Or if reading from the {@link ObjectStream}
     *           fails.
     */
    public static TokenizerModel train(ObjectStream<TokenSample> samples, TokenizerFactory factory,
            TrainingParameters mlParams) throws IOException {

        Map<String, String> manifestInfoEntries = new HashMap<>();

        ObjectStream<Event> eventStream = new TokSpanEventStream(samples, factory.isUseAlphaNumericOptmization(),
                factory.getAlphaNumericPattern(), factory.getContextGenerator());

        EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams, manifestInfoEntries);

        MaxentModel maxentModel = trainer.train(eventStream);

        return new TokenizerModel(maxentModel, manifestInfoEntries, factory);
    }

    /**
     * Returns the value of the alpha-numeric optimization flag.
     *
     * @return true if the tokenizer should use alpha-numeric optimization, false otherwise.
     */
    public boolean useAlphaNumericOptimization() {
        return useAlphaNumericOptimization;
    }
}