opennlp.tools.tokenize.TokenizerFactory.java Source code

Java tutorial

Introduction

Here is the source code for opennlp.tools.tokenize.TokenizerFactory.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.tokenize;

import java.util.Collections;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.tokenize.lang.Factory;
import opennlp.tools.util.BaseToolFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ext.ExtensionLoader;

/**
 * The factory that provides {@link Tokenizer} default implementations and
 * resources. Users can extend this class if their application requires
 * overriding the {@link TokenContextGenerator}, {@link Dictionary} etc.
 */
public class TokenizerFactory extends BaseToolFactory {

    private String languageCode;
    private Dictionary abbreviationDictionary;
    private Boolean useAlphaNumericOptimization = false;
    private Pattern alphaNumericPattern;

    private static final String ABBREVIATIONS_ENTRY_NAME = "abbreviations.dictionary";
    private static final String USE_ALPHA_NUMERIC_OPTIMIZATION = "useAlphaNumericOptimization";
    private static final String ALPHA_NUMERIC_PATTERN = "alphaNumericPattern";

    /**
     * Creates a {@link TokenizerFactory} that provides the default implementation
     * of the resources.
     */
    public TokenizerFactory() {
    }

    /**
     * Creates a {@link TokenizerFactory}. Use this constructor to
     * programmatically create a factory.
     *
     * @param languageCode
     *          the language of the natural text
     * @param abbreviationDictionary
     *          an abbreviations dictionary
     * @param useAlphaNumericOptimization
     *          if true alpha numerics are skipped
     * @param alphaNumericPattern
     *          null or a custom alphanumeric pattern (default is:
     *          "^[A-Za-z0-9]+$", provided by {@link Factory#DEFAULT_ALPHANUMERIC}
     */
    public TokenizerFactory(String languageCode, Dictionary abbreviationDictionary,
            boolean useAlphaNumericOptimization, Pattern alphaNumericPattern) {
        this.init(languageCode, abbreviationDictionary, useAlphaNumericOptimization, alphaNumericPattern);
    }

    protected void init(String languageCode, Dictionary abbreviationDictionary, boolean useAlphaNumericOptimization,
            Pattern alphaNumericPattern) {
        this.languageCode = languageCode;
        this.useAlphaNumericOptimization = useAlphaNumericOptimization;
        this.alphaNumericPattern = alphaNumericPattern;
        this.abbreviationDictionary = abbreviationDictionary;
    }

    @Override
    public void validateArtifactMap() throws InvalidFormatException {
        if (this.artifactProvider.getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION) == null)
            throw new InvalidFormatException(USE_ALPHA_NUMERIC_OPTIMIZATION + " is a mandatory property!");

        Object abbreviationsEntry = this.artifactProvider.getArtifact(ABBREVIATIONS_ENTRY_NAME);

        if (abbreviationsEntry != null && !(abbreviationsEntry instanceof Dictionary)) {
            throw new InvalidFormatException("Abbreviations dictionary '" + abbreviationsEntry
                    + "' has wrong type, needs to be of type Dictionary!");
        }
    }

    @Override
    public Map<String, Object> createArtifactMap() {
        Map<String, Object> artifactMap = super.createArtifactMap();

        // Abbreviations are optional
        if (abbreviationDictionary != null) {
            artifactMap.put(ABBREVIATIONS_ENTRY_NAME, abbreviationDictionary);
        }

        return artifactMap;
    }

    @Override
    public Map<String, String> createManifestEntries() {
        Map<String, String> manifestEntries = super.createManifestEntries();

        manifestEntries.put(USE_ALPHA_NUMERIC_OPTIMIZATION, Boolean.toString(isUseAlphaNumericOptmization()));

        // alphanumeric pattern is optional
        if (getAlphaNumericPattern() != null) {
            manifestEntries.put(ALPHA_NUMERIC_PATTERN, getAlphaNumericPattern().pattern());
        }

        return manifestEntries;
    }

    /**
     * Factory method the framework uses create a new {@link TokenizerFactory}.
     *
     * @param subclassName the name of the class implementing the {@link TokenizerFactory}
     * @param languageCode the language code the tokenizer should use
     * @param abbreviationDictionary an optional dictionary containing abbreviations, or null if not present
     * @param useAlphaNumericOptimization indicate if the alpha numeric optimization
     *     should be enabled or disabled
     * @param alphaNumericPattern the pattern the alpha numeric optimization should use
     *
     * @return the instance of the Tokenizer Factory
     *
     * @throws InvalidFormatException if once of the input parameters doesn't comply if the expected format
     */
    public static TokenizerFactory create(String subclassName, String languageCode,
            Dictionary abbreviationDictionary, boolean useAlphaNumericOptimization, Pattern alphaNumericPattern)
            throws InvalidFormatException {
        if (subclassName == null) {
            // will create the default factory
            return new TokenizerFactory(languageCode, abbreviationDictionary, useAlphaNumericOptimization,
                    alphaNumericPattern);
        }
        try {
            TokenizerFactory theFactory = ExtensionLoader.instantiateExtension(TokenizerFactory.class,
                    subclassName);
            theFactory.init(languageCode, abbreviationDictionary, useAlphaNumericOptimization, alphaNumericPattern);
            return theFactory;
        } catch (Exception e) {
            String msg = "Could not instantiate the " + subclassName + ". The initialization throw an exception.";
            System.err.println(msg);
            e.printStackTrace();
            throw new InvalidFormatException(msg, e);
        }
    }

    /**
     * Gets the alpha numeric pattern.
     *
     * @return the user specified alpha numeric pattern or a default.
     */
    public Pattern getAlphaNumericPattern() {
        if (this.alphaNumericPattern == null) {
            if (this.artifactProvider != null) {
                String prop = this.artifactProvider.getManifestProperty(ALPHA_NUMERIC_PATTERN);
                if (prop != null) {
                    this.alphaNumericPattern = Pattern.compile(prop);
                }
            }
            // could not load from manifest, will get from language dependent factory
            if (this.alphaNumericPattern == null) {
                Factory f = new Factory();
                this.alphaNumericPattern = f.getAlphanumeric(languageCode);
            }
        }
        return this.alphaNumericPattern;
    }

    /**
     * Gets whether to use alphanumeric optimization.
     *
     * @return true if the alpha numeric optimization is enabled, otherwise false
     */
    public boolean isUseAlphaNumericOptmization() {
        if (artifactProvider != null) {
            this.useAlphaNumericOptimization = Boolean
                    .valueOf(this.artifactProvider.getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION));
        }
        return this.useAlphaNumericOptimization;
    }

    /**
     * Gets the abbreviation dictionary
     *
     * @return null or the abbreviation dictionary
     */
    public Dictionary getAbbreviationDictionary() {
        if (this.abbreviationDictionary == null && artifactProvider != null) {
            this.abbreviationDictionary = this.artifactProvider.getArtifact(ABBREVIATIONS_ENTRY_NAME);
        }
        return this.abbreviationDictionary;
    }

    /**
     * Retrieves the language code.
     *
     * @return the language code
     */
    public String getLanguageCode() {
        if (this.languageCode == null && this.artifactProvider != null) {
            this.languageCode = this.artifactProvider.getLanguage();
        }
        return this.languageCode;
    }

    /**
     * Gets the context generator
     *
     * @return a new instance of the context generator
     */
    public TokenContextGenerator getContextGenerator() {
        Factory f = new Factory();
        Set<String> abbs;
        Dictionary abbDict = getAbbreviationDictionary();
        if (abbDict != null) {
            abbs = abbDict.asStringSet();
        } else {
            abbs = Collections.emptySet();
        }
        return f.createTokenContextGenerator(getLanguageCode(), abbs);
    }
}