gov.va.vinci.leo.ae.ExampleWhitespaceTokenizer.java Source code

Introduction

Here is the source code for gov.va.vinci.leo.ae.ExampleWhitespaceTokenizer.java
Source

package gov.va.vinci.leo.ae;

/*
 * #%L
 * Leo Service
 * %%
 * Copyright (C) 2010 - 2014 Department of Veterans Affairs
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import gov.va.vinci.leo.descriptors.LeoTypeSystemDescription;
import gov.va.vinci.leo.descriptors.TypeDescriptionBuilder;
import gov.va.vinci.leo.tools.ConfigurationParameterImpl;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.Feature;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.ConfigurationParameter;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashSet;

/**
 * User: Thomas Ginter
 * Date: 11/1/13
 * Time: 1:35 PM
 */
public class ExampleWhitespaceTokenizer extends LeoBaseAnnotator {
    /**
     * Set of stop words to exclude from the document results
     */
    private HashSet<String> stopWordSet = new HashSet<String>();

    /**
     * Token type definitions
     */
    public static final int TK_SPECIAL = 0;
    public static final int TK_NUMBER = 1;
    public static final int TK_LETTER = 2;
    public static final int TK_WORD = 3;
    public static final int TK_WHITESPACE = 4;
    public static final int TK_PUNCTUATION = 5;
    public static final int TK_NEWLINE = 6;
    public static final int TK_CONTROL = 7;
    public static final int TK_IDENTIFIER = 8;
    public static final int TK_INVALID = -1;
    public static final int TK_UNKNOWN = -2;

    /**
     * Current token type being processed
     */
    private int mTokenType;

    private boolean filterStopWords = false;

    //Parameter settings
    protected String tokenOutputType = null;
    protected String tokenOutputTypeFeature = null;
    protected String wordOutputType = null;
    protected String stopWordsFile = null;

    protected final static Logger logger = Logger.getLogger(ExampleWhitespaceTokenizer.class.getCanonicalName());

    /**
     * Type system constants
     */
    public static final String TOKEN_OUTPUT_TYPE_NAME = "gov.va.vinci.leo.types.Token";
    public static final String TOKEN_OUTPUT_TYPE_FEATURE_NAME = "TokenType";
    public static final String WORD_OUTPUT_TYPE_NAME = "gov.va.vinci.leo.types.WordToken";

    public ExampleWhitespaceTokenizer() {
        /** Default for UIMA Init **/
    }

    public ExampleWhitespaceTokenizer(String tokenOutputType, String tokenOutputTypeFeature, String wordOutputType,
            String stopWordsFile) {
        this.tokenOutputType = tokenOutputType;
        this.tokenOutputTypeFeature = tokenOutputTypeFeature;
        this.wordOutputType = wordOutputType;
        this.stopWordsFile = stopWordsFile;
    }

    /**
     * Read in the words from the stop words file
     *
     * @see org.apache.uima.analysis_component.AnalysisComponent_ImplBase#initialize(org.apache.uima.UimaContext)
     */
    @Override
    public void initialize(UimaContext aContext) throws ResourceInitializationException {
        super.initialize(aContext);

        if (StringUtils.isNotBlank(stopWordsFile)) {
            //Read in the file and populate the set
            BufferedReader br = null;
            try {
                br = new BufferedReader(new InputStreamReader(getResourceAsInputStream(stopWordsFile)));
                String line = null;
                while ((line = br.readLine()) != null) {
                    if (StringUtils.isNotBlank(line)) {
                        stopWordSet.add(line.toLowerCase().trim());
                    } //if
                } //while
                br.close();
                filterStopWords = true;
            } catch (Exception e) {
                throw new ResourceInitializationException(e);
            } finally {
                try {
                    br.close();
                } catch (IOException e) {
                    /** Just trying to close the handle **/
                    logger.warn("Could not close buffered reader: " + e);
                }
            } //finally
        }
    }//initialize method

    /**
     * @see org.apache.uima.analysis_component.JCasAnnotator_ImplBase#process(org.apache.uima.jcas.JCas)
     */
    @Override
    public void process(JCas aJCas) throws AnalysisEngineProcessException {
        super.process(aJCas);

        Feature tokenTypeFeature = aJCas.getTypeSystem().getType(tokenOutputType)
                .getFeatureByBaseName(tokenOutputTypeFeature);

        char[] documentChars = aJCas.getDocumentText().toCharArray();
        int documentLength = documentChars.length;
        int tokenBegin = TK_UNKNOWN;
        char currCh = '0';
        int currPos = 0;
        this.mTokenType = TK_UNKNOWN;

        //Build tokens based on character classifications
        while (currPos < documentLength) {
            //get current and next characters
            currCh = documentChars[currPos];

            //get current character type
            int currChType = characterType(currCh);
            switch (currChType) {
            case TK_LETTER:
                if (tokenBegin == TK_UNKNOWN) {
                    tokenBegin = currPos;
                    this.mTokenType = TK_WORD;
                } else if (this.mTokenType == TK_NUMBER) {
                    //Create a Number Token
                    createToken(aJCas, tokenBegin, currPos, this.mTokenType, tokenTypeFeature);
                    //Identifier is a combination of letters and numbers
                    this.mTokenType = TK_IDENTIFIER;
                }
                break;
            case TK_NUMBER:
                if (tokenBegin == TK_UNKNOWN) {
                    tokenBegin = currPos;
                    this.mTokenType = TK_NUMBER;
                } else if (this.mTokenType == TK_WORD) {
                    //Token is an identifier if a combination of letters and numbers
                    this.mTokenType = TK_IDENTIFIER;
                }
                break;
            case TK_PUNCTUATION:
                if (tokenBegin != TK_UNKNOWN) {
                    if (!(this.mTokenType == TK_WORD && currCh == '\'')) {
                        //not a contraction so end the current token, then create one for this punctuation
                        createToken(aJCas, tokenBegin, currPos, this.mTokenType, tokenTypeFeature);
                        tokenBegin = TK_UNKNOWN;
                        this.mTokenType = TK_UNKNOWN;
                    } //if not a contraction
                } //if token already started
                createToken(aJCas, currPos, currPos + 1, TK_PUNCTUATION, tokenTypeFeature);
                break;
            case TK_CONTROL:
            case TK_UNKNOWN:
                //End the current token if one is in progress then create token for current char
                if (tokenBegin != TK_UNKNOWN) {
                    createToken(aJCas, tokenBegin, currPos, this.mTokenType, tokenTypeFeature);
                }
                createToken(aJCas, currPos, currPos + 1, currChType, tokenTypeFeature);
                tokenBegin = TK_UNKNOWN;
                this.mTokenType = TK_UNKNOWN;
                break;
            default:
                //end the current open token
                if (tokenBegin != TK_UNKNOWN) {
                    createToken(aJCas, tokenBegin, currPos, this.mTokenType, tokenTypeFeature);
                    tokenBegin = TK_UNKNOWN;
                    this.mTokenType = TK_UNKNOWN;
                } //if tokenBegin != TK_UNKNOWN
            }//switch
            currPos++;
        } //while currPos < documentLength

        //We reached the end of the document, close any open tokens
        if (tokenBegin != TK_UNKNOWN) {
            createToken(aJCas, tokenBegin, currPos, this.mTokenType, tokenTypeFeature);
            tokenBegin = TK_UNKNOWN;
            this.mTokenType = TK_UNKNOWN;
        }
    }//process method

    /**
     * Create a Token annotation in the CAS using the begin and end points, type feature
     * will also be set
     *
     * @param aJCas current CAS
     * @param begin start index of the annotation
     * @param end end index of the annotation
     * @param tokenType type of Token being created
     */
    private void createToken(JCas aJCas, int begin, int end, int tokenType, Feature tokenTypeFeature)
            throws AnalysisEngineProcessException {
        Annotation token = this.addOutputAnnotation(tokenOutputType, aJCas, begin, end);
        token.setIntValue(tokenTypeFeature, tokenType);

        //Create the word annotation if not in the stopWordSet
        if (tokenType == TK_WORD) {
            if (!filterStopWords) {
                this.addOutputAnnotation(wordOutputType, aJCas, begin, end);
            } else {
                String coveredText = token.getCoveredText();
                if (StringUtils.isNotBlank(coveredText)
                        && !stopWordSet.contains(coveredText.toLowerCase().trim())) {
                    //Create the WordToken annotation
                    this.addOutputAnnotation(wordOutputType, aJCas, begin, end);
                } //if there is covered text and the word is not in the stopWordSet
            } //else
        } //if
    }//createToken method

    /**
     * Given a character c return the type definition from the
     * list of public static type definitions in this class.
     *
     * @param c
     * @return type definition for the character c
     */
    private static int characterType(char c) {
        switch (Character.getType(c)) {
        //letters
        case Character.UPPERCASE_LETTER:
        case Character.LOWERCASE_LETTER:
        case Character.TITLECASE_LETTER:
        case Character.MODIFIER_LETTER:
        case Character.OTHER_LETTER:
        case Character.NON_SPACING_MARK:
        case Character.ENCLOSING_MARK:
        case Character.COMBINING_SPACING_MARK:
        case Character.PRIVATE_USE:
        case Character.SURROGATE:
        case Character.MODIFIER_SYMBOL:
            return TK_LETTER;
        //numbers
        case Character.DECIMAL_DIGIT_NUMBER:
        case Character.LETTER_NUMBER:
        case Character.OTHER_NUMBER:
            return TK_NUMBER;
        //Regular Whitespace
        case Character.SPACE_SEPARATOR:
            return TK_WHITESPACE;
        //Punctuation
        case Character.DASH_PUNCTUATION:
        case Character.START_PUNCTUATION:
        case Character.END_PUNCTUATION:
        case Character.OTHER_PUNCTUATION:
            return TK_PUNCTUATION;
        //Simple NewLine
        case Character.LINE_SEPARATOR:
        case Character.PARAGRAPH_SEPARATOR:
            return TK_NEWLINE;
        //Other types of "control" characters
        case Character.CONTROL:
            if (c == '\n' || c == '\r')
                return TK_NEWLINE;
            if (Character.isWhitespace(c)) //Tab char is a "Control" character
                return TK_WHITESPACE;
            return TK_CONTROL;
        default:
            if (Character.isWhitespace(c)) {
                return TK_WHITESPACE;
            } //if
            return TK_UNKNOWN;
        }//switch
    }//characterType method

    @Override
    public LeoTypeSystemDescription getLeoTypeSystemDescription() {
        LeoTypeSystemDescription ftsd = new LeoTypeSystemDescription();
        try {
            ftsd.addType(TypeDescriptionBuilder.create(TOKEN_OUTPUT_TYPE_NAME, "", "uima.tcas.Annotation")
                    .addFeature(TOKEN_OUTPUT_TYPE_FEATURE_NAME, "", "uima.cas.Integer").getTypeDescription())
                    .addType(WORD_OUTPUT_TYPE_NAME, "Annotates collections of letters", "uima.tcas.Annotation");
        } catch (Exception e) {
            logger.warn("Exception occurred generating WhitespaceTypeSystem", e);
            throw new RuntimeException(e);
        } //catch
        return ftsd;
    }

    public static class Param {
        /**
         * The output type for tokens. This is the full type name, including package.
         */
        public static ConfigurationParameter TOKEN_OUTPUT_TYPE = new ConfigurationParameterImpl("tokenOutputType",
                "tokenOutputType", "String", true, false, new String[] {});

        /**
         * The feature on output type to set the token type (
         *          TK_SPECIAL       = 0;
         *          TK_NUMBER        = 1;
         *          TK_LETTER        = 2;
         *          TK_WORD          = 3;
         *          TK_WHITESPACE    = 4;
         *          TK_PUNCTUATION   = 5;
         *          TK_NEWLINE      = 6;
         *          TK_CONTROL      = 7;
         *          TK_IDENTIFIER   = 8;
         *          TK_INVALID      = -1;
         *          TK_UNKNOWN      = -2;
         *   )
         *
         */
        public static ConfigurationParameter TOKEN_OUTPUT_TYPE_FEATURE = new ConfigurationParameterImpl(
                "tokenOutputTypeFeature", "tokenOutputTypeFeature", "Integer", true, false, new String[] {});

        /**
         * The output type for words. This is the full type name, including package.
         */
        public static ConfigurationParameter WORD_OUTPUT_TYPE = new ConfigurationParameterImpl("wordOutputType",
                "wordOutputType", "String", true, false, new String[] {});

        /**
         * The full path to a stop words file. Words in the stop word file are not annotated as word token if found
         * in the document.
         */
        public static ConfigurationParameter STOP_WORDS_FILE = new ConfigurationParameterImpl("stopWordsFile",
                "stopWordsFile", "String", false, false, new String[] {});

    }
}