com.textocat.textokit.dictmatcher.TaggedChunkerBuilderResource.java Source code

Java tutorial

Introduction

Here is the source code for com.textocat.textokit.dictmatcher.TaggedChunkerBuilderResource.java

Source

/*
 *    Copyright 2015 Textocat
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 */

package com.textocat.textokit.dictmatcher;

import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.textocat.textokit.chunk.Chunker;
import com.textocat.textokit.chunk.ChunkerBuilder;
import com.textocat.textokit.resource.SpringResourceLocator;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.LineIterator;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.ExternalResourceFactory;
import org.apache.uima.resource.ExternalResourceDescription;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.util.Map;

/**
 * @author Rinat Gareev
 */
public class TaggedChunkerBuilderResource extends SpringResourceLocator {

    public static final String PARAM_CHUNKER_BUILDER_CLASS = "chunkerBuilderClass";

    public static ExternalResourceDescription createDescription(String resourceLocation) {
        return createDescription(resourceLocation, "com.textocat.textokit.dictmatcher.mensa.MensaChunkerBuilder");
    }

    public static ExternalResourceDescription createDescription(String resourceLocation,
            Class<? extends ChunkerBuilder> builderClass) {
        return createDescription(resourceLocation, builderClass.getName());
    }

    public static ExternalResourceDescription createDescription(String resourceLocation, String builderClassName) {
        return ExternalResourceFactory.createExternalResourceDescription(TaggedChunkerBuilderResource.class,
                PARAM_CHUNKER_BUILDER_CLASS, builderClassName, PARAM_RESOURCE_LOCATION, resourceLocation);
    }

    @ConfigurationParameter(name = PARAM_CHUNKER_BUILDER_CLASS)
    private Class<? extends ChunkerBuilder> chunkerBuilderClass;
    // state fields
    private final Logger log = LoggerFactory.getLogger(getClass());
    private Chunker<String> chunker;

    @Override
    public boolean initialize(ResourceSpecifier aSpecifier, Map<String, Object> aAdditionalParams)
            throws ResourceInitializationException {
        if (!super.initialize(aSpecifier, aAdditionalParams))
            return false;
        //
        ChunkerBuilder<String> builder;
        try {
            //noinspection unchecked
            builder = chunkerBuilderClass.newInstance();
        } catch (Exception e) {
            throw new ResourceInitializationException(e);
        }
        //
        try (InputStream in = resourceMeta.getInputStream()) {
            LineIterator lineIterator = IOUtils.lineIterator(in, "UTF-8");
            int lineNum = 0;
            while (lineIterator.hasNext()) {
                lineNum++;
                DictEntry de = parseLine(lineIterator.nextLine(), lineNum);
                if (de != null)
                    builder.addEntry(de.tokens, de.tag);
            }
        } catch (IOException e) {
            throw new ResourceInitializationException(e);
        }
        chunker = builder.build();
        return true;
    }

    private DictEntry parseLine(String line, int lineNum) {
        line = line.trim();
        if (line.isEmpty()) {
            return null;
        }
        if (line.startsWith("#")) {
            return null;
        }
        String[] tokTagSplit = line.split(TAG_DELIMITER);
        if (tokTagSplit.length != 2)
            cantParseLine(line);
        String tag = tokTagSplit[1].trim().intern();
        Iterable<String> tokens = TOKEN_SPLITTER.split(tokTagSplit[0]);
        if (Iterables.isEmpty(tokens)) {
            log.warn("Line {} contains empty record!", lineNum);
            return null;
        }
        return new DictEntry(tokens, tag);
    }

    public static final String TOKEN_SEPARATOR = " ";
    public static final String TAG_DELIMITER = "\t";
    private static Splitter TOKEN_SPLITTER = Splitter.on(TOKEN_SEPARATOR).trimResults().omitEmptyStrings();

    private void cantParseLine(String line) {
        throw new IllegalStateException("Can't parse line:\n" + line);
    }

    @Override
    public Object getResource() {
        return chunker;
    }

    private class DictEntry {
        private final Iterable<String> tokens;
        private final String tag;

        public DictEntry(Iterable<String> tokens, String tag) {
            this.tokens = tokens;
            this.tag = tag;
        }
    }
}