opennlp.tools.namefind.TokenNameFinderFactory.java Source code

Java tutorial

Introduction

Here is the source code for opennlp.tools.namefind.TokenNameFinderFactory.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.namefind;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;

import opennlp.tools.namefind.TokenNameFinderModel.FeatureGeneratorCreationError;
import opennlp.tools.util.BaseToolFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.SequenceCodec;
import opennlp.tools.util.ext.ExtensionLoader;
import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
import opennlp.tools.util.featuregen.AggregatedFeatureGenerator;
import opennlp.tools.util.featuregen.BigramNameFeatureGenerator;
import opennlp.tools.util.featuregen.CachedFeatureGenerator;
import opennlp.tools.util.featuregen.GeneratorFactory;
import opennlp.tools.util.featuregen.OutcomePriorFeatureGenerator;
import opennlp.tools.util.featuregen.PreviousMapFeatureGenerator;
import opennlp.tools.util.featuregen.SentenceFeatureGenerator;
import opennlp.tools.util.featuregen.TokenClassFeatureGenerator;
import opennlp.tools.util.featuregen.TokenFeatureGenerator;
import opennlp.tools.util.featuregen.WindowFeatureGenerator;

// Idea of this factory is that most resources/impls used by the name finder
// can be modified through this class!
// That only works if that's the central class used for training/runtime

public class TokenNameFinderFactory extends BaseToolFactory {

    private byte[] featureGeneratorBytes;
    private Map<String, Object> resources;
    private SequenceCodec<String> seqCodec;

    /**
     * Creates a {@link TokenNameFinderFactory} that provides the default implementation
     * of the resources.
     */
    public TokenNameFinderFactory() {
        this.seqCodec = new BioCodec();
    }

    public TokenNameFinderFactory(byte[] featureGeneratorBytes, final Map<String, Object> resources,
            SequenceCodec<String> seqCodec) {
        init(featureGeneratorBytes, resources, seqCodec);
    }

    void init(byte[] featureGeneratorBytes, final Map<String, Object> resources, SequenceCodec<String> seqCodec) {
        this.featureGeneratorBytes = featureGeneratorBytes;
        this.resources = resources;
        this.seqCodec = seqCodec;
    }

    private static byte[] loadDefaultFeatureGeneratorBytes() {

        ByteArrayOutputStream bytes = new ByteArrayOutputStream();
        try (InputStream in = TokenNameFinderFactory.class
                .getResourceAsStream("/opennlp/tools/namefind/ner-default-features.xml")) {

            if (in == null) {
                throw new IllegalStateException("Classpath must contain ner-default-features.xml file!");
            }

            byte[] buf = new byte[1024];
            int len;
            while ((len = in.read(buf)) > 0) {
                bytes.write(buf, 0, len);
            }
        } catch (IOException e) {
            throw new IllegalStateException("Failed reading from ner-default-features.xml file on classpath!");
        }

        return bytes.toByteArray();
    }

    protected SequenceCodec<String> getSequenceCodec() {
        return seqCodec;
    }

    protected Map<String, Object> getResources() {
        return resources;
    }

    protected byte[] getFeatureGenerator() {
        return featureGeneratorBytes;
    }

    public static TokenNameFinderFactory create(String subclassName, byte[] featureGeneratorBytes,
            final Map<String, Object> resources, SequenceCodec<String> seqCodec) throws InvalidFormatException {
        TokenNameFinderFactory theFactory;
        if (subclassName == null) {
            // will create the default factory
            theFactory = new TokenNameFinderFactory();
        } else {
            try {
                theFactory = ExtensionLoader.instantiateExtension(TokenNameFinderFactory.class, subclassName);
            } catch (Exception e) {
                String msg = "Could not instantiate the " + subclassName
                        + ". The initialization throw an exception.";
                System.err.println(msg);
                e.printStackTrace();
                throw new InvalidFormatException(msg, e);
            }
        }
        theFactory.init(featureGeneratorBytes, resources, seqCodec);
        return theFactory;
    }

    @Override
    public void validateArtifactMap() throws InvalidFormatException {
        // no additional artifacts
    }

    public SequenceCodec<String> createSequenceCodec() {

        if (artifactProvider != null) {
            String sequeceCodecImplName = artifactProvider
                    .getManifestProperty(TokenNameFinderModel.SEQUENCE_CODEC_CLASS_NAME_PARAMETER);
            return instantiateSequenceCodec(sequeceCodecImplName);
        } else {
            return seqCodec;
        }
    }

    public NameContextGenerator createContextGenerator() {

        AdaptiveFeatureGenerator featureGenerator = createFeatureGenerators();

        if (featureGenerator == null) {
            featureGenerator = new CachedFeatureGenerator(
                    new WindowFeatureGenerator(new TokenFeatureGenerator(), 2, 2),
                    new WindowFeatureGenerator(new TokenClassFeatureGenerator(true), 2, 2),
                    new OutcomePriorFeatureGenerator(), new PreviousMapFeatureGenerator(),
                    new BigramNameFeatureGenerator(), new SentenceFeatureGenerator(true, false));
        }

        return new DefaultNameContextGenerator(featureGenerator);
    }

    /**
     * Creates the {@link AdaptiveFeatureGenerator}. Usually this
     * is a set of generators contained in the {@link AggregatedFeatureGenerator}.
     *
     * Note:
     * The generators are created on every call to this method.
     *
     * @return the feature generator or null if there is no descriptor in the model
     */
    public AdaptiveFeatureGenerator createFeatureGenerators() {

        if (featureGeneratorBytes == null && artifactProvider != null) {
            featureGeneratorBytes = artifactProvider
                    .getArtifact(TokenNameFinderModel.GENERATOR_DESCRIPTOR_ENTRY_NAME);
        }

        if (featureGeneratorBytes == null) {
            featureGeneratorBytes = loadDefaultFeatureGeneratorBytes();
        }

        InputStream descriptorIn = new ByteArrayInputStream(featureGeneratorBytes);

        AdaptiveFeatureGenerator generator;
        try {
            generator = GeneratorFactory.create(descriptorIn, key -> {
                if (artifactProvider != null) {
                    return artifactProvider.getArtifact(key);
                } else {
                    return resources.get(key);
                }
            });
        } catch (InvalidFormatException e) {
            // It is assumed that the creation of the feature generation does not
            // fail after it succeeded once during model loading.

            // But it might still be possible that such an exception is thrown,
            // in this case the caller should not be forced to handle the exception
            // and a Runtime Exception is thrown instead.

            // If the re-creation of the feature generation fails it is assumed
            // that this can only be caused by a programming mistake and therefore
            // throwing a Runtime Exception is reasonable

            throw new FeatureGeneratorCreationError(e);
        } catch (IOException e) {
            throw new IllegalStateException("Reading from mem cannot result in an I/O error", e);
        }

        return generator;
    }

    public static SequenceCodec<String> instantiateSequenceCodec(String sequenceCodecImplName) {

        if (sequenceCodecImplName != null) {
            return ExtensionLoader.instantiateExtension(SequenceCodec.class, sequenceCodecImplName);
        } else {
            // If nothing is specified return old default!
            return new BioCodec();
        }
    }
}