org.knime.ext.textprocessing.data.StanfordNERModelPortObject.java Source code

Java tutorial

Introduction

Here is the source code for org.knime.ext.textprocessing.data.StanfordNERModelPortObject.java

Source

/*
 * ------------------------------------------------------------------------
 *
 *  Copyright by KNIME AG, Zurich, Switzerland
 *  Website: http://www.knime.com; Email: contact@knime.com
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License, Version 3, as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses>.
 *
 *  Additional permission under GNU GPL version 3 section 7:
 *
 *  KNIME interoperates with ECLIPSE solely via ECLIPSE's plug-in APIs.
 *  Hence, KNIME and ECLIPSE are both independent programs and are not
 *  derived from each other. Should, however, the interpretation of the
 *  GNU GPL Version 3 ("License") under any applicable laws result in
 *  KNIME and ECLIPSE being a combined program, KNIME AG herewith grants
 *  you the additional permission to use and propagate KNIME together with
 *  ECLIPSE with only the license terms in place for ECLIPSE applying to
 *  ECLIPSE and the GNU GPL Version 3 applying for KNIME, provided the
 *  license terms of ECLIPSE themselves allow for the respective use and
 *  propagation of ECLIPSE together with KNIME.
 *
 *  Additional permission relating to nodes for KNIME that extend the Node
 *  Extension (and in particular that are based on subclasses of NodeModel,
 *  NodeDialog, and NodeView) and that only interoperate with KNIME through
 *  standard APIs ("Nodes"):
 *  Nodes are deemed to be separate and independent programs and to not be
 *  covered works.  Notwithstanding anything to the contrary in the
 *  License, the License does not apply to Nodes, you are not required to
 *  license Nodes under the License, and you are granted a license to
 *  prepare and propagate Nodes, in each case even if such Nodes are
 *  propagated with or for interoperation with KNIME.  The owner of a Node
 *  may freely choose the license terms applicable to such Node, including
 *  when such Node is propagated with or for interoperation with KNIME.
 * ---------------------------------------------------------------------
 *
 * History
 *   01.08.2016 (Julian): created
 */
package org.knime.ext.textprocessing.data;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.nio.file.Files;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.zip.ZipEntry;

import org.apache.commons.io.IOUtils;
import org.knime.core.data.util.NonClosableInputStream;
import org.knime.core.data.util.NonClosableOutputStream;
import org.knime.core.node.CanceledExecutionException;
import org.knime.core.node.ExecutionMonitor;
import org.knime.core.node.InvalidSettingsException;
import org.knime.core.node.ModelContent;
import org.knime.core.node.ModelContentRO;
import org.knime.core.node.port.PortObjectSpec;
import org.knime.core.node.port.PortObjectZipInputStream;
import org.knime.core.node.port.PortObjectZipOutputStream;
import org.knime.core.node.port.PortType;
import org.knime.core.node.port.PortTypeRegistry;

import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreLabel;

/**
 * This port object is a specific implementation of {@code NERModelPortObject} used for models trained by the
 * StanfordNLP NE Learner node.
 *
 * @author Julian Bunzel, KNIME.com, Berlin, Germany
 * @since 3.3
 */
public class StanfordNERModelPortObject extends NERModelPortObject<CRFClassifier<CoreLabel>> {

    /**
     * Define port type of objects of this class when used as PortObjects.
     */
    @SuppressWarnings("hiding")
    public static final PortType TYPE = PortTypeRegistry.getInstance()
            .getPortType(StanfordNERModelPortObject.class);

    /**
     * The serializer used to save/load the port object.
     *
     * @author Julian Bunzel, KNIME.com, Berlin, Germany
     */
    public static final class Serializer extends AbstractSimplePortObjectSerializer<StanfordNERModelPortObject> {

        /**
         * {@inheritDoc}
         */
        @Override
        public void savePortObject(final StanfordNERModelPortObject portObject, final PortObjectZipOutputStream out,
                final ExecutionMonitor exec) throws IOException, CanceledExecutionException {
            final String MODEL_FILE_NAME = "outputmodel" + portObject.getFileExtension();
            final String XML_CONFIG_NAME = "config.xml";
            final String DICT_FILE_NAME = "dict.bin";

            out.putNextEntry(new ZipEntry(XML_CONFIG_NAME));
            final ModelContent config = new ModelContent(XML_CONFIG_NAME);
            config.addString("tagValue", portObject.getTagValue());
            config.addString("tagType", portObject.getTagType());
            config.addString("tokenizerName", portObject.getTokenizerName());
            config.saveToXML(new NonClosableOutputStream.Zip(out));
            out.putNextEntry(new ZipEntry(MODEL_FILE_NAME));
            out.write(portObject.getByteArray());
            out.putNextEntry(new ZipEntry(DICT_FILE_NAME));
            try {
                out.write(portObject.getDictAsByteArray());
            } catch (final IOException e) {
                throw new IOException("Could not convert dictionary to byte array", e);
            }
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public StanfordNERModelPortObject loadPortObject(final PortObjectZipInputStream in,
                final PortObjectSpec spec, final ExecutionMonitor exec)
                throws IOException, CanceledExecutionException {
            // get xml entry
            in.getNextEntry();
            final ModelContentRO config;
            final InputStream is = new NonClosableInputStream.Zip(in);
            config = ModelContent.loadFromXML(is);

            // get model byte array
            in.getNextEntry();
            final byte[] outputModelByteArray = IOUtils.toByteArray(in);

            // get dict set
            in.getNextEntry();
            final byte[] dictByteArray = IOUtils.toByteArray(in);
            final String dict = new String(dictByteArray);
            final BufferedReader stringReader = new BufferedReader(new StringReader(dict));
            String line = null;
            final Set<String> dictSet = new LinkedHashSet<>();
            while ((line = stringReader.readLine()) != null) {
                dictSet.add(line);
            }

            // get settings
            try {
                final String usedTagValue = config.getString("tagValue");
                final String usedTagType = config.getString("tagType");
                final String nameOfUsedTokenizer = config.containsKey("tokenizerName")
                        ? config.getString("tokenizerName")
                        : "OpenNLP English WordTokenizer";
                final Tag usedTag = new Tag(usedTagValue, usedTagType);
                return new StanfordNERModelPortObject(outputModelByteArray, usedTag, dictSet, nameOfUsedTokenizer);
            } catch (final InvalidSettingsException e) {
                throw new IOException("Failed to deserialize port object", e);
            } catch (final Exception e) {
                throw new IOException("Could not create NLPModelPortObject");
            }
        }
    }

    /**
     * Creates an instance of {@code StanfordNERModelPortObject}.
     *
     * @param outputBuffer The byte array containing the NER model.
     * @param tag The used tag.
     * @param dict The used dictionary.
     * @param tokenizerName The name of the tokenizer used for word tokenization.
     * @since 3.3
     */
    public StanfordNERModelPortObject(final byte[] outputBuffer, final Tag tag, final Set<String> dict,
            final String tokenizerName) {
        super(outputBuffer, tag, dict, tokenizerName);
    }

    /**
     * {@inheritDoc}
     *
     * @throws IOException If the model file cannot be created or if there are problems accessing the input stream.
     * @throws ClassNotFoundException If there are problems interpreting the serialized data.
     * @throws ClassCastException If there are problems interpreting the serialized data.
     */
    @Override
    public CRFClassifier<CoreLabel> getNERModel() throws IOException, ClassNotFoundException {
        File file;
        file = getModelFile();
        final CRFClassifier<CoreLabel> crf = CRFClassifier.getClassifier(file);
        Files.delete(file.toPath());
        return crf;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public String getFileExtension() {
        return ".crf.ser.gz";
    }
}