org.knime.ext.textprocessing.nodes.misc.bratdocumentwriter.BratDocumentWriterNodeModel.java Source code

Java tutorial

Introduction

Here is the source code for org.knime.ext.textprocessing.nodes.misc.bratdocumentwriter.BratDocumentWriterNodeModel.java

Source

/*
 * ------------------------------------------------------------------------
 *
 *  Copyright by KNIME AG, Zurich, Switzerland
 *  Website: http://www.knime.com; Email: contact@knime.com
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License, Version 3, as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses>.
 *
 *  Additional permission under GNU GPL version 3 section 7:
 *
 *  KNIME interoperates with ECLIPSE solely via ECLIPSE's plug-in APIs.
 *  Hence, KNIME and ECLIPSE are both independent programs and are not
 *  derived from each other. Should, however, the interpretation of the
 *  GNU GPL Version 3 ("License") under any applicable laws result in
 *  KNIME and ECLIPSE being a combined program, KNIME AG herewith grants
 *  you the additional permission to use and propagate KNIME together with
 *  ECLIPSE with only the license terms in place for ECLIPSE applying to
 *  ECLIPSE and the GNU GPL Version 3 applying for KNIME, provided the
 *  license terms of ECLIPSE themselves allow for the respective use and
 *  propagation of ECLIPSE together with KNIME.
 *
 *  Additional permission relating to nodes for KNIME that extend the Node
 *  Extension (and in particular that are based on subclasses of NodeModel,
 *  NodeDialog, and NodeView) and that only interoperate with KNIME through
 *  standard APIs ("Nodes"):
 *  Nodes are deemed to be separate and independent programs and to not be
 *  covered works.  Notwithstanding anything to the contrary in the
 *  License, the License does not apply to Nodes, you are not required to
 *  license Nodes under the License, and you are granted a license to
 *  prepare and propagate Nodes, in each case even if such Nodes are
 *  propagated with or for interoperation with KNIME.  The owner of a Node
 *  may freely choose the license terms applicable to such Node, including
 *  when such Node is propagated with or for interoperation with KNIME.
 * ---------------------------------------------------------------------
 *
 * History
 *   Oct 18, 2018 (dewi): created
 */
package org.knime.ext.textprocessing.nodes.misc.bratdocumentwriter;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.SystemUtils;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataRow;
import org.knime.core.data.DataTableSpec;
import org.knime.core.data.container.CloseableRowIterator;
import org.knime.core.node.BufferedDataTable;
import org.knime.core.node.CanceledExecutionException;
import org.knime.core.node.ExecutionContext;
import org.knime.core.node.ExecutionMonitor;
import org.knime.core.node.InvalidSettingsException;
import org.knime.core.node.NodeLogger;
import org.knime.core.node.NodeModel;
import org.knime.core.node.NodeSettingsRO;
import org.knime.core.node.NodeSettingsWO;
import org.knime.core.node.defaultnodesettings.SettingsModelBoolean;
import org.knime.core.node.defaultnodesettings.SettingsModelString;
import org.knime.core.node.util.CheckUtils;
import org.knime.core.util.FileUtil;
import org.knime.ext.textprocessing.data.Document;
import org.knime.ext.textprocessing.data.DocumentValue;
import org.knime.ext.textprocessing.data.IndexedTerm;
import org.knime.ext.textprocessing.util.ColumnSelectionVerifier;
import org.knime.ext.textprocessing.util.DataTableSpecVerifier;
import org.knime.ext.textprocessing.util.DocumentUtil;

/**
 * The {@link NodeModel} for the Brat Document Writer. This node writes document tags and terms in an .ann file and the
 * document text in a .txt file.
 *
 * @author Andisa Dewi, KNIME AG, Berlin, Germany
 */
final class BratDocumentWriterNodeModel extends NodeModel {

    /**
     * The delimiter splitting the document title from its body.
     */
    private static final String TITLE_DELIMITER = "\n";

    /**
     * Boolean to check whether the OS is windows.
     */
    private static final boolean IS_WINDOWS = SystemUtils.IS_OS_WINDOWS;

    /**
     * The Logger for BratDocumentWriterNodeModel.
     */
    private static final NodeLogger LOGGER = NodeLogger.getLogger(BratDocumentWriterNodeModel.class);

    /**
     * The SettingsModelString for the document column.
     */
    private final SettingsModelString m_docColModel = BratDocumentWriterNodeDialog.getDocColModel();

    /**
     * The SettingsModelString for the directory path.
     */
    private final SettingsModelString m_directoryModel = BratDocumentWriterNodeDialog.getDirectoryModel();

    /**
     * The SettingsModelBoolean for the overwrite flag.
     */
    private final SettingsModelBoolean m_overwriteModel = BratDocumentWriterNodeDialog.getOverwriteModel();

    /**
     * The SettingsModelString for the file name prefix.
     */
    private final SettingsModelString m_prefixModel = BratDocumentWriterNodeDialog.getPrefixModel();

    /**
     * The SettingsModelString for the file name suffix.
     */
    private final SettingsModelString m_suffixModel = BratDocumentWriterNodeDialog.getSuffixModel();

    /**
     * The constructor of the Brat Document Writer node. The node has one input and no output port.
     */
    BratDocumentWriterNodeModel() {
        super(1, 0);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    protected DataTableSpec[] configure(final DataTableSpec[] inSpecs) throws InvalidSettingsException {
        checkDataTableSpec(inSpecs[0]);

        // check target directory
        CheckUtils.checkDestinationDirectory(m_directoryModel.getStringValue());

        // check suffix and prefix for invalid chars
        checkForInvalidChars(m_prefixModel.getStringValue());
        checkForInvalidChars(m_suffixModel.getStringValue());

        return new DataTableSpec[] {};
    }

    /**
     * Check the input data table spec.
     *
     * @param spec the data table spec to be checked
     * @throws InvalidSettingsException
     */
    private final void checkDataTableSpec(final DataTableSpec spec) throws InvalidSettingsException {
        // check that input spec has at least 1 document column
        DataTableSpecVerifier verifier = new DataTableSpecVerifier(spec);
        verifier.verifyMinimumDocumentCells(1, true);

        ColumnSelectionVerifier.verifyColumn(m_docColModel, spec, DocumentValue.class, null)
                .ifPresent(msg -> setWarningMessage(msg));
    }

    /**
     * {@inheritDoc}
     */
    @Override
    protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec)
            throws Exception {
        final DataTableSpec inputSpec = inData[0].getDataTableSpec();
        checkDataTableSpec(inputSpec);
        final int docColIndex = inputSpec.findColumnIndex(m_docColModel.getStringValue());

        final double rowCount = inData[0].size();
        long currRow = 0;
        try (final CloseableRowIterator it = inData[0].iterator()) {
            int countMissing = 0;

            while (it.hasNext()) {
                final DataRow row = it.next();
                // get document cell from original data table
                final DataCell docCell = row.getCell(docColIndex);
                // if cell is not missing, try to read the doc and write to files
                if (!docCell.isMissing()) {
                    final Document doc = ((DocumentValue) docCell).getDocument();
                    // add prefix and suffix to filename if available
                    // and verify that the filename does not contain forbidden symbols
                    // if it is okay, try to write to files
                    writeDocumentToFiles(doc, buildFilename(row.getKey().getString()));
                } else { // otherwise count as missing
                    countMissing++;
                    LOGGER.debug("Skipping row " + row.getKey().getString() + " since the cell is missing.");
                }
                // report status
                final long fCurrRow = ++currRow;
                exec.setProgress(currRow / rowCount, () -> "Processing document " + fCurrRow + " of " + rowCount);
                exec.checkCanceled();
            }
            if (countMissing > 0) {
                setWarningMessage("Skipped " + countMissing + " rows due to missing values.");
            }
            if (rowCount == 0) {
                setWarningMessage("Input table is empty.");
            }
        }
        return new BufferedDataTable[] {};
    }

    /**
     * Write the document text and its tags and terms to .txt and .ann files. If there is an error while writing, it
     * will first try to delete both of the files, and then throw an exception.
     *
     * @param doc the document to be stored
     * @param filename the supposed name of the file to be written
     * @throws InvalidSettingsException if the file path is problematic
     * @throws InvalidPathException if the file path looks like a file system path but is invalid
     * @throws URISyntaxException if the passed URL does not conform with RFC2396 for URIs
     * @throws IOException if an I/O error occurs
     */
    private void writeDocumentToFiles(final Document doc, final String filename)
            throws InvalidSettingsException, InvalidPathException, URISyntaxException, IOException {
        final String dirPath = m_directoryModel.getStringValue();
        // check the directory path
        final String dirWarning = CheckUtils.checkDestinationDirectory(dirPath);
        // set a warning message if there is one
        if (dirWarning != null) {
            setWarningMessage(dirWarning);
        }
        // add extensions to the filename
        final String txtFilename = dirPath + "/" + filename + ".txt";
        final String annFilename = dirPath + "/" + filename + ".ann";

        try {
            // write document text to txt file
            writeToFile(createOutputStream(txtFilename), doc.getTitle().isEmpty() ? doc.getText()
                    : String.join(TITLE_DELIMITER, doc.getTitle(), doc.getDocumentBodyText()));

            // fetch and write the tags and terms
            writeToFile(createOutputStream(annFilename),
                    convertToString(DocumentUtil.getIndexedTerms(doc, true, TITLE_DELIMITER)));
        } catch (final IOException e) {
            // if something is wrong mid writing, try to delete both files
            try {
                deleteFile(txtFilename);
                deleteFile(annFilename);
            } catch (final IOException ex) {
                // if an error occurs while deleting the files then
                // nothing we can do
            }
            throw e;
            // for remote files nothing will be done
        }
    }

    /**
     * Add prefix and suffix to filename if exist and then check the filename if it contains any forbidden symbol.
     *
     * @param filename the file name
     * @return verified file name with its suffix and prefix
     * @throws InvalidSettingsException if the file name contains forbidden symbol
     */
    private String buildFilename(final String filename) throws InvalidSettingsException {
        String result = filename;
        if (!m_prefixModel.getStringValue().isEmpty()) {
            result = m_prefixModel.getStringValue() + filename;
        }
        if (!m_suffixModel.getStringValue().isEmpty()) {
            result += m_suffixModel.getStringValue();
        }
        checkForInvalidChars(result);
        checkReservedNamesInWindows(result);

        return result;
    }

    /**
     * Verify the file name to make sure it does not contain any forbidden symbol.
     *
     * @param filename the file name
     * @throws InvalidSettingsException if the file name contains forbidden symbol
     */
    static void checkForInvalidChars(final String filename) throws InvalidSettingsException {
        // forbid /:?<>*"|\
        Pattern pattern = Pattern.compile("[/:?<>*\"|\\\\]");
        Matcher matcher = pattern.matcher(filename);
        if (matcher.find()) {
            final int invalidIdx = matcher.start();
            throw new InvalidSettingsException(
                    "Invalid file name: contains invalid char " + filename.charAt(invalidIdx));
        }
    }

    /**
     * Check if the filename is the same as any filenames that are reserved in Windows.
     *
     * @param filename the filename to be checked
     * @throws InvalidSettingsException if the filename is a reserved name in Windows
     */
    private static void checkReservedNamesInWindows(final String filename) throws InvalidSettingsException {
        if (IS_WINDOWS) {
            Pattern forbiddenWindowsNames = Pattern.compile("^(CON|PRN|AUX|NUL|COM[1-9]|LPT[1-9])$");
            Matcher matcher = forbiddenWindowsNames.matcher(filename);
            if (matcher.find()) {
                throw new InvalidSettingsException(
                        "Invalid file name: the file name " + filename + " is forbidden on Windows.");
            }
        }
    }

    /**
     * Delete local file.
     *
     * @param filepath the to be deleted file path
     * @throws IOException if an I/O error occurs
     */
    private static void deleteFile(final String filepath) throws IOException {
        final Path path = Paths.get(filepath);
        // check if file exists, if yes then delete
        if (Files.exists(path)) {
            Files.delete(path);
        }
        // if file does not exist anyway, do nothing
    }

    /**
     * Open an output stream based on a given file path.
     *
     * @param filepath the input file path
     * @throws InvalidSettingsException if the file path is problematic
     * @throws InvalidPathException if the file path looks like a file system path but is invalid
     * @throws URISyntaxException if the passed URL does not conform with RFC2396 for URIs
     * @throws IOException if an I/O error occurs
     */
    private OutputStream createOutputStream(final String filepath)
            throws InvalidSettingsException, InvalidPathException, IOException, URISyntaxException {
        // check the validity of file path
        final String warning = CheckUtils.checkDestinationFile(filepath, m_overwriteModel.getBooleanValue());
        // set a warning message if there is one
        if (warning != null) {
            setWarningMessage(warning);
        }
        final URL url = FileUtil.toURL(filepath);
        final Path localPath = FileUtil.resolveToPath(url);
        if (localPath != null) {
            return new BufferedOutputStream(Files.newOutputStream(localPath));
        } else {
            return new BufferedOutputStream(FileUtil.openOutputConnection(url, "PUT").getOutputStream());
        }
    }

    /**
     * Write strings to the output stream.
     *
     * @param out the output stream
     * @param content the string to be written
     * @throws IOException if an I/O error occurs
     */
    private static void writeToFile(final OutputStream out, final String content) throws IOException {
        out.write(content.getBytes());
        out.flush();
        out.close();
    }

    /**
     * Concatenate all the terms and tags into one string. Each line contains one term with one particular tag. So if a
     * term has multiple tags, each one will be written in one line.
     *
     * An example of a line is like this: T1<tab>Location 61 69<tab>Germany
     *
     * Where T1 is the term index (Brat-style), Location is the tag, both 61 and 69 are start and stop index of the term
     * respectively, while Germany is the term.
     *
     * @param list the list of the terms
     * @return the string containing all the terms
     */
    private static String convertToString(final List<IndexedTerm> list) {
        StringBuilder out = new StringBuilder();
        int idx = 1;
        for (IndexedTerm obj : list) {
            List<String> tags = obj.getTagValues();
            for (String tag : tags) {
                out.append("T" + idx++);
                out.append("\t");
                out.append(tag);
                out.append(" ");
                out.append(obj.getStartIndex());
                out.append(" ");
                out.append(obj.getStopIndex());
                out.append("\t");
                out.append(obj.getTermValue());
                out.append("\n");
            }
        }
        return out.toString();
    }

    /**
     * {@inheritDoc}
     */
    @Override
    protected void loadInternals(final File nodeInternDir, final ExecutionMonitor exec)
            throws IOException, CanceledExecutionException {
        // nothing to do
    }

    /**
     * {@inheritDoc}
     */
    @Override
    protected void saveInternals(final File nodeInternDir, final ExecutionMonitor exec)
            throws IOException, CanceledExecutionException {
        // nothing to do
    }

    /**
     * {@inheritDoc}
     */
    @Override
    protected void saveSettingsTo(final NodeSettingsWO settings) {
        m_docColModel.saveSettingsTo(settings);
        m_directoryModel.saveSettingsTo(settings);
        m_overwriteModel.saveSettingsTo(settings);
        m_prefixModel.saveSettingsTo(settings);
        m_suffixModel.saveSettingsTo(settings);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    protected void validateSettings(final NodeSettingsRO settings) throws InvalidSettingsException {
        m_docColModel.validateSettings(settings);
        m_directoryModel.validateSettings(settings);
        m_overwriteModel.validateSettings(settings);
        m_prefixModel.validateSettings(settings);
        m_suffixModel.validateSettings(settings);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    protected void loadValidatedSettingsFrom(final NodeSettingsRO settings) throws InvalidSettingsException {
        m_docColModel.loadSettingsFrom(settings);
        m_directoryModel.loadSettingsFrom(settings);
        m_overwriteModel.loadSettingsFrom(settings);
        m_prefixModel.loadSettingsFrom(settings);
        m_suffixModel.loadSettingsFrom(settings);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    protected void reset() {
        // nothing to do
    }

}