org.knime.ext.textprocessing.nodes.transformation.stringstodocument.StringsToDocumentCellFactory.java Source code

Java tutorial

Introduction

Here is the source code for org.knime.ext.textprocessing.nodes.transformation.stringstodocument.StringsToDocumentCellFactory.java

Source

/*
 * ------------------------------------------------------------------------
 *  Copyright by KNIME AG, Zurich, Switzerland
 *  Website: http://www.knime.com; Email: contact@knime.com
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License, Version 3, as
 *  published by the Free Software Foundation.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, see <http://www.gnu.org/licenses>.
 *
 *  Additional permission under GNU GPL version 3 section 7:
 *
 *  KNIME interoperates with ECLIPSE solely via ECLIPSE's plug-in APIs.
 *  Hence, KNIME and ECLIPSE are both independent programs and are not
 *  derived from each other. Should, however, the interpretation of the
 *  GNU GPL Version 3 ("License") under any applicable laws result in
 *  KNIME and ECLIPSE being a combined program, KNIME AG herewith grants
 *  you the additional permission to use and propagate KNIME together with
 *  ECLIPSE with only the license terms in place for ECLIPSE applying to
 *  ECLIPSE and the GNU GPL Version 3 applying for KNIME, provided the
 *  license terms of ECLIPSE themselves allow for the respective use and
 *  propagation of ECLIPSE together with KNIME.
 *
 *  Additional permission relating to nodes for KNIME that extend the Node
 *  Extension (and in particular that are based on subclasses of NodeModel,
 *  NodeDialog, and NodeView) and that only interoperate with KNIME through
 *  standard APIs ("Nodes"):
 *  Nodes are deemed to be separate and independent programs and to not be
 *  covered works.  Notwithstanding anything to the contrary in the
 *  License, the License does not apply to Nodes, you are not required to
 *  license Nodes under the License, and you are granted a license to
 *  prepare and propagate Nodes, in each case even if such Nodes are
 *  propagated with or for interoperation with KNIME.  The owner of a Node
 *  may freely choose the license terms applicable to such Node, including
 *  when such Node is propagated with or for interoperation with KNIME.
 * ---------------------------------------------------------------------
 *
 * History
 *   29.07.2008 (thiel): created
 */
package org.knime.ext.textprocessing.nodes.transformation.stringstodocument;

import java.text.ParseException;
import java.time.LocalDate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.concurrent.ConcurrentException;
import org.apache.commons.lang3.concurrent.LazyInitializer;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataRow;
import org.knime.core.data.StringValue;
import org.knime.core.data.container.AbstractCellFactory;
import org.knime.core.data.date.DateAndTimeValue;
import org.knime.core.data.filestore.FileStoreFactory;
import org.knime.core.data.time.localdate.LocalDateValue;
import org.knime.core.node.NodeLogger;
import org.knime.core.node.util.CheckUtils;
import org.knime.ext.textprocessing.data.Author;
import org.knime.ext.textprocessing.data.DocumentBuilder;
import org.knime.ext.textprocessing.data.DocumentCategory;
import org.knime.ext.textprocessing.data.DocumentSource;
import org.knime.ext.textprocessing.data.DocumentType;
import org.knime.ext.textprocessing.data.PublicationDate;
import org.knime.ext.textprocessing.data.SectionAnnotation;
import org.knime.ext.textprocessing.preferences.TextprocessingPreferenceInitializer;
import org.knime.ext.textprocessing.util.DataCellCache;
import org.knime.ext.textprocessing.util.LRUDataCellCache;
import org.knime.ext.textprocessing.util.TextContainerDataCellFactory;
import org.knime.ext.textprocessing.util.TextContainerDataCellFactoryBuilder;

/**
 * A {@CellFactory} to build a document for each data row. The given {@StringsToDocumentConfig} instance specifies which
 * columns of the row to use as title, text authors, etc.
 *
 * @author Hermann Azong, KNIME.com, Berlin, Germany
 * @deprecated
 */
@Deprecated
public class StringsToDocumentCellFactory extends AbstractCellFactory {

    private static final NodeLogger LOGGER = NodeLogger.getLogger(StringsToDocumentCellFactory.class);

    // standard date pattern for strings to document node
    private static final Pattern DATE_PATTERN = Pattern.compile("([\\d]{2})-([\\d]{2})-([\\d]{4})");

    private final StringsToDocumentConfig m_config;

    private final LazyInitializer<DataCellCache> m_cacheInitializer;

    private boolean m_cacheCreated = false;

    // initializing the old standard word tokenizer for backwards compatibility
    private String m_tokenizerName = TextprocessingPreferenceInitializer.tokenizerName();

    /**
     * Creates new instance of {@StringsToDocumentCellFactory} with given configuration.
     *
     * @param config The configuration how to build a document.
     * @param newColSpecs The specs of the new columns that are created.
     * @param numberOfThreads The number of parallel threads to use.
     * @throws IllegalArgumentException If given configuration is {@null}.
     * @since 3.1
     * @deprecated Use {@link #StringsToDocumentCellFactory(StringsToDocumentConfig, DataColumnSpec[], int, String)} for
     *             tokenizer selection.
     */
    @Deprecated
    public StringsToDocumentCellFactory(final StringsToDocumentConfig config, final DataColumnSpec[] newColSpecs,
            final int numberOfThreads) throws IllegalArgumentException {
        this(config, newColSpecs, numberOfThreads, TextprocessingPreferenceInitializer.tokenizerName());
    }

    /**
     * Creates new instance of <code>StringsToDocumentCellFactory</code> with given configuration.
     *
     * @param config The configuration how to build a document.
     * @param newColSpecs The specs of the new columns that are created.
     * @param numberOfThreads The number of parallel threads to use.
     * @param tokenizerName The tokenizer used for wordTokenization.
     * @throws IllegalArgumentException If given configuration is <code>null</code>.
     * @since 3.3
     */
    public StringsToDocumentCellFactory(final StringsToDocumentConfig config, final DataColumnSpec[] newColSpecs,
            final int numberOfThreads, final String tokenizerName) throws IllegalArgumentException {
        super(newColSpecs);

        this.setParallelProcessing(true, numberOfThreads, 10 * numberOfThreads);

        if (config == null) {
            throw new IllegalArgumentException("Configuration object may not be null!");
        }
        m_cacheInitializer = new LazyInitializer<DataCellCache>() {
            @Override
            protected DataCellCache initialize() throws ConcurrentException {
                DataCellCache dataCellCache = initializeDataCellCache();
                m_cacheCreated = true;
                return dataCellCache;
            }
        };
        m_config = config;
        m_tokenizerName = tokenizerName;
    }

    /** Callback from initializer - only be called when executing. */
    private DataCellCache initializeDataCellCache() {
        final TextContainerDataCellFactory docCellFac = TextContainerDataCellFactoryBuilder
                .createDocumentCellFactory();
        final FileStoreFactory fileStoreFactory = getFileStoreFactory();
        CheckUtils.checkState(fileStoreFactory != null, "File store factory not expected to be null at this point");
        docCellFac.prepare(fileStoreFactory);
        return new LRUDataCellCache(docCellFac);
    }

    /** @return the cache from the initializer, not null. Throws RuntimeException if needed. */
    private DataCellCache getDataCellCache() {
        DataCellCache dataCellCache;
        try {
            dataCellCache = m_cacheInitializer.get();
        } catch (ConcurrentException e) {
            throw new RuntimeException("Couldn't retrieve data cell cache", e);
        }
        return dataCellCache;
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public DataCell[] getCells(final DataRow row) {
        final DocumentBuilder docBuilder = new DocumentBuilder(m_tokenizerName);

        // Set title
        String title = m_config.getDocTitle();
        if (m_config.getUseTitleColumn()) {
            if (m_config.getTitleStringIndex() >= 0) {
                final DataCell titleCell = row.getCell(m_config.getTitleStringIndex());
                if (!titleCell.isMissing() && titleCell.getType().isCompatible(StringValue.class)) {
                    title = ((StringValue) titleCell).getStringValue();
                } else {
                    title = "";
                }
                docBuilder.addTitle(title);
            }
        } else {
            title = row.getKey().toString();
            docBuilder.addTitle(title);
        }

        //Set fulltext
        if (m_config.getFulltextStringIndex() >= 0) {
            final DataCell textCell = row.getCell(m_config.getFulltextStringIndex());
            String fulltext = "";
            if (!textCell.isMissing() && textCell.getType().isCompatible(StringValue.class)) {
                fulltext = ((StringValue) textCell).getStringValue();
            }
            docBuilder.addSection(fulltext, SectionAnnotation.UNKNOWN);
        }

        // Set authors
        if (m_config.getUseAuthorsColumn()) {
            if (m_config.getAuthorsStringIndex() >= 0) {
                final DataCell auhorsCell = row.getCell(m_config.getAuthorsStringIndex());
                if (!auhorsCell.isMissing() && auhorsCell.getType().isCompatible(StringValue.class)) {
                    final String authors = ((StringValue) auhorsCell).getStringValue();
                    final String[] authorsArr = authors.split(m_config.getAuthorsSplitChar());
                    for (String author : authorsArr) {
                        String firstName = m_config.getAuthorFirstName();
                        String lastName = m_config.getAuthorLastName();

                        final String[] names = author.split(" ");
                        if (names.length > 1) {
                            final StringBuilder sb = new StringBuilder();
                            for (int i = 0; i < names.length - 1; i++) {
                                sb.append(names[i]);
                                sb.append(" ");
                            }
                            firstName = sb.toString();
                            lastName = names[names.length - 1];
                        } else if (names.length == 1) {
                            lastName = names[0];
                        }

                        docBuilder.addAuthor(new Author(firstName.trim(), lastName.trim()));
                    }
                    // If check box is set to use author names from column
                    // if author first/last name is not specified and both first and last name in the node dialog
                    // component are empty return an empty string.
                } else if (auhorsCell.isMissing()
                        && (!m_config.getAuthorFirstName().isEmpty() || !m_config.getAuthorLastName().isEmpty())) {
                    docBuilder.addAuthor(new Author(m_config.getAuthorFirstName(), m_config.getAuthorLastName()));
                }

            }
        } else if (!m_config.getAuthorFirstName().isEmpty() || !m_config.getAuthorLastName().isEmpty()) {
            // if no check box is set to use authors name from column, if both dialog components name are empty
            // return an empty string otherwise return "-" + the one specified.
            docBuilder.addAuthor(new Author(m_config.getAuthorFirstName(), m_config.getAuthorLastName()));
        }

        // set document source
        String docSource = m_config.getDocSource();
        if (m_config.getUseSourceColumn()) {
            if (m_config.getSourceStringIndex() >= 0) {
                final DataCell sourceCell = row.getCell(m_config.getSourceStringIndex());
                if (!sourceCell.isMissing() && sourceCell.getType().isCompatible(StringValue.class)) {
                    docSource = ((StringValue) sourceCell).getStringValue();
                } else {
                    docSource = "";
                }
            }
        }
        if (docSource.length() > 0) {
            docBuilder.addDocumentSource(new DocumentSource(docSource));
        }

        // set document category
        String docCat = m_config.getDocCat();
        if (m_config.getUseCatColumn()) {
            if (m_config.getCategoryStringIndex() >= 0) {
                final DataCell catCell = row.getCell(m_config.getCategoryStringIndex());
                if (!catCell.isMissing() && catCell.getType().isCompatible(StringValue.class)) {
                    docCat = ((StringValue) catCell).getStringValue();
                } else {
                    docCat = "";
                }
            }
        }
        if (docCat.length() > 0) {
            docBuilder.addDocumentCategory(new DocumentCategory(docCat));
        }

        // set document type
        docBuilder.setDocumentType(DocumentType.stringToDocumentType(m_config.getDocType()));

        // set publication date
        if (m_config.getUsePubDateColumn()) {
            if (m_config.getPubDateStringIndex() >= 0) {
                final DataCell pubDateCell = row.getCell(m_config.getPubDateStringIndex());
                if (!pubDateCell.isMissing()) {
                    // new LocalDate type
                    if (pubDateCell.getType().isCompatible(LocalDateValue.class)) {
                        LocalDate date = ((LocalDateValue) pubDateCell).getLocalDate();
                        setPublicationDate(docBuilder, date.getYear(), date.getMonthValue(), date.getDayOfMonth());
                    } else if (pubDateCell.getType().isCompatible(DateAndTimeValue.class)) {
                        DateAndTimeValue dateTime = ((DateAndTimeValue) pubDateCell);
                        setPublicationDate(docBuilder, dateTime.getYear(), dateTime.getMonth() + 1,
                                dateTime.getDayOfMonth());
                    } else if (pubDateCell.getType().isCompatible(StringValue.class)) {
                        extractAndSetPublicationDate(((StringValue) pubDateCell).getStringValue(), docBuilder);
                    }
                }
            }
        } else {
            extractAndSetPublicationDate(m_config.getPublicationDate(), docBuilder);
        }
        DataCellCache dataCellCache = getDataCellCache();
        return new DataCell[] { dataCellCache.getInstance(docBuilder.createDocument()) };
    }

    /**
     * Closes data cell cache.
     *
     * @since 2.8
     */
    @Override
    public void afterProcessing() {
        super.afterProcessing();
        if (m_cacheCreated) {
            getDataCellCache().close();
        }
    }

    // extracts and sets the date if incoming date information is a String
    private void extractAndSetPublicationDate(final String dateStr, final DocumentBuilder docBuilder) {
        final Matcher m = DATE_PATTERN.matcher(dateStr);
        if (m.matches()) {
            setPublicationDate(docBuilder, Integer.parseInt(m.group(3)), Integer.parseInt(m.group(2)),
                    Integer.parseInt(m.group(1)));
        }
    }

    // sets the publication date to the document builder
    private void setPublicationDate(final DocumentBuilder docBuilder, final int year, final int month,
            final int day) {
        try {
            docBuilder.setPublicationDate(new PublicationDate(year, month, day));
        } catch (ParseException e) {
            LOGGER.info("Publication date could not be set!");
        }
    }
}