Java tutorial
/* * ------------------------------------------------------------------------ * Copyright by KNIME AG, Zurich, Switzerland * Website: http://www.knime.com; Email: contact@knime.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License, Version 3, as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see <http://www.gnu.org/licenses>. * * Additional permission under GNU GPL version 3 section 7: * * KNIME interoperates with ECLIPSE solely via ECLIPSE's plug-in APIs. * Hence, KNIME and ECLIPSE are both independent programs and are not * derived from each other. Should, however, the interpretation of the * GNU GPL Version 3 ("License") under any applicable laws result in * KNIME and ECLIPSE being a combined program, KNIME AG herewith grants * you the additional permission to use and propagate KNIME together with * ECLIPSE with only the license terms in place for ECLIPSE applying to * ECLIPSE and the GNU GPL Version 3 applying for KNIME, provided the * license terms of ECLIPSE themselves allow for the respective use and * propagation of ECLIPSE together with KNIME. * * Additional permission relating to nodes for KNIME that extend the Node * Extension (and in particular that are based on subclasses of NodeModel, * NodeDialog, and NodeView) and that only interoperate with KNIME through * standard APIs ("Nodes"): * Nodes are deemed to be separate and independent programs and to not be * covered works. Notwithstanding anything to the contrary in the * License, the License does not apply to Nodes, you are not required to * license Nodes under the License, and you are granted a license to * prepare and propagate Nodes, in each case even if such Nodes are * propagated with or for interoperation with KNIME. The owner of a Node * may freely choose the license terms applicable to such Node, including * when such Node is propagated with or for interoperation with KNIME. * --------------------------------------------------------------------- * * History * 29.07.2008 (thiel): created */ package org.knime.ext.textprocessing.nodes.transformation.stringstodocument; import java.text.ParseException; import java.time.LocalDate; import org.apache.commons.lang3.concurrent.ConcurrentException; import org.apache.commons.lang3.concurrent.LazyInitializer; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataRow; import org.knime.core.data.StringValue; import org.knime.core.data.container.AbstractCellFactory; import org.knime.core.data.container.CellFactory; import org.knime.core.data.filestore.FileStoreFactory; import org.knime.core.data.time.localdate.LocalDateValue; import org.knime.core.node.NodeLogger; import org.knime.core.node.util.CheckUtils; import org.knime.ext.textprocessing.data.Author; import org.knime.ext.textprocessing.data.DocumentBuilder; import org.knime.ext.textprocessing.data.DocumentCategory; import org.knime.ext.textprocessing.data.DocumentSource; import org.knime.ext.textprocessing.data.DocumentType; import org.knime.ext.textprocessing.data.PublicationDate; import org.knime.ext.textprocessing.data.SectionAnnotation; import org.knime.ext.textprocessing.preferences.TextprocessingPreferenceInitializer; import org.knime.ext.textprocessing.util.DataCellCache; import org.knime.ext.textprocessing.util.LRUDataCellCache; import org.knime.ext.textprocessing.util.TextContainerDataCellFactory; import org.knime.ext.textprocessing.util.TextContainerDataCellFactoryBuilder; /** * A {@link CellFactory} implementation to build a document for each data row. The given * {@code StringsToDocumentConfig2} instance specifies which columns of the row to use as title, text authors, etc. * * @author Hermann Azong & Julian Bunzel, KNIME.com, Berlin, Germany * @since 3.5 */ final class StringsToDocumentCellFactory2 extends AbstractCellFactory { private static final NodeLogger LOGGER = NodeLogger.getLogger(StringsToDocumentCellFactory2.class); private final StringsToDocumentConfig2 m_config; private final LazyInitializer<DataCellCache> m_cacheInitializer; private boolean m_cacheCreated = false; private String m_tokenizerName = TextprocessingPreferenceInitializer.tokenizerName(); /** * Creates new instance of {@code StringsToDocumentCellFactory2} with given configuration. * * @param config The configuration how to build a document. * @param newColSpecs The specs of the new columns that are created. * @param numberOfThreads The number of parallel threads to use. * @param tokenizerName The tokenizer used for word tokenization. * @throws IllegalArgumentException If given configuration is {@code null}. */ public StringsToDocumentCellFactory2(final StringsToDocumentConfig2 config, final DataColumnSpec[] newColSpecs, final int numberOfThreads, final String tokenizerName) throws IllegalArgumentException { super(newColSpecs); this.setParallelProcessing(true, numberOfThreads, 10 * numberOfThreads); if (config == null) { throw new IllegalArgumentException("Configuration object may not be null!"); } m_cacheInitializer = new LazyInitializer<DataCellCache>() { @Override protected DataCellCache initialize() throws ConcurrentException { DataCellCache dataCellCache = initializeDataCellCache(); m_cacheCreated = true; return dataCellCache; } }; m_config = config; m_tokenizerName = tokenizerName; } /** Callback from initializer - only be called when executing. */ private DataCellCache initializeDataCellCache() { final TextContainerDataCellFactory docCellFac = TextContainerDataCellFactoryBuilder .createDocumentCellFactory(); final FileStoreFactory fileStoreFactory = getFileStoreFactory(); CheckUtils.checkState(fileStoreFactory != null, "File store factory not expected to be null at this point"); docCellFac.prepare(fileStoreFactory); return new LRUDataCellCache(docCellFac); } /** @return the cache from the initializer, not null. Throws RuntimeException if needed. */ private DataCellCache getDataCellCache() { DataCellCache dataCellCache; try { dataCellCache = m_cacheInitializer.get(); } catch (ConcurrentException e) { throw new RuntimeException("Couldn't retrieve data cell cache", e); } return dataCellCache; } /** * {@inheritDoc} */ @Override public DataCell[] getCells(final DataRow row) { final DocumentBuilder docBuilder = new DocumentBuilder(m_tokenizerName); // Set title if (m_config.getUseTitleColumn()) { final DataCell titleCell = row.getCell(m_config.getTitleColumnIndex()); if (!titleCell.isMissing()) { docBuilder.addTitle(((StringValue) titleCell).getStringValue()); } } else if (m_config.getTitleMode().contentEquals(StringsToDocumentConfig2.TITLEMODE_ROWID)) { docBuilder.addTitle(row.getKey().toString()); } //Set fulltext final DataCell textCell = row.getCell(m_config.getFulltextColumnIndex()); if (!textCell.isMissing()) { docBuilder.addSection(((StringValue) textCell).getStringValue(), SectionAnnotation.UNKNOWN); } // Set authors if (m_config.getUseAuthorsColumn()) { final DataCell authorsCell = row.getCell(m_config.getAuthorsColumnIndex()); if (!authorsCell.isMissing()) { final String authors = ((StringValue) authorsCell).getStringValue(); final String[] authorsArr = authors.split(m_config.getAuthorsSplitChar()); for (String author : authorsArr) { String firstName = ""; String lastName = ""; final String[] names = author.split(" "); if (names.length > 1) { final StringBuilder sb = new StringBuilder(); for (int i = 0; i < names.length - 1; i++) { sb.append(names[i]); sb.append(" "); } firstName = sb.toString(); lastName = names[names.length - 1]; } else if (names.length == 1) { lastName = names[0]; } docBuilder.addAuthor(new Author(firstName.trim(), lastName.trim())); } } } else if (!m_config.getAuthorFirstName().isEmpty() || !m_config.getAuthorLastName().isEmpty()) { docBuilder.addAuthor(new Author(m_config.getAuthorFirstName(), m_config.getAuthorLastName())); } // set document source if (m_config.getUseSourceColumn()) { final DataCell sourceCell = row.getCell(m_config.getSourceColumnIndex()); if (!sourceCell.isMissing()) { docBuilder.addDocumentSource(new DocumentSource(((StringValue) sourceCell).getStringValue())); } } else if (m_config.getDocSource().length() > 0) { docBuilder.addDocumentSource(new DocumentSource(m_config.getDocSource())); } // set document category if (m_config.getUseCatColumn()) { final DataCell catCell = row.getCell(m_config.getCategoryColumnIndex()); if (!catCell.isMissing()) { docBuilder.addDocumentCategory(new DocumentCategory(((StringValue) catCell).getStringValue())); } } else if (m_config.getDocCat().length() > 0) { docBuilder.addDocumentCategory(new DocumentCategory(m_config.getDocCat())); } // set document type docBuilder.setDocumentType(DocumentType.stringToDocumentType(m_config.getDocType())); // set publication date if (m_config.getUsePubDateColumn()) { final DataCell pubDateCell = row.getCell(m_config.getPubDateColumnIndex()); if (!pubDateCell.isMissing()) { LocalDate date = ((LocalDateValue) pubDateCell).getLocalDate(); setPublicationDate(docBuilder, date.getYear(), date.getMonthValue(), date.getDayOfMonth()); } } else { LocalDate date = m_config.getPublicationDate(); setPublicationDate(docBuilder, date.getYear(), date.getMonthValue(), date.getDayOfMonth()); } // return datacells cells return new DataCell[] { getDataCellCache().getInstance(docBuilder.createDocument()) }; } /** * Closes data cell cache. */ @Override public void afterProcessing() { super.afterProcessing(); if (m_cacheCreated) { getDataCellCache().close(); } } // sets the publication date to the document builder private void setPublicationDate(final DocumentBuilder docBuilder, final int year, final int month, final int day) { try { docBuilder.setPublicationDate(new PublicationDate(year, month, day)); } catch (ParseException e) { LOGGER.info("Publication date could not be set!"); } } }