org.jesterj.ingest.scanners.JdbcScanner.java Source code

Java tutorial

Introduction

Here is the source code for org.jesterj.ingest.scanners.JdbcScanner.java

Source

/*
 * Copyright 2016 Needham Software LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.jesterj.ingest.scanners;

import com.copyright.easiertest.SimpleProperty;
import com.google.common.io.CharStreams;
import net.jini.space.JavaSpace;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.pdfbox.io.IOUtils;
import org.jesterj.ingest.model.ConfiguredBuildable;
import org.jesterj.ingest.model.Document;
import org.jesterj.ingest.model.Router;
import org.jesterj.ingest.model.exception.ConfigurationException;
import org.jesterj.ingest.model.exception.PersistenceException;
import org.jesterj.ingest.model.impl.DocumentImpl;
import org.jesterj.ingest.model.impl.ScannerImpl;
import org.jesterj.ingest.utils.SqlUtils;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.sql.Blob;
import java.sql.Clob;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Statement;
import java.time.Instant;
import java.time.format.DateTimeFormatter;
import java.util.Date;
import java.util.function.Consumer;
import java.util.function.Function;

/**
 * Scans a JDBC source such as an RDBMS (e.g. MySQL). Obtains a connection through the specified
 * JDBC driver and extracts rows using the specified SQL statement; converts extracted rows to
 * documents and passes the documents downstream.
 *
 * @author dgoldenberg
 */
public class JdbcScanner extends ScannerImpl {

    private static final Logger log = LogManager.getLogger();

    private String jdbcDriver;
    private String jdbcUrl;
    private String jdbcUser;
    private String jdbcPassword;
    private String sqlStatement;
    private String table;
    private transient volatile boolean ready;

    // For MySQL, this may need to be set by user to Integer.MIN_VALUE:
    // http://stackoverflow.com/questions/2447324/streaming-large-result-sets-with-mysql
    // http://stackoverflow.com/questions/24098247/jdbc-select-batching-fetch-size-with-mysql
    // http://stackoverflow.com/questions/20899977/what-and-when-should-i-specify-setfetchsize
    private int fetchSize = -1;

    private boolean autoCommit;
    private int queryTimeout = -1;

    // The (optional) name for the column that contains the document content
    private String contentColumn;

    private SqlUtils sqlUtils = new SqlUtils();

    // Use the ISO 8601 date format supported by Lucene, e.g. 2011-12-03T10:15:30Z
    private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ISO_INSTANT;

    @Override
    public Function<String, String> getIdFunction() {
        return s -> s;
    }

    @Override
    public Consumer<Document> getDocumentTracker() {
        return document -> {
        };
    }

    @Override
    public void activate() {
        ready = true;
        super.activate();
    }

    @Override
    public void deactivate() {
        ready = false;
        super.deactivate();
    }

    @Override
    public Runnable getScanOperation() {
        return () -> {

            log.info("connecting to database {}", jdbcUrl);
            // Establish a connection and execute the query.
            this.ready = false;
            scanStarted();
            try (Connection conn = sqlUtils.createJdbcConnection(jdbcDriver, jdbcUrl, jdbcUser, jdbcPassword,
                    autoCommit);
                    Statement statement = createStatement(conn);
                    ResultSet rs = statement.executeQuery(sqlStatement)) {
                log.info("Successfully queried database {}", jdbcUrl);

                String[] columnNames = getColumnNames(rs);
                int docIdColumnIdx = getDocIdColumnIndex(columnNames, getPlan().getDocIdField());

                // For each row
                while (rs.next()) {
                    String docId = rs.getString(docIdColumnIdx);
                    docId = jdbcUrl + "/" + table + "/" + docId;
                    Document doc = makeDoc(rs, columnNames, docId);
                    JdbcScanner.this.docFound(doc);
                }

            } catch (ConfigurationException | PersistenceException | SQLException ex) {
                log.error("JDBC scanner error.", ex);
            }
            scanFinished();
            this.ready = true;
        };
    }

    /**
     * Creates a document from a result set row.
     *
     * @param rs          the result set
     * @param columnNames the column names
     * @param docId       the document ID to use
     * @return the document instance
     * @throws SQLException if the connection to the database has a problem
     */
    Document makeDoc(ResultSet rs, String[] columnNames, String docId) throws SQLException {
        // TODO - deletion tracking. Configure a separate query to identify soft deletes.
        // TODO - query Cassandra for whether the ID is in it, if so then it's an update

        byte[] rawBytes = getContentBytes(rs);

        DocumentImpl doc = new DocumentImpl(rawBytes, docId, getPlan(), Document.Operation.NEW, JdbcScanner.this);

        // For each column value
        for (int i = 1; i <= columnNames.length; i++) {
            // Skip over the content column; if any is specified, we've already processed it as raw bytes.
            // similarly skip over the ID field as well.
            String columnName = columnNames[i - 1];
            if (!columnName.equals(contentColumn) && !columnName.equals(doc.getIdField())) {
                Object value = rs.getObject(i);
                String strValue;
                if (value != null) {
                    // Take care of java.sql.Date, java.sql.Time, and java.sql.Timestamp
                    if (value instanceof Date) {
                        strValue = convertDateToString(value);
                    } else {
                        strValue = value.toString();
                    }
                    doc.put(columnName, strValue);
                }
            }
        }

        return doc;
    }

    private byte[] getContentBytes(ResultSet rs) throws SQLException {
        byte[] rawBytes = null;

        // If the content column was specified
        if (StringUtils.isNotBlank(contentColumn)) {
            // Get its value.
            Object content = rs.getObject(contentColumn);

            if (content != null) {
                // Clob
                if (content instanceof Clob) {
                    Clob clob = (Clob) content;
                    try (Reader reader = clob.getCharacterStream()) {
                        rawBytes = CharStreams.toString(reader).getBytes();
                    } catch (IOException ex) {
                        String msg = String.format("I/O error while reading value of content column '%s'.",
                                contentColumn);
                        log.error(msg, ex);
                    }
                }
                // Blob
                else if (content instanceof Blob) {
                    Blob blob = (Blob) content;
                    try (InputStream stream = blob.getBinaryStream()) {
                        rawBytes = IOUtils.toByteArray(stream);
                    } catch (IOException ex) {
                        String msg = String.format("I/O error while reading value of content column '%s'.",
                                contentColumn);
                        log.error(msg, ex);
                    }
                }
                // Date (unlikely, but)
                else if (content instanceof Date) {
                    rawBytes = convertDateToString(content).getBytes();
                }
                // Anything else
                else {
                    rawBytes = content.toString().getBytes();
                }
            }
        }
        return rawBytes;
    }

    private static String convertDateToString(Object value) {
        Instant instant = Instant.ofEpochMilli(((Date) value).getTime());
        return DATE_FORMATTER.format(instant);
    }

    // Creates a statement to execute
    private Statement createStatement(Connection conn) throws SQLException {
        Statement statement = conn.createStatement(java.sql.ResultSet.TYPE_FORWARD_ONLY,
                java.sql.ResultSet.CONCUR_READ_ONLY);
        if (fetchSize != -1) {
            statement.setFetchSize(fetchSize);
        }
        if (queryTimeout > 0) {
            statement.setQueryTimeout(queryTimeout);
        }
        return statement;
    }

    // Gets the list of columns from the result set
    private String[] getColumnNames(ResultSet rs) throws SQLException {
        ResultSetMetaData meta = rs.getMetaData();
        String[] names = new String[meta.getColumnCount()];

        for (int i = 0; i < names.length; i++) {
            // NB: we're getting the label here, to support the SQL's "select a AS b".
            // If label not available, we'll get the plain column name.
            names[i] = meta.getColumnLabel(i + 1);
        }
        return names;
    }

    // Gets the document ID column index
    private int getDocIdColumnIndex(String[] columnNames, String docIdColumnName) throws PersistenceException {
        int itemIdColNum = -1;
        if (docIdColumnName != null) {
            for (int i = 0; i < columnNames.length; i++) {
                if (columnNames[i].equals(docIdColumnName)) {
                    itemIdColNum = i + 1; // + 1 for JDBC
                    break;
                }
            }
            if (itemIdColNum == -1) {
                throw new PersistenceException(String.format(
                        "The document ID column could not be found in the SQL result set. docIdColumn: '%s', SQL: %s, columns: %s.",
                        docIdColumnName, sqlStatement, String.join(", ", (CharSequence[]) columnNames)));
            }
        }
        return itemIdColNum;
    }

    @Override
    public boolean isReady() {
        return ready;
    }

    /**
     * Handles configuration parameters for the JDBC scanner.
     *
     * @author dgoldenberg
     */
    public static class Builder extends ScannerImpl.Builder {

        private JdbcScanner obj;

        public Builder() {
            if (whoAmI() == this.getClass()) {
                obj = new JdbcScanner();
            }
        }

        private Class<?> whoAmI() {
            return new Object() {
            }.getClass().getEnclosingMethod().getDeclaringClass();
        }

        // TODO DG - validate method

        public Builder withJdbcDriver(String jdbcDriver) {
            getObject().jdbcDriver = jdbcDriver;
            return this;
        }

        public Builder withJdbcUrl(String jdbcUrl) {
            getObject().jdbcUrl = jdbcUrl;
            return this;
        }

        public Builder withJdbcUser(String jdbcUser) {
            getObject().jdbcUser = jdbcUser;
            return this;
        }

        public Builder withJdbcPassword(String jdbcPassword) {
            getObject().jdbcPassword = jdbcPassword;
            return this;
        }

        public Builder withSqlStatement(String sqlStatement) {
            getObject().sqlStatement = sqlStatement;
            return this;
        }

        public Builder representingTable(String table) {
            getObject().table = table;
            return this;
        }

        public Builder withContentColumn(String contentColumn) {
            getObject().contentColumn = contentColumn;
            return this;
        }

        public Builder withFetchSize(int fetchSize) {
            getObject().fetchSize = fetchSize;
            return this;
        }

        public Builder withAutoCommit(boolean autoCommit) {
            getObject().autoCommit = autoCommit;
            return this;
        }

        public Builder withQueryTimeout(int queryTimeout) {
            getObject().queryTimeout = queryTimeout;
            return this;
        }

        @Override
        public JdbcScanner.Builder batchSize(int size) {
            super.batchSize(size);
            return this;
        }

        @Override
        public JdbcScanner.Builder outputSpace(JavaSpace outputSpace) {
            super.outputSpace(outputSpace);
            return this;
        }

        @Override
        public JdbcScanner.Builder inputSpace(JavaSpace inputSpace) {
            super.inputSpace(inputSpace);
            return this;
        }

        @Override
        public JdbcScanner.Builder named(String stepName) {
            super.named(stepName);
            return this;
        }

        @Override
        public JdbcScanner.Builder routingBy(ConfiguredBuildable<? extends Router> router) {
            super.routingBy(router);
            return this;
        }

        @Override
        public JdbcScanner.Builder scanFreqMS(long interval) {
            super.scanFreqMS(interval);
            return this;
        }

        @Override
        public ScannerImpl build() {
            if (obj.jdbcDriver == null || obj.jdbcPassword == null || obj.jdbcUser == null || obj.jdbcUrl == null
                    || obj.table == null) {
                throw new IllegalStateException(
                        "jdbc driver, password, user, url, and the table being represented " + "must be supplied");
            }
            JdbcScanner tmp = obj;
            this.obj = new JdbcScanner();
            return tmp;
        }

        @Override
        protected JdbcScanner getObject() {
            return obj;
        }
    }

    @SimpleProperty
    public String getJdbcDriver() {
        return jdbcDriver;
    }

    @SimpleProperty
    public String getJdbcUrl() {
        return jdbcUrl;
    }

    @SimpleProperty
    public String getJdbcUser() {
        return jdbcUser;
    }

    @SimpleProperty
    public String getJdbcPassword() {
        return jdbcPassword;
    }

    @SimpleProperty
    public String getSqlStatement() {
        return sqlStatement;
    }

    @SimpleProperty
    public int getFetchSize() {
        return fetchSize;
    }

    @SimpleProperty
    public boolean isAutoCommit() {
        return autoCommit;
    }

    @SimpleProperty
    public int getQueryTimeout() {
        return queryTimeout;
    }

    @SimpleProperty
    public String getContentColumn() {
        return contentColumn;
    }
}