org.apache.nifi.util.hive.HiveJdbcCommon.java Source code

Introduction

Here is the source code for org.apache.nifi.util.hive.HiveJdbcCommon.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nifi.util.hive;

import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.SchemaBuilder.FieldAssembler;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumWriter;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.nifi.components.PropertyDescriptor;

import java.io.IOException;
import java.io.OutputStream;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import static java.sql.Types.ARRAY;
import static java.sql.Types.BIGINT;
import static java.sql.Types.BINARY;
import static java.sql.Types.BIT;
import static java.sql.Types.BLOB;
import static java.sql.Types.BOOLEAN;
import static java.sql.Types.CHAR;
import static java.sql.Types.CLOB;
import static java.sql.Types.DATE;
import static java.sql.Types.DECIMAL;
import static java.sql.Types.DOUBLE;
import static java.sql.Types.FLOAT;
import static java.sql.Types.INTEGER;
import static java.sql.Types.JAVA_OBJECT;
import static java.sql.Types.LONGNVARCHAR;
import static java.sql.Types.LONGVARBINARY;
import static java.sql.Types.LONGVARCHAR;
import static java.sql.Types.NCHAR;
import static java.sql.Types.NUMERIC;
import static java.sql.Types.NVARCHAR;
import static java.sql.Types.REAL;
import static java.sql.Types.ROWID;
import static java.sql.Types.SMALLINT;
import static java.sql.Types.STRUCT;
import static java.sql.Types.TIME;
import static java.sql.Types.TIMESTAMP;
import static java.sql.Types.TINYINT;
import static java.sql.Types.VARBINARY;
import static java.sql.Types.VARCHAR;

/**
 * JDBC / HiveQL common functions.
 */
public class HiveJdbcCommon {

    public static final String AVRO = "Avro";
    public static final String CSV = "CSV";

    public static final String MIME_TYPE_AVRO_BINARY = "application/avro-binary";
    public static final String CSV_MIME_TYPE = "text/csv";

    public static final PropertyDescriptor NORMALIZE_NAMES_FOR_AVRO = new PropertyDescriptor.Builder()
            .name("hive-normalize-avro").displayName("Normalize Table/Column Names")
            .description(
                    "Whether to change non-Avro-compatible characters in column names to Avro-compatible characters. For example, colons and periods "
                            + "will be changed to underscores in order to build a valid Avro record.")
            .allowableValues("true", "false").defaultValue("false").required(true).build();

    public static long convertToAvroStream(final ResultSet rs, final OutputStream outStream, final int maxRows,
            boolean convertNames) throws SQLException, IOException {
        return convertToAvroStream(rs, outStream, null, maxRows, convertNames, null);
    }

    public static long convertToAvroStream(final ResultSet rs, final OutputStream outStream, String recordName,
            final int maxRows, boolean convertNames, ResultSetRowCallback callback)
            throws SQLException, IOException {
        final Schema schema = createSchema(rs, recordName, convertNames);
        final GenericRecord rec = new GenericData.Record(schema);

        final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
        try (final DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
            dataFileWriter.create(schema, outStream);

            final ResultSetMetaData meta = rs.getMetaData();
            final int nrOfColumns = meta.getColumnCount();
            long nrOfRows = 0;
            while (rs.next()) {
                if (callback != null) {
                    callback.processRow(rs);
                }
                for (int i = 1; i <= nrOfColumns; i++) {
                    final int javaSqlType = meta.getColumnType(i);
                    Object value = rs.getObject(i);

                    if (value == null) {
                        rec.put(i - 1, null);

                    } else if (javaSqlType == BINARY || javaSqlType == VARBINARY || javaSqlType == LONGVARBINARY
                            || javaSqlType == BLOB || javaSqlType == CLOB) {
                        // bytes requires little bit different handling
                        ByteBuffer bb = null;
                        if (value instanceof byte[]) {
                            bb = ByteBuffer.wrap((byte[]) value);
                        } else if (value instanceof ByteBuffer) {
                            bb = (ByteBuffer) value;
                        }
                        if (bb != null) {
                            rec.put(i - 1, bb);
                        } else {
                            throw new IOException(
                                    "Could not process binary object of type " + value.getClass().getName());
                        }

                    } else if (value instanceof Byte) {
                        // tinyint(1) type is returned by JDBC driver as java.sql.Types.TINYINT
                        // But value is returned by JDBC as java.lang.Byte
                        // (at least H2 JDBC works this way)
                        // direct put to avro record results:
                        // org.apache.avro.AvroRuntimeException: Unknown datum type java.lang.Byte
                        rec.put(i - 1, ((Byte) value).intValue());

                    } else if (value instanceof BigDecimal || value instanceof BigInteger) {
                        // Avro can't handle BigDecimal and BigInteger as numbers - it will throw an AvroRuntimeException such as: "Unknown datum type: java.math.BigDecimal: 38"
                        rec.put(i - 1, value.toString());

                    } else if (value instanceof Number) {
                        // Need to call the right getXYZ() method (instead of the getObject() method above), since Doubles are sometimes returned
                        // when the JDBC type is 6 (Float) for example.
                        if (javaSqlType == FLOAT) {
                            value = rs.getFloat(i);
                        } else if (javaSqlType == DOUBLE) {
                            value = rs.getDouble(i);
                        } else if (javaSqlType == INTEGER || javaSqlType == TINYINT || javaSqlType == SMALLINT) {
                            value = rs.getInt(i);
                        }

                        rec.put(i - 1, value);

                    } else if (value instanceof Boolean) {
                        rec.put(i - 1, value);
                    } else {
                        // The different types that we support are numbers (int, long, double, float),
                        // as well as boolean values and Strings. Since Avro doesn't provide
                        // timestamp types, we want to convert those to Strings. So we will cast anything other
                        // than numbers or booleans to strings by using the toString() method.
                        rec.put(i - 1, value.toString());
                    }
                }
                dataFileWriter.append(rec);
                nrOfRows += 1;

                if (maxRows > 0 && nrOfRows == maxRows)
                    break;
            }

            return nrOfRows;
        }
    }

    public static Schema createSchema(final ResultSet rs, boolean convertNames) throws SQLException {
        return createSchema(rs, null, false);
    }

    /**
     * Creates an Avro schema from a result set. If the table/record name is known a priori and provided, use that as a
     * fallback for the record name if it cannot be retrieved from the result set, and finally fall back to a default value.
     *
     * @param rs         The result set to convert to Avro
     * @param recordName The a priori record name to use if it cannot be determined from the result set.
     * @param convertNames  Whether to convert column/table names to be legal Avro names
     * @return A Schema object representing the result set converted to an Avro record
     * @throws SQLException if any error occurs during conversion
     */
    public static Schema createSchema(final ResultSet rs, String recordName, boolean convertNames)
            throws SQLException {
        final ResultSetMetaData meta = rs.getMetaData();
        final int nrOfColumns = meta.getColumnCount();
        String tableName = StringUtils.isEmpty(recordName) ? "NiFi_SelectHiveQL_Record" : recordName;
        try {
            if (nrOfColumns > 0) {
                // Hive JDBC doesn't support getTableName, instead it returns table.column for column name. Grab the table name from the first column
                String firstColumnNameFromMeta = meta.getColumnName(1);
                int tableNameDelimiter = firstColumnNameFromMeta.lastIndexOf(".");
                if (tableNameDelimiter > -1) {
                    String tableNameFromMeta = firstColumnNameFromMeta.substring(0, tableNameDelimiter);
                    if (!StringUtils.isBlank(tableNameFromMeta)) {
                        tableName = tableNameFromMeta;
                    }
                }
            }
        } catch (SQLException se) {
            // Not all drivers support getTableName, so just use the previously-set default
        }

        if (convertNames) {
            tableName = normalizeNameForAvro(tableName);
        }
        final FieldAssembler<Schema> builder = SchemaBuilder.record(tableName).namespace("any.data").fields();

        /**
         * Some missing Avro types - Decimal, Date types. May need some additional work.
         */
        for (int i = 1; i <= nrOfColumns; i++) {
            String columnNameFromMeta = meta.getColumnName(i);
            // Hive returns table.column for column name. Grab the column name as the string after the last period
            int columnNameDelimiter = columnNameFromMeta.lastIndexOf(".");
            String columnName = columnNameFromMeta.substring(columnNameDelimiter + 1);
            switch (meta.getColumnType(i)) {
            case CHAR:
            case LONGNVARCHAR:
            case LONGVARCHAR:
            case NCHAR:
            case NVARCHAR:
            case VARCHAR:
            case ARRAY:
            case STRUCT:
            case JAVA_OBJECT:
                builder.name(columnName).type().unionOf().nullBuilder().endNull().and().stringType().endUnion()
                        .noDefault();
                break;

            case BIT:
            case BOOLEAN:
                builder.name(columnName).type().unionOf().nullBuilder().endNull().and().booleanType().endUnion()
                        .noDefault();
                break;

            case INTEGER:
                // Default to signed type unless otherwise noted. Some JDBC drivers don't implement isSigned()
                boolean signedType = true;
                try {
                    signedType = meta.isSigned(i);
                } catch (SQLException se) {
                    // Use signed types as default
                }
                if (signedType) {
                    builder.name(columnName).type().unionOf().nullBuilder().endNull().and().intType().endUnion()
                            .noDefault();
                } else {
                    builder.name(columnName).type().unionOf().nullBuilder().endNull().and().longType().endUnion()
                            .noDefault();
                }
                break;

            case SMALLINT:
            case TINYINT:
                builder.name(columnName).type().unionOf().nullBuilder().endNull().and().intType().endUnion()
                        .noDefault();
                break;

            case BIGINT:
                builder.name(columnName).type().unionOf().nullBuilder().endNull().and().longType().endUnion()
                        .noDefault();
                break;

            // java.sql.RowId is interface, is seems to be database
            // implementation specific, let's convert to String
            case ROWID:
                builder.name(columnName).type().unionOf().nullBuilder().endNull().and().stringType().endUnion()
                        .noDefault();
                break;

            case FLOAT:
            case REAL:
                builder.name(columnName).type().unionOf().nullBuilder().endNull().and().floatType().endUnion()
                        .noDefault();
                break;

            case DOUBLE:
                builder.name(columnName).type().unionOf().nullBuilder().endNull().and().doubleType().endUnion()
                        .noDefault();
                break;

            // Did not find direct suitable type, need to be clarified!!!!
            case DECIMAL:
            case NUMERIC:
                builder.name(columnName).type().unionOf().nullBuilder().endNull().and().stringType().endUnion()
                        .noDefault();
                break;

            // Did not find direct suitable type, need to be clarified!!!!
            case DATE:
            case TIME:
            case TIMESTAMP:
                builder.name(columnName).type().unionOf().nullBuilder().endNull().and().stringType().endUnion()
                        .noDefault();
                break;

            case BINARY:
            case VARBINARY:
            case LONGVARBINARY:
            case BLOB:
            case CLOB:
                builder.name(columnName).type().unionOf().nullBuilder().endNull().and().bytesType().endUnion()
                        .noDefault();
                break;

            default:
                throw new IllegalArgumentException("createSchema: Unknown SQL type " + meta.getColumnType(i)
                        + " cannot be converted to Avro type");
            }
        }

        return builder.endRecord();
    }

    public static long convertToCsvStream(final ResultSet rs, final OutputStream outStream,
            CsvOutputOptions outputOptions) throws SQLException, IOException {
        return convertToCsvStream(rs, outStream, null, null, outputOptions);
    }

    public static long convertToCsvStream(final ResultSet rs, final OutputStream outStream, String recordName,
            ResultSetRowCallback callback, CsvOutputOptions outputOptions) throws SQLException, IOException {

        final ResultSetMetaData meta = rs.getMetaData();
        final int nrOfColumns = meta.getColumnCount();
        List<String> columnNames = new ArrayList<>(nrOfColumns);

        if (outputOptions.isHeader()) {
            if (outputOptions.getAltHeader() == null) {
                for (int i = 1; i <= nrOfColumns; i++) {
                    String columnNameFromMeta = meta.getColumnName(i);
                    // Hive returns table.column for column name. Grab the column name as the string after the last period
                    int columnNameDelimiter = columnNameFromMeta.lastIndexOf(".");
                    columnNames.add(columnNameFromMeta.substring(columnNameDelimiter + 1));
                }
            } else {
                String[] altHeaderNames = outputOptions.getAltHeader().split(",");
                columnNames = Arrays.asList(altHeaderNames);
            }
        }

        // Write column names as header row
        outStream.write(
                StringUtils.join(columnNames, outputOptions.getDelimiter()).getBytes(StandardCharsets.UTF_8));
        if (outputOptions.isHeader()) {
            outStream.write("\n".getBytes(StandardCharsets.UTF_8));
        }

        // Iterate over the rows
        int maxRows = outputOptions.getMaxRowsPerFlowFile();
        long nrOfRows = 0;
        while (rs.next()) {
            if (callback != null) {
                callback.processRow(rs);
            }
            List<String> rowValues = new ArrayList<>(nrOfColumns);
            for (int i = 1; i <= nrOfColumns; i++) {
                final int javaSqlType = meta.getColumnType(i);
                final Object value = rs.getObject(i);

                switch (javaSqlType) {
                case CHAR:
                case LONGNVARCHAR:
                case LONGVARCHAR:
                case NCHAR:
                case NVARCHAR:
                case VARCHAR:
                    String valueString = rs.getString(i);
                    if (valueString != null) {
                        // Removed extra quotes as those are a part of the escapeCsv when required.
                        StringBuilder sb = new StringBuilder();
                        if (outputOptions.isQuote()) {
                            sb.append("\"");
                            if (outputOptions.isEscape()) {
                                sb.append(StringEscapeUtils.escapeCsv(valueString));
                            } else {
                                sb.append(valueString);
                            }
                            sb.append("\"");
                            rowValues.add(sb.toString());
                        } else {
                            if (outputOptions.isEscape()) {
                                rowValues.add(StringEscapeUtils.escapeCsv(valueString));
                            } else {
                                rowValues.add(valueString);
                            }
                        }
                    } else {
                        rowValues.add("");
                    }
                    break;
                case ARRAY:
                case STRUCT:
                case JAVA_OBJECT:
                    String complexValueString = rs.getString(i);
                    if (complexValueString != null) {
                        rowValues.add(StringEscapeUtils.escapeCsv(complexValueString));
                    } else {
                        rowValues.add("");
                    }
                    break;
                default:
                    if (value != null) {
                        rowValues.add(value.toString());
                    } else {
                        rowValues.add("");
                    }
                }
            }
            // Write row values
            outStream.write(
                    StringUtils.join(rowValues, outputOptions.getDelimiter()).getBytes(StandardCharsets.UTF_8));
            outStream.write("\n".getBytes(StandardCharsets.UTF_8));
            nrOfRows++;

            if (maxRows > 0 && nrOfRows == maxRows)
                break;
        }
        return nrOfRows;
    }

    public static String normalizeNameForAvro(String inputName) {
        String normalizedName = inputName.replaceAll("[^A-Za-z0-9_]", "_");
        if (Character.isDigit(normalizedName.charAt(0))) {
            normalizedName = "_" + normalizedName;
        }
        return normalizedName;
    }

    /**
     * An interface for callback methods which allows processing of a row during the convertToXYZStream() processing.
     * <b>IMPORTANT:</b> This method should only work on the row pointed at by the current ResultSet reference.
     * Advancing the cursor (e.g.) can cause rows to be skipped during Avro transformation.
     */
    public interface ResultSetRowCallback {
        void processRow(ResultSet resultSet) throws IOException;
    }

    public static Configuration getConfigurationFromFiles(final String configFiles) {
        final Configuration hiveConfig = new HiveConf();
        if (StringUtils.isNotBlank(configFiles)) {
            for (final String configFile : configFiles.split(",")) {
                hiveConfig.addResource(new Path(configFile.trim()));
            }
        }
        return hiveConfig;
    }
}