com.thinkbiganalytics.discovery.util.ParserHelper.java Source code

Java tutorial

Introduction

Here is the source code for com.thinkbiganalytics.discovery.util.ParserHelper.java

Source

package com.thinkbiganalytics.discovery.util;

/*-
 * #%L
 * thinkbig-schema-discovery-api
 * %%
 * Copyright (C) 2017 ThinkBig Analytics
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.fasterxml.jackson.annotation.JsonProperty;
import com.thinkbiganalytics.discovery.schema.DataTypeDescriptor;
import com.thinkbiganalytics.discovery.schema.Field;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.sql.JDBCType;
import java.sql.Types;
import java.util.List;

/**
 * Provides utility methods useful for writing parsers
 */
public class ParserHelper {

    private static final Logger log = LoggerFactory.getLogger(ParserHelper.class);

    /**
     * Maximum number of characters to sample from a file protecting from memory
     */
    protected static int MAX_CHARS = 128000;

    protected static int MAX_ROWS = 1000;

    /**
     * Extracts the given number of rows from the file and returns a new reader for the sample.
     * This method protects memory in the case where a large file can be submitted with no delimiters.
     */
    public static String extractSampleLines(InputStream is, Charset charset, int rows) throws IOException {

        StringWriter sw = new StringWriter();
        Validate.notNull(is, "empty input stream");
        Validate.notNull(charset, "charset cannot be null");
        Validate.exclusiveBetween(1, MAX_ROWS, rows, "invalid number of sample rows");

        // Sample the file in case there are no newlines
        StringWriter swBlock = new StringWriter();
        IOUtils.copyLarge(new InputStreamReader(is, charset), swBlock, -1, MAX_CHARS);
        try (BufferedReader br = new BufferedReader(new StringReader(swBlock.toString()))) {
            IOUtils.closeQuietly(swBlock);
            String line = br.readLine();
            int linesRead = 0;
            for (int i = 1; i <= rows && line != null; i++) {
                sw.write(line);
                sw.write("\n");
                linesRead++;
                line = br.readLine();
            }
            if (linesRead <= 1 && sw.toString().length() >= MAX_CHARS) {
                throw new IOException("Failed to detect newlines for sample file.");
            }
        }

        return sw.toString();
    }

    public static String sqlTypeToHiveType(Integer type) {
        switch (type) {
        case Types.BIGINT:
            return "bigint";
        case Types.NUMERIC:
        case Types.DOUBLE:
        case Types.DECIMAL:
            return "double";
        case Types.INTEGER:
            return "int";
        case Types.FLOAT:
            return "float";
        case Types.TINYINT:
            return "tinyint";
        case Types.DATE:
            return "date";
        case Types.TIMESTAMP:
            return "timestamp";
        case Types.BOOLEAN:
            return "boolean";
        case Types.BINARY:
            return "binary";
        default:
            return "string";
        }
    }

    /*
    Convert the JDBC sql type to a hive type
     */
    public static String sqlTypeToHiveType(JDBCType jdbcType) {
        if (jdbcType != null) {
            Integer type = jdbcType.getVendorTypeNumber();
            return sqlTypeToHiveType(type);

        }
        return null;
    }

    /**
     * Derive the corresponding data type from sample values
     *
     * @param values a list of string values
     * @return the JDBC data type
     */
    public static JDBCType deriveJDBCDataType(List<String> values) {

        JDBCType guess = null;
        if (values != null) {
            for (String v : values) {
                if (!StringUtils.isEmpty(v)) {
                    JDBCType currentPass;
                    try {
                        Integer.parseInt(v);
                        currentPass = JDBCType.INTEGER;
                    } catch (NumberFormatException e) {
                        try {
                            Double.parseDouble(v);
                            currentPass = JDBCType.DOUBLE;
                        } catch (NumberFormatException ex) {
                            // return immediately for non-numeric case
                            return JDBCType.VARCHAR;
                        }
                    }
                    // If a double is encountered, use that type
                    if (guess == null || currentPass == JDBCType.DOUBLE) {
                        guess = currentPass;
                    }
                }
            }
        }
        return (guess == null ? JDBCType.VARCHAR : guess);
    }

    /**
     * Derive data types
     *
     * @param type   the target database platform
     * @param fields the fields
     */
    public static void deriveDataTypes(TableSchemaType type, List<? extends Field> fields) {
        for (Field field : fields) {
            if (StringUtils.isEmpty(field.getDerivedDataType())) {
                JDBCType jdbcType = JDBCType.VARCHAR;
                try {
                    if (!StringUtils.isEmpty(field.getNativeDataType())) {
                        jdbcType = JDBCType.valueOf(field.getNativeDataType());
                    } else {
                        jdbcType = deriveJDBCDataType(field.getSampleValues());
                    }
                } catch (IllegalArgumentException e) {
                    log.warn("Unable to convert data type [?] will be converted to VARCHAR",
                            field.getNativeDataType());
                }

                switch (type) {
                case HIVE:
                    String hiveType = sqlTypeToHiveType(jdbcType);
                    field.setDerivedDataType(hiveType);
                    field.setDataTypeDescriptor(hiveTypeToDescriptor(hiveType));
                    break;
                case RDBMS:
                    field.setDerivedDataType(jdbcType.getName());
                }
            }
        }
    }

    /*
    Returns whether the provided field represents a complex structure such as ARRAY, STRUCT, or BINARY
    */
    public static DataTypeDescriptor hiveTypeToDescriptor(String hiveType) {
        HiveDataTypeDescriptor descriptor = new HiveDataTypeDescriptor();
        if (hiveType != null) {
            hiveType = hiveType.toLowerCase();
            switch (hiveType) {
            case "boolean":
            case "string":
                break;
            case "bigint":
            case "double":
            case "int":
            case "float":
            case "tinyint":
                descriptor.setNumeric(true);
                break;
            case "date":
            case "timestamp":
                descriptor.setDate(true);
                break;
            default:
                if (hiveType.contains("decimal")) {
                    descriptor.setNumeric(true);
                } else {
                    descriptor.setComplex(true);
                }
            }
        }
        return descriptor;
    }

    public static String toNativeType(Integer dataType) {
        return JDBCType.valueOf(dataType).getName();
    }

    static class HiveDataTypeDescriptor implements DataTypeDescriptor {

        @JsonProperty("date")
        boolean isDate;

        @JsonProperty("numeric")
        boolean isNumeric;

        @JsonProperty("complex")
        boolean isComplex;

        @Override
        public Boolean isDate() {
            return isDate;
        }

        @Override
        public Boolean isNumeric() {
            return isNumeric;
        }

        @Override
        public Boolean isComplex() {
            return isComplex;
        }

        public void setDate(boolean date) {
            isDate = date;
        }

        public void setNumeric(boolean numeric) {
            isNumeric = numeric;
        }

        public void setComplex(boolean complex) {
            isComplex = complex;
        }

    }
}