co.nubetech.hiho.hive.HiveUtility.java Source code

Java tutorial

Introduction

Here is the source code for co.nubetech.hiho.hive.HiveUtility.java

Source

/**
 * Copyright 2011 Nube Technologies
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and limitations under the License.
 */
package co.nubetech.hiho.hive;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.sql.Types;
import java.util.ArrayList;
import java.util.NoSuchElementException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import org.apache.log4j.Logger;

import co.nubetech.apache.hadoop.DBConfiguration;
import co.nubetech.hiho.common.HIHOConf;
import co.nubetech.hiho.common.HIHOException;
import co.nubetech.hiho.common.OutputStrategyEnum;
import co.nubetech.hiho.mapreduce.lib.db.ColumnInfo;
import co.nubetech.hiho.mapreduce.lib.db.GenericDBWritable;

public class HiveUtility {

    private final static Logger logger = Logger.getLogger(co.nubetech.hiho.hive.HiveUtility.class);

    public static void createTable(Configuration conf, Job job, GenericDBWritable writable, int jobCounter)
            throws HIHOException {
        String createQuery = null;
        String loadQuery = null;
        String tableName = getTableName(conf);
        FileOutputStream fos = null;
        boolean isCreateTable = true;
        boolean isDyanamicPartition = false;
        if (jobCounter > 0) {
            isCreateTable = false;
        }

        try {

            fos = getFileOutputStream(conf, jobCounter, tableName);
            BufferedWriter w = new BufferedWriter(new OutputStreamWriter(fos));

            String partitionBy = conf.get(HIHOConf.HIVE_PARTITION_BY);
            // tableName = getTableName(conf);

            if (conf.get(HIHOConf.HIVE_PARTITION_BY) == null) {

                if (isCreateTable) {
                    createQuery = getCreateQuery(conf, writable);
                    w.append(createQuery + "\n");
                }
                loadQuery = getLoadQuery(conf, conf.get(HIHOConf.INPUT_OUTPUT_PATH), writable);
                w.append(loadQuery + "\n");
                logger.warn("\nThe queries are: " + createQuery + "\n" + loadQuery);
            } else {
                loadQuery = getLoadQuery(conf, conf.get(HIHOConf.INPUT_OUTPUT_PATH), writable, partitionBy);
                if (isCreateTable) {
                    createQuery = getCreateQuery(conf, writable, partitionBy);
                    w.append(createQuery + "\n");
                }
                w.append(loadQuery + "\n");
                logger.warn("\nThe queries are: " + createQuery + "\n" + loadQuery);
                if (getDynamicPartitionBy(partitionBy) != null) {
                    isDyanamicPartition = true;
                }
            }

            if (!isDyanamicPartition) {
                runQuery(createQuery, loadQuery, isCreateTable, conf);
            } else {

                String insertQuery = getInsertQueryFromTmpToMain(conf, writable, partitionBy);
                String tmpCreateQuery = getTmpCreateQuery(conf, writable);
                runQuery(tmpCreateQuery, insertQuery, createQuery, loadQuery, conf, isCreateTable);
                w.append(tmpCreateQuery + "\n" + loadQuery + "\n");
                if (isCreateTable) {
                    w.append(createQuery + "\n");
                }
                w.append(insertQuery + "\n");
                logger.warn("\nThe queries are: " + createQuery + "\n" + loadQuery + "\n" + tmpCreateQuery + "\n"
                        + insertQuery);
            }
            // w.append(createQuery + "\n" + loadQuery);
            w.close();
            fos.close();
        } catch (Exception e) {
            e.printStackTrace();
            throw new HIHOException("Could not generate hive table", e);
        }
    }

    // This function is used to get the FileOutputStream for storing script of
    // hive queries
    public static FileOutputStream getFileOutputStream(Configuration conf, int jobCounter, String tableName)
            throws HIHOException {
        FileOutputStream fos = null;
        try {
            if (jobCounter == 0) {
                File file = new File(new File(conf.get(HIHOConf.INPUT_OUTPUT_LOADTO_PATH)),
                        "hiveScript" + tableName + ".txt");

                fos = new FileOutputStream(file);

            } else {
                String path = conf.get(HIHOConf.INPUT_OUTPUT_LOADTO_PATH) + "hiveScript" + tableName + ".txt";
                fos = new FileOutputStream(path, true);
            }
        } catch (FileNotFoundException e) {
            throw new HIHOException();
        }
        return fos;
    }

    // This function is used to get tableName for table we going to create in
    // Hive
    public static String getTableName(Configuration conf) throws HIHOException {
        String tableName = conf.get(HIHOConf.HIVE_TABLE_NAME);
        // if user did not specify hive table name, lets try to deduce it
        // automatically
        // get it from the name of the table we are querying against
        if (tableName == null) {
            tableName = conf.get(DBConfiguration.INPUT_TABLE_NAME_PROPERTY);
        }
        if (tableName == null) {
            if (conf.get(DBConfiguration.INPUT_QUERY) != null) {
                String query = conf.get(DBConfiguration.INPUT_QUERY);
                StringTokenizer queryTokens = new StringTokenizer(query, " ");
                while (queryTokens.hasMoreTokens()) {
                    if (queryTokens.nextToken().equalsIgnoreCase("FROM")) {
                        tableName = queryTokens.nextToken();
                    }
                }
            }
        }

        // we are missing a couple of cases here. FROM table1, table2. FROM
        // table1 inner join blah blah
        // lets leave them for now and address them when the user need is
        // clearer

        /*
         * if (conf.get(DBConfiguration.INPUT_TABLE_NAME_PROPERTY) != null) {
         * tableName = conf.get(DBConfiguration.INPUT_TABLE_NAME_PROPERTY); }
         * else if (conf.get(DBConfiguration.INPUT_QUERY) != null) { String
         * query = conf.get(DBConfiguration.INPUT_QUERY); StringTokenizer
         * queryTokens = new StringTokenizer(query, " "); while
         * (queryTokens.hasMoreTokens()) { if
         * (queryTokens.nextToken().equalsIgnoreCase("FROM")) { tableName =
         * queryTokens.nextToken(); } } } if
         * (conf.get(HIHOConf.HIVE_MULTIPLE_PARTITION_BY) != null &&
         * conf.get(HIHOConf.HIVE_TABLE_NAME) != null) { tableName =
         * conf.get(HIHOConf.HIVE_TABLE_NAME); }
         */
        if (tableName == null) {
            throw new HIHOException("Cannot get hive table name");
        }
        return tableName;
    }

    /*
     * This function is used to generate insertQuery which loads data from one
     * table to another table in Hive, in case of dynamicPartition we create
     * temporary table to load data then we transfer that data to our main table
     * through this query
     */
    public static String getInsertQueryFromTmpToMain(Configuration conf, GenericDBWritable writable,
            String partitionBy) throws HIHOException {
        StringBuilder builder = new StringBuilder();
        String tableName = getTableName(conf);
        builder.append("FROM `" + tableName + "tmp` tmp INSERT OVERWRITE TABLE `" + tableName + "` PARTITION (");
        // Here we are computing columnName and its value for partition,in
        // format columnName='columnValue',columnName...
        StringTokenizer tempToken = new StringTokenizer(partitionBy, ",");
        String columnsData = " ";
        while (tempToken.hasMoreTokens()) {
            StringTokenizer partitionData = new StringTokenizer(tempToken.nextToken(), ":");
            columnsData = columnsData + partitionData.nextToken();
            partitionData.nextToken();
            if (partitionData.hasMoreTokens()) {
                columnsData = columnsData + "='" + partitionData.nextToken() + "'";
            }
            if (tempToken.hasMoreTokens()) {
                columnsData = columnsData + ",";
            }
        }
        //
        builder.append(columnsData + ") SELECT " + getTmpTableColumns(writable));
        return builder.toString();
    }

    // This function is used to get columns name from which we want to insert
    // values in our table columns,in format
    // `tableAlias`.`columnName`,`tableAlias`.`columnName`...
    public static String getTmpTableColumns(GenericDBWritable writable) throws HIHOException {
        StringBuilder builder = new StringBuilder();
        ArrayList<ColumnInfo> columns = writable.getColumns();
        for (ColumnInfo column : columns) {
            builder.append("`tmp`");
            builder.append(".`" + column.getName());
            builder.append("`,");
        }
        // above loop adds extra comma, removing.
        if (builder.length() > 0) {
            builder.deleteCharAt(builder.lastIndexOf(","));
        }
        return builder.toString();
    }

    // This function is used to get query for temporary table, in case of
    // dynamicPartition
    public static String getTmpCreateQuery(Configuration conf, GenericDBWritable writable) throws HIHOException {
        StringBuilder builder = new StringBuilder();
        builder.append("CREATE TABLE `" + getTableName(conf) + "tmp` ( ");

        builder.append(getColumns(writable));

        builder.append(") ROW FORMAT ");
        OutputStrategyEnum outputStrategy = OutputStrategyEnum.valueOf(conf.get(HIHOConf.INPUT_OUTPUT_STRATEGY));
        if (outputStrategy == OutputStrategyEnum.DELIMITED) {
            builder.append("DELIMITED FIELDS TERMINATED BY '");
            builder.append(conf.get(HIHOConf.INPUT_OUTPUT_DELIMITER));
            builder.append("' STORED AS TEXTFILE");
        }

        return builder.toString();
    }

    // This function is used return columnNames and columnType for partitionBy
    // clause in createTable query in right format.It takes data in format
    // columnName:columnType:[columnValue],columnName:columnType:[columnValue]...
    // and return in format columnName columnValue,columnName columnValue...
    public static String getPartitionBy(String partitionBy) {
        StringTokenizer partitionToken = new StringTokenizer(partitionBy, ";");

        String columnsData = "";
        while (partitionToken.hasMoreTokens()) {
            StringTokenizer partitionData = new StringTokenizer(partitionToken.nextToken(), ":");
            columnsData = columnsData + partitionData.nextToken() + " " + partitionData.nextToken();
            if (partitionToken.hasMoreTokens()) {
                columnsData = columnsData + ",";
            }
        }
        return columnsData;
    }

    public static String getColumnsForPartitionedCreateTables(Configuration conf, String columns) {
        // columns format columnsName columnType,columnsName1 columnType1
        // get the column by which we want to do partition by
        // and remove that from the table creation
        String dynamicPartitionBy = getDynamicPartitionBy(conf.get(HIHOConf.HIVE_PARTITION_BY));
        StringTokenizer columnTokens = new StringTokenizer(columns, ",");
        columns = " ";
        int count = 0;
        while (columnTokens.hasMoreTokens()) {
            String columnData = columnTokens.nextToken();
            StringTokenizer columnDataTokens = new StringTokenizer(columnData, "` ");
            String columnName = columnDataTokens.nextToken();
            if (!columnName.equals(dynamicPartitionBy) && count > 0) {
                columns = columns + ",";
            }
            if (!columnName.equals(dynamicPartitionBy)) {
                columns = columns + columnData;
                count++;
            }

        }
        return columns;
    }

    // This function is used to get create table query in the case where user
    // has defined values in partitioned by clause
    public static String getCreateQuery(Configuration conf, GenericDBWritable writable, String partitionBy)
            throws HIHOException {

        StringBuilder builder = new StringBuilder();
        String tableName = getTableName(conf);
        if (conf.get(HIHOConf.HIVE_TABLE_OVERWRITE, "false").equals("true")) {
            builder.append("CREATE TABLE IF NOT EXISTS `" + tableName + "` (");
        } else {
            builder.append("CREATE TABLE `" + tableName + "` (");
        }
        String columns = getColumns(writable);

        builder.append(getColumnsForPartitionedCreateTables(conf, columns));
        builder.append(") PARTITIONED BY (" + getPartitionBy(partitionBy));
        builder.append(")");

        builder = appendClusteredByToCreateQuery(conf, builder);

        builder.append(" ROW FORMAT ");
        builder = appendDelimitedDataToCreateQuery(conf, builder);

        return builder.toString();

    }

    // This function is used to get create table query in the case where user
    // has not defined values in partitioned by clause
    public static String getCreateQuery(Configuration conf, GenericDBWritable writable) throws HIHOException {
        StringBuilder builder = new StringBuilder();
        String tableName = getTableName(conf);
        if (conf.get(HIHOConf.HIVE_TABLE_OVERWRITE, "false").equals("true")) {
            builder.append("CREATE TABLE IF NOT EXISTS `" + tableName + "` ( ");
        } else {
            builder.append("CREATE TABLE `" + tableName + "` ( ");
        }
        builder.append(getColumns(writable));
        builder.append(")");
        builder = appendClusteredByToCreateQuery(conf, builder);
        builder.append(" ROW FORMAT ");
        builder = appendDelimitedDataToCreateQuery(conf, builder);

        return builder.toString();
    }

    // //
    public static StringBuilder appendDelimitedDataToCreateQuery(Configuration conf, StringBuilder builder) {
        OutputStrategyEnum outputStrategy = OutputStrategyEnum.valueOf(conf.get(HIHOConf.INPUT_OUTPUT_STRATEGY));
        if (outputStrategy == OutputStrategyEnum.DELIMITED) {
            builder.append("DELIMITED FIELDS TERMINATED BY '");
            builder.append(conf.get(HIHOConf.INPUT_OUTPUT_DELIMITER));
            builder.append("' STORED AS TEXTFILE");
        }
        return builder;
    }

    public static StringBuilder appendClusteredByToCreateQuery(Configuration conf, StringBuilder builder)
            throws HIHOException {
        if (conf.get(HIHOConf.HIVE_CLUSTERED_BY) != null) {
            StringTokenizer clusterData = new StringTokenizer(conf.get(HIHOConf.HIVE_CLUSTERED_BY), ":");
            builder.append(" CLUSTERED BY (" + clusterData.nextToken());
            builder.append(")");
            if (conf.get(HIHOConf.HIVE_SORTED_BY) != null) {
                builder.append(" SORTED BY (" + conf.get(HIHOConf.HIVE_SORTED_BY));
                builder.append(")");
            }
            try {
                String buckets = clusterData.nextToken();
                if (buckets == null) {
                    throw new HIHOException("The number of buckets wih clustered by is not defined");

                }
                builder.append(" INTO " + buckets + " BUCKETS");
            } catch (NoSuchElementException e) {
                throw new HIHOException("The number of buckets wih clustered by is not defined");

            }

        }

        return builder;

    }

    /*
     * Parses the partition by configuration of the form
     * col:type[:value],col1:type1[:value1] and provides the name of the dynamic
     * partition
     */

    public static String getDynamicPartitionBy(String partitionBy) {
        StringTokenizer partitionToken = new StringTokenizer(partitionBy, ";");
        String dynamicPartitionBy = null;
        while (partitionToken.hasMoreTokens()) {

            StringTokenizer columnTokens = new StringTokenizer(partitionToken.nextToken(), ":");
            String columnNames = columnTokens.nextToken();
            columnTokens.nextToken();
            if (!columnTokens.hasMoreTokens()) {
                dynamicPartitionBy = columnNames;
            }
        }
        return dynamicPartitionBy;
    }

    public static String getLoadQuery(Configuration conf, String hdfsDir, GenericDBWritable writable,
            String partitionBy) throws HIHOException {
        String loadQuery = getLoadQuery(conf, hdfsDir, writable);
        StringTokenizer tempToken = new StringTokenizer(partitionBy, ";");

        String columnsData = " ";
        while (tempToken.hasMoreTokens()) {
            StringTokenizer columnTokens = new StringTokenizer(tempToken.nextToken(), ":");
            String columnNames = columnTokens.nextToken();
            columnsData = columnsData + columnNames;
            columnTokens.nextToken();
            if (columnTokens.hasMoreTokens()) {
                columnsData = columnsData + "='" + columnTokens.nextToken() + "'";
            }
            if (tempToken.hasMoreTokens()) {
                columnsData = columnsData + ",";
            }
        }
        if (getDynamicPartitionBy(conf.get(HIHOConf.HIVE_PARTITION_BY)) == null) {
            loadQuery = loadQuery + "` PARTITION (" + columnsData + ")";
        } else {
            loadQuery = loadQuery + "tmp`";
        }

        return loadQuery;
    }

    public static String getLoadQuery(Configuration conf, String hdfsDir, GenericDBWritable writable)
            throws HIHOException {
        StringBuilder builder = new StringBuilder();

        builder.append("LOAD DATA INPATH '");
        builder.append(hdfsDir);
        builder.append("' OVERWRITE INTO TABLE `");
        builder.append(getTableName(conf));
        if (conf.get(HIHOConf.HIVE_PARTITION_BY) == null) {
            builder.append("`");
        }
        return builder.toString();
    }

    // //Accept String query to be run

    public static void runQuery(String createQuery, String loadQuery, boolean createTable, Configuration conf)
            throws HIHOException {
        try {
            Class.forName(conf.get(HIHOConf.HIVE_DRIVER));
            Connection con = DriverManager.getConnection(conf.get(HIHOConf.HIVE_URL),
                    conf.get(HIHOConf.HIVE_USR_NAME), conf.get(HIHOConf.HIVE_PASSWORD));
            Statement stmt = con.createStatement();
            // stmt.executeQuery("drop table " + tableName);
            if (createTable) {
                stmt.executeQuery("drop table " + getTableName(conf));
                stmt.executeQuery(createQuery);
            }
            stmt.executeQuery(loadQuery);

            /*
             * ResultSet rs = stmt.executeQuery("select * from " + tableName);
             * while (rs.next()) { System.out.println(rs.getString(1) + "\t" +
             * rs.getString(2)); }
             */

            stmt.close();
            con.close();

        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (SQLException e) {
            e.printStackTrace();
        }

    }

    public static void runQuery(String tmpCreateQuery, String insertQuery, String createQuery, String loadQuery,
            Configuration conf, boolean createTable) throws HIHOException {
        try {
            Class.forName(conf.get(HIHOConf.HIVE_DRIVER));
            Connection con = DriverManager.getConnection(conf.get(HIHOConf.HIVE_URL),
                    conf.get(HIHOConf.HIVE_USR_NAME), conf.get(HIHOConf.HIVE_PASSWORD));
            Statement stmt = con.createStatement();
            stmt.executeQuery("drop table " + getTableName(conf) + "tmp");
            stmt.executeQuery("drop table " + getTableName(conf));
            stmt.executeQuery(tmpCreateQuery);
            stmt.executeQuery(loadQuery);
            /*
             * ResultSet rs2 = stmt.executeQuery("select * from " + tableName +
             * "tmp"); while (rs2.next()) { System.out.println(rs2.getString(1)
             * + "\t" + rs2.getString(2)); }
             */
            if (createTable) {
                stmt.executeQuery(createQuery);
            }
            stmt.executeQuery(insertQuery);
            stmt.executeQuery("drop table " + getTableName(conf) + "tmp");

            stmt.close();
            con.close();

        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (SQLException e) {
            e.printStackTrace();
        }

    }

    public static String getColumns(GenericDBWritable writable) throws HIHOException {
        StringBuilder builder = new StringBuilder();
        ArrayList<ColumnInfo> columns = writable.getColumns();
        for (ColumnInfo column : columns) {
            builder.append("`");
            builder.append(column.getName());
            builder.append("` ");
            builder.append(getColumnType(column.getType()));
            builder.append(",");
        }

        if (builder.length() > 0) {
            builder.deleteCharAt(builder.lastIndexOf(","));
        }
        return builder.toString();
    }

    // //Column Types to be verified and junits added
    public static String getColumnType(int columnType) throws HIHOException {
        String returnType = null;
        switch (columnType) {

        case Types.BIGINT:
            returnType = "long";
            break;
        case Types.BOOLEAN:
        case Types.CHAR:
            returnType = "string";
            break;
        case Types.DECIMAL:
        case Types.REAL:
        case Types.NUMERIC:
        case Types.DOUBLE:
            returnType = "double";
            break;
        case Types.FLOAT:
            returnType = "float";
            break;
        case Types.TINYINT:
        case Types.SMALLINT:
        case Types.INTEGER:
            returnType = "int";
            break;
        case Types.BINARY:
        case Types.CLOB:
        case Types.BLOB:
        case Types.VARBINARY:
        case Types.LONGVARBINARY:
            returnType = "bytearray";
            break;
        case Types.VARCHAR:
        case Types.LONGVARCHAR:
            returnType = "string";
            break;
        default:
            throw new HIHOException("Unsupported type");
        }
        return returnType;
    }

}