hydrograph.engine.cascading.scheme.hive.parquet.HiveParquetSchemeHelper.java Source code

Java tutorial

Introduction

Here is the source code for hydrograph.engine.cascading.scheme.hive.parquet.HiveParquetSchemeHelper.java

Source

/*******************************************************************************
 * Copyright 2017 Capital One Services, LLC and Bitwise, Inc.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License
 *******************************************************************************/
package hydrograph.engine.cascading.scheme.hive.parquet;

import cascading.tap.hive.HiveTableDescriptor;
import cascading.tuple.Fields;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hive.ql.io.parquet.convert.HiveSchemaConverter;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;

public class HiveParquetSchemeHelper {

    private final static String COLUMNS = "columns";
    private final static String COLUMNS_TYPES = "columns.types";

    public static List<String> getTableColumns(HiveTableDescriptor hiveTableDescriptor) {
        return Arrays.asList(hiveTableDescriptor.getColumnNames());
    }

    public static List<TypeInfo> getColumnsDataTypes(HiveTableDescriptor hiveTableDescriptor) {
        String dataTypes = StringUtils.join(hiveTableDescriptor.getColumnTypes(), ":");
        return TypeInfoUtils.getTypeInfosFromTypeString(dataTypes);
    }

    public static String getParquetSchemeMessage(HiveTableDescriptor hiveTableDescriptor) {
        return HiveSchemaConverter
                .convert(getTableColumns(hiveTableDescriptor), getColumnsDataTypes(hiveTableDescriptor)).toString();
    }

    public static Properties getTableProperties(HiveTableDescriptor hiveTableDescriptor) {

        Properties properties = new Properties();
        String columns = StringUtils.join(hiveTableDescriptor.getColumnNames(), ",");
        String columnTypes = StringUtils.join(hiveTableDescriptor.getColumnTypes(), ":");
        properties.put(COLUMNS, columns);
        properties.put(COLUMNS_TYPES, columnTypes);
        return properties;
    }

    public static String getParquetSchemeMessage(Fields sinkFields, String[] columnTypes) {
        List<String> columnName = new ArrayList<String>();
        for (int i = 0; i < sinkFields.size(); i++) {
            columnName.add(sinkFields.get(i).toString());
        }
        String dataTypes = StringUtils.join(columnTypes, ":");
        return HiveSchemaConverter.convert(columnName, TypeInfoUtils.getTypeInfosFromTypeString(dataTypes))
                .toString();
    }
}