hydrograph.engine.cascading.assembly.InputFileHiveTextAssembly.java Source code

Introduction

Here is the source code for hydrograph.engine.cascading.assembly.InputFileHiveTextAssembly.java
Source

/*******************************************************************************
 * Copyright 2017 Capital One Services, LLC and Bitwise, Inc.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License
 *******************************************************************************/
/**
 * 
 */
package hydrograph.engine.cascading.assembly;

import cascading.scheme.hadoop.TextDelimited;
import cascading.tap.SinkMode;
import cascading.tap.hive.HivePartitionTap;
import cascading.tap.hive.HiveTableDescriptor;
import cascading.tap.hive.HiveTap;
import cascading.tuple.Fields;
import hydrograph.engine.cascading.assembly.base.InputFileHiveBase;
import hydrograph.engine.cascading.assembly.infra.ComponentParameters;
import hydrograph.engine.cascading.assembly.utils.HiveTypeToCoercibleTypeMapping;
import hydrograph.engine.cascading.filters.PartitionFilter;
import hydrograph.engine.cascading.scheme.HydrographDelimitedParser;
import hydrograph.engine.cascading.scheme.hive.text.HiveTextTableDescriptor;
import hydrograph.engine.core.component.entity.InputFileHiveTextEntity;
import hydrograph.engine.core.component.entity.base.HiveEntityBase;
import hydrograph.engine.utilities.HiveConfigurationMapping;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.lang.reflect.Type;
import java.util.Arrays;

public class InputFileHiveTextAssembly extends InputFileHiveBase {

    /**
     * Hive Input File Component - read records as input from Hive Table stored
     * in Text format.
     * 
     */
    private static final long serialVersionUID = -2946197683137950707L;

    protected InputFileHiveTextEntity inputHiveFileEntity;
    private HiveTextTableDescriptor tableDesc;

    private static Logger LOG = LoggerFactory.getLogger(InputFileHiveTextAssembly.class);

    public InputFileHiveTextAssembly(InputFileHiveTextEntity baseComponentEntity,
            ComponentParameters componentParameters) {
        super(baseComponentEntity, componentParameters);
    }

    public void validateParameters() {
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * hydrograph.engine.cascading.assembly.base.InputFileHiveBase#prepareScheme
     * ()
     */
    @Override
    protected void prepareScheme() {
        LOG.debug("Applying HiveTextScheme to read data from Hive");

        // HiveTextTableDescriptor is developed specifically for handling
        // Text File format with Hive. Hence, the object of table descriptor
        // is created in its respective assembly and not in its base class.

        Configuration conf = new Configuration();
        conf.addResource(new Path(HiveConfigurationMapping.getHiveConf("path_to_hive_site_xml")));

        HiveTableDescriptor.Factory factory = new HiveTableDescriptor.Factory(conf);
        HiveTableDescriptor tb = factory.newInstance(inputHiveFileEntity.getDatabaseName(),
                inputHiveFileEntity.getTableName());

        tableDesc = new HiveTextTableDescriptor(tb.getDatabaseName(),

                tb.getTableName(), tb.getColumnNames(), tb.getColumnTypes(), tb.getPartitionKeys(),
                tb.getDelimiter(), "", getHiveExternalTableLocationPath(), false);

        Fields fields = getFieldsToWrite(tb);
        HydrographDelimitedParser delimitedParser = new HydrographDelimitedParser(
                inputHiveFileEntity.getDelimiter() != null ? inputHiveFileEntity.getDelimiter() : "\1",

                inputHiveFileEntity.getQuote(), null, inputHiveFileEntity.isStrict(), inputHiveFileEntity.isSafe());

        scheme = new TextDelimited(fields, null, false, false, "UTF-8", delimitedParser);

        // scheme = new
        // TextDelimited(fields,inputHiveFileEntity.getDelimiter());
        scheme.setSourceFields(fields);
        scheme.setSinkFields(fields);
    }

    protected void initializeHiveTap() {
        LOG.debug("Initializing Hive Tap using HiveTextTableDescriptor");
        hiveTap = new HiveTap(tableDesc, scheme, SinkMode.KEEP, true);
        if (inputHiveFileEntity.getPartitionKeys() != null && inputHiveFileEntity.getPartitionKeys().length > 0) {
            hiveTap = new HivePartitionTap((HiveTap) hiveTap);
            if (isPartitionFilterEnabled())
                addPartitionFilter(((HivePartitionTap) hiveTap));
        }
    }

    private boolean isPartitionFilterEnabled() {
        if (inputHiveFileEntity.getPartitionFilterList().size() > 0)
            return false;
        else
            return true;
    }

    private void addPartitionFilter(HivePartitionTap hivePartitionTap) {
        hivePartitionTap.addSourcePartitionFilter(
                new Fields(convertLowerCase(inputHiveFileEntity.getPartitionKeys())),
                new PartitionFilter(inputHiveFileEntity.getPartitionFilterList()));
    }

    /*
     * (non-Javadoc)
     * 
     * @see com.bitwiseglobal.cascading.assembly.base.InputFileHiveBase#
     * castHiveEntityFromBase
     * (com.bitwiseglobal.assembly.entity.base.HiveEntityBase)
     * 
     * 
     * 
     * cast the hiveEntityBase to InputFileHiveTextEntity
     */
    @Override
    public void castHiveEntityFromBase(HiveEntityBase hiveEntityBase) {
        inputHiveFileEntity = (InputFileHiveTextEntity) hiveEntityBase;

    }

    /**
     * @return Fields.
     */
    private Fields getFieldsToWrite(HiveTableDescriptor tb) {
        String[] testField = new String[tb.getColumnNames().length - tb.getPartitionKeys().length];
        int i = 0;
        for (String inputfield : tb.getColumnNames()) {
            if (!Arrays.asList(tb.getPartitionKeys()).contains(inputfield)) {
                testField[i++] = inputfield;
            }
        }

        return new Fields(testField).applyTypes(getTypes(tb));

    }

    /**
     * The datatype support for text to skip the partition key write in file.
     */
    private Type[] getTypes(HiveTableDescriptor tb) {

        Type[] types = new Type[tb.getColumnTypes().length - tb.getPartitionKeys().length];
        String[] colTypes = tb.getColumnTypes();

        for (int i = 0; i < types.length; i++) {

            HiveTypeToCoercibleTypeMapping hiveTo = null;
            if (colTypes[i].contains("decimal")) {
                hiveTo = HiveTypeToCoercibleTypeMapping.valueOf("DECIMAL");
            } else
                hiveTo = HiveTypeToCoercibleTypeMapping.valueOf(colTypes[i].toUpperCase());

            types[i] = hiveTo.getMappingType(colTypes[i]);

        }

        return types;
    }

}