com.teradata.adsbserde.AdsbSerDe.java Source code

Java tutorial

Introduction

Here is the source code for com.teradata.adsbserde.AdsbSerDe.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.teradata.adsbserde;

import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;

/**
 * A Hive SerDe (Serialiser / Deserialiser) for raw ADSB Data.
 * <p>
 * <pre>
 * The structure of the raw ADSB data is as follows (without the spaces):
 * Key \t value [ \t key \t value ...]
 * An example might be:
 *     clock\t124234\thexid\t\A1B13\talt\t30000
 * The above record contains three fields as follows:
 * </pre>
 * <ul>
 *   <li>clock    124234</li>
 *   <li>hexid    A1B13</li>
 *   <li>alt      30000</li>
 * </ul>
 * </p>
 * <p>
 * This SerDe uses the names of the columns in the table definition to identify
 * the attributes to extract from the ADSB data file.
 * Thus a table defined as follows:
 * <pre>
 *    create table x (
 *       clock    timestamp,
 *       hexid    string,
 *       speed    int
 *    ) ...
 * </pre>
 * Would return the clock (124234), hexid (A1B13) and null for the speed (as
 * there was no speed key in the raw data set. The alt in the raw data set
 * will be ignored because it is not in the table definition.
 * </p>
 * @author Glenn McCall
 *
 */
public class AdsbSerDe extends AbstractSerDe {

    /**
     * The row type information.
     */
    private StructTypeInfo rowTypeInfo;

    /**
     * The row Object Inspector.
     */
    private ObjectInspector rowOI;

    /**
     * The column names.
     */
    private List<String> colNames;

    /**
     * The rows.
     */
    private List<Object> row = new ArrayList<Object>();

    private static final String PROP_DEBUG_ENABLED = "debugEnabled";

    @Override
    public void initialize(Configuration c, Properties tbl) throws SerDeException {
        MyLogger.println("AsdbSerDe called ... ");
        MyLogger.println("c: " + c.toString());
        MyLogger.println("tbl: " + tbl.toString());

        System.out.println(PROP_DEBUG_ENABLED + "=" + tbl.getProperty(PROP_DEBUG_ENABLED));
        MyLogger.enabled = "true".equalsIgnoreCase(tbl.getProperty(PROP_DEBUG_ENABLED));

        // Generate the regular expression used to extract the ADSB data pairs.
        // This is being done here in the anticipation that one day, parts of this
        // e.g. the separator character - might be accepted as a property of the
        // table definition.
        pairs = Pattern.compile("([\\S^]+)\\s+([\\S$]+)");

        String colNamesStr = tbl.getProperty(serdeConstants.LIST_COLUMNS);
        List<String> colNamesWrk = Arrays.asList(colNamesStr.split(","));
        colNames = new LinkedList();
        for (String name : colNamesWrk) {
            if (name != null) {
                colNames.add(name.toLowerCase());
            } else {
                colNames.add(name);
            }
        }

        MyLogger.println("colNames: " + colNamesStr);

        // Get a list of TypeInfos for the columns. This list lines up with 
        // the list of column names.
        String colTypesStr = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES);
        List<TypeInfo> colTypes = TypeInfoUtils.getTypeInfosFromTypeString(colTypesStr);
        MyLogger.println("colTypes: " + colTypesStr);

        rowTypeInfo = (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(colNames, colTypes);
        MyLogger.println("rowTypeInfo: " + rowTypeInfo);
        MyLogger.println("clock rowTypeInfo: " + rowTypeInfo.getStructFieldTypeInfo("clock"));
        MyLogger.println("speed rowTypeInfo: " + rowTypeInfo.getStructFieldTypeInfo("speed"));

        rowOI = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(rowTypeInfo);
        MyLogger.println("rowOI: " + rowOI);

    }

    @Override
    public Class<? extends Writable> getSerializedClass() {
        return Text.class;
    }

    @Override
    public Writable serialize(Object o, ObjectInspector oi) throws SerDeException {
        throw new UnsupportedOperationException("The ADS-B SerDe does not support writes."); //To change body of generated methods, choose Tools | Templates.
    }

    @Override
    public SerDeStats getSerDeStats() {
        return null;
    }

    @Override
    public Object deserialize(Writable wrtbl) throws SerDeException {
        return processRecord(wrtbl.toString(), colNames);
    }

    @Override
    public ObjectInspector getObjectInspector() throws SerDeException {
        return rowOI;
    }

    private Pattern pairs;

    public List<Object> processRecord(String record, List<String> colNames) {
        HashMap<String, String> valueMap = new HashMap();

        Matcher m = pairs.matcher(record);
        while (m.find()) {
            String key = m.group(1);
            String value = m.group(2);
            //            MyLogger.println("Key:" + key + ", value:" + value);
            valueMap.put(key.toLowerCase(), value);
        }

        List<Object> workingRow = new LinkedList<>();
        for (String colName : colNames) {
            String value = valueMap.get(colName);
            Object retValue = value;

            TypeInfo typeInfo = rowTypeInfo.getStructFieldTypeInfo(colName);
            if (value != null) {
                try {
                    if ("int".equalsIgnoreCase(typeInfo.getTypeName())) {
                        retValue = Integer.parseInt(value);
                    } else if ("float".equalsIgnoreCase(typeInfo.getTypeName())) {
                        retValue = Float.parseFloat(value);
                    } else if ("timestamp".equalsIgnoreCase(typeInfo.getTypeName())) {
                        long longVal = Long.parseLong(value);
                        retValue = new Timestamp(longVal * 1000);
                        //MyLogger.println("timestamp: " + value + ": " + longVal + ": " + retValue.toString());
                    }
                } catch (Exception e) {
                    MyLogger.println(
                            "Caught an exception: " + e.toString() + "\nColumn: " + colName + ", value: " + value);
                    retValue = null;
                }
            }
            workingRow.add(retValue);
        }

        return workingRow;
    }
}