com.snowplowanalytics.snowplow.hadoop.hive.SnowPlowEventDeserializer.java Source code

Java tutorial

Introduction

Here is the source code for com.snowplowanalytics.snowplow.hadoop.hive.SnowPlowEventDeserializer.java

Source

/* 
 * Copyright (c) 2012-2013 SnowPlow Analytics Ltd. All rights reserved.
 *
 * This program is licensed to you under the Apache License Version 2.0,
 * and you may not use this file except in compliance with the Apache License Version 2.0.
 * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the Apache License Version 2.0 is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
 */
package com.snowplowanalytics.snowplow.hadoop.hive;

// Java
import java.nio.charset.CharacterCodingException;
import java.util.List;
import java.util.Properties;

// Commons Logging
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

// Hadoop
import org.apache.hadoop.conf.Configuration;

// Hive
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ReflectionStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;

/**
 * SnowPlowEventDeserializer reads SnowPlow event and page view log data into Hive.
 * 
 * For documentation please see the introductory README.md in the project root.
 */
public class SnowPlowEventDeserializer implements Deserializer {

    // -------------------------------------------------------------------------------------------------------------------
    // Initial setup
    // -------------------------------------------------------------------------------------------------------------------

    // Setup logging
    public static final Log LOG = LogFactory.getLog(SnowPlowEventDeserializer.class.getName());

    // What's our continue on errors property called?
    private static final String CONTINUE_ON = "continue_on_unexpected_error";

    // Voodoo taken from Zemanta's S3LogDeserializer
    static {
        StackTraceElement[] sTrace = new Exception().getStackTrace();
        sTrace[0].getClassName();
    }

    // We'll initialize our object inspector below
    private ObjectInspector cachedObjectInspector;

    // For performance reasons we reuse the same object to deserialize all of our rows
    private static final SnowPlowEventStruct cachedStruct = new SnowPlowEventStruct();

    // Returns null instead of throwing an exception
    private boolean continueOnUnexpectedError = false;

    // -------------------------------------------------------------------------------------------------------------------
    // Helper for deserializing a single line. Used for testing
    // -------------------------------------------------------------------------------------------------------------------

    /**
     * A helper which deserializes and inspects a single row
     *
     * @param line The line of text to deserialize
     * @param verbose Whether to debug-print the contents of the struct using reflection
     * @param continueOn Whether to continue on an unexpected error or not
     * @return The struct object from deserializing the text
     * @throws SerDeException if there is a problem deserializing the line, or reflection-inspecting the struct's contents
     */
    public static Object deserializeLine(String line, Boolean verbose, Boolean continueOn) throws SerDeException {

        // Prep the deserializer
        SnowPlowEventDeserializer serDe = new SnowPlowEventDeserializer();
        Configuration conf = new Configuration();
        Properties tbl = new Properties();
        tbl.setProperty(CONTINUE_ON, continueOn ? "1" : "0");

        serDe.initialize(conf, tbl);

        // Run the deserializer with the sample row
        Text text = new Text(line);
        Object row = serDe.deserialize(text);

        // Loop through and output each field in the struct, if required.
        if (verbose) {
            ReflectionStructObjectInspector oi = (ReflectionStructObjectInspector) serDe.getObjectInspector();
            List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();
            for (int i = 0; i < fieldRefs.size(); i++) {
                System.out.println(fieldRefs.get(i).toString());
                Object fieldData = oi.getStructFieldData(row, fieldRefs.get(i));
                if (fieldData == null) {
                    System.out.println("null");
                } else {
                    System.out.println(fieldData.toString());
                }
            }
        }

        return row;
    }

    // -------------------------------------------------------------------------------------------------------------------
    // Constructor & initializer
    // -------------------------------------------------------------------------------------------------------------------

    /**
     * Empty constructor
     */
    public SnowPlowEventDeserializer() throws SerDeException {
    }

    /**
     * Initialize the SnowPlowEventDeserializer.
     *
     * @param conf System properties
     * @param tbl Table properties
     * @throws SerDeException For any exception during initialization
     */
    @Override
    public void initialize(Configuration conf, Properties tbl) throws SerDeException {

        cachedObjectInspector = ObjectInspectorFactory.getReflectionObjectInspector(SnowPlowEventStruct.class,
                ObjectInspectorFactory.ObjectInspectorOptions.JAVA);

        this.continueOnUnexpectedError = SnowPlowEventStruct.stringToBoolean(tbl.getProperty(CONTINUE_ON, "0"));

        LOG.debug(this.getClass().getName() + " initialized");
    }

    // -------------------------------------------------------------------------------------------------------------------
    // Deserializer
    // -------------------------------------------------------------------------------------------------------------------

    /**
     * Deserialize an object out of a Writable blob. In most cases, the return
     * value of this function will be constant since the function will reuse the
     * returned object. If the client wants to keep a copy of the object, the
     * client needs to clone the returned value by calling
     * ObjectInspectorUtils.getStandardObject().
     * 
     * @param blob The Writable object containing a serialized object
     * @return A Java object representing the contents in the blob.
     * @throws SerDeException For any exception during initialization
     */
    @Override
    public Object deserialize(Writable field) throws SerDeException {
        String row = null;
        if (field instanceof BytesWritable) {
            BytesWritable b = (BytesWritable) field;
            try {
                row = Text.decode(b.getBytes(), 0, b.getLength());
            } catch (CharacterCodingException e) {
                throw new SerDeException(e);
            }
        } else if (field instanceof Text) {
            row = field.toString();
        }
        try {
            // Update in place the S3LogStruct with the row data
            if (cachedStruct.updateByParsing(row))
                return cachedStruct;
            else
                return null;
        } catch (ClassCastException e) {
            throw new SerDeException(this.getClass().getName() + " expects Text or BytesWritable", e);
        } catch (Exception e) {
            if (this.continueOnUnexpectedError) {
                LOG.error("Could not parse row: \"" + row + "\"", e);
                return null;
            } else
                throw new SerDeException(e);
        }
    }

    // -------------------------------------------------------------------------------------------------------------------
    // Getters
    // -------------------------------------------------------------------------------------------------------------------

    /**
     * Retrieve statistics for this SerDe. Returns null
     * because we don't support statistics (yet).
     *
     * @return The SerDe's statistics (null in this case)
     */
    @Override
    public SerDeStats getSerDeStats() {
        return null;
    }

    /**
     * Get the object inspector that can be used to navigate through the internal
     * structure of the Object returned from deserialize(...).
     *
     * @return The ObjectInspector for this Deserializer 
     * @throws SerDeException For any exception during initialization
     */
    @Override
    public ObjectInspector getObjectInspector() throws SerDeException {
        return cachedObjectInspector;
    }
}