com.acme.io.JsonLoader.java Source code

Introduction

Here is the source code for com.acme.io.JsonLoader.java
Source

/**
 * This code is made available under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */
package com.acme.io;

import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.JsonToken;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.pig.Expression;
import org.apache.pig.LoadCaster;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.LoadPushDown;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.LoadPushDown.OperatorSet;
import org.apache.pig.LoadPushDown.RequiredFieldList;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.builtin.Utf8StorageConverter;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;

/**
 * A loader for data stored using {@link JsonStorage}.  This is not a generic
 * JSON loader.  It depends on the schema being stored with the data when
 * conceivably you could write a loader that determines the schema from the
 * JSON.  It is also not well tested, for functionality or performance.  It
 * works for simple demonstrations.
 *
 * Also note that this loader and the associated storage function require a
 * version of Pig that has PIG-2112 to work with complex data.
 */
public class JsonLoader extends LoadFunc implements LoadMetadata {

    protected RecordReader reader = null;
    protected ResourceFieldSchema[] fields = null;
    protected final Log log = LogFactory.getLog(getClass());

    private String udfcSignature = null;
    private JsonFactory jsonFactory = null;
    private TupleFactory tupleFactory = TupleFactory.getInstance();
    private BagFactory bagFactory = BagFactory.getInstance();

    /**
     * Communicate to the loader the location of the object(s) being loaded.  
     * The location string passed to the LoadFunc here is the return value of 
     * {@link LoadFunc#relativeToAbsolutePath(String, Path)}. Implementations
     * should use this method to communicate the location (and any other
     * information) to its underlying InputFormat through the Job object.
     * 
     * This method will be called in the backend multiple times. Implementations
     * should bear in mind that this method is called multiple times and should
     * ensure there are no inconsistent side effects due to the multiple calls.
     * 
     * @param location Location as returned by 
     * {@link LoadFunc#relativeToAbsolutePath(String, Path)}
     * @param job the {@link Job} object
     * store or retrieve earlier stored information from the {@link UDFContext}
     * @throws IOException if the location is not valid.
     */
    public void setLocation(String location, Job job) throws IOException {
        // Tell our input format where we will be reading from
        FileInputFormat.setInputPaths(job, location);
    }

    /**
     * This will be called during planning on the front end. This is the
     * instance of InputFormat (rather than the class name) because the 
     * load function may need to instantiate the InputFormat in order 
     * to control how it is constructed.
     * @return the InputFormat associated with this loader.
     * @throws IOException if there is an exception during InputFormat 
     * construction
     */
    @SuppressWarnings("unchecked")
    public InputFormat getInputFormat() throws IOException {
        // We will use TextInputFormat, the default Hadoop input format for
        // text.  It has a LongWritable key that we will ignore, and the value
        // is a Text (a string writable) that the JSON data is in.
        return new TextInputFormat();
    }

    /**
     * This will be called on the front end during planning and not on the back 
     * end during execution.
     * @return the {@link LoadCaster} associated with this loader.
     * Returning null indicates that casts from byte array are not supported
     * for this loader. 
     * @throws IOException if there is an exception during LoadCaster 
     */
    public LoadCaster getLoadCaster() throws IOException {
        // We do not expect to do casting of byte arrays, because we will be
        // returning typed data.
        return null;
    }

    /**
     * Initializes LoadFunc for reading data.  This will be called during
     * execution before any calls to getNext.  The RecordReader needs to be
     * passed here because it has been instantiated for a particular InputSplit.
     * @param reader {@link RecordReader} to be used by this instance of
     * the LoadFunc
     * @param split The input {@link PigSplit} to process
     * @throws IOException if there is an exception during initialization
     */
    @SuppressWarnings("unchecked")
    public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
        this.reader = reader;

        // Get the schema string from the UDFContext object.
        UDFContext udfc = UDFContext.getUDFContext();
        Properties p = udfc.getUDFProperties(this.getClass(), new String[] { udfcSignature });
        String strSchema = p.getProperty("pig.jsonloader.schema");
        if (strSchema == null) {
            throw new IOException("Could not find schema in UDF context");
        }

        // Parse the schema from the string stored in the properties object.
        ResourceSchema schema = new ResourceSchema(Utils.getSchemaFromString(strSchema));
        fields = schema.getFields();

        jsonFactory = new JsonFactory();
    }

    /**
     * Retrieves the next tuple to be processed. Implementations should NOT
     * reuse tuple objects (or inner member objects) they return across calls
     * and should return a different tuple object in each call.
     * @return the next tuple to be processed or null if there are no more
     * tuples to be processed.
     * @throws IOException if there is an exception while retrieving the next
     * tuple
     */
    public Tuple getNext() throws IOException {
        Text val = null;
        try {
            // Read the next key value pair from the record reader.  If it's
            // finished, return null
            if (!reader.nextKeyValue())
                return null;

            // Get the current value.  We don't use the key.
            val = (Text) reader.getCurrentValue();
        } catch (InterruptedException ie) {
            throw new IOException(ie);
        }

        // Create a parser specific for this input line.  This may not be the
        // most efficient approach.
        ByteArrayInputStream bais = new ByteArrayInputStream(val.getBytes());
        JsonParser p = jsonFactory.createJsonParser(bais);

        // Create the tuple we will be returning.  We create it with the right
        // number of fields, as the Tuple object is optimized for this case.
        Tuple t = tupleFactory.newTuple(fields.length);

        // Read the start object marker.  Throughout this file if the parsing
        // isn't what we expect we return a tuple with null fields rather than
        // throwing an exception.  That way a few mangled lines don't fail the
        // job.
        if (p.nextToken() != JsonToken.START_OBJECT) {
            log.warn("Bad record, could not find start of record " + val.toString());
            return t;
        }

        // Read each field in the record
        for (int i = 0; i < fields.length; i++) {
            t.set(i, readField(p, fields[i], i));
        }

        if (p.nextToken() != JsonToken.END_OBJECT) {
            log.warn("Bad record, could not find end of record " + val.toString());
            return t;
        }
        p.close();
        return t;
    }

    private Object readField(JsonParser p, ResourceFieldSchema field, int fieldnum) throws IOException {
        // Read the next token
        JsonToken tok = p.nextToken();
        if (tok == null) {
            log.warn("Early termination of record, expected " + fields.length + " fields bug found " + fieldnum);
            return null;
        }

        // Check to see if this value was null
        if (tok == JsonToken.VALUE_NULL)
            return null;

        // Read based on our expected type
        switch (field.getType()) {
        case DataType.INTEGER:
            // Read the field name
            p.nextToken();
            return p.getValueAsInt();

        case DataType.LONG:
            p.nextToken();
            return p.getValueAsLong();

        case DataType.FLOAT:
            p.nextToken();
            return (float) p.getValueAsDouble();

        case DataType.DOUBLE:
            p.nextToken();
            return p.getValueAsDouble();

        case DataType.BYTEARRAY:
            p.nextToken();
            byte[] b = p.getBinaryValue();
            // Use the DBA constructor that copies the bytes so that we own
            // the memory
            return new DataByteArray(b, 0, b.length);

        case DataType.CHARARRAY:
            p.nextToken();
            return p.getText();

        case DataType.MAP:
            // Should be a start of the map object
            if (p.nextToken() != JsonToken.START_OBJECT) {
                log.warn("Bad map field, could not find start of object, field " + fieldnum);
                return null;
            }
            Map<String, String> m = new HashMap<String, String>();
            while (p.nextToken() != JsonToken.END_OBJECT) {
                String k = p.getCurrentName();
                String v = p.getText();
                m.put(k, v);
            }
            return m;

        case DataType.TUPLE:
            if (p.nextToken() != JsonToken.START_OBJECT) {
                log.warn("Bad tuple field, could not find start of object, " + "field " + fieldnum);
                return null;
            }

            ResourceSchema s = field.getSchema();
            ResourceFieldSchema[] fs = s.getFields();
            Tuple t = tupleFactory.newTuple(fs.length);

            for (int j = 0; j < fs.length; j++) {
                t.set(j, readField(p, fs[j], j));
            }

            if (p.nextToken() != JsonToken.END_OBJECT) {
                log.warn("Bad tuple field, could not find end of object, " + "field " + fieldnum);
                return null;
            }
            return t;

        case DataType.BAG:
            if (p.nextToken() != JsonToken.START_ARRAY) {
                log.warn("Bad bag field, could not find start of array, " + "field " + fieldnum);
                return null;
            }

            s = field.getSchema();
            fs = s.getFields();
            // Drill down the next level to the tuple's schema.
            s = fs[0].getSchema();
            fs = s.getFields();

            DataBag bag = bagFactory.newDefaultBag();

            JsonToken innerTok;
            while ((innerTok = p.nextToken()) != JsonToken.END_ARRAY) {
                if (innerTok != JsonToken.START_OBJECT) {
                    log.warn("Bad bag tuple field, could not find start of " + "object, field " + fieldnum);
                    return null;
                }

                t = tupleFactory.newTuple(fs.length);
                for (int j = 0; j < fs.length; j++) {
                    t.set(j, readField(p, fs[j], j));
                }

                if (p.nextToken() != JsonToken.END_OBJECT) {
                    log.warn("Bad bag tuple field, could not find end of " + "object, field " + fieldnum);
                    return null;
                }
                bag.add(t);
            }
            return bag;
        default:
            throw new IOException("Unknown type in input schema: " + field.getType());
        }

    }

    //------------------------------------------------------------------------

    /**
     * This method will be called by Pig both in the front end and back end to
     * pass a unique signature to the {@link LoadFunc}. The signature can be used
     * to store into the {@link UDFContext} any information which the 
     * {@link LoadFunc} needs to store between various method invocations in the
     * front end and back end. A use case is to store {@link RequiredFieldList} 
     * passed to it in {@link LoadPushDown#pushProjection(RequiredFieldList)} for
     * use in the back end before returning tuples in {@link LoadFunc#getNext()}.
     * This method will be call before other methods in {@link LoadFunc}
     * @param signature a unique signature to identify this LoadFunc
     */
    public void setUDFContextSignature(String signature) {
        udfcSignature = signature;
    }

    /**
     * Get a schema for the data to be loaded.  
     * @param location Location as returned by 
     * {@link LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)}
     * @param job The {@link Job} object - this should be used only to obtain 
     * cluster properties through {@link Job#getConfiguration()} and not to
     * set/query any runtime job information.  
     * @return schema for the data to be loaded. This schema should represent
     * all tuples of the returned data.  If the schema is unknown or it is
     * not possible to return a schema that represents all returned data,
     * then null should be returned. The schema should not be affected by
     * pushProjection, ie. getSchema should always return the original schema
     * even after pushProjection
     * @throws IOException if an exception occurs while determining the schema
     */
    public ResourceSchema getSchema(String location, Job job) throws IOException {
        // Open the schema file and read the schema
        // Get an HDFS handle.
        FileSystem fs = FileSystem.get(job.getConfiguration());
        DataInputStream in = fs.open(new Path(location + "/_schema"));
        String line = in.readLine();
        in.close();

        // Parse the schema
        ResourceSchema s = new ResourceSchema(Utils.getSchemaFromString(line));
        if (s == null) {
            throw new IOException("Unable to parse schema found in file " + location + "/_schema");
        }

        // Now that we have determined the schema, store it in our
        // UDFContext properties object so we have it when we need it on the
        // backend
        UDFContext udfc = UDFContext.getUDFContext();
        Properties p = udfc.getUDFProperties(this.getClass(), new String[] { udfcSignature });
        p.setProperty("pig.jsonloader.schema", line);

        return s;
    }

    /**
     * Get statistics about the data to be loaded.  If no statistics are
     * available, then null should be returned.
     * @param location Location as returned by 
     * {@link LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)}
     * @param job The {@link Job} object - this should be used only to obtain 
     * cluster properties through {@link Job#getConfiguration()} and not to set/query
     * any runtime job information.  
     * @return statistics about the data to be loaded.  If no statistics are
     * available, then null should be returned.
     * @throws IOException if an exception occurs while retrieving statistics
     */
    public ResourceStatistics getStatistics(String location, Job job) throws IOException {
        // We don't implement this one.
        return null;
    }

    /**
     * Find what columns are partition keys for this input.
     * @param location Location as returned by 
     * {@link LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)}
     * @param job The {@link Job} object - this should be used only to obtain 
     * cluster properties through {@link Job#getConfiguration()} and not to
     * set/query any runtime job information.  
     * @return array of field names of the partition keys. Implementations 
     * should return null to indicate that there are no partition keys
     * @throws IOException if an exception occurs while retrieving partition keys
     */
    public String[] getPartitionKeys(String location, Job job) throws IOException {
        // We don't have partitions
        return null;
    }

    /**
     * Set the filter for partitioning.  It is assumed that this filter
     * will only contain references to fields given as partition keys in
     * getPartitionKeys. So if the implementation returns null in 
     * {@link #getPartitionKeys(String, Job)}, then this method is not
     * called by Pig runtime. This method is also not called by the Pig runtime
     * if there are no partition filter conditions. 
     * @param partitionFilter that describes filter for partitioning
     * @throws IOException if the filter is not compatible with the storage
     * mechanism or contains non-partition fields.
     */
    public void setPartitionFilter(Expression partitionFilter) throws IOException {
        // We don't have partitions
    }
}