com.qq.pig.udf.CustomJsonLoader.java Source code

Introduction

Here is the source code for com.qq.pig.udf.CustomJsonLoader.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.qq.pig.udf;

import com.google.common.collect.Maps;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.pig.*;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.builtin.JsonMetadata;
import org.apache.pig.data.*;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.JsonToken;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;

//import org.joda.time.format.DateTimeFormatter;
//import org.joda.time.format.ISODateTimeFormat;

/**
 * A loader for data stored using {@link org.apache.pig.builtin.JsonStorage}.  This is not a generic
 * JSON loader. It depends on the schema being stored with the data when
 * conceivably you could write a loader that determines the schema from the
 * JSON.
 */
public class CustomJsonLoader extends LoadFunc implements LoadMetadata {

    protected RecordReader reader = null;
    protected ResourceSchema schema = null;
    //    protected ResourceFieldSchema[] fields = null;

    private String udfcSignature = null;
    private JsonFactory jsonFactory = null;
    private TupleFactory tupleFactory = TupleFactory.getInstance();
    private BagFactory bagFactory = BagFactory.getInstance();

    private static final String SCHEMA_SIGNATURE = "pig.jsonloader.schema";

    private Map<String, ResourceFieldSchema> nameToFields = Maps.newHashMap();
    private Map<String, Integer> nameToPosition = Maps.newHashMap();

    public CustomJsonLoader() {
    }

    public CustomJsonLoader(String schemaString) throws IOException {
        schema = new ResourceSchema(Utils.parseSchema(schemaString));
    }

    public void setLocation(String location, Job job) throws IOException {
        // Tell our input format where we will be reading from
        FileInputFormat.setInputPaths(job, location);
    }

    @SuppressWarnings("unchecked")
    public InputFormat getInputFormat() throws IOException {
        // We will use TextInputFormat, the default Hadoop input format for
        // text.  It has a LongWritable key that we will ignore, and the value
        // is a Text (a string writable) that the JSON data is in.
        return new TextInputFormat();
    }

    public LoadCaster getLoadCaster() throws IOException {
        // We do not expect to do casting of byte arrays, because we will be
        // returning typed data.
        return null;
    }

    @SuppressWarnings("unchecked")
    public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
        this.reader = reader;

        // Get the schema string from the UDFContext object.
        UDFContext udfc = UDFContext.getUDFContext();
        Properties p = udfc.getUDFProperties(this.getClass(), new String[] { udfcSignature });
        String strSchema = p.getProperty(SCHEMA_SIGNATURE);
        if (strSchema == null) {
            throw new IOException("Could not find schema in UDF context");
        }

        // Parse the schema from the string stored in the properties object.
        schema = new ResourceSchema(Utils.getSchemaFromString(strSchema));
        jsonFactory = new JsonFactory();

        init(nameToFields, nameToPosition, schema);
    }

    private void init(Map<String, ResourceFieldSchema> nameToFields, Map<String, Integer> nameToPosition,
            ResourceSchema schema) {
        ResourceFieldSchema[] fields = schema.getFields();
        int index = 0;
        for (ResourceFieldSchema field : fields) {
            nameToFields.put(field.getName(), field);
            nameToPosition.put(field.getName(), index++);

            switch (field.getType()) {
            case DataType.TUPLE:
                ResourceSchema tupleSchema = field.getSchema();
                init(nameToFields, nameToPosition, tupleSchema);
                break;
            case DataType.BAG:
                ResourceSchema bagSchema = field.getSchema();
                ResourceFieldSchema[] fs = bagSchema.getFields();
                // Drill down the next level to the tuple's schema.
                ResourceSchema t = fs[0].getSchema();
                init(nameToFields, nameToPosition, t);
                break;
            default:
            }
        }
    }

    public Tuple getNext() throws IOException {
        Text val = null;
        try {
            // Read the next key value pair from the record reader.  If it's
            // finished, return null
            if (!reader.nextKeyValue())
                return null;

            // Get the current value.  We don't use the key.
            val = (Text) reader.getCurrentValue();
        } catch (InterruptedException ie) {
            throw new IOException(ie);
        }
        return parseTuple(val);

    }

    public Tuple parseTuple(Text val) throws IOException {
        // Create a parser specific for this input line.  This may not be the
        // most efficient approach.

        //TODO why make a byte copy?
        byte[] newBytes = new byte[val.getLength()];
        System.arraycopy(val.getBytes(), 0, newBytes, 0, val.getLength());

        ByteArrayInputStream bais = new ByteArrayInputStream(newBytes);
        JsonParser p = jsonFactory.createJsonParser(bais);

        // Create the tuple we will be returning.  We create it with the right
        // number of fields, as the Tuple object is optimized for this case.
        ResourceFieldSchema[] fields = schema.getFields();
        Tuple t = tupleFactory.newTuple(fields.length);

        // Read the start object marker.  Throughout this file if the parsing
        // isn't what we expect we return a tuple with null fields rather than
        // throwing an exception.  That way a few mangled lines don't fail the
        // job.
        if (p.nextToken() != JsonToken.START_OBJECT) {
            warn("Bad record, could not find start of record " + val.toString(), PigWarning.UDF_WARNING_1);
            return t;
        }
        readFields(p, t);
        p.close();
        return t;
    }

    private Object readFields(JsonParser p, Tuple t) throws IOException {

        int fieldNum = 0;
        while (true) {
            JsonToken tok = p.nextToken();
            if (tok == null)//end of stream.
                break;

            // Check to see if this value was null
            if (tok == JsonToken.VALUE_NULL)
                continue;

            //TODO why?
            String name = p.getCurrentName();
            if (name == null)
                continue;

            ResourceFieldSchema fieldSchema = nameToFields.get(name);
            if (fieldSchema == null) {
                //drain next useless token
                p.nextToken();
                continue;
            }
            Integer position = nameToPosition.get(name);
            if (position == null) {
                throw new RuntimeException("#" + name + "#");
            }
            t.set(position, getValue(p, fieldSchema, fieldNum++));
        }

        return t;
    }

    private Object readField(JsonParser p, ResourceFieldSchema field, int fieldnum) throws IOException {
        // Read the next token
        JsonToken tok = p.nextToken();
        if (tok == null) {
            warn("Early termination of record, expected " + schema.getFields().length + " fields bug found "
                    + fieldnum, PigWarning.UDF_WARNING_1);
            return null;
        }

        // Check to see if this value was null
        if (tok == JsonToken.VALUE_NULL)
            return null;

        //        String name = p.getCurrentName();
        //        int position = nameToFields.get(name);
        //        field = fields[position];
        return getValue(p, field, fieldnum);

    }

    private Object readTuple(JsonParser p, ResourceSchema s, int fieldnum) throws IOException {
        //        ResourceSchema s = field.getSchema();
        ResourceFieldSchema[] fs = s.getFields();
        Tuple t = tupleFactory.newTuple(fs.length);

        JsonToken currentToken;
        while (true) {
            currentToken = p.nextToken();

            if (currentToken == null) {
                warn("Early termination of record, expected " + schema.getFields().length + " fields bug found "
                        + fieldnum, PigWarning.UDF_WARNING_1);
                return null;
            }
            // Check to see if this value was null
            if (currentToken == JsonToken.VALUE_NULL)
                return null;

            if (currentToken == JsonToken.END_OBJECT)
                break;

            String currentName = p.getCurrentName();
            ResourceFieldSchema fieldSchema = nameToFields.get(currentName);
            if (fieldSchema == null)
                continue;//unknown field
            Integer position = nameToPosition.get(currentName);
            t.set(position, getValue(p, fieldSchema, 0));
        }

        if (currentToken != JsonToken.END_OBJECT) {
            warn("Bad tuple field, could not find end of object, " + "field " + fieldnum, PigWarning.UDF_WARNING_1);
        }
        return t;
    }

    private Object readBag(JsonParser p, ResourceSchema s, int fieldnum) throws IOException {

        //        ResourceSchema s = field.getSchema();
        ResourceFieldSchema[] fs = s.getFields();
        // Drill down the next level to the tuple's schema.
        s = fs[0].getSchema();
        fs = s.getFields();

        DataBag bag = bagFactory.newDefaultBag();

        JsonToken innerTok;
        while ((innerTok = p.nextToken()) != JsonToken.END_ARRAY) {
            Tuple tuple = null;
            if (innerTok == JsonToken.START_OBJECT) {
                tuple = (Tuple) readTuple(p, s, 0);
            } else {
                tuple = tupleFactory.newTuple(1);

                Object value = null;

                switch (fs[0].getType()) {
                case DataType.BOOLEAN:

                    if (innerTok == JsonToken.VALUE_NULL)
                        return null;
                    value = p.getBooleanValue();
                    break;

                case DataType.INTEGER:
                    // Read the field name
                    if (innerTok == JsonToken.VALUE_NULL)
                        return null;
                    value = p.getIntValue();
                    break;
                case DataType.LONG:
                    if (innerTok == JsonToken.VALUE_NULL)
                        return null;
                    value = p.getLongValue();
                    break;
                case DataType.FLOAT:
                    if (innerTok == JsonToken.VALUE_NULL)
                        return null;
                    value = p.getFloatValue();
                    break;
                case DataType.DOUBLE:
                    if (innerTok == JsonToken.VALUE_NULL)
                        return null;
                    value = p.getDoubleValue();
                    break;
                case DataType.DATETIME:
                    //                tok = p.nextToken();
                    //                if (tok == JsonToken.VALUE_NULL) return null;
                    //                DateTimeFormatter formatter = ISODateTimeFormat.dateTimeParser();
                    //                return formatter.withOffsetParsed().parseDateTime(p.getText());

                    value = null;
                    break;
                case DataType.BYTEARRAY:
                    if (innerTok == JsonToken.VALUE_NULL)
                        return null;
                    byte[] b = p.getText().getBytes();
                    // Use the DBA constructor that copies the bytes so that we own
                    // the memory
                    value = new DataByteArray(b, 0, b.length);
                    break;
                case DataType.CHARARRAY:
                    if (innerTok == JsonToken.VALUE_NULL)
                        return null;
                    value = p.getText();
                    break;
                case DataType.BIGINTEGER:
                    if (innerTok == JsonToken.VALUE_NULL)
                        return null;
                    value = p.getBigIntegerValue();
                    break;
                case DataType.BIGDECIMAL:
                    if (innerTok == JsonToken.VALUE_NULL)
                        return null;
                    value = p.getDecimalValue();
                    break;
                default:
                    value = null;
                }

                tuple.set(0, value);
            }

            bag.add(tuple);
        }
        return bag;
    }

    private Object getValue(JsonParser p, ResourceFieldSchema field, int fieldnum) throws IOException {
        try {
            JsonToken tok;// Read based on our expected type
            switch (field.getType()) {
            case DataType.BOOLEAN:
                tok = p.nextToken();
                if (tok == JsonToken.VALUE_NULL)
                    return null;
                return p.getBooleanValue();

            case DataType.INTEGER:
                // Read the field name
                tok = p.nextToken();
                if (tok == JsonToken.VALUE_NULL)
                    return null;
                return p.getIntValue();

            case DataType.LONG:
                tok = p.nextToken();
                if (tok == JsonToken.VALUE_NULL)
                    return null;
                return p.getLongValue();

            case DataType.FLOAT:
                tok = p.nextToken();
                if (tok == JsonToken.VALUE_NULL)
                    return null;
                return p.getFloatValue();

            case DataType.DOUBLE:
                tok = p.nextToken();
                if (tok == JsonToken.VALUE_NULL)
                    return null;
                return p.getDoubleValue();

            case DataType.DATETIME:
                //                tok = p.nextToken();
                //                if (tok == JsonToken.VALUE_NULL) return null;
                //                DateTimeFormatter formatter = ISODateTimeFormat.dateTimeParser();
                //                return formatter.withOffsetParsed().parseDateTime(p.getText());

                return null;
            case DataType.BYTEARRAY:
                tok = p.nextToken();
                if (tok == JsonToken.VALUE_NULL)
                    return null;
                byte[] b = p.getText().getBytes();
                // Use the DBA constructor that copies the bytes so that we own
                // the memory
                return new DataByteArray(b, 0, b.length);

            case DataType.CHARARRAY:
                tok = p.nextToken();
                if (tok == JsonToken.VALUE_NULL)
                    return null;
                return p.getText();

            case DataType.BIGINTEGER:
                tok = p.nextToken();
                if (tok == JsonToken.VALUE_NULL)
                    return null;
                return p.getBigIntegerValue();

            case DataType.BIGDECIMAL:
                tok = p.nextToken();
                if (tok == JsonToken.VALUE_NULL)
                    return null;
                return p.getDecimalValue();

            case DataType.MAP:
                // Should be a start of the map object
                if (p.nextToken() != JsonToken.START_OBJECT) {
                    warn("Bad map field, could not find start of object, field " + fieldnum,
                            PigWarning.UDF_WARNING_1);
                    return null;
                }
                Map<String, String> m = new HashMap<String, String>();
                while (p.nextToken() != JsonToken.END_OBJECT) {
                    String k = p.getCurrentName();
                    String v = p.getCurrentToken() == JsonToken.VALUE_NULL ? null : p.getText();
                    m.put(k, v);
                }
                return m;

            case DataType.TUPLE:
                if (p.nextToken() != JsonToken.START_OBJECT) {
                    warn("Bad tuple field, could not find start of object, " + "field " + fieldnum,
                            PigWarning.UDF_WARNING_1);
                    return null;
                }
                return readTuple(p, field.getSchema(), 0);

            case DataType.BAG:
                if (p.nextToken() != JsonToken.START_ARRAY) {
                    warn("Bad bag field, could not find start of array, " + "field " + fieldnum,
                            PigWarning.UDF_WARNING_1);
                    return null;
                }

                return readBag(p, field.getSchema(), 0);

            default:
                throw new IOException("Unknown type in input schema: " + field.getType());
            }

        } catch (Exception e) {
            throw new RuntimeException(p.getCurrentName(), e);
        }

    }

    //------------------------------------------------------------------------

    public void setUDFContextSignature(String signature) {
        udfcSignature = signature;
    }

    public ResourceSchema getSchema(String location, Job job) throws IOException {

        ResourceSchema s;
        if (schema != null) {
            s = schema;
        } else {
            // Parse the schema
            s = (new JsonMetadata()).getSchema(location, job, true);

            if (s == null) {
                throw new IOException("Unable to parse schema found in file in " + location);
            }
        }

        // Now that we have determined the schema, store it in our
        // UDFContext properties object so we have it when we need it on the
        // backend
        UDFContext udfc = UDFContext.getUDFContext();
        Properties p = udfc.getUDFProperties(this.getClass(), new String[] { udfcSignature });
        p.setProperty(SCHEMA_SIGNATURE, s.toString());

        return s;
    }

    public ResourceStatistics getStatistics(String location, Job job) throws IOException {
        // We don't implement this one.
        return null;
    }

    public String[] getPartitionKeys(String location, Job job) throws IOException {
        // We don't have partitions
        return null;
    }

    public void setPartitionFilter(Expression partitionFilter) throws IOException {
        // We don't have partitions
    }
}