datafu.hourglass.avro.AvroMultipleInputsUtil.java Source code

Java tutorial

Introduction

Here is the source code for datafu.hourglass.avro.AvroMultipleInputsUtil.java

Source

/**
* Copyright 2013 LinkedIn, Inc
* 
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
* 
* http://www.apache.org/licenses/LICENSE-2.0
* 
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package datafu.hourglass.avro;

import org.apache.avro.Schema;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.log4j.Logger;
import org.json.JSONException;
import org.json.JSONObject;

/**
 * Helper methods for dealing with multiple Avro input schemas.  A mapping is stored in the configuration
 * that maps each input path to its corresponding schema.  Methods in this class help with loading and
 * storing these schema mappings.
 * 
 * @author "Matthew Hayes"
 *
 */
public class AvroMultipleInputsUtil {
    private static final Logger _log = Logger.getLogger(AvroMultipleInputsUtil.class);

    private static final String CONF_INPUT_KEY_SCHEMA = "avro.schema.multiple.inputs.keys";

    /**
     * Gets the schema for a particular input split. 
     * 
     * @param conf configuration to get schema from
     * @param split input split to get schema for
     * @return schema
     */
    public static Schema getInputKeySchemaForSplit(Configuration conf, InputSplit split) {
        String path = ((FileSplit) split).getPath().toString();
        _log.info("Determining schema for input path " + path);
        JSONObject schemas;
        try {
            schemas = getInputSchemas(conf);
        } catch (JSONException e1) {
            throw new RuntimeException(e1);
        }
        Schema schema = null;
        if (schemas != null) {
            for (String key : JSONObject.getNames(schemas)) {
                _log.info("Checking against " + key);
                if (path.startsWith(key)) {
                    try {
                        schema = new Schema.Parser().parse(schemas.getString(key));
                        _log.info("Input schema found: " + schema.toString(true));
                        break;
                    } catch (JSONException e) {
                        throw new RuntimeException(e);
                    }
                }
            }
        }
        if (schema == null) {
            _log.info("Could not determine input schema");
        }
        return schema;
    }

    /**
     * Sets the job input key schema for a path.
     *
     * @param job The job to configure.
     * @param schema The input key schema.
     * @param path the path to set the schema for
     */
    public static void setInputKeySchemaForPath(Job job, Schema schema, String path) {
        JSONObject schemas;
        try {
            schemas = getInputSchemas(job.getConfiguration());
            schemas.put(path, schema.toString());
        } catch (JSONException e) {
            throw new RuntimeException(e);
        }
        job.getConfiguration().set(CONF_INPUT_KEY_SCHEMA, schemas.toString());
    }

    /**
     * Get a mapping from path to input schema.
     * 
     * @param conf
     * @return mapping from path to input schem
     * @throws JSONException
     */
    private static JSONObject getInputSchemas(Configuration conf) throws JSONException {
        JSONObject schemas;

        String schemasJson = conf.get(CONF_INPUT_KEY_SCHEMA);

        if (schemasJson == null) {
            schemas = new JSONObject();
        } else {
            schemas = new JSONObject(schemasJson);
        }

        return schemas;
    }
}