com.cloudera.science.avro.streaming.AvroAsJSONInputFormat.java Source code

Introduction

Here is the source code for com.cloudera.science.avro.streaming.AvroAsJSONInputFormat.java
Source

/**
 * Copyright (c) 2013, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.science.avro.streaming;

import java.io.IOException;
import java.util.List;

import org.apache.avro.Schema;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

import com.cloudera.science.avro.common.SchemaLoader;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;

/**
 * Some changes introduced in the original code by Mateusz Kobos
 */
public class AvroAsJSONInputFormat extends FileInputFormat<Text, Text> {
    public static final String SCHEMA_LITERAL = "input.schema.literal";
    public static final String SCHEMA_URL = "input.schema.url";
    public static final String SCHEMA_TYPE_NAME = "eu.dnetlib.iis.avro.input.class";

    private List<Schema> schemas;
    private String[] inputPaths;

    @Override
    public RecordReader<Text, Text> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
            throws IOException {
        if (schemas == null) {
            loadSchemas(job);
        }
        FileSplit fs = (FileSplit) split;
        Schema schema = null;
        if (schemas.size() == 1) {
            schema = schemas.get(0);
        } else {
            // Need to figure out which schema we're loading
            String current = fs.getPath().toString();
            int index = -1;
            int bestMatchLength = -1;
            for (int i = 0; i < inputPaths.length; i++) {
                int match = Strings.commonPrefix(current, inputPaths[i]).length();
                if (match > bestMatchLength) {
                    bestMatchLength = match;
                    index = i;
                }
            }
            schema = schemas.get(index);
        }
        return new AvroAsJSONRecordReader(schema, job, fs);
    }

    private void loadSchemas(JobConf job) throws IOException {
        this.schemas = Lists.newArrayList();
        SchemaLoader loader = new SchemaLoader(job);
        String schemaLiteral = job.get(SCHEMA_LITERAL);
        if (schemaLiteral != null) {
            schemas.add(loader.loadLiteral(schemaLiteral));
            return;
        } else {
            String[] schemaUrls = job.getStrings(SCHEMA_URL);
            String[] typeNames = job.getStrings(SCHEMA_TYPE_NAME);
            if (schemaUrls != null) {
                for (String schemaUrl : schemaUrls) {
                    schemas.add(loader.loadFromUrl(schemaUrl));
                }
            } else if (typeNames != null) {
                for (String typeName : typeNames) {
                    schemas.add(loader.loadFromTypeName(typeName));
                }
            } else {
                throw new IllegalArgumentException("No schema information provided");
            }

            if (schemas.size() > 1) {
                // Need to track input paths
                Path[] inputs = FileInputFormat.getInputPaths(job);
                if (inputs.length != schemas.size()) {
                    throw new IllegalArgumentException(String.format(
                            "Number of input paths (%d) does not match number of schemas specified (%d)",
                            inputs.length, schemas.size()));
                }
                this.inputPaths = new String[inputs.length];
                for (int i = 0; i < inputs.length; i++) {
                    inputPaths[i] = inputs[i].toString();
                }
            }
        }
    }
}