com.linkedin.cubert.io.avro.AvroStorage.java Source code

Introduction

Here is the source code for com.linkedin.cubert.io.avro.AvroStorage.java
Source

/* (c) 2014 LinkedIn Corp. All rights reserved.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */

package com.linkedin.cubert.io.avro;

import java.io.IOException;
import java.util.List;

import org.apache.avro.Schema;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.codehaus.jackson.JsonNode;

import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.block.BlockWriter;
import com.linkedin.cubert.block.TupleCreator;
import com.linkedin.cubert.io.CachedFileReader;
import com.linkedin.cubert.io.Storage;
import com.linkedin.cubert.io.TeeWriter;
import com.linkedin.cubert.operator.PostCondition;
import com.linkedin.cubert.utils.AvroUtils;

public class AvroStorage implements Storage {

    @Override
    public void prepareInput(Job job, Configuration conf, JsonNode params, List<Path> paths) throws IOException {
        Schema avroSchema = AvroUtils.getSchema(conf, paths.get(0));
        // set the schema for this index
        conf.set("cubert.avro.input.schema", avroSchema.toString());
        if (params.has("unsplittable") && Boolean.parseBoolean(params.get("unsplittable").getTextValue()))
            conf.set("cubert.avro.input.unsplittable", "true");
        else
            conf.set("cubert.avro.input.unsplittable", "false");
        job.setInputFormatClass(PigAvroInputFormatAdaptor.class);
    }

    @Override
    public void prepareOutput(Job job, Configuration conf, JsonNode params, BlockSchema schema, Path path) {
        Schema avroSchema = null;

        // we can specify the avro schema of the output directly, instead of deriving
        // it from the BlockSchema
        // if (json.has("avroSchema"))
        // {
        // avroSchema =
        // new org.apache.avro.Schema.Parser().parse(JsonUtils.getText(json,
        // "avroSchema"));
        // }
        // else
        // {
        // schema = new BlockSchema(json.get("schema"));
        avroSchema = AvroUtils.convertFromBlockSchema("record", schema);

        conf.set("cubert.avro.output.schema", avroSchema.toString());

        job.setOutputFormatClass(PigAvroOutputFormatAdaptor.class);
        // AvroJob.setOutputKeySchema(job, avroSchema);
        // AvroJob.setOutputValueSchema(job, Schema.create(Type.NULL));
        // job.setOutputFormatClass(AvroKeyOutputFormat.class);
    }

    @Override
    public TupleCreator getTupleCreator() {
        return new AvroTupleCreator();
    }

    @Override
    public BlockWriter getBlockWriter() {
        return new AvroBlockWriter();
    }

    @Override
    public TeeWriter getTeeWriter() {
        return new AvroTeeWriter();
    }

    @Override
    public CachedFileReader getCachedFileReader() {
        return new AvroFileReader();
    }

    @Override
    public PostCondition getPostCondition(Configuration conf, JsonNode json, Path path) throws IOException {
        Schema avroSchema = AvroUtils.getSchema(conf, path);
        BlockSchema schema = AvroUtils.convertToBlockSchema(avroSchema);
        return new PostCondition(schema, null, null);
    }

}