Java tutorial
/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.io.avro; import java.io.IOException; import java.util.List; import org.apache.avro.Schema; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.codehaus.jackson.JsonNode; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.BlockWriter; import com.linkedin.cubert.block.TupleCreator; import com.linkedin.cubert.io.CachedFileReader; import com.linkedin.cubert.io.Storage; import com.linkedin.cubert.io.TeeWriter; import com.linkedin.cubert.operator.PostCondition; import com.linkedin.cubert.utils.AvroUtils; public class AvroStorage implements Storage { @Override public void prepareInput(Job job, Configuration conf, JsonNode params, List<Path> paths) throws IOException { Schema avroSchema = AvroUtils.getSchema(conf, paths.get(0)); // set the schema for this index conf.set("cubert.avro.input.schema", avroSchema.toString()); if (params.has("unsplittable") && Boolean.parseBoolean(params.get("unsplittable").getTextValue())) conf.set("cubert.avro.input.unsplittable", "true"); else conf.set("cubert.avro.input.unsplittable", "false"); job.setInputFormatClass(PigAvroInputFormatAdaptor.class); } @Override public void prepareOutput(Job job, Configuration conf, JsonNode params, BlockSchema schema, Path path) { Schema avroSchema = null; // we can specify the avro schema of the output directly, instead of deriving // it from the BlockSchema // if (json.has("avroSchema")) // { // avroSchema = // new org.apache.avro.Schema.Parser().parse(JsonUtils.getText(json, // "avroSchema")); // } // else // { // schema = new BlockSchema(json.get("schema")); avroSchema = AvroUtils.convertFromBlockSchema("record", schema); conf.set("cubert.avro.output.schema", avroSchema.toString()); job.setOutputFormatClass(PigAvroOutputFormatAdaptor.class); // AvroJob.setOutputKeySchema(job, avroSchema); // AvroJob.setOutputValueSchema(job, Schema.create(Type.NULL)); // job.setOutputFormatClass(AvroKeyOutputFormat.class); } @Override public TupleCreator getTupleCreator() { return new AvroTupleCreator(); } @Override public BlockWriter getBlockWriter() { return new AvroBlockWriter(); } @Override public TeeWriter getTeeWriter() { return new AvroTeeWriter(); } @Override public CachedFileReader getCachedFileReader() { return new AvroFileReader(); } @Override public PostCondition getPostCondition(Configuration conf, JsonNode json, Path path) throws IOException { Schema avroSchema = AvroUtils.getSchema(conf, path); BlockSchema schema = AvroUtils.convertToBlockSchema(avroSchema); return new PostCondition(schema, null, null); } }