Java tutorial
/* * Copyright 2010 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package com.linkedin.json; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.util.StringUtils; import org.apache.log4j.Logger; import voldemort.serialization.json.JsonTypeSerializer; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * */ public class JsonSequenceFileInputFormat extends FileInputFormat<Object, Object> { protected static final Logger log = Logger.getLogger(JsonSequenceFileInputFormat.class.getName()); private final SequenceFileInputFormat<BytesWritable, BytesWritable> baseInputFormat = new SequenceFileInputFormat<BytesWritable, BytesWritable>(); @Override public RecordReader<Object, Object> createRecordReader(final InputSplit split, final TaskAttemptContext context) throws IOException { Configuration conf = context.getConfiguration(); String inputPathString = ((FileSplit) split).getPath().toUri().getPath(); log.info("Input file path:" + inputPathString); Path inputPath = new Path(inputPathString); SequenceFile.Reader reader = new SequenceFile.Reader(inputPath.getFileSystem(conf), inputPath, conf); SequenceFile.Metadata meta = reader.getMetadata(); try { final Text keySchema = meta.get(new Text("key.schema")); final Text valueSchema = meta.get(new Text("value.schema")); if (0 == keySchema.getLength() || 0 == valueSchema.getLength()) { throw new Exception(String.format("Cannot have a 0 length schema. keySchema[%s], valueSchema[%s]", keySchema, valueSchema)); } return new JsonObjectRecordReader(new JsonTypeSerializer(keySchema.toString()), new JsonTypeSerializer(valueSchema.toString()), baseInputFormat.createRecordReader(split, context)); } catch (Exception e) { throw new IOException("Failed to Load Schema from file:" + inputPathString + "\n"); } } @Override protected List<FileStatus> listStatus(JobContext job) throws IOException { String dirs = job.getConfiguration().get("mapred.input.dir", ""); String[] list = StringUtils.split(dirs); List<FileStatus> status = new ArrayList<FileStatus>(); for (int i = 0; i < list.length; i++) { status.addAll(getAllSubFileStatus(job, new Path(list[i]))); } return status; } private List<FileStatus> getAllSubFileStatus(JobContext jobContext, Path filterMemberPath) throws IOException { List<FileStatus> list = new ArrayList<FileStatus>(); FileSystem fs = filterMemberPath.getFileSystem(jobContext.getConfiguration()); FileStatus[] subFiles = fs.listStatus(filterMemberPath); if (null != subFiles) { if (fs.getFileStatus(filterMemberPath).isDir()) { for (FileStatus subFile : subFiles) { if (!subFile.getPath().getName().startsWith("_")) { list.addAll(getAllSubFileStatus(jobContext, subFile.getPath())); } } } else { if (subFiles.length > 0 && !subFiles[0].getPath().getName().startsWith("_")) { list.add(subFiles[0]); } } } return list; } private static class JsonObjectRecordReader extends RecordReader<Object, Object> { final JsonTypeSerializer inputKeySerializer; final JsonTypeSerializer inputValueSerializer; final RecordReader<BytesWritable, BytesWritable> delegateReader; public JsonObjectRecordReader(final JsonTypeSerializer inputKeySerializer, final JsonTypeSerializer inputValueSerializer, final RecordReader<BytesWritable, BytesWritable> recordReader) { this.delegateReader = recordReader; this.inputKeySerializer = inputKeySerializer; this.inputValueSerializer = inputValueSerializer; } @Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { delegateReader.initialize(inputSplit, taskAttemptContext); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { return delegateReader.nextKeyValue(); } @Override public Object getCurrentKey() throws IOException, InterruptedException { return inputKeySerializer.toObject(delegateReader.getCurrentKey().getBytes()); } @Override public Object getCurrentValue() throws IOException, InterruptedException { return inputValueSerializer.toObject(delegateReader.getCurrentValue().getBytes()); } @Override public float getProgress() throws IOException, InterruptedException { return delegateReader.getProgress(); } @Override public void close() throws IOException { delegateReader.close(); } } }