Java tutorial
/* * Copyright 2008-2009 LinkedIn, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package voldemort.store.readonly.mr.serialization; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.util.StringUtils; import org.apache.log4j.Logger; import voldemort.store.readonly.mr.utils.HadoopUtils; /** * Extends {@link SequenceFileInputFormat} to support our JSON based * serialization format. * * Reads in a SequenceFile Read out the schema from Metadata and save it as keys * in configuration. * <ul> * <li>"mapper.input.key.schema"</li> * <li>"mapper.input.value.schema"</li> * </ul> * * @author bbansal * */ public class JsonSequenceFileInputFormat extends SequenceFileInputFormat<BytesWritable, BytesWritable> { protected static final Logger log = Logger.getLogger(JsonSequenceFileInputFormat.class.getName()); @Override public RecordReader<BytesWritable, BytesWritable> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { String inputPathString = ((FileSplit) split).getPath().toUri().getPath(); log.info("Input file path:" + inputPathString); Path inputPath = new Path(inputPathString); SequenceFile.Reader reader = new SequenceFile.Reader(inputPath.getFileSystem(conf), inputPath, conf); SequenceFile.Metadata meta = reader.getMetadata(); try { Text keySchema = meta.get(new Text("key.schema")); Text valueSchema = meta.get(new Text("value.schema")); if (0 == keySchema.getLength() || 0 == valueSchema.getLength()) { throw new Exception(); } // update Joboconf with schemas conf.set("mapper.input.key.schema", keySchema.toString()); conf.set("mapper.input.value.schema", valueSchema.toString()); } catch (Exception e) { throw new IOException("Failed to Load Schema from file:" + inputPathString + "\n"); } return super.getRecordReader(split, conf, reporter); } @Override protected FileStatus[] listStatus(JobConf job) throws IOException { String dirs = job.get("mapred.input.dir", ""); String[] list = StringUtils.split(dirs); List<FileStatus> status = new ArrayList<FileStatus>(); for (int i = 0; i < list.length; i++) { status.addAll(getAllSubFileStatus(job, new Path(list[i]))); } return status.toArray(new FileStatus[0]); } private List<FileStatus> getAllSubFileStatus(JobConf inputConf, Path filterMemberPath) throws IOException { List<FileStatus> list = new ArrayList<FileStatus>(); FileSystem fs = filterMemberPath.getFileSystem(inputConf); FileStatus[] subFiles = fs.listStatus(filterMemberPath); if (null != subFiles) { if (fs.isDirectory(filterMemberPath)) { for (FileStatus subFile : subFiles) { if (!HadoopUtils.shouldPathBeIgnored(subFile.getPath())) { list.addAll(getAllSubFileStatus(inputConf, subFile.getPath())); } } } else { if (subFiles.length > 0 && !HadoopUtils.shouldPathBeIgnored(subFiles[0].getPath())) { list.add(subFiles[0]); } } } return list; } }