com.linkedin.json.JsonSequenceFileInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.json.JsonSequenceFileInputFormat.java

Source

/*
 * Copyright 2010 LinkedIn, Inc
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.linkedin.json;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Logger;
import voldemort.serialization.json.JsonTypeSerializer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 *
 */
public class JsonSequenceFileInputFormat extends FileInputFormat<Object, Object> {
    protected static final Logger log = Logger.getLogger(JsonSequenceFileInputFormat.class.getName());

    private final SequenceFileInputFormat<BytesWritable, BytesWritable> baseInputFormat = new SequenceFileInputFormat<BytesWritable, BytesWritable>();

    @Override
    public RecordReader<Object, Object> createRecordReader(final InputSplit split, final TaskAttemptContext context)
            throws IOException {
        Configuration conf = context.getConfiguration();

        String inputPathString = ((FileSplit) split).getPath().toUri().getPath();
        log.info("Input file path:" + inputPathString);
        Path inputPath = new Path(inputPathString);

        SequenceFile.Reader reader = new SequenceFile.Reader(inputPath.getFileSystem(conf), inputPath, conf);
        SequenceFile.Metadata meta = reader.getMetadata();

        try {
            final Text keySchema = meta.get(new Text("key.schema"));
            final Text valueSchema = meta.get(new Text("value.schema"));

            if (0 == keySchema.getLength() || 0 == valueSchema.getLength()) {
                throw new Exception(String.format("Cannot have a 0 length schema. keySchema[%s], valueSchema[%s]",
                        keySchema, valueSchema));
            }

            return new JsonObjectRecordReader(new JsonTypeSerializer(keySchema.toString()),
                    new JsonTypeSerializer(valueSchema.toString()),
                    baseInputFormat.createRecordReader(split, context));
        } catch (Exception e) {
            throw new IOException("Failed to Load Schema from file:" + inputPathString + "\n");
        }
    }

    @Override
    protected List<FileStatus> listStatus(JobContext job) throws IOException {
        String dirs = job.getConfiguration().get("mapred.input.dir", "");
        String[] list = StringUtils.split(dirs);

        List<FileStatus> status = new ArrayList<FileStatus>();
        for (int i = 0; i < list.length; i++) {
            status.addAll(getAllSubFileStatus(job, new Path(list[i])));
        }

        return status;
    }

    private List<FileStatus> getAllSubFileStatus(JobContext jobContext, Path filterMemberPath) throws IOException {
        List<FileStatus> list = new ArrayList<FileStatus>();

        FileSystem fs = filterMemberPath.getFileSystem(jobContext.getConfiguration());
        FileStatus[] subFiles = fs.listStatus(filterMemberPath);

        if (null != subFiles) {
            if (fs.getFileStatus(filterMemberPath).isDir()) {
                for (FileStatus subFile : subFiles) {
                    if (!subFile.getPath().getName().startsWith("_")) {
                        list.addAll(getAllSubFileStatus(jobContext, subFile.getPath()));
                    }
                }
            } else {
                if (subFiles.length > 0 && !subFiles[0].getPath().getName().startsWith("_")) {
                    list.add(subFiles[0]);
                }
            }
        }

        return list;
    }

    private static class JsonObjectRecordReader extends RecordReader<Object, Object> {
        final JsonTypeSerializer inputKeySerializer;
        final JsonTypeSerializer inputValueSerializer;
        final RecordReader<BytesWritable, BytesWritable> delegateReader;

        public JsonObjectRecordReader(final JsonTypeSerializer inputKeySerializer,
                final JsonTypeSerializer inputValueSerializer,
                final RecordReader<BytesWritable, BytesWritable> recordReader) {
            this.delegateReader = recordReader;
            this.inputKeySerializer = inputKeySerializer;
            this.inputValueSerializer = inputValueSerializer;
        }

        @Override
        public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
                throws IOException, InterruptedException {
            delegateReader.initialize(inputSplit, taskAttemptContext);
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return delegateReader.nextKeyValue();
        }

        @Override
        public Object getCurrentKey() throws IOException, InterruptedException {
            return inputKeySerializer.toObject(delegateReader.getCurrentKey().getBytes());
        }

        @Override
        public Object getCurrentValue() throws IOException, InterruptedException {
            return inputValueSerializer.toObject(delegateReader.getCurrentValue().getBytes());
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            return delegateReader.getProgress();
        }

        @Override
        public void close() throws IOException {
            delegateReader.close();
        }
    }
}