Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ca.sparkera.adapters.mapreduce; import java.io.IOException; import java.io.InputStream; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Seekable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.compress.CodecPool; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.CompressionInputStream; import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.Log; /** * A reader to read fixed length records from a split. Record offset is returned * as key and the record as bytes is returned in value. */ @InterfaceAudience.Private @InterfaceStability.Evolving public class MainframeVBRecordReader extends RecordReader<LongWritable, BytesWritable> { private static final Log LOG = LogFactory.getLog(MainframeVBRecordReader.class); private int recordLength; private long start; private long pos; private long end; private long numBytesRemainingInSplit; private FSDataInputStream fileIn; private Seekable filePosition; private LongWritable key; private BytesWritable value; private boolean isCompressedInput; private Decompressor decompressor; private InputStream inputStream; public MainframeVBRecordReader() { super(); } @Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); final Path file = split.getPath(); initialize(job, split.getStart(), split.getLength(), file); } // This is also called from the old FixedLengthRecordReader API // implementation public void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException { start = splitStart; end = start + splitLength; LOG.info("Start of the split:" + start + "-End of split:" + end); LOG.debug("VLR initialize started: start pos:" + start + "endpos:" + end); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor); filePosition = cIn; inputStream = cIn; LOG.info("Compressed input; cannot compute number of records in the split"); } else { fileIn.seek(start); filePosition = fileIn; inputStream = fileIn; numBytesRemainingInSplit = splitLength; LOG.info("Variable length input; cannot compute number of records in the split"); } this.pos = start; } public static int hex2decimal(byte[] b) { int val = 0; for (int i = 0; i < 2; i++) { int low = b[i] & 0x0F; int high = (b[i] >> 4) & 0x0f; if (low < 0) low *= -1; if (high < 0) high *= -1; int num = high * 16 + low; val = 256 * val + num; } return val; } public int getRecordLength() { return recordLength; } @Override public synchronized boolean nextKeyValue() throws IOException { LOG.debug("VLR nextKey value started: pos" + pos + "leninit" + recordLength); byte[] lengthByte = new byte[4]; int offset = 0; int numBytesRead = 0; if (numBytesRemainingInSplit > 0) { int numBytesToRead = 4; while (numBytesToRead > 0) { byte[] tempByte = new byte[4]; numBytesRead = inputStream.read(tempByte, offset, numBytesToRead); if (numBytesRead == -1) { // EOF return false; } for (int i = 0; i < numBytesRead; i++) { lengthByte[4 - numBytesToRead + i] = tempByte[i]; } numBytesToRead -= numBytesRead; } if (numBytesToRead == 0) { pos += numBytesRead; recordLength = hex2decimal(lengthByte) - 4; numBytesRemainingInSplit -= numBytesRead; } else { throw new IOException("Error Reading RDW at pos = " + pos); } } if (key == null) { key = new LongWritable(); } if (value == null) { value = new BytesWritable(new byte[recordLength]); } LOG.debug("VLR nextKey record length" + recordLength + ":pos:" + pos); boolean dataRead = false; value.setSize(recordLength); byte[] record = value.getBytes(); if (numBytesRemainingInSplit > 0) { key.set(pos); int numBytesToRead = recordLength; while (numBytesToRead > 0) { numBytesRead = inputStream.read(record, offset, numBytesToRead); if (numBytesRead == -1) { // EOF break; } offset += numBytesRead; numBytesToRead -= numBytesRead; } numBytesRead = recordLength - numBytesToRead; pos += numBytesRead; if (numBytesRead > 0) { dataRead = true; if (numBytesRead >= recordLength) { if (!isCompressedInput) { numBytesRemainingInSplit -= numBytesRead; } } else { dataRead = false; throw new IOException( "Partial record(length = " + numBytesRead + ") found at the end of split."); } } else { numBytesRemainingInSplit = 0L; // End of input. } } return dataRead; } @Override public LongWritable getCurrentKey() { return key; } @Override public BytesWritable getCurrentValue() { return value; } @Override public synchronized float getProgress() throws IOException { if (start == end) { return 0.0f; } else { return Math.min(1.0f, (getFilePosition() - start) / (float) (end - start)); } } @Override public synchronized void close() throws IOException { try { if (inputStream != null) { inputStream.close(); inputStream = null; } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); decompressor = null; } } } // This is called from the old FixedLengthRecordReader API implementation. public long getPos() { return pos; } private long getFilePosition() throws IOException { long retVal; if (isCompressedInput && null != filePosition) { retVal = filePosition.getPos(); } else { retVal = pos; } retVal = filePosition.getPos(); return retVal; } }