com.ery.hadoop.mrddx.file.LineRecordReader.java Source code

Introduction

Here is the source code for com.ery.hadoop.mrddx.file.LineRecordReader.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.ery.hadoop.mrddx.file;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.zip.ZipInputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CodecPool;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.io.compress.SplitCompressionInputStream;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.tools.tar.TarInputStream;

import com.ery.hadoop.mrddx.MRConfiguration;

/**
 * Treats keys as offset in file and value as line.
 */
public class LineRecordReader extends RecordReader<LongWritable, Text> {
    private static final Log LOG = LogFactory.getLog(LineRecordReader.class.getName());

    private CompressionCodecFactory compressionCodecs = null;
    private long start;
    private long pos;
    private long end;
    private long totalend;
    private long finishLen;
    private LineReader in;
    int maxLineLength;
    private Seekable filePosition;
    private CompressionCodec codec;
    private Decompressor decompressor;
    private FileSplit split;
    private Configuration job;
    private LongWritable key = null;
    private Text value = null;
    int perFileSkipRowNum;
    private String fileEncodeing;

    /**
     * A class that provides a line reader from an input stream.
     * 
     * @deprecated Use {@link org.apache.hadoop.util.LineReader} instead.
     */
    @Deprecated
    public static class LineReader extends LineReaders {

        LineReader(InputStream in, int skipNum) {
            super(in, LineReaders.DEFAULT_BUFFER_SIZE, skipNum);
        }

        LineReader(InputStream in, int bufferSize, int skipNum) {
            super(in, bufferSize, skipNum);
        }

        public LineReader(InputStream in, Configuration conf) throws IOException {
            super(in, conf);
        }
    }

    public LineRecordReader(Configuration job, FileSplit split) throws IOException {
        this.perFileSkipRowNum = job.getInt(FileConfiguration.INPUT_FILE_SKIP_ROWNUM, 0);
        this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
        this.job = job;
        this.split = split;
        for (long l : split.getLengths()) {
            totalend += l;
        }
        this.fileEncodeing = job
                .get(MRConfiguration.FILE_CONTENT_ENCODING, MRConfiguration.FILE_CONTENT_ENCODING_DEFAULT)
                .toLowerCase();
        if (this.fileEncodeing.equals("")) {
            this.fileEncodeing = "utf-8";
        }
        this.split.setFileIndex(0);
        this.openFile();
    }

    private boolean isCompressedInput() {
        return (codec != null);
    }

    private int maxBytesToConsume(long pos) {
        return isCompressedInput() ? Integer.MAX_VALUE : (int) Math.min(Integer.MAX_VALUE, end - pos);
    }

    private long getFilePosition() throws IOException {
        long retVal;
        if (isCompressedInput() && null != filePosition) {
            retVal = filePosition.getPos();
        } else {
            retVal = pos;
        }
        return retVal;
    }

    public LineRecordReader(InputStream in, long offset, long endOffset, int maxLineLength, int skipNum) {
        this.maxLineLength = maxLineLength;
        this.perFileSkipRowNum = skipNum;
        this.in = new LineReader(in, perFileSkipRowNum);
        this.start = offset;
        this.pos = offset;
        this.end = endOffset;
        this.filePosition = null;
        this.fileEncodeing = job
                .get(MRConfiguration.FILE_CONTENT_ENCODING, MRConfiguration.FILE_CONTENT_ENCODING_DEFAULT)
                .toLowerCase();
        if (this.fileEncodeing.equals("")) {
            this.fileEncodeing = "utf-8";
        }
    }

    public LineRecordReader(InputStream in, long offset, long endOffset, Configuration job) throws IOException {
        this.job = job;
        this.perFileSkipRowNum = job.getInt(FileConfiguration.INPUT_FILE_SKIP_ROWNUM, 0);
        this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
        this.in = new LineReader(in, job);
        this.start = offset;
        this.pos = offset;
        this.end = endOffset;
        this.filePosition = null;
    }

    public LongWritable createKey() {
        return new LongWritable();
    }

    public Text createValue() {
        return new Text();
    }

    /** Read a line. */
    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        // public synchronized boolean next(LongWritable key, Text value) throws
        // IOException {
        if (key == null) {
            key = new LongWritable();
        }
        if (value == null) {
            value = createValue();
        }
        // We always read one extra line, which lies outside the upper
        // split limit i.e. (end - 1)
        do {
            while (getFilePosition() <= end) {
                key.set(pos);
                int newSize = in.readLine(value, maxLineLength, Math.max(maxBytesToConsume(pos), maxLineLength));
                if (newSize <= 0) {
                    if (getNextFile()) {
                        continue;
                        // return nextKeyValue();
                    } else {
                        return false;
                    }
                }
                pos += newSize;
                if (newSize < maxLineLength) {
                    return true;
                }
                // line too long. try again
                LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
            }
        } while (getNextFile());
        return false;
    }

    public String getFilePath() {
        return split.getPath().getName();
    }

    public String getContextFileName() {
        if (in.getInputStream() instanceof TarInputStream || in.getInputStream() instanceof ZipInputStream) {
            return in.getContextFileName();
        }
        return split.getPath().getName();
    }

    void openFile() throws IOException {
        start = split.getStart();
        end = start + split.getLength();
        final Path file = split.getPath();
        LOG.info("split.getFileIndex=" + split.getFileIndex() + ",file.path=" + file.toString() + " fileEncodeing="
                + fileEncodeing + " " + split.getStart() + ":" + split.getLength());
        // open the file and seek to the start of the split
        FileSystem fs = file.getFileSystem(job);
        FSDataInputStream fileIn = fs.open(split.getPath());

        compressionCodecs = new CompressionCodecFactory(job);
        codec = compressionCodecs.getCodec(file);
        if (file.getName().endsWith(".zip")) {
            LOG.info("use ZipInputStream read file " + split.getPath());
            ZipInputStream zin = new ZipInputStream(fileIn, Charset.forName(fileEncodeing));
            in = new LineReader(zin, job);
            filePosition = fileIn;
            codec = new GzipCodec();
            return;
        }
        if (isCompressedInput()) {
            decompressor = CodecPool.getDecompressor(codec);
            if (codec instanceof SplittableCompressionCodec) {
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                        fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
                // tar.gzTarInputStream
                // new TarInputStream(codec.createInputStream(fileIn,
                // decompressor)
                String filename = file.getName();
                if (filename.endsWith(".tar.gz")) {
                    in = new LineReader(new TarInputStream(cIn), job);
                } else {
                    in = new LineReader(cIn, job);
                }
                start = cIn.getAdjustedStart();
                end = cIn.getAdjustedEnd();
                filePosition = cIn; // take pos from compressed stream
            } else {
                String filename = file.getName();
                if (filename.endsWith(".tar.gz") || filename.endsWith(".tar")) {
                    in = new LineReader(new TarInputStream(codec.createInputStream(fileIn, decompressor)), job);
                } else {
                    in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
                }
                filePosition = fileIn;
            }
        } else {
            fileIn.seek(start);
            String filename = file.getName();
            if (filename.endsWith(".tar")) {
                in = new LineReader(new TarInputStream(fileIn), job);
            } else {
                in = new LineReader(fileIn, job);
            }

            filePosition = fileIn;
        }
        // If this is not the first split, we always throw away first record
        // because we always (except the last split) read one extra line in
        // next() method.
        if (start != 0) {
            start += in.readLine(new Text(), 0, maxBytesToConsume(start));
        }
        this.pos = start;
    }

    private boolean getNextFile() throws IOException {
        if (split.getFileIndex() + 1 >= split.getFileMaxIndex()) {// ?
            LOG.info("split.getFileIndex=" + split.getFileIndex() + ",totalFiles=" + split.getFileMaxIndex());
            return false;
        }

        finishLen += split.getLength();
        split.setFileIndex(split.getFileIndex() + 1);
        this.openFile();
        return true;
    }

    /**
     * Get the progress within the split
     */
    public float getProgress() throws IOException {
        if (start == totalend) {
            return 0.0f;
        } else {
            return Math.min(1.0f, ((getFilePosition() - start) + finishLen) / (float) totalend);
        }
    }

    public synchronized long getPos() throws IOException {
        return pos;
    }

    public synchronized void close() throws IOException {
        try {
            if (in != null) {
                in.close();
            }
        } finally {
            if (decompressor != null) {
                CodecPool.returnDecompressor(decompressor);
            }
        }
    }

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        // TODO Auto-generated method stub
    }

    @Override
    public LongWritable getCurrentKey() throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        return this.key;
    }

    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        return this.value;
    }
}