org.apache.hadoop.mapred.SequenceFileRecordReader.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.mapred.SequenceFileRecordReader.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.mapred;

import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.SequenceFile.ValueBytes;
import org.apache.hadoop.util.ReflectionUtils;

import skewtune.mapreduce.SkewTuneJobConfig;
import skewtune.mapreduce.lib.input.RecordLength;

/** An {@link RecordReader} for {@link SequenceFile}s. */
@InterfaceAudience.Public
@InterfaceStability.Stable
public class SequenceFileRecordReader<K, V> implements SplittableRecordReader<K, V>, ScannableReader<K, V> {
    private static final Log LOG = LogFactory.getLog(SequenceFileRecordReader.class.getName());

    private SequenceFile.Reader in;
    private long start;
    private long end;
    private boolean more = true;
    protected Configuration conf;

    // SKEWREDUCE
    // monitoring skew
    private long pos; // current position
    private long len;
    private float minFactor;
    private FileSplit split;
    private boolean stopped;

    private long lastSyncPos;
    private int recSinceSync;

    public SequenceFileRecordReader(Configuration conf, FileSplit split) throws IOException {
        Path path = split.getPath();
        FileSystem fs = path.getFileSystem(conf);
        this.in = new SequenceFile.Reader(fs, path, conf);
        this.end = split.getStart() + split.getLength();
        this.conf = conf;

        RelativeOffset ro = new RelativeOffset(conf);

        if (ro.shouldOverride(conf, split.getStart())) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("override starting offset using relative offset = " + ro);
            }
            if (ro.getOffset() > in.getPosition()) {
                in.sync(ro.getOffset());
                // now skip over
                more = skipRecs(ro.getRecOffset());
            }
        } else {
            if (split.getStart() > in.getPosition())
                in.sync(split.getStart()); // sync to start
        }

        this.start = in.getPosition();
        more = start < end;

        this.len = split.getLength();
        minFactor = conf.getFloat(SkewTuneJobConfig.MAP_NUM_SPLITS_ATTR, 0.0f); // by default, never report to tracker

        this.split = split;

        if (LOG.isDebugEnabled()) {
            LOG.debug("start to reading input stream at " + this.start);
        }
    }

    private boolean skipRecs(int n) throws IOException {
        DataOutputBuffer rawKey = new DataOutputBuffer();
        ValueBytes rawVal = in.createValueBytes();
        for (int i = 0; i <= n; ++i) {
            if (in.nextRaw(rawKey, rawVal) < 0)
                return false;
        }
        return true;
    }

    /** The class of key that must be passed to {@link
     * #next(Object, Object)}.. */
    public Class getKeyClass() {
        return in.getKeyClass();
    }

    /** The class of value that must be passed to {@link
     * #next(Object, Object)}.. */
    public Class getValueClass() {
        return in.getValueClass();
    }

    @SuppressWarnings("unchecked")
    public K createKey() {
        return (K) ReflectionUtils.newInstance(getKeyClass(), conf);
    }

    @SuppressWarnings("unchecked")
    public V createValue() {
        return (V) ReflectionUtils.newInstance(getValueClass(), conf);
    }

    public synchronized boolean next(K key, V value) throws IOException {
        if (!more)
            return false;
        pos = in.getPosition();
        boolean remaining = (in.next(key) != null);
        if (remaining) {
            getCurrentValue(value);
        }
        if ((pos >= end || stopped) && in.syncSeen()) {
            more = false;
        } else {
            more = remaining;
        }

        if (more) {
            if (in.syncSeen()) {
                lastSyncPos = pos;
                recSinceSync = 0;
            } else {
                ++recSinceSync;
            }
        }

        return more;
    }

    protected synchronized boolean next(K key) throws IOException {
        if (!more)
            return false;
        pos = in.getPosition();
        boolean remaining = (in.next(key) != null);
        if ((pos >= end || stopped) && in.syncSeen()) {
            more = false;
        } else {
            more = remaining;
        }

        if (more) {
            if (in.syncSeen()) {
                lastSyncPos = pos;
                recSinceSync = 0;
            } else {
                ++recSinceSync;
            }
        }

        return more;
    }

    protected synchronized void getCurrentValue(V value) throws IOException {
        in.getCurrentValue(value);

        // check record size
        long recEnd = in.getPosition();
        int recLen = (int) (recEnd - pos);
        float factor = len / (float) recLen;
        if (factor < minFactor) {
            minFactor = factor;
            SkewNotifier.registerNotification(minFactor, pos, recLen);
        }
    }

    /**
     * Return the progress within the input split
     * @return 0.0 to 1.0 of the input byte range
     */
    public float getProgress() throws IOException {
        if (end == start) {
            return 0.0f;
        } else {
            return Math.min(1.0f, (in.getPosition() - start) / (float) (end - start));
        }
    }

    public synchronized long getPos() throws IOException {
        return in.getPosition();
    }

    protected synchronized void seek(long pos) throws IOException {
        in.seek(pos);
    }

    public synchronized void close() throws IOException {
        if (LOG.isDebugEnabled()) {
            LOG.debug("closing input stream at " + in.getPosition());
        }
        in.close();
    }

    /*
        @Override
        public synchronized List<InputSplit> split(int n, boolean cont) throws IOException {
    long pos = in.getPosition();
        
    long remain = split.getLength() - (pos+1);
    long perSplit = (long)Math.ceil(remain / (double) ( cont ? n+1 : n ));
        
    long startPos = pos+1;
    end = in.getPosition();
        
    if ( cont ) {
        startPos += perSplit;
        end += perSplit;
        remain -= perSplit;
    }
        
    List<InputSplit> splits = new ArrayList<InputSplit>(n);
    for ( int i = 0; i < n; ++i ) {
        long sz = Math.min(remain, perSplit);
        FileSplit fileSplit = new FileSplit(split.getPath(), startPos, sz, (String[])null);
        splits.add(fileSplit);
        startPos += sz;
        remain -= sz;
    }
        
    return splits;
        }
    */

    /**
     * @param out contains file name, current offset, and supposed length
     */
    @Override
    public synchronized StopStatus tellAndStop(DataOutput out) throws IOException {
        if (!more || stopped || pos >= end) {
            if (LOG.isInfoEnabled()) {
                LOG.info("tellAndStop(): already stopped pos=" + pos + "; end=" + end);
            }
            return StopStatus.CANNOT_STOP;
        }

        Text.writeString(out, split.getPath().toString());
        out.writeLong(in.getPosition());
        out.writeLong(split.getStart() + split.getLength());

        if (LOG.isDebugEnabled()) {
            LOG.debug("tellAndStop(): stop reading input stream at position " + in.getPosition());
        }

        stopped = true;

        return StopStatus.STOPPED;
    }

    /*
    @Override
    public synchronized boolean scanNextKeyValue(LongWritable offset, RecordLength recLen) throws IOException {
    if ( ! more ) return false;
        
    if ( rawScanKey == null ) rawScanKey = new DataOutputBuffer(65536);
    if ( rawScanVal == null ) rawScanVal = in.createValueBytes();
        
    pos = in.getPosition();
    int len = in.nextRaw(rawScanKey, rawScanVal);
    long endOff = in.getPosition();
        
    boolean remaining = len >= 0;
    boolean syncSeen = in.syncSeen();
    if ( (pos >= end) && syncSeen ) {
        more = false;
    } else {
        more = remaining;
    }
        
    offset.set(pos);
    recLen.set(rawScanKey.getLength(), rawScanVal.getSize(), (int)(endOff-pos), syncSeen);
        
    return more;
    }
    */

    // we should buffer one record because of the sync mark.
    // if syncseen is true, that means the previous record is the boundary -- end of one chunk, not the current one.

    private DataOutputBuffer rawScanKey;
    private ValueBytes rawScanVal;

    // buffer
    private long prevPos;
    private RecordLength prevRecLen;

    @Override
    public synchronized void initScan() throws IOException {
        rawScanKey = new DataOutputBuffer(65536);
        rawScanVal = in.createValueBytes();
        prevRecLen = new RecordLength();

        pos = in.getPosition();
        int len = in.nextRaw(rawScanKey, rawScanVal);
        long endOff = in.getPosition();

        boolean remaining = len >= 0;
        boolean syncSeen = in.syncSeen();
        if ((pos >= end) && syncSeen) {
            more = false;
        } else {
            more = remaining;
        }

        if (more) {
            prevPos = pos;
            prevRecLen.set(rawScanKey.getLength(), rawScanVal.getSize(), (int) (endOff - pos), false); // don't care about sync
        } else {
            prevPos = -1;
        }
    }

    @Override
    public synchronized boolean scanNextKeyValue(LongWritable offset, RecordLength recLen) throws IOException {
        if (!more) {
            if (prevPos < 0) {
                return false;
            } else {
                // last record. must be splittable
                offset.set(prevPos);
                recLen.set(prevRecLen.getKeyLength(), prevRecLen.getValueLength(), prevRecLen.getRecordLength(),
                        true);
                prevPos = -1;
                return true;
            }
        }

        rawScanKey.reset();
        pos = in.getPosition();
        int len = in.nextRaw(rawScanKey, rawScanVal);
        long endOff = in.getPosition();

        boolean remaining = len >= 0;
        boolean syncSeen = in.syncSeen();
        if ((pos >= end) && syncSeen) {
            more = false;
        } else {
            more = remaining;
        }

        // copy buffered entry
        offset.set(prevPos);
        recLen.set(prevRecLen.getKeyLength(), prevRecLen.getValueLength(), prevRecLen.getRecordLength(), syncSeen);

        if (more) {
            prevPos = pos;
            prevRecLen.set(rawScanKey.getLength(), rawScanVal.getSize(), (int) (endOff - pos), false);
        } else {
            prevPos = -1; // no more buffered
        }

        return true;
    }

    @Override
    public synchronized void closeScan() throws IOException {
        rawScanKey = null;
        rawScanVal = null;
        prevRecLen = null;
    }

    @Override
    public RelativeOffset getRelativeOffset() {
        return new RelativeOffset(lastSyncPos, recSinceSync);
    }
}