cascading.tap.hadoop.io.MultiRecordReaderIterator.java Source code

Introduction

Here is the source code for cascading.tap.hadoop.io.MultiRecordReaderIterator.java
Source

/*
 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.tap.hadoop.io;

import java.io.IOException;

import cascading.flow.FlowProcess;
import cascading.flow.hadoop.MapRed;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.tap.Tap;
import cascading.tap.TapException;
import cascading.tuple.Tuple;
import cascading.util.CloseableIterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static cascading.flow.hadoop.util.HadoopUtil.asJobConfInstance;

/**
 * Class TapIterator is an implementation of {@link cascading.util.CloseableIterator}. It is returned by {@link cascading.tap.Tap} instances when
 * opening the taps resource for reading.
 */
public class MultiRecordReaderIterator implements CloseableIterator<RecordReader> {
    /** Field LOG */
    private static final Logger LOG = LoggerFactory.getLogger(MultiRecordReaderIterator.class);

    private final FlowProcess<? extends Configuration> flowProcess;
    /** Field tap */
    private final Tap tap;
    /** Field inputFormat */
    private InputFormat inputFormat;
    /** Field conf */
    private Configuration conf;
    /** Field splits */
    private InputSplit[] splits;
    /** Field reader */
    private RecordReader reader;

    /** Field lastReader */
    private RecordReader lastReader;

    /** Field currentSplit */
    private int currentSplit = 0;
    /** Field complete */
    private boolean complete = false;

    /**
     * Constructor TapIterator creates a new TapIterator instance.
     *
     * @throws IOException when
     */
    public MultiRecordReaderIterator(FlowProcess<? extends Configuration> flowProcess, Tap tap) throws IOException {
        this.flowProcess = flowProcess;
        this.tap = tap;
        this.conf = flowProcess.getConfigCopy();

        initialize();
    }

    private void initialize() throws IOException {
        // prevent collisions of configuration properties set client side if now cluster side
        String property = flowProcess.getStringProperty("cascading.node.accumulated.source.conf." + Tap.id(tap));

        if (property == null) {
            // default behavior is to accumulate paths, so remove any set prior
            conf = HadoopUtil.removePropertiesFrom(conf, "mapred.input.dir",
                    "mapreduce.input.fileinputformat.inputdir"); // hadoop2
            tap.sourceConfInit(flowProcess, conf);
        }

        JobConf jobConf = asJobConfInstance(conf);

        inputFormat = jobConf.getInputFormat();

        if (inputFormat instanceof JobConfigurable)
            ((JobConfigurable) inputFormat).configure(jobConf);

        // do not test for existence, let hadoop decide how to handle the given path
        // this delegates globbing to the inputformat on split generation.
        splits = inputFormat.getSplits(jobConf, 1);

        if (splits.length == 0)
            complete = true;
    }

    private RecordReader makeReader(int currentSplit) throws IOException {
        LOG.debug("reading split: {}", currentSplit);

        Reporter reporter = Reporter.NULL;

        if (flowProcess instanceof MapRed)
            reporter = ((MapRed) flowProcess).getReporter(); // may return Reporter.NULL

        return inputFormat.getRecordReader(splits[currentSplit], asJobConfInstance(conf), reporter);
    }

    /**
     * Method hasNext returns true if there more {@link Tuple} instances available.
     *
     * @return boolean
     */
    public boolean hasNext() {
        getNextReader();

        return !complete;
    }

    /**
     * Method next returns the next {@link Tuple}.
     *
     * @return Tuple
     */
    public RecordReader next() {
        if (complete)
            throw new IllegalStateException("no more values");

        try {
            getNextReader();

            return reader;
        } finally {
            reader = null;
        }
    }

    private void getNextReader() {
        if (complete || reader != null)
            return;

        try {
            if (currentSplit < splits.length) {
                if (lastReader != null)
                    lastReader.close();

                reader = makeReader(currentSplit++);
                lastReader = reader;
            } else {
                complete = true;
            }
        } catch (IOException exception) {
            throw new TapException("could not get next tuple", exception);
        }
    }

    public void remove() {
        throw new UnsupportedOperationException("unimplemented");
    }

    @Override
    public void close() throws IOException {
        if (lastReader != null)
            lastReader.close();
    }
}