cascading.scheme.TextLine.java Source code

Java tutorial

Introduction

Here is the source code for cascading.scheme.TextLine.java

Source

/*
 * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Cascading is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Cascading is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Cascading.  If not, see <http://www.gnu.org/licenses/>.
 */

package cascading.scheme;

import java.beans.ConstructorProperties;
import java.io.IOException;

import cascading.tap.Tap;
import cascading.tap.hadoop.ZipInputFormat;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

/**
 * A TextLine is a type of {@link Scheme} for plain text files. Files are broken into
 * lines. Either line-feed or carriage-return are used to signal end of line.
 * <p/>
 * By default, this scheme returns a {@link Tuple} with two fields, "offset" and "line".
 * <p/>
 * Many of the constructors take both "sourceFields" and "sinkFields". sourceFields denote the field names
 * to be used instead of the names "offset" and "line". sinkFields is a selector and is by default {@link Fields#ALL}.
 * Any available field names can be given if only a subset of the incoming fields should be used.
 * <p/>
 * If a {@link Fields} instance is passed on the constructor as sourceFields having only one field, the return tuples
 * will simply be the "line" value using the given field name.
 * <p/>
 * Note that TextLine will concatenate all the Tuple values for the selected fields with a TAB delimiter before
 * writing out the line.
 * <p/>
 * Note sink compression is {@link Compress#DISABLE} by default. If {@code null} is passed to the constructor
 * for the compression value, it will remain disabled.
 * <p/>
 * If all the input files end with ".zip", the {@link ZipInputFormat} will be used. This is not
 * bi-directional, so zip files cannot be written.
 */
public class TextLine extends Scheme {
    public enum Compress {
        DEFAULT, ENABLE, DISABLE
    }

    /** Field serialVersionUID */
    private static final long serialVersionUID = 1L;
    /** Field DEFAULT_SOURCE_FIELDS */
    public static final Fields DEFAULT_SOURCE_FIELDS = new Fields("offset", "line");

    /** Field sinkCompression */
    Compress sinkCompression = Compress.DISABLE;

    /**
     * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where
     * "offset" is the byte offset in the input file.
     */
    public TextLine() {
        super(DEFAULT_SOURCE_FIELDS);
    }

    /**
     * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where
     * "offset" is the byte offset in the input file.
     *
     * @param numSinkParts of type int
     */
    @ConstructorProperties({ "numSinkParts" })
    public TextLine(int numSinkParts) {
        super(DEFAULT_SOURCE_FIELDS, numSinkParts);
    }

    /**
     * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where
     * "offset" is the byte offset in the input file.
     *
     * @param sinkCompression of type Compress
     */
    @ConstructorProperties({ "sinkCompression" })
    public TextLine(Compress sinkCompression) {
        super(DEFAULT_SOURCE_FIELDS);

        setSinkCompression(sinkCompression);
    }

    /**
     * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields the source fields for this scheme
     * @param sinkFields   the sink fields for this scheme
     */
    @ConstructorProperties({ "sourceFields", "sinkFields" })
    public TextLine(Fields sourceFields, Fields sinkFields) {
        super(sourceFields, sinkFields);

        if (sourceFields.size() < 1 || sourceFields.size() > 2)
            throw new IllegalArgumentException(
                    "this scheme requires either one or two source fields, given [" + sourceFields + "]");
    }

    /**
     * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields the source fields for this scheme
     * @param sinkFields   the sink fields for this scheme
     * @param numSinkParts of type int
     */
    @ConstructorProperties({ "sourceFields", "sinkFields", "numSinkParts" })
    public TextLine(Fields sourceFields, Fields sinkFields, int numSinkParts) {
        super(sourceFields, sinkFields, numSinkParts);

        if (sourceFields.size() < 1 || sourceFields.size() > 2)
            throw new IllegalArgumentException(
                    "this scheme requires either one or two source fields, given [" + sourceFields + "]");
    }

    /**
     * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields    of type Fields
     * @param sinkFields      of type Fields
     * @param sinkCompression of type Compress
     */
    @ConstructorProperties({ "sourceFields", "sinkFields", "sinkCompression" })
    public TextLine(Fields sourceFields, Fields sinkFields, Compress sinkCompression) {
        super(sourceFields, sinkFields);

        this.sinkCompression = sinkCompression;

        if (sourceFields.size() < 1 || sourceFields.size() > 2)
            throw new IllegalArgumentException(
                    "this scheme requires either one or two source fields, given [" + sourceFields + "]");
    }

    /**
     * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields    of type Fields
     * @param sinkFields      of type Fields
     * @param sinkCompression of type Compress
     * @param numSinkParts    of type int
     */
    @ConstructorProperties({ "sourceFields", "sinkFields", "sinkCompression", "numSinkParts" })
    public TextLine(Fields sourceFields, Fields sinkFields, Compress sinkCompression, int numSinkParts) {
        super(sourceFields, sinkFields, numSinkParts);

        setSinkCompression(sinkCompression);

        if (sourceFields.size() < 1 || sourceFields.size() > 2)
            throw new IllegalArgumentException(
                    "this scheme requires either one or two source fields, given [" + sourceFields + "]");
    }

    /**
     * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields the source fields for this scheme
     */
    @ConstructorProperties({ "sourceFields" })
    public TextLine(Fields sourceFields) {
        super(sourceFields);

        if (sourceFields.size() < 1 || sourceFields.size() > 2)
            throw new IllegalArgumentException(
                    "this scheme requires either one or two source fields, given [" + sourceFields + "]");
    }

    /**
     * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples. The resulting data set will have numSinkParts.
     *
     * @param sourceFields the source fields for this scheme
     * @param numSinkParts of type int
     */
    @ConstructorProperties({ "sourceFields", "numSinkParts" })
    public TextLine(Fields sourceFields, int numSinkParts) {
        super(sourceFields, numSinkParts);

        if (sourceFields.size() < 1 || sourceFields.size() > 2)
            throw new IllegalArgumentException(
                    "this scheme requires either one or two source fields, given [" + sourceFields + "]");
    }

    /**
     * Method getSinkCompression returns the sinkCompression of this TextLine object.
     *
     * @return the sinkCompression (type Compress) of this TextLine object.
     */
    public Compress getSinkCompression() {
        return sinkCompression;
    }

    /**
     * Method setSinkCompression sets the sinkCompression of this TextLine object. If null, compression will remain disabled.
     *
     * @param sinkCompression the sinkCompression of this TextLine object.
     */
    public void setSinkCompression(Compress sinkCompression) {
        if (sinkCompression != null) // leave disabled if null
            this.sinkCompression = sinkCompression;
    }

    @Override
    public void sourceInit(Tap tap, JobConf conf) {
        if (hasZippedFiles(FileInputFormat.getInputPaths(conf)))
            conf.setInputFormat(ZipInputFormat.class);
        else
            conf.setInputFormat(TextInputFormat.class);
    }

    private boolean hasZippedFiles(Path[] paths) {
        boolean isZipped = paths[0].getName().endsWith(".zip");

        for (int i = 1; i < paths.length; i++) {
            if (isZipped != paths[i].getName().endsWith(".zip"))
                throw new IllegalStateException("cannot mix zipped and upzippled files");
        }

        return isZipped;
    }

    @Override
    public void sinkInit(Tap tap, JobConf conf) throws IOException {
        if (tap.getQualifiedPath(conf).toString().endsWith(".zip"))
            throw new IllegalStateException("cannot write zip files: " + FileOutputFormat.getOutputPath(conf));

        if (getSinkCompression() == Compress.DISABLE)
            conf.setBoolean("mapred.output.compress", false);
        else if (getSinkCompression() == Compress.ENABLE)
            conf.setBoolean("mapred.output.compress", true);

        conf.setOutputKeyClass(Text.class); // be explicit
        conf.setOutputValueClass(Text.class); // be explicit
        conf.setOutputFormat(TextOutputFormat.class);
    }

    @Override
    public Tuple source(Object key, Object value) {
        Tuple tuple = new Tuple();

        if (sourceFields.size() == 2)
            tuple.add(key.toString());

        tuple.add(value.toString());

        return tuple;
    }

    @Override
    public void sink(TupleEntry tupleEntry, OutputCollector outputCollector) throws IOException {
        // it's ok to use NULL here so the collector does not write anything
        outputCollector.collect(null, tupleEntry.selectTuple(sinkFields));
    }

}