cascading.scheme.hadoop.TextLine.java Source code

Java tutorial

Introduction

Here is the source code for cascading.scheme.hadoop.TextLine.java

Source

/*
 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.scheme.hadoop;

import java.beans.ConstructorProperties;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Arrays;

import cascading.flow.FlowProcess;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.management.annotation.Property;
import cascading.management.annotation.PropertyDescription;
import cascading.management.annotation.Visibility;
import cascading.scheme.Scheme;
import cascading.scheme.SinkCall;
import cascading.scheme.SourceCall;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;

import static cascading.flow.hadoop.util.HadoopUtil.asJobConfInstance;

/**
 * A TextLine is a type of {@link cascading.scheme.Scheme} for plain text files. Files are broken into
 * lines. Either line-feed or carriage-return are used to signal end of line.
 * <p/>
 * By default, this scheme returns a {@link Tuple} with two fields, "offset" and "line".
 * <p/>
 * Many of the constructors take both "sourceFields" and "sinkFields". sourceFields denote the field names
 * to be used instead of the names "offset" and "line". sinkFields is a selector and is by default {@link Fields#ALL}.
 * Any available field names can be given if only a subset of the incoming fields should be used.
 * <p/>
 * If a {@link Fields} instance is passed on the constructor as sourceFields having only one field, the return tuples
 * will simply be the "line" value using the given field name.
 * <p/>
 * Note that TextLine will concatenate all the Tuple values for the selected fields with a TAB delimiter before
 * writing out the line.
 * <p/>
 * Note sink compression is {@link Compress#DISABLE} by default. If {@code null} is passed to the constructor
 * for the compression value, it will remain disabled.
 * <p/>
 * If any of the input files end with ".zip", an error will be thrown.
 * * <p/>
 * By default, all text is encoded/decoded as UTF-8. This can be changed via the {@code charsetName} constructor
 * argument.
 */
public class TextLine extends Scheme<Configuration, RecordReader, OutputCollector, Object[], Object[]> {
    public enum Compress {
        DEFAULT, ENABLE, DISABLE
    }

    public static final String DEFAULT_CHARSET = "UTF-8";

    /** Field serialVersionUID */
    private static final long serialVersionUID = 1L;
    /** Field DEFAULT_SOURCE_FIELDS */
    public static final Fields DEFAULT_SOURCE_FIELDS = new Fields("offset", "line");

    /** Field sinkCompression */
    Compress sinkCompression = Compress.DISABLE;

    String charsetName = DEFAULT_CHARSET;

    /**
     * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where
     * "offset" is the byte offset in the input file.
     */
    public TextLine() {
        super(DEFAULT_SOURCE_FIELDS);
    }

    /**
     * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where
     * "offset" is the byte offset in the input file.
     *
     * @param numSinkParts of type int
     */
    @ConstructorProperties({ "numSinkParts" })
    public TextLine(int numSinkParts) {
        super(DEFAULT_SOURCE_FIELDS, numSinkParts);
    }

    /**
     * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where
     * "offset" is the byte offset in the input file.
     *
     * @param sinkCompression of type Compress
     */
    @ConstructorProperties({ "sinkCompression" })
    public TextLine(Compress sinkCompression) {
        super(DEFAULT_SOURCE_FIELDS);

        setSinkCompression(sinkCompression);
    }

    /**
     * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields the source fields for this scheme
     * @param sinkFields   the sink fields for this scheme
     */
    @ConstructorProperties({ "sourceFields", "sinkFields" })
    public TextLine(Fields sourceFields, Fields sinkFields) {
        super(sourceFields, sinkFields);

        verify(sourceFields);
    }

    /**
     * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields the source fields for this scheme
     * @param sinkFields   the sink fields for this scheme
     * @param charsetName  of type String
     */
    @ConstructorProperties({ "sourceFields", "sinkFields", "charsetName" })
    public TextLine(Fields sourceFields, Fields sinkFields, String charsetName) {
        super(sourceFields, sinkFields);

        // throws an exception if not found
        setCharsetName(charsetName);

        verify(sourceFields);
    }

    /**
     * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields the source fields for this scheme
     * @param sinkFields   the sink fields for this scheme
     * @param numSinkParts of type int
     */
    @ConstructorProperties({ "sourceFields", "sinkFields", "numSinkParts" })
    public TextLine(Fields sourceFields, Fields sinkFields, int numSinkParts) {
        super(sourceFields, sinkFields, numSinkParts);

        verify(sourceFields);
    }

    /**
     * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields    of type Fields
     * @param sinkFields      of type Fields
     * @param sinkCompression of type Compress
     */
    @ConstructorProperties({ "sourceFields", "sinkFields", "sinkCompression" })
    public TextLine(Fields sourceFields, Fields sinkFields, Compress sinkCompression) {
        super(sourceFields, sinkFields);

        setSinkCompression(sinkCompression);

        verify(sourceFields);
    }

    /**
     * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields    of type Fields
     * @param sinkFields      of type Fields
     * @param sinkCompression of type Compress
     * @param charsetName     of type String
     */
    @ConstructorProperties({ "sourceFields", "sinkFields", "sinkCompression", "charsetName" })
    public TextLine(Fields sourceFields, Fields sinkFields, Compress sinkCompression, String charsetName) {
        super(sourceFields, sinkFields);

        setSinkCompression(sinkCompression);

        // throws an exception if not found
        setCharsetName(charsetName);

        verify(sourceFields);
    }

    /**
     * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields    of type Fields
     * @param sinkFields      of type Fields
     * @param sinkCompression of type Compress
     * @param numSinkParts    of type int
     */
    @ConstructorProperties({ "sourceFields", "sinkFields", "sinkCompression", "numSinkParts" })
    public TextLine(Fields sourceFields, Fields sinkFields, Compress sinkCompression, int numSinkParts) {
        super(sourceFields, sinkFields, numSinkParts);

        setSinkCompression(sinkCompression);

        verify(sourceFields);
    }

    /**
     * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields    of type Fields
     * @param sinkFields      of type Fields
     * @param sinkCompression of type Compress
     * @param numSinkParts    of type int
     * @param charsetName     of type String
     */
    @ConstructorProperties({ "sourceFields", "sinkFields", "sinkCompression", "numSinkParts", "charsetName" })
    public TextLine(Fields sourceFields, Fields sinkFields, Compress sinkCompression, int numSinkParts,
            String charsetName) {
        super(sourceFields, sinkFields, numSinkParts);

        setSinkCompression(sinkCompression);

        // throws an exception if not found
        setCharsetName(charsetName);

        verify(sourceFields);
    }

    /**
     * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields the source fields for this scheme
     */
    @ConstructorProperties({ "sourceFields" })
    public TextLine(Fields sourceFields) {
        super(sourceFields);

        verify(sourceFields);
    }

    /**
     * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples.
     *
     * @param sourceFields the source fields for this scheme
     * @param charsetName  of type String
     */
    @ConstructorProperties({ "sourceFields", "charsetName" })
    public TextLine(Fields sourceFields, String charsetName) {
        super(sourceFields);

        // throws an exception if not found
        setCharsetName(charsetName);

        verify(sourceFields);
    }

    /**
     * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the
     * subsequent tuples. The resulting data set will have numSinkParts.
     *
     * @param sourceFields the source fields for this scheme
     * @param numSinkParts of type int
     */
    @ConstructorProperties({ "sourceFields", "numSinkParts" })
    public TextLine(Fields sourceFields, int numSinkParts) {
        super(sourceFields, numSinkParts);

        verify(sourceFields);
    }

    protected void setCharsetName(String charsetName) {
        if (charsetName != null)
            this.charsetName = charsetName;

        Charset.forName(this.charsetName);
    }

    @Property(name = "charset", visibility = Visibility.PUBLIC)
    @PropertyDescription(value = "character set used in this scheme.")
    public String getCharsetName() {
        return charsetName;
    }

    protected void verify(Fields sourceFields) {
        if (sourceFields.size() < 1 || sourceFields.size() > 2)
            throw new IllegalArgumentException(
                    "this scheme requires either one or two source fields, given [" + sourceFields + "]");
    }

    /**
     * Method getSinkCompression returns the sinkCompression of this TextLine object.
     *
     * @return the sinkCompression (type Compress) of this TextLine object.
     */
    @Property(name = "sinkCompression", visibility = Visibility.PUBLIC)
    @PropertyDescription(value = "The compression of the scheme when used in a sink.")
    public Compress getSinkCompression() {
        return sinkCompression;
    }

    /**
     * Method setSinkCompression sets the sinkCompression of this TextLine object. If null, compression will remain disabled.
     *
     * @param sinkCompression the sinkCompression of this TextLine object.
     */
    public void setSinkCompression(Compress sinkCompression) {
        if (sinkCompression != null) // leave disabled if null
            this.sinkCompression = sinkCompression;
    }

    @Override
    public void sourceConfInit(FlowProcess<? extends Configuration> flowProcess,
            Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf) {
        if (hasZippedFiles(FileInputFormat.getInputPaths(asJobConfInstance(conf))))
            throw new IllegalStateException("cannot read zip files: "
                    + Arrays.toString(FileInputFormat.getInputPaths(asJobConfInstance(conf))));

        conf.setBoolean("mapred.mapper.new-api", false);
        conf.setClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class);
    }

    private boolean hasZippedFiles(Path[] paths) {
        if (paths == null || paths.length == 0)
            return false;

        boolean isZipped = paths[0].getName().endsWith(".zip");

        for (int i = 1; i < paths.length; i++) {
            if (isZipped != paths[i].getName().endsWith(".zip"))
                throw new IllegalStateException("cannot mix zipped and upzipped files");
        }

        return isZipped;
    }

    @Override
    public void presentSourceFields(FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields) {
        // do nothing to change TextLine state
    }

    @Override
    public void presentSinkFields(FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields) {
        // do nothing to change TextLine state
    }

    @Override
    public void sinkConfInit(FlowProcess<? extends Configuration> flowProcess,
            Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf) {
        if (tap.getFullIdentifier(conf).endsWith(".zip"))
            throw new IllegalStateException("cannot write zip files: " + HadoopUtil.getOutputPath(conf));

        conf.setBoolean("mapred.mapper.new-api", false);

        if (getSinkCompression() == Compress.DISABLE)
            conf.setBoolean("mapred.output.compress", false);
        else if (getSinkCompression() == Compress.ENABLE)
            conf.setBoolean("mapred.output.compress", true);

        conf.setClass("mapred.output.key.class", Text.class, Object.class);
        conf.setClass("mapred.output.value.class", Text.class, Object.class);
        conf.setClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class);
    }

    @Override
    public void sourcePrepare(FlowProcess<? extends Configuration> flowProcess,
            SourceCall<Object[], RecordReader> sourceCall) {
        if (sourceCall.getContext() == null)
            sourceCall.setContext(new Object[3]);

        sourceCall.getContext()[0] = sourceCall.getInput().createKey();
        sourceCall.getContext()[1] = sourceCall.getInput().createValue();
        sourceCall.getContext()[2] = Charset.forName(charsetName);
    }

    @Override
    public boolean source(FlowProcess<? extends Configuration> flowProcess,
            SourceCall<Object[], RecordReader> sourceCall) throws IOException {
        if (!sourceReadInput(sourceCall))
            return false;

        sourceHandleInput(sourceCall);

        return true;
    }

    private boolean sourceReadInput(SourceCall<Object[], RecordReader> sourceCall) throws IOException {
        Object[] context = sourceCall.getContext();

        return sourceCall.getInput().next(context[0], context[1]);
    }

    protected void sourceHandleInput(SourceCall<Object[], RecordReader> sourceCall) {
        TupleEntry result = sourceCall.getIncomingEntry();

        int index = 0;
        Object[] context = sourceCall.getContext();

        // coerce into canonical forms
        if (getSourceFields().size() == 2)
            result.setLong(index++, ((LongWritable) context[0]).get());

        result.setString(index, makeEncodedString(context));
    }

    protected String makeEncodedString(Object[] context) {
        Text text = (Text) context[1];
        return new String(text.getBytes(), 0, text.getLength(), (Charset) context[2]);
    }

    @Override
    public void sourceCleanup(FlowProcess<? extends Configuration> flowProcess,
            SourceCall<Object[], RecordReader> sourceCall) {
        sourceCall.setContext(null);
    }

    @Override
    public void sinkPrepare(FlowProcess<? extends Configuration> flowProcess,
            SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
        sinkCall.setContext(new Object[2]);

        sinkCall.getContext()[0] = new Text();
        sinkCall.getContext()[1] = Charset.forName(charsetName);
    }

    @Override
    public void sink(FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall)
            throws IOException {
        Text text = (Text) sinkCall.getContext()[0];
        Charset charset = (Charset) sinkCall.getContext()[1];
        String line = sinkCall.getOutgoingEntry().getTuple().toString();

        text.set(line.getBytes(charset));

        // it's ok to use NULL here so the collector does not write anything
        sinkCall.getOutput().collect(null, text);
    }
}