Java tutorial
/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.scheme; import java.beans.ConstructorProperties; import java.io.IOException; import cascading.tap.Tap; import cascading.tap.hadoop.ZipInputFormat; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; /** * A TextLine is a type of {@link Scheme} for plain text files. Files are broken into * lines. Either line-feed or carriage-return are used to signal end of line. * <p/> * By default, this scheme returns a {@link Tuple} with two fields, "offset" and "line". * <p/> * Many of the constructors take both "sourceFields" and "sinkFields". sourceFields denote the field names * to be used instead of the names "offset" and "line". sinkFields is a selector and is by default {@link Fields#ALL}. * Any available field names can be given if only a subset of the incoming fields should be used. * <p/> * If a {@link Fields} instance is passed on the constructor as sourceFields having only one field, the return tuples * will simply be the "line" value using the given field name. * <p/> * Note that TextLine will concatenate all the Tuple values for the selected fields with a TAB delimiter before * writing out the line. * <p/> * Note sink compression is {@link Compress#DISABLE} by default. If {@code null} is passed to the constructor * for the compression value, it will remain disabled. * <p/> * If all the input files end with ".zip", the {@link ZipInputFormat} will be used. This is not * bi-directional, so zip files cannot be written. */ public class TextLine extends Scheme { public enum Compress { DEFAULT, ENABLE, DISABLE } /** Field serialVersionUID */ private static final long serialVersionUID = 1L; /** Field DEFAULT_SOURCE_FIELDS */ public static final Fields DEFAULT_SOURCE_FIELDS = new Fields("offset", "line"); /** Field sinkCompression */ Compress sinkCompression = Compress.DISABLE; /** * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where * "offset" is the byte offset in the input file. */ public TextLine() { super(DEFAULT_SOURCE_FIELDS); } /** * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where * "offset" is the byte offset in the input file. * * @param numSinkParts of type int */ @ConstructorProperties({ "numSinkParts" }) public TextLine(int numSinkParts) { super(DEFAULT_SOURCE_FIELDS, numSinkParts); } /** * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where * "offset" is the byte offset in the input file. * * @param sinkCompression of type Compress */ @ConstructorProperties({ "sinkCompression" }) public TextLine(Compress sinkCompression) { super(DEFAULT_SOURCE_FIELDS); setSinkCompression(sinkCompression); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields the source fields for this scheme * @param sinkFields the sink fields for this scheme */ @ConstructorProperties({ "sourceFields", "sinkFields" }) public TextLine(Fields sourceFields, Fields sinkFields) { super(sourceFields, sinkFields); if (sourceFields.size() < 1 || sourceFields.size() > 2) throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]"); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields the source fields for this scheme * @param sinkFields the sink fields for this scheme * @param numSinkParts of type int */ @ConstructorProperties({ "sourceFields", "sinkFields", "numSinkParts" }) public TextLine(Fields sourceFields, Fields sinkFields, int numSinkParts) { super(sourceFields, sinkFields, numSinkParts); if (sourceFields.size() < 1 || sourceFields.size() > 2) throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]"); } /** * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields of type Fields * @param sinkFields of type Fields * @param sinkCompression of type Compress */ @ConstructorProperties({ "sourceFields", "sinkFields", "sinkCompression" }) public TextLine(Fields sourceFields, Fields sinkFields, Compress sinkCompression) { super(sourceFields, sinkFields); this.sinkCompression = sinkCompression; if (sourceFields.size() < 1 || sourceFields.size() > 2) throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]"); } /** * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields of type Fields * @param sinkFields of type Fields * @param sinkCompression of type Compress * @param numSinkParts of type int */ @ConstructorProperties({ "sourceFields", "sinkFields", "sinkCompression", "numSinkParts" }) public TextLine(Fields sourceFields, Fields sinkFields, Compress sinkCompression, int numSinkParts) { super(sourceFields, sinkFields, numSinkParts); setSinkCompression(sinkCompression); if (sourceFields.size() < 1 || sourceFields.size() > 2) throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]"); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields the source fields for this scheme */ @ConstructorProperties({ "sourceFields" }) public TextLine(Fields sourceFields) { super(sourceFields); if (sourceFields.size() < 1 || sourceFields.size() > 2) throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]"); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. The resulting data set will have numSinkParts. * * @param sourceFields the source fields for this scheme * @param numSinkParts of type int */ @ConstructorProperties({ "sourceFields", "numSinkParts" }) public TextLine(Fields sourceFields, int numSinkParts) { super(sourceFields, numSinkParts); if (sourceFields.size() < 1 || sourceFields.size() > 2) throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]"); } /** * Method getSinkCompression returns the sinkCompression of this TextLine object. * * @return the sinkCompression (type Compress) of this TextLine object. */ public Compress getSinkCompression() { return sinkCompression; } /** * Method setSinkCompression sets the sinkCompression of this TextLine object. If null, compression will remain disabled. * * @param sinkCompression the sinkCompression of this TextLine object. */ public void setSinkCompression(Compress sinkCompression) { if (sinkCompression != null) // leave disabled if null this.sinkCompression = sinkCompression; } @Override public void sourceInit(Tap tap, JobConf conf) { if (hasZippedFiles(FileInputFormat.getInputPaths(conf))) conf.setInputFormat(ZipInputFormat.class); else conf.setInputFormat(TextInputFormat.class); } private boolean hasZippedFiles(Path[] paths) { boolean isZipped = paths[0].getName().endsWith(".zip"); for (int i = 1; i < paths.length; i++) { if (isZipped != paths[i].getName().endsWith(".zip")) throw new IllegalStateException("cannot mix zipped and upzippled files"); } return isZipped; } @Override public void sinkInit(Tap tap, JobConf conf) throws IOException { if (tap.getQualifiedPath(conf).toString().endsWith(".zip")) throw new IllegalStateException("cannot write zip files: " + FileOutputFormat.getOutputPath(conf)); if (getSinkCompression() == Compress.DISABLE) conf.setBoolean("mapred.output.compress", false); else if (getSinkCompression() == Compress.ENABLE) conf.setBoolean("mapred.output.compress", true); conf.setOutputKeyClass(Text.class); // be explicit conf.setOutputValueClass(Text.class); // be explicit conf.setOutputFormat(TextOutputFormat.class); } @Override public Tuple source(Object key, Object value) { Tuple tuple = new Tuple(); if (sourceFields.size() == 2) tuple.add(key.toString()); tuple.add(value.toString()); return tuple; } @Override public void sink(TupleEntry tupleEntry, OutputCollector outputCollector) throws IOException { // it's ok to use NULL here so the collector does not write anything outputCollector.collect(null, tupleEntry.selectTuple(sinkFields)); } }