Java tutorial
/* * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cascading.scheme.hadoop; import java.beans.ConstructorProperties; import java.io.IOException; import java.nio.charset.Charset; import java.util.Arrays; import cascading.flow.FlowProcess; import cascading.flow.hadoop.util.HadoopUtil; import cascading.management.annotation.Property; import cascading.management.annotation.PropertyDescription; import cascading.management.annotation.Visibility; import cascading.scheme.Scheme; import cascading.scheme.SinkCall; import cascading.scheme.SourceCall; import cascading.tap.Tap; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; import static cascading.flow.hadoop.util.HadoopUtil.asJobConfInstance; /** * A TextLine is a type of {@link cascading.scheme.Scheme} for plain text files. Files are broken into * lines. Either line-feed or carriage-return are used to signal end of line. * <p/> * By default, this scheme returns a {@link Tuple} with two fields, "offset" and "line". * <p/> * Many of the constructors take both "sourceFields" and "sinkFields". sourceFields denote the field names * to be used instead of the names "offset" and "line". sinkFields is a selector and is by default {@link Fields#ALL}. * Any available field names can be given if only a subset of the incoming fields should be used. * <p/> * If a {@link Fields} instance is passed on the constructor as sourceFields having only one field, the return tuples * will simply be the "line" value using the given field name. * <p/> * Note that TextLine will concatenate all the Tuple values for the selected fields with a TAB delimiter before * writing out the line. * <p/> * Note sink compression is {@link Compress#DISABLE} by default. If {@code null} is passed to the constructor * for the compression value, it will remain disabled. * <p/> * If any of the input files end with ".zip", an error will be thrown. * * <p/> * By default, all text is encoded/decoded as UTF-8. This can be changed via the {@code charsetName} constructor * argument. */ public class TextLine extends Scheme<Configuration, RecordReader, OutputCollector, Object[], Object[]> { public enum Compress { DEFAULT, ENABLE, DISABLE } public static final String DEFAULT_CHARSET = "UTF-8"; /** Field serialVersionUID */ private static final long serialVersionUID = 1L; /** Field DEFAULT_SOURCE_FIELDS */ public static final Fields DEFAULT_SOURCE_FIELDS = new Fields("offset", "line"); /** Field sinkCompression */ Compress sinkCompression = Compress.DISABLE; String charsetName = DEFAULT_CHARSET; /** * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where * "offset" is the byte offset in the input file. */ public TextLine() { super(DEFAULT_SOURCE_FIELDS); } /** * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where * "offset" is the byte offset in the input file. * * @param numSinkParts of type int */ @ConstructorProperties({ "numSinkParts" }) public TextLine(int numSinkParts) { super(DEFAULT_SOURCE_FIELDS, numSinkParts); } /** * Creates a new TextLine instance that sources "offset" and "line" fields, and sinks all incoming fields, where * "offset" is the byte offset in the input file. * * @param sinkCompression of type Compress */ @ConstructorProperties({ "sinkCompression" }) public TextLine(Compress sinkCompression) { super(DEFAULT_SOURCE_FIELDS); setSinkCompression(sinkCompression); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields the source fields for this scheme * @param sinkFields the sink fields for this scheme */ @ConstructorProperties({ "sourceFields", "sinkFields" }) public TextLine(Fields sourceFields, Fields sinkFields) { super(sourceFields, sinkFields); verify(sourceFields); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields the source fields for this scheme * @param sinkFields the sink fields for this scheme * @param charsetName of type String */ @ConstructorProperties({ "sourceFields", "sinkFields", "charsetName" }) public TextLine(Fields sourceFields, Fields sinkFields, String charsetName) { super(sourceFields, sinkFields); // throws an exception if not found setCharsetName(charsetName); verify(sourceFields); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields the source fields for this scheme * @param sinkFields the sink fields for this scheme * @param numSinkParts of type int */ @ConstructorProperties({ "sourceFields", "sinkFields", "numSinkParts" }) public TextLine(Fields sourceFields, Fields sinkFields, int numSinkParts) { super(sourceFields, sinkFields, numSinkParts); verify(sourceFields); } /** * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields of type Fields * @param sinkFields of type Fields * @param sinkCompression of type Compress */ @ConstructorProperties({ "sourceFields", "sinkFields", "sinkCompression" }) public TextLine(Fields sourceFields, Fields sinkFields, Compress sinkCompression) { super(sourceFields, sinkFields); setSinkCompression(sinkCompression); verify(sourceFields); } /** * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields of type Fields * @param sinkFields of type Fields * @param sinkCompression of type Compress * @param charsetName of type String */ @ConstructorProperties({ "sourceFields", "sinkFields", "sinkCompression", "charsetName" }) public TextLine(Fields sourceFields, Fields sinkFields, Compress sinkCompression, String charsetName) { super(sourceFields, sinkFields); setSinkCompression(sinkCompression); // throws an exception if not found setCharsetName(charsetName); verify(sourceFields); } /** * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields of type Fields * @param sinkFields of type Fields * @param sinkCompression of type Compress * @param numSinkParts of type int */ @ConstructorProperties({ "sourceFields", "sinkFields", "sinkCompression", "numSinkParts" }) public TextLine(Fields sourceFields, Fields sinkFields, Compress sinkCompression, int numSinkParts) { super(sourceFields, sinkFields, numSinkParts); setSinkCompression(sinkCompression); verify(sourceFields); } /** * Constructor TextLine creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields of type Fields * @param sinkFields of type Fields * @param sinkCompression of type Compress * @param numSinkParts of type int * @param charsetName of type String */ @ConstructorProperties({ "sourceFields", "sinkFields", "sinkCompression", "numSinkParts", "charsetName" }) public TextLine(Fields sourceFields, Fields sinkFields, Compress sinkCompression, int numSinkParts, String charsetName) { super(sourceFields, sinkFields, numSinkParts); setSinkCompression(sinkCompression); // throws an exception if not found setCharsetName(charsetName); verify(sourceFields); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields the source fields for this scheme */ @ConstructorProperties({ "sourceFields" }) public TextLine(Fields sourceFields) { super(sourceFields); verify(sourceFields); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. * * @param sourceFields the source fields for this scheme * @param charsetName of type String */ @ConstructorProperties({ "sourceFields", "charsetName" }) public TextLine(Fields sourceFields, String charsetName) { super(sourceFields); // throws an exception if not found setCharsetName(charsetName); verify(sourceFields); } /** * Creates a new TextLine instance. If sourceFields has one field, only the text line will be returned in the * subsequent tuples. The resulting data set will have numSinkParts. * * @param sourceFields the source fields for this scheme * @param numSinkParts of type int */ @ConstructorProperties({ "sourceFields", "numSinkParts" }) public TextLine(Fields sourceFields, int numSinkParts) { super(sourceFields, numSinkParts); verify(sourceFields); } protected void setCharsetName(String charsetName) { if (charsetName != null) this.charsetName = charsetName; Charset.forName(this.charsetName); } @Property(name = "charset", visibility = Visibility.PUBLIC) @PropertyDescription(value = "character set used in this scheme.") public String getCharsetName() { return charsetName; } protected void verify(Fields sourceFields) { if (sourceFields.size() < 1 || sourceFields.size() > 2) throw new IllegalArgumentException( "this scheme requires either one or two source fields, given [" + sourceFields + "]"); } /** * Method getSinkCompression returns the sinkCompression of this TextLine object. * * @return the sinkCompression (type Compress) of this TextLine object. */ @Property(name = "sinkCompression", visibility = Visibility.PUBLIC) @PropertyDescription(value = "The compression of the scheme when used in a sink.") public Compress getSinkCompression() { return sinkCompression; } /** * Method setSinkCompression sets the sinkCompression of this TextLine object. If null, compression will remain disabled. * * @param sinkCompression the sinkCompression of this TextLine object. */ public void setSinkCompression(Compress sinkCompression) { if (sinkCompression != null) // leave disabled if null this.sinkCompression = sinkCompression; } @Override public void sourceConfInit(FlowProcess<? extends Configuration> flowProcess, Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf) { if (hasZippedFiles(FileInputFormat.getInputPaths(asJobConfInstance(conf)))) throw new IllegalStateException("cannot read zip files: " + Arrays.toString(FileInputFormat.getInputPaths(asJobConfInstance(conf)))); conf.setBoolean("mapred.mapper.new-api", false); conf.setClass("mapred.input.format.class", TextInputFormat.class, InputFormat.class); } private boolean hasZippedFiles(Path[] paths) { if (paths == null || paths.length == 0) return false; boolean isZipped = paths[0].getName().endsWith(".zip"); for (int i = 1; i < paths.length; i++) { if (isZipped != paths[i].getName().endsWith(".zip")) throw new IllegalStateException("cannot mix zipped and upzipped files"); } return isZipped; } @Override public void presentSourceFields(FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields) { // do nothing to change TextLine state } @Override public void presentSinkFields(FlowProcess<? extends Configuration> flowProcess, Tap tap, Fields fields) { // do nothing to change TextLine state } @Override public void sinkConfInit(FlowProcess<? extends Configuration> flowProcess, Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf) { if (tap.getFullIdentifier(conf).endsWith(".zip")) throw new IllegalStateException("cannot write zip files: " + HadoopUtil.getOutputPath(conf)); conf.setBoolean("mapred.mapper.new-api", false); if (getSinkCompression() == Compress.DISABLE) conf.setBoolean("mapred.output.compress", false); else if (getSinkCompression() == Compress.ENABLE) conf.setBoolean("mapred.output.compress", true); conf.setClass("mapred.output.key.class", Text.class, Object.class); conf.setClass("mapred.output.value.class", Text.class, Object.class); conf.setClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class); } @Override public void sourcePrepare(FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall) { if (sourceCall.getContext() == null) sourceCall.setContext(new Object[3]); sourceCall.getContext()[0] = sourceCall.getInput().createKey(); sourceCall.getContext()[1] = sourceCall.getInput().createValue(); sourceCall.getContext()[2] = Charset.forName(charsetName); } @Override public boolean source(FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall) throws IOException { if (!sourceReadInput(sourceCall)) return false; sourceHandleInput(sourceCall); return true; } private boolean sourceReadInput(SourceCall<Object[], RecordReader> sourceCall) throws IOException { Object[] context = sourceCall.getContext(); return sourceCall.getInput().next(context[0], context[1]); } protected void sourceHandleInput(SourceCall<Object[], RecordReader> sourceCall) { TupleEntry result = sourceCall.getIncomingEntry(); int index = 0; Object[] context = sourceCall.getContext(); // coerce into canonical forms if (getSourceFields().size() == 2) result.setLong(index++, ((LongWritable) context[0]).get()); result.setString(index, makeEncodedString(context)); } protected String makeEncodedString(Object[] context) { Text text = (Text) context[1]; return new String(text.getBytes(), 0, text.getLength(), (Charset) context[2]); } @Override public void sourceCleanup(FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall) { sourceCall.setContext(null); } @Override public void sinkPrepare(FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException { sinkCall.setContext(new Object[2]); sinkCall.getContext()[0] = new Text(); sinkCall.getContext()[1] = Charset.forName(charsetName); } @Override public void sink(FlowProcess<? extends Configuration> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException { Text text = (Text) sinkCall.getContext()[0]; Charset charset = (Charset) sinkCall.getContext()[1]; String line = sinkCall.getOutgoingEntry().getTuple().toString(); text.set(line.getBytes(charset)); // it's ok to use NULL here so the collector does not write anything sinkCall.getOutput().collect(null, text); } }