Java tutorial
/* * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cascading.scheme.hadoop; import java.beans.ConstructorProperties; import java.io.IOException; import cascading.flow.FlowProcess; import cascading.scheme.Scheme; import cascading.scheme.SinkCall; import cascading.scheme.SourceCall; import cascading.tap.Tap; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat; /** * A SequenceFile is a type of {@link cascading.scheme.Scheme}, which is a flat file consisting of * binary key/value pairs. This is a space and time efficient means to store data. */ public class SequenceFile extends Scheme<Configuration, RecordReader, OutputCollector, Object[], Void> { /** Protected for use by TempDfs and other subclasses. Not for general consumption. */ protected SequenceFile() { super(null); } /** * Creates a new SequenceFile instance that stores the given field names. * * @param fields */ @ConstructorProperties({ "fields" }) public SequenceFile(Fields fields) { super(fields, fields); } @Override public void sourceConfInit(FlowProcess<? extends Configuration> flowProcess, Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf) { conf.setBoolean("mapred.mapper.new-api", false); conf.setClass("mapred.input.format.class", SequenceFileInputFormat.class, InputFormat.class); } @Override public void sinkConfInit(FlowProcess<? extends Configuration> flowProcess, Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf) { conf.setBoolean("mapred.mapper.new-api", false); conf.setClass("mapred.output.key.class", Tuple.class, Object.class); conf.setClass("mapred.output.value.class", Tuple.class, Object.class); conf.setClass("mapred.output.format.class", SequenceFileOutputFormat.class, OutputFormat.class); } @Override public void sourcePrepare(FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall) { Object[] pair = new Object[] { sourceCall.getInput().createKey(), sourceCall.getInput().createValue() }; sourceCall.setContext(pair); } @Override public boolean source(FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall) throws IOException { Tuple key = (Tuple) sourceCall.getContext()[0]; Tuple value = (Tuple) sourceCall.getContext()[1]; boolean result = sourceCall.getInput().next(key, value); if (!result) return false; TupleEntry entry = sourceCall.getIncomingEntry(); if (entry.hasTypes()) entry.setCanonicalTuple(value); else entry.setTuple(value); return true; } @Override public void sourceCleanup(FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall) { sourceCall.setContext(null); } @Override public void sink(FlowProcess<? extends Configuration> flowProcess, SinkCall<Void, OutputCollector> sinkCall) throws IOException { sinkCall.getOutput().collect(Tuple.NULL, sinkCall.getOutgoingEntry().getTuple()); } }