Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.crunch.impl.mem; import java.io.IOException; import java.util.List; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import com.google.common.util.concurrent.AbstractFuture; import org.apache.avro.Schema; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericContainer; import org.apache.avro.generic.GenericDatumWriter; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.crunch.CrunchRuntimeException; import org.apache.crunch.PCollection; import org.apache.crunch.PTable; import org.apache.crunch.Pair; import org.apache.crunch.Pipeline; import org.apache.crunch.PipelineExecution; import org.apache.crunch.PipelineResult; import org.apache.crunch.Source; import org.apache.crunch.TableSource; import org.apache.crunch.Target; import org.apache.crunch.impl.mem.collect.MemCollection; import org.apache.crunch.impl.mem.collect.MemTable; import org.apache.crunch.io.At; import org.apache.crunch.io.PathTarget; import org.apache.crunch.io.ReadableSource; import org.apache.crunch.io.avro.AvroFileTarget; import org.apache.crunch.io.seq.SeqFileTarget; import org.apache.crunch.types.Converter; import org.apache.crunch.types.PTableType; import org.apache.crunch.types.PType; import org.apache.crunch.types.avro.ReflectDataFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.mapreduce.Counters; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import com.google.common.collect.Sets; public class MemPipeline implements Pipeline { private static final Log LOG = LogFactory.getLog(MemPipeline.class); private static Counters COUNTERS = new CountersWrapper(); private static final MemPipeline INSTANCE = new MemPipeline(); private int outputIndex = 0; public static Counters getCounters() { return COUNTERS; } public static void clearCounters() { COUNTERS = new CountersWrapper(); } public static Pipeline getInstance() { return INSTANCE; } public static <T> PCollection<T> collectionOf(T... ts) { return new MemCollection<T>(ImmutableList.copyOf(ts)); } public static <T> PCollection<T> collectionOf(Iterable<T> collect) { return new MemCollection<T>(collect); } public static <T> PCollection<T> typedCollectionOf(PType<T> ptype, T... ts) { return new MemCollection<T>(ImmutableList.copyOf(ts), ptype, null); } public static <T> PCollection<T> typedCollectionOf(PType<T> ptype, Iterable<T> collect) { return new MemCollection<T>(collect, ptype, null); } public static <S, T> PTable<S, T> tableOf(S s, T t, Object... more) { List<Pair<S, T>> pairs = Lists.newArrayList(); pairs.add(Pair.of(s, t)); for (int i = 0; i < more.length; i += 2) { pairs.add(Pair.of((S) more[i], (T) more[i + 1])); } return new MemTable<S, T>(pairs); } public static <S, T> PTable<S, T> typedTableOf(PTableType<S, T> ptype, S s, T t, Object... more) { List<Pair<S, T>> pairs = Lists.newArrayList(); pairs.add(Pair.of(s, t)); for (int i = 0; i < more.length; i += 2) { pairs.add(Pair.of((S) more[i], (T) more[i + 1])); } return new MemTable<S, T>(pairs, ptype, null); } public static <S, T> PTable<S, T> tableOf(Iterable<Pair<S, T>> pairs) { return new MemTable<S, T>(pairs); } public static <S, T> PTable<S, T> typedTableOf(PTableType<S, T> ptype, Iterable<Pair<S, T>> pairs) { return new MemTable<S, T>(pairs, ptype, null); } private Configuration conf = new Configuration(); private Set<Target> activeTargets = Sets.newHashSet(); private MemPipeline() { } @Override public void setConfiguration(Configuration conf) { this.conf = conf; } @Override public Configuration getConfiguration() { return conf; } @Override public <T> PCollection<T> read(Source<T> source) { if (source instanceof ReadableSource) { try { Iterable<T> iterable = ((ReadableSource<T>) source).read(conf); return new MemCollection<T>(iterable, source.getType(), source.toString()); } catch (IOException e) { LOG.error("Exception reading source: " + source.toString(), e); throw new IllegalStateException(e); } } LOG.error("Source " + source + " is not readable"); throw new IllegalStateException("Source " + source + " is not readable"); } @Override public <K, V> PTable<K, V> read(TableSource<K, V> source) { if (source instanceof ReadableSource) { try { Iterable<Pair<K, V>> iterable = ((ReadableSource<Pair<K, V>>) source).read(conf); return new MemTable<K, V>(iterable, source.getTableType(), source.toString()); } catch (IOException e) { LOG.error("Exception reading source: " + source.toString(), e); throw new IllegalStateException(e); } } LOG.error("Source " + source + " is not readable"); throw new IllegalStateException("Source " + source + " is not readable"); } @Override public void write(PCollection<?> collection, Target target) { write(collection, target, Target.WriteMode.DEFAULT); } @Override public void write(PCollection<?> collection, Target target, Target.WriteMode writeMode) { target.handleExisting(writeMode, -1, getConfiguration()); if (writeMode != Target.WriteMode.APPEND && activeTargets.contains(target)) { throw new CrunchRuntimeException("Target " + target + " is already written in the current run." + " Use WriteMode.APPEND in order to write additional data to it."); } activeTargets.add(target); if (target instanceof PathTarget) { Path path = ((PathTarget) target).getPath(); try { FileSystem fs = path.getFileSystem(conf); outputIndex++; if (target instanceof SeqFileTarget) { if (collection instanceof PTable) { writeSequenceFileFromPTable(fs, path, (PTable<?, ?>) collection); } else { writeSequenceFileFromPCollection(fs, path, collection); } } else { FSDataOutputStream os = fs.create(new Path(path, "out" + outputIndex)); if (target instanceof AvroFileTarget && !(collection instanceof PTable)) { writeAvroFile(os, collection.materialize()); } else { LOG.warn("Defaulting to write to a text file from MemPipeline"); if (collection instanceof PTable) { for (Object o : collection.materialize()) { Pair p = (Pair) o; os.writeBytes(p.first().toString()); os.writeBytes("\t"); os.writeBytes(p.second().toString()); os.writeBytes("\r\n"); } } else { for (Object o : collection.materialize()) { os.writeBytes(o.toString() + "\r\n"); } } } os.close(); } } catch (IOException e) { LOG.error("Exception writing target: " + target, e); } } else { LOG.error("Target " + target + " is not a PathTarget instance"); } } @SuppressWarnings({ "rawtypes", "unchecked" }) private void writeAvroFile(FSDataOutputStream outputStream, Iterable genericRecords) throws IOException { Object r = genericRecords.iterator().next(); Schema schema = null; if (r instanceof GenericContainer) { schema = ((GenericContainer) r).getSchema(); } else { schema = new ReflectDataFactory().getReflectData().getSchema(r.getClass()); } GenericDatumWriter genericDatumWriter = new GenericDatumWriter(schema); DataFileWriter dataFileWriter = new DataFileWriter(genericDatumWriter); dataFileWriter.create(schema, outputStream); for (Object record : genericRecords) { dataFileWriter.append(record); } dataFileWriter.close(); outputStream.close(); } @SuppressWarnings({ "rawtypes", "unchecked" }) private void writeSequenceFileFromPTable(final FileSystem fs, final Path path, final PTable table) throws IOException { final PTableType pType = table.getPTableType(); final Class<?> keyClass = pType.getConverter().getKeyClass(); final Class<?> valueClass = pType.getConverter().getValueClass(); final SequenceFile.Writer writer = new SequenceFile.Writer(fs, fs.getConf(), path, keyClass, valueClass); for (final Object o : table.materialize()) { final Pair<?, ?> p = (Pair) o; final Object key = pType.getKeyType().getOutputMapFn().map(p.first()); final Object value = pType.getValueType().getOutputMapFn().map(p.second()); writer.append(key, value); } writer.close(); } private void writeSequenceFileFromPCollection(final FileSystem fs, final Path path, final PCollection collection) throws IOException { final PType pType = collection.getPType(); final Converter converter = pType.getConverter(); final Class valueClass = converter.getValueClass(); final SequenceFile.Writer writer = new SequenceFile.Writer(fs, fs.getConf(), path, NullWritable.class, valueClass); for (final Object o : collection.materialize()) { final Object value = pType.getOutputMapFn().map(o); writer.append(NullWritable.get(), value); } writer.close(); } @Override public PCollection<String> readTextFile(String pathName) { return read(At.textFile(pathName)); } @Override public <T> void writeTextFile(PCollection<T> collection, String pathName) { write(collection, At.textFile(pathName)); } @Override public <T> Iterable<T> materialize(PCollection<T> pcollection) { return pcollection.materialize(); } @Override public PipelineExecution runAsync() { activeTargets.clear(); return new MemExecution(); } @Override public PipelineResult run() { try { return runAsync().get(); } catch (Exception e) { throw new RuntimeException(e); } } @Override public PipelineResult done() { return run(); } @Override public void enableDebug() { LOG.info("Note: in-memory pipelines do not have debug logging"); } @Override public String getName() { return "Memory Pipeline"; } private static class MemExecution extends AbstractFuture<PipelineResult> implements PipelineExecution { private PipelineResult res; public MemExecution() { this.res = new PipelineResult( ImmutableList.of(new PipelineResult.StageResult("MemPipelineStage", COUNTERS)), PipelineExecution.Status.SUCCEEDED); } @Override public String getPlanDotFile() { return ""; } @Override public void waitFor(long timeout, TimeUnit timeUnit) throws InterruptedException { set(res); } @Override public void waitUntilDone() throws InterruptedException { set(res); } @Override public PipelineResult get() throws ExecutionException, InterruptedException { set(res); return super.get(); } @Override public PipelineResult get(long timeout, TimeUnit timeUnit) throws InterruptedException, ExecutionException, TimeoutException { set(res); return super.get(timeout, timeUnit); } @Override public Status getStatus() { return isDone() ? Status.SUCCEEDED : Status.READY; } @Override public PipelineResult getResult() { return isDone() ? res : null; } @Override public void kill() { // No-op } } }