org.apache.crunch.impl.mem.MemPipeline.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.crunch.impl.mem.MemPipeline.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch.impl.mem;

import java.io.IOException;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import com.google.common.util.concurrent.AbstractFuture;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericContainer;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.crunch.CrunchRuntimeException;
import org.apache.crunch.PCollection;
import org.apache.crunch.PTable;
import org.apache.crunch.Pair;
import org.apache.crunch.Pipeline;
import org.apache.crunch.PipelineExecution;
import org.apache.crunch.PipelineResult;
import org.apache.crunch.Source;
import org.apache.crunch.TableSource;
import org.apache.crunch.Target;
import org.apache.crunch.impl.mem.collect.MemCollection;
import org.apache.crunch.impl.mem.collect.MemTable;
import org.apache.crunch.io.At;
import org.apache.crunch.io.PathTarget;
import org.apache.crunch.io.ReadableSource;
import org.apache.crunch.io.avro.AvroFileTarget;
import org.apache.crunch.io.seq.SeqFileTarget;
import org.apache.crunch.types.Converter;
import org.apache.crunch.types.PTableType;
import org.apache.crunch.types.PType;
import org.apache.crunch.types.avro.ReflectDataFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Counters;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;

public class MemPipeline implements Pipeline {

    private static final Log LOG = LogFactory.getLog(MemPipeline.class);
    private static Counters COUNTERS = new CountersWrapper();
    private static final MemPipeline INSTANCE = new MemPipeline();

    private int outputIndex = 0;

    public static Counters getCounters() {
        return COUNTERS;
    }

    public static void clearCounters() {
        COUNTERS = new CountersWrapper();
    }

    public static Pipeline getInstance() {
        return INSTANCE;
    }

    public static <T> PCollection<T> collectionOf(T... ts) {
        return new MemCollection<T>(ImmutableList.copyOf(ts));
    }

    public static <T> PCollection<T> collectionOf(Iterable<T> collect) {
        return new MemCollection<T>(collect);
    }

    public static <T> PCollection<T> typedCollectionOf(PType<T> ptype, T... ts) {
        return new MemCollection<T>(ImmutableList.copyOf(ts), ptype, null);
    }

    public static <T> PCollection<T> typedCollectionOf(PType<T> ptype, Iterable<T> collect) {
        return new MemCollection<T>(collect, ptype, null);
    }

    public static <S, T> PTable<S, T> tableOf(S s, T t, Object... more) {
        List<Pair<S, T>> pairs = Lists.newArrayList();
        pairs.add(Pair.of(s, t));
        for (int i = 0; i < more.length; i += 2) {
            pairs.add(Pair.of((S) more[i], (T) more[i + 1]));
        }
        return new MemTable<S, T>(pairs);
    }

    public static <S, T> PTable<S, T> typedTableOf(PTableType<S, T> ptype, S s, T t, Object... more) {
        List<Pair<S, T>> pairs = Lists.newArrayList();
        pairs.add(Pair.of(s, t));
        for (int i = 0; i < more.length; i += 2) {
            pairs.add(Pair.of((S) more[i], (T) more[i + 1]));
        }
        return new MemTable<S, T>(pairs, ptype, null);
    }

    public static <S, T> PTable<S, T> tableOf(Iterable<Pair<S, T>> pairs) {
        return new MemTable<S, T>(pairs);
    }

    public static <S, T> PTable<S, T> typedTableOf(PTableType<S, T> ptype, Iterable<Pair<S, T>> pairs) {
        return new MemTable<S, T>(pairs, ptype, null);
    }

    private Configuration conf = new Configuration();
    private Set<Target> activeTargets = Sets.newHashSet();

    private MemPipeline() {
    }

    @Override
    public void setConfiguration(Configuration conf) {
        this.conf = conf;
    }

    @Override
    public Configuration getConfiguration() {
        return conf;
    }

    @Override
    public <T> PCollection<T> read(Source<T> source) {
        if (source instanceof ReadableSource) {
            try {
                Iterable<T> iterable = ((ReadableSource<T>) source).read(conf);
                return new MemCollection<T>(iterable, source.getType(), source.toString());
            } catch (IOException e) {
                LOG.error("Exception reading source: " + source.toString(), e);
                throw new IllegalStateException(e);
            }
        }
        LOG.error("Source " + source + " is not readable");
        throw new IllegalStateException("Source " + source + " is not readable");
    }

    @Override
    public <K, V> PTable<K, V> read(TableSource<K, V> source) {
        if (source instanceof ReadableSource) {
            try {
                Iterable<Pair<K, V>> iterable = ((ReadableSource<Pair<K, V>>) source).read(conf);
                return new MemTable<K, V>(iterable, source.getTableType(), source.toString());
            } catch (IOException e) {
                LOG.error("Exception reading source: " + source.toString(), e);
                throw new IllegalStateException(e);
            }
        }
        LOG.error("Source " + source + " is not readable");
        throw new IllegalStateException("Source " + source + " is not readable");
    }

    @Override
    public void write(PCollection<?> collection, Target target) {
        write(collection, target, Target.WriteMode.DEFAULT);
    }

    @Override
    public void write(PCollection<?> collection, Target target, Target.WriteMode writeMode) {
        target.handleExisting(writeMode, -1, getConfiguration());
        if (writeMode != Target.WriteMode.APPEND && activeTargets.contains(target)) {
            throw new CrunchRuntimeException("Target " + target + " is already written in the current run."
                    + " Use WriteMode.APPEND in order to write additional data to it.");
        }
        activeTargets.add(target);
        if (target instanceof PathTarget) {
            Path path = ((PathTarget) target).getPath();
            try {
                FileSystem fs = path.getFileSystem(conf);
                outputIndex++;
                if (target instanceof SeqFileTarget) {
                    if (collection instanceof PTable) {
                        writeSequenceFileFromPTable(fs, path, (PTable<?, ?>) collection);
                    } else {
                        writeSequenceFileFromPCollection(fs, path, collection);
                    }
                } else {
                    FSDataOutputStream os = fs.create(new Path(path, "out" + outputIndex));
                    if (target instanceof AvroFileTarget && !(collection instanceof PTable)) {

                        writeAvroFile(os, collection.materialize());
                    } else {
                        LOG.warn("Defaulting to write to a text file from MemPipeline");
                        if (collection instanceof PTable) {
                            for (Object o : collection.materialize()) {
                                Pair p = (Pair) o;
                                os.writeBytes(p.first().toString());
                                os.writeBytes("\t");
                                os.writeBytes(p.second().toString());
                                os.writeBytes("\r\n");
                            }
                        } else {
                            for (Object o : collection.materialize()) {
                                os.writeBytes(o.toString() + "\r\n");
                            }
                        }
                    }
                    os.close();
                }
            } catch (IOException e) {
                LOG.error("Exception writing target: " + target, e);
            }
        } else {
            LOG.error("Target " + target + " is not a PathTarget instance");
        }
    }

    @SuppressWarnings({ "rawtypes", "unchecked" })
    private void writeAvroFile(FSDataOutputStream outputStream, Iterable genericRecords) throws IOException {

        Object r = genericRecords.iterator().next();

        Schema schema = null;

        if (r instanceof GenericContainer) {
            schema = ((GenericContainer) r).getSchema();
        } else {
            schema = new ReflectDataFactory().getReflectData().getSchema(r.getClass());
        }

        GenericDatumWriter genericDatumWriter = new GenericDatumWriter(schema);

        DataFileWriter dataFileWriter = new DataFileWriter(genericDatumWriter);
        dataFileWriter.create(schema, outputStream);

        for (Object record : genericRecords) {
            dataFileWriter.append(record);
        }

        dataFileWriter.close();
        outputStream.close();
    }

    @SuppressWarnings({ "rawtypes", "unchecked" })
    private void writeSequenceFileFromPTable(final FileSystem fs, final Path path, final PTable table)
            throws IOException {
        final PTableType pType = table.getPTableType();
        final Class<?> keyClass = pType.getConverter().getKeyClass();
        final Class<?> valueClass = pType.getConverter().getValueClass();

        final SequenceFile.Writer writer = new SequenceFile.Writer(fs, fs.getConf(), path, keyClass, valueClass);

        for (final Object o : table.materialize()) {
            final Pair<?, ?> p = (Pair) o;
            final Object key = pType.getKeyType().getOutputMapFn().map(p.first());
            final Object value = pType.getValueType().getOutputMapFn().map(p.second());
            writer.append(key, value);
        }

        writer.close();
    }

    private void writeSequenceFileFromPCollection(final FileSystem fs, final Path path,
            final PCollection collection) throws IOException {
        final PType pType = collection.getPType();
        final Converter converter = pType.getConverter();
        final Class valueClass = converter.getValueClass();

        final SequenceFile.Writer writer = new SequenceFile.Writer(fs, fs.getConf(), path, NullWritable.class,
                valueClass);

        for (final Object o : collection.materialize()) {
            final Object value = pType.getOutputMapFn().map(o);
            writer.append(NullWritable.get(), value);
        }

        writer.close();
    }

    @Override
    public PCollection<String> readTextFile(String pathName) {
        return read(At.textFile(pathName));
    }

    @Override
    public <T> void writeTextFile(PCollection<T> collection, String pathName) {
        write(collection, At.textFile(pathName));
    }

    @Override
    public <T> Iterable<T> materialize(PCollection<T> pcollection) {
        return pcollection.materialize();
    }

    @Override
    public PipelineExecution runAsync() {
        activeTargets.clear();
        return new MemExecution();
    }

    @Override
    public PipelineResult run() {
        try {
            return runAsync().get();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public PipelineResult done() {
        return run();
    }

    @Override
    public void enableDebug() {
        LOG.info("Note: in-memory pipelines do not have debug logging");
    }

    @Override
    public String getName() {
        return "Memory Pipeline";
    }

    private static class MemExecution extends AbstractFuture<PipelineResult> implements PipelineExecution {

        private PipelineResult res;

        public MemExecution() {
            this.res = new PipelineResult(
                    ImmutableList.of(new PipelineResult.StageResult("MemPipelineStage", COUNTERS)),
                    PipelineExecution.Status.SUCCEEDED);
        }

        @Override
        public String getPlanDotFile() {
            return "";
        }

        @Override
        public void waitFor(long timeout, TimeUnit timeUnit) throws InterruptedException {
            set(res);
        }

        @Override
        public void waitUntilDone() throws InterruptedException {
            set(res);
        }

        @Override
        public PipelineResult get() throws ExecutionException, InterruptedException {
            set(res);
            return super.get();
        }

        @Override
        public PipelineResult get(long timeout, TimeUnit timeUnit)
                throws InterruptedException, ExecutionException, TimeoutException {
            set(res);
            return super.get(timeout, timeUnit);
        }

        @Override
        public Status getStatus() {
            return isDone() ? Status.SUCCEEDED : Status.READY;
        }

        @Override
        public PipelineResult getResult() {
            return isDone() ? res : null;
        }

        @Override
        public void kill() {
            // No-op
        }
    }
}