com.cloudera.seismic.crunch.SUPipeline.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.seismic.crunch.SUPipeline.java

Source

/**
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.seismic.crunch;

import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import org.apache.crunch.PCollection;
import org.apache.crunch.PGroupedTable;
import org.apache.crunch.Pair;
import org.apache.crunch.Pipeline;
import org.apache.crunch.TupleN;
import org.apache.crunch.impl.mr.MRPipeline;
import org.apache.crunch.io.From;
import org.apache.crunch.io.To;
import org.apache.crunch.lib.PTables;
import org.apache.crunch.types.PTypeFamily;
import org.apache.crunch.types.writable.Writables;
import com.cloudera.seismic.su.SUProcess;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;

public class SUPipeline extends Configured implements Tool {

    private static final Set<String> X_COMMANDS = ImmutableSet.of("suxcontour", "suxgraph", "suximage", "suxmax",
            "suxmovie", "suxpicker", "suxwigb", "xcontour", "ximage", "xpicker", "xwigb");

    public PCollection<ByteBuffer> constructPipeline(PCollection<ByteBuffer> input, String cwproot,
            List<String> steps) {
        PTypeFamily ptf = input.getTypeFamily();
        PGroupedTable<TupleN, ByteBuffer> sorted = null;
        for (String step : steps) {
            String[] pieces = step.split("\\s+");
            if ("susort".equals(pieces[0])) {
                if (sorted != null) {
                    throw new IllegalArgumentException("Cannot have susort followed by susort");
                } else {
                    List<String> keys = Lists.newArrayList();
                    for (int i = 1; i < pieces.length; i++) {
                        if (!pieces[i].isEmpty()) {
                            keys.add(pieces[i]);
                        }
                    }
                    if (keys.isEmpty()) {
                        throw new IllegalArgumentException("susort must have at least one key");
                    }
                    sorted = SUSort.apply(input, keys);
                }
            } else {
                SUProcess proc = new SUProcess(cwproot, pieces[0]);
                for (int i = 1; i < pieces.length; i++) {
                    proc.addArg(pieces[i]);
                }
                if (sorted == null) {
                    input = input.parallelDo(pieces[0], new SUDoFn(proc), ptf.bytes());
                } else {
                    input = sorted.parallelDo(pieces[0], new SUPostGroupFn(proc), ptf.bytes());
                    sorted = null;
                }
            }
        }
        if (sorted != null) {
            input = PTables.values(sorted.ungroup());
        }
        return input;
    }

    public int run(String[] args) throws Exception {
        Options options = new Options();
        options.addOption("cwproot", true, "The path to CWPROOT on the cluster machines");
        options.addOption("input", true, "SU files in Hadoop");
        options.addOption("output", true, "The path of the SU files to write out to Hadoop");
        options.addOption("command", true, "A pipeline of SU commands to run on the data");

        // Parse the commandline and check for required arguments.
        CommandLine cmdLine = new PosixParser().parse(options, args, false);
        if (!cmdLine.hasOption("input") || !cmdLine.hasOption("command")) {
            System.out.println("Mising required input/command arguments");
            new HelpFormatter().printHelp("SUPipeline", options);
            System.exit(1);
        }

        String clusterCwproot = null;
        if (cmdLine.hasOption("cwproot")) {
            clusterCwproot = cmdLine.getOptionValue("cwproot");
        }
        if (clusterCwproot == null || clusterCwproot.isEmpty()) {
            System.out.println("Could not determine cluster's CWPROOT value");
            new HelpFormatter().printHelp("SUPipeline", options);
            System.exit(1);
        }

        Pipeline pipeline = new MRPipeline(SUPipeline.class);
        PCollection<ByteBuffer> traces = pipeline
                .read(From.sequenceFile(cmdLine.getOptionValue("input"), Writables.bytes()));
        Pair<List<String>, String> cmd = parse(cmdLine.getOptionValue("command"));
        PCollection<ByteBuffer> result = constructPipeline(traces, clusterCwproot, cmd.first());

        if (cmdLine.hasOption("output")) {
            result.write(To.sequenceFile(cmdLine.getOptionValue("output")));
        }

        if (cmd.second() != null) {
            String localCwproot = System.getenv("CWPROOT");
            if (localCwproot == null) {
                System.out.println("To use local SU commands, the CWPROOT environment variable must be set");
                System.exit(1);
            }
            String[] pieces = cmd.second().split("\\s+");
            SUProcess x = new SUProcess(localCwproot, pieces[0]);
            for (int i = 1; i < pieces.length; i++) {
                x.addArg(pieces[i]);
            }
            x.addEnvironment(ImmutableMap.of("DISPLAY", System.getenv("DISPLAY")));
            Iterator<ByteBuffer> iter = result.materialize().iterator();
            x.start();
            while (iter.hasNext()) {
                ByteBuffer bb = iter.next();
                x.write(bb.array(), bb.arrayOffset(), bb.limit());
            }
            x.closeAndWait();
        }

        if (!cmdLine.hasOption("output") && cmd.second() == null) {
            System.out.println("No output destination specified");
            System.exit(1);
        }

        pipeline.done();
        return 0;
    }

    private Pair<List<String>, String> parse(String command) {
        List<String> hCmds = Lists.newArrayList();
        String xCmd = null;
        for (String arg : command.toLowerCase().split("\\|\\s+")) {
            if (X_COMMANDS.contains(arg.split("\\s+")[0])) {
                xCmd = arg;
                break;
            } else {
                hCmds.add(arg);
            }
        }
        return Pair.of(hCmds, xCmd);
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new SUPipeline(), args);
    }
}