it.crs4.seal.common.SealToolParser.java Source code

Java tutorial

Introduction

Here is the source code for it.crs4.seal.common.SealToolParser.java

Source

// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal.  If not, see <http://www.gnu.org/licenses/>.

package it.crs4.seal.common;

import it.crs4.seal.common.ClusterUtils;

import java.io.IOException;
import java.io.File;
import java.io.FileReader;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.commons.cli.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class SealToolParser {

    public static final File DefaultConfigFile = new File(System.getProperty("user.home"), ".sealrc");
    public static final int DEFAULT_MIN_REDUCE_TASKS = 0;
    public static final int DEFAULT_REDUCE_TASKS_PER_NODE = 3;

    public static final String INPUT_FORMAT_CONF = "seal.input-format";
    public static final String OUTPUT_FORMAT_CONF = "seal.output-format";

    protected static final String INPUT_FORMAT_DESC = "Input format name";
    protected static final String OUTPUT_FORMAT_DESC = "Output format name";

    public static final String INPUT_FORMAT_ENCODING = "seal.input.base-quality-encoding";

    private int minReduceTasks;

    /**
     * Configuration object used to parse the command line, cached for further queries.
     */
    private Configuration myconf;

    protected Options options;
    private Option opt_nReduceTasks;
    private Option opt_configFileOverride;
    private Option opt_inputFormat;
    private Option opt_outputFormat;
    private Option opt_compressOutput;

    private String[] acceptedInputFormats;
    private String[] acceptedOutputFormats;

    private Integer nReduceTasks;
    private int nReduceTasksPerNode;
    private String configSection;
    protected String toolName;

    protected ArrayList<Path> inputs;
    private Path outputDir;

    /**
     * Construct a SealToolParser instance.
     *
     * The instance is set to read the properties in configuration file's section sectionName,
     * in addition to the default section.  Properties set on the command line will override
     * the file's settings.
     *
     * @param configSection Name of section of configuration to load, in addition to DEFAULT.
     * If null, only DEFAULT is loaded
     * @param toolName Name used in the help message
     */
    @SuppressWarnings("static") // for OptionBuilder
    public SealToolParser(String configSection, String toolName) {
        this.toolName = toolName;

        options = new Options(); // empty
        opt_nReduceTasks = OptionBuilder.withDescription("Number of reduce tasks to use.").hasArg()
                .withArgName("INT").withLongOpt("num-reducers").create("r");
        options.addOption(opt_nReduceTasks);

        opt_configFileOverride = OptionBuilder
                .withDescription("Override default Seal config file (" + DefaultConfigFile + ")").hasArg()
                .withArgName("FILE").withLongOpt("seal-config").create("sc");
        options.addOption(opt_configFileOverride);

        opt_inputFormat = OptionBuilder.withDescription(INPUT_FORMAT_DESC).hasArg().withArgName("FORMAT")
                .withLongOpt("input-format").create("if");
        options.addOption(opt_inputFormat);

        opt_outputFormat = OptionBuilder.withDescription(OUTPUT_FORMAT_DESC).hasArg().withArgName("FORMAT")
                .withLongOpt("output-format").create("of");
        options.addOption(opt_outputFormat);

        opt_compressOutput = OptionBuilder
                .withDescription("Compress output files with CODEC (one of gzip, bzip2, snappy, auto)").hasArg()
                .withArgName("CODEC").withLongOpt("compress-output").create("oc");
        options.addOption(opt_compressOutput);

        nReduceTasks = null;
        inputs = new ArrayList<Path>(10);
        outputDir = null;
        this.configSection = (configSection == null) ? "" : configSection;
        minReduceTasks = DEFAULT_MIN_REDUCE_TASKS;
        myconf = null;
        nReduceTasksPerNode = DEFAULT_REDUCE_TASKS_PER_NODE;
    }

    /**
     * Set the minimum acceptable number of reduce tasks.
     * If a user specifies a number lower than this limit parseOptions will raise
     * an error.
     */
    public void setMinReduceTasks(int x) {
        if (x < 0)
            throw new IllegalArgumentException("minimum number of reduce tasks must be >= 0");
        minReduceTasks = x;
    }

    public int getMinReduceTasks() {
        return minReduceTasks;
    }

    protected void loadConfig(Configuration conf, File fname) throws ParseException, IOException {
        ConfigFileParser parser = new ConfigFileParser();

        try {
            parser.load(new FileReader(fname));

            Iterator<ConfigFileParser.KvPair> it = parser.getSectionIterator(configSection);
            ConfigFileParser.KvPair pair;
            while (it.hasNext()) {
                pair = it.next();
                conf.set(pair.getKey(), pair.getValue());
            }
        } catch (FormatException e) {
            throw new ParseException("Error reading config file " + fname + ". " + e);
        }
    }

    /**
     * Decides whether to use an rc file, and if so which one.
     *
     * This method is necessary only because we'd like the user to be able to override the default
     * location of the seal configuration file ($HOME/.sealrc).  So, it scans
     * the command line arguments looking for a user-specified seal configuration file.
     * If one is specified, it verifies that it exists and is readable.  If none is specified
     * it checks to see whether a configuration file is available at the default location,
     * and if it is the method verifies that it is readable.
     *
     * If a config file is found and is readable, its path is returned as a File object.  On the other
     * hand, if a config file isn't found the method returns null.
     *
     * @param args command line arguments
     * @exception ParseException raise if the file specified on the cmd line doesn't exist or isn't readable.
     */
    protected File getRcFile(String[] args) throws ParseException {
        File fname = null;

        String shortOpt = "--" + opt_configFileOverride.getOpt();
        String longOpt = "--" + opt_configFileOverride.getLongOpt();

        for (int i = 0; i < args.length; ++i) {
            if (args[i].equals(shortOpt) || args[i].equals(longOpt)) {
                if (i + 1 >= args.length)
                    throw new ParseException("Missing file argument to " + args[i]);

                fname = new File(args[i + 1]);
                break;
            }
        }

        if (fname != null) // a seal configuration file was specified
        {
            if (!fname.exists())
                throw new ParseException("Configuration file " + fname + " doesn't exist");
            if (!fname.canRead())
                throw new ParseException("Can't read configuration file " + fname);
            // at this point it should be all good.
        } else // none specified.  Try the default
        {
            // presume that if it exists the user intends to use it
            if (DefaultConfigFile.exists()) {
                if (DefaultConfigFile.canRead())
                    fname = DefaultConfigFile;
                else {
                    // The file exists but it can't be read.  Warn the user.
                    LogFactory.getLog(SealToolParser.class)
                            .warn("Seal configuration file " + DefaultConfigFile + " isn't readable");
                    // leave fname as null so no configuration file will be used
                }
            }
        }

        return fname;
    }

    /**
     * Set properties useful for the whole Seal suite.
     */
    protected void setDefaultProperties(Configuration conf) {
        conf.set("mapred.compress.map.output", "true");
    }

    public void parse(Configuration conf, String[] args) throws IOException {
        try {
            parseOptions(conf, args);
        } catch (ParseException e) {
            defaultUsageError(e.getMessage()); // doesn't return
        }
    }

    /**
     * Parses command line.
     *
     * Override this method to implement additional command line options,
     * but do make sure you call this method to parse the default options.
     */
    protected CommandLine parseOptions(Configuration conf, String[] args) throws ParseException, IOException {
        myconf = conf;

        setDefaultProperties(conf);

        // load settings from configuration file
        // first, parse the command line (in getRcFile) looking for an option overriding the default seal configuration file
        File configFile = getRcFile(args);
        if (configFile != null)
            loadConfig(conf, configFile);

        // now parse the entire command line using the default hadoop parser.  Now
        // the user can override properties specified in the config file with properties
        // specified on the command line.
        CommandLine line = new GenericOptionsParser(conf, options, args).getCommandLine();
        if (line == null)
            throw new ParseException("Error parsing command line"); // getCommandLine returns an null if there was a parsing error

        ////////////////////// input/output formats //////////////////////
        // set the configuration property.  Then, we'll check the property
        // to ensure it has a valid value, regardless of whether we just set it,
        // so that the check will also be valid if the property is set directly.
        if (line.hasOption(opt_inputFormat.getOpt()))
            myconf.set(INPUT_FORMAT_CONF, line.getOptionValue(opt_inputFormat.getOpt()));

        validateIOFormat(INPUT_FORMAT_CONF, acceptedInputFormats);

        if (line.hasOption(opt_outputFormat.getOpt()))
            myconf.set(OUTPUT_FORMAT_CONF, line.getOptionValue(opt_outputFormat.getOpt()));

        validateIOFormat(OUTPUT_FORMAT_CONF, acceptedOutputFormats);

        if (conf.get(INPUT_FORMAT_ENCODING) != null) {
            String value = conf.get(INPUT_FORMAT_ENCODING);
            if (value.equals("sanger") || value.equals("illumina"))
                conf.set(org.seqdoop.hadoop_bam.FormatConstants.CONF_INPUT_BASE_QUALITY_ENCODING, value);
            else
                throw new ParseException("Invalid " + INPUT_FORMAT_ENCODING + ". Expected 'sanger' or 'illumina'");
        }

        /////////////////////// output compression /////////////////////
        if (line.hasOption(opt_compressOutput.getOpt())) {
            myconf.setBoolean("mapred.output.compress", true);
            String codec = line.getOptionValue(opt_compressOutput.getOpt());
            if (codec != null) {
                String codecClass = "org.apache.hadoop.io.compress.GzipCodec"; // default
                if ("auto".equalsIgnoreCase(codec) || "gzip".equalsIgnoreCase(codec)) {
                    // pass.  Already set
                } else if ("bzip2".equalsIgnoreCase(codec))
                    codecClass = "org.apache.hadoop.io.compress.BZip2Codec";
                else if ("snappy".equalsIgnoreCase(codec))
                    codecClass = "org.apache.hadoop.io.compress.SnappyCodec";
                else {
                    throw new ParseException("Unknown codec " + codec
                            + ". Valid values are gzip, bzip2, snappy and auto.\n"
                            + "If you want to use an unsupported codec pass 'auto' and set the property mapred.output.compression.codec directly");
                }

                myconf.set("mapred.output.compression.codec", codecClass);
            }
        }

        ////////////////////// number of reducers //////////////////////
        if (line.hasOption(opt_nReduceTasks.getOpt())) {
            String rString = line.getOptionValue(opt_nReduceTasks.getOpt());
            try {
                int r = Integer.parseInt(rString);
                if (r >= minReduceTasks)
                    nReduceTasks = r;
                else
                    throw new ParseException("Number of reducers must be greater than or equal to " + minReduceTasks
                            + " (got " + rString + ")");
            } catch (NumberFormatException e) {
                throw new ParseException("Invalid number of reduce tasks '" + rString + "'");
            }
        }

        ////////////////////// positional arguments //////////////////////
        String[] otherArgs = line.getArgs();
        if (otherArgs.length < 2) // require at least two:  one input and one output
            throw new ParseException("You must provide input and output paths");
        else {
            //
            FileSystem fs;
            for (int i = 0; i < otherArgs.length - 1; ++i) {
                Path p = new Path(otherArgs[i]);
                fs = p.getFileSystem(conf);
                p = p.makeQualified(fs);
                FileStatus[] files = fs.globStatus(p);
                if (files != null && files.length > 0) {
                    for (FileStatus status : files)
                        inputs.add(status.getPath());
                } else
                    throw new ParseException("Input path " + p.toString() + " doesn't exist");
            }
            // now the last one, should be the output path
            outputDir = new Path(otherArgs[otherArgs.length - 1]);
            fs = outputDir.getFileSystem(conf);
            outputDir = outputDir.makeQualified(fs);
            if (fs.exists(outputDir))
                throw new ParseException(
                        "Output path " + outputDir.toString() + " already exists.  Won't overwrite");
        }

        return line;
    }

    /**
     * Set accepted input format names.
     *
     * @names if null, turns off checking of input format name.  Otherwise,
     *        specifying as a parameter to --input-format any name that is
     *        not in the list will generate an error.
     */
    public void setAcceptedInputFormats(String[] names) {
        acceptedInputFormats = names;

        String inputFormatHelp = INPUT_FORMAT_DESC;
        if (names != null)
            inputFormatHelp += " (" + joinStrings(names, ",") + ")";

        opt_inputFormat.setDescription(inputFormatHelp);
    }

    /**
     * Set accepted output format names.
     *
     * @names if null, turns off checking of output format name.  Otherwise,
     *        specifying as a parameter to --output-format any name that is
     *        not in the list will generate an error.
     */
    public void setAcceptedOutputFormats(String[] names) {
        acceptedOutputFormats = names;

        String help = OUTPUT_FORMAT_DESC;
        if (names != null)
            help += "(" + joinStrings(names, ",") + ")";

        opt_outputFormat.setDescription(help);
    }

    /**
     * Return the input format specified, if any.
     *
     * @param defaultName Return this value if the configuration isn't set.
     */
    public String getInputFormatName(String defaultName) {
        return myconf.get(INPUT_FORMAT_CONF, defaultName);
    }

    /**
     * Return the input format specified, if any.
     */
    public String getInputFormatName() {
        return myconf.get(INPUT_FORMAT_CONF);
    }

    /**
     * Return the output format specified, if any.
     *
     * @param defaultName Return this value if the configuration isn't set.
     */
    public String getOutputFormatName(String defaultName) {
        return myconf.get(OUTPUT_FORMAT_CONF, defaultName);
    }

    /**
     * Return the output format specified, if any.
     */
    public String getOutputFormatName() {
        return myconf.get(OUTPUT_FORMAT_CONF);
    }

    /**
     * Get total number of reduce tasks to run.
     * This option parser must have already parsed the command line.
     */
    public int getNReduceTasks() throws java.io.IOException {
        if (myconf == null)
            throw new IllegalStateException("getNReduceTasks called before parsing the command line.");

        if (nReduceTasksPerNode < 0)
            throw new IllegalArgumentException(
                    "Invalid number of default reduce tasks per node: " + nReduceTasksPerNode);

        if (nReduceTasks == null) {
            // Calculate and cache value
            // To calculate, we get the number of tasktrackers available and multiply by
            // nReduceTasksPerNode.  It can happen that a cluster doesn't yet have any
            // task trackers, so we use a lower bound of 1.
            nReduceTasks = Math.max(ClusterUtils.getNumberTaskTrackers(myconf), 1) * nReduceTasksPerNode;
            return nReduceTasks;
        } else
            return nReduceTasks;
    }

    public void setNReduceTasksPerNode(int value) {
        if (value < 0)
            throw new IllegalArgumentException("number of reduce tasks per node must be >= 0 (got " + value + ")");
        nReduceTasksPerNode = value;
        nReduceTasks = null; // reset cached value
    }

    /**
     * Return the specified output path.
     */
    public Path getOutputPath() {
        return outputDir;
    }

    public List<Path> getInputPaths() {
        ArrayList<Path> retval = new ArrayList<Path>(getNumInputPaths());
        for (Path p : inputs)
            retval.add(p);
        return retval;
    }

    public int getNumInputPaths() {
        return inputs.size();
    }

    public void defaultUsageError() {
        defaultUsageError(null);
    }

    /**
     * Prints help and exits with code 3.
     */
    public void defaultUsageError(String msg) {
        System.err.print("Usage error");
        if (msg != null)
            System.err.println(":  " + msg);
        System.err.print("\n");
        // XXX: redirect System.out to System.err since the simple version of
        // HelpFormatter.printHelp prints to System.out, and we're on a way to
        // a fatal exit.
        System.setOut(System.err);
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("" + toolName + " [options] <in>+ <out>", options);
        System.exit(3);
    }

    protected static String joinStrings(String[] strings, String joinString) {
        if (strings.length == 0)
            return "";
        else if (strings.length == 1)
            return strings[0];
        else {
            StringBuilder builder = new StringBuilder();
            builder.append(strings[0]);
            for (int i = 1; i < strings.length; ++i) {
                builder.append(joinString);
                builder.append(strings[i]);
            }

            return builder.toString();
        }
    }

    protected void validateIOFormat(String ioProperty, String[] acceptedFormats) throws ParseException {
        String selectedFormat = myconf.get(ioProperty);

        if (acceptedFormats != null && selectedFormat != null) {
            for (int i = 0; i < acceptedFormats.length; ++i)
                if (acceptedFormats[i].equals(selectedFormat))
                    return;
            throw new ParseException("Incompatible file format selected. " + ioProperty + " is " + selectedFormat
                    + " but acceptable values are " + joinStrings(acceptedFormats, ", "));
        }
    }
}