it.crs4.seal.read_sort.MergeAlignments.java Source code

Java tutorial

Introduction

Here is the source code for it.crs4.seal.read_sort.MergeAlignments.java

Source

// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal.  If not, see <http://www.gnu.org/licenses/>.

package it.crs4.seal.read_sort;

import it.crs4.seal.common.BwaRefAnnotation;
import it.crs4.seal.common.SealToolRunner;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.cli.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.Tool;

public class MergeAlignments extends Configured implements Tool {
    private String userInput;
    private String userOutput;
    private String userReferenceRoot;
    private String userAnnotation;
    private String sortOrder = "coordinate";
    private String genomeAssemblyId;

    private BwaRefAnnotation refAnnotation;

    private Path referenceRootPath;
    private Path annotationPath;
    private Path[] inputPaths;
    private Path outputPath;

    private boolean generatedMd5 = false;
    private boolean headerOnly = false;
    private FastaChecksummer checksums;

    private Map<String, String> readGroupFields;

    private static final Log log = LogFactory.getLog(MergeAlignments.class); // log must not go to standard output.

    private Path getQualifiedPath(String simplePath) throws IOException {
        Path path = new Path(simplePath);
        return path.makeQualified(path.getFileSystem(getConf()));
    }

    @SuppressWarnings("static") // for OptionBuilder
    private Map<String, Option> defineRGOptions() {
        Map<String, Option> readGroupOptions = new HashMap<String, Option>();

        readGroupOptions.put("ID", OptionBuilder.withDescription("Read group id").hasArg().withArgName("ID")
                .withLongOpt("rg-id").create("rgid"));

        readGroupOptions.put("SM", OptionBuilder.withDescription("Read group sample").hasArg().withArgName("sample")
                .withLongOpt("rg-sm").create("rgsm"));

        readGroupOptions.put("LB", OptionBuilder.withDescription("Read group library").hasArg()
                .withArgName("library").withLongOpt("rg-lb").create("rglb"));

        readGroupOptions.put("PU", OptionBuilder.withDescription("Read group platform unit").hasArg()
                .withArgName("pu").withLongOpt("rg-pu").create("rgpu"));

        readGroupOptions.put("CN", OptionBuilder.withDescription("Read group center").hasArg().withArgName("center")
                .withLongOpt("rg-cn").create("rgcn"));

        readGroupOptions.put("DT", OptionBuilder.withDescription("Read group date").hasArg().withArgName("date")
                .withLongOpt("rg-dt").create("rgdt"));

        readGroupOptions.put("PL", OptionBuilder.withDescription("Read group platform").hasArg()
                .withArgName("platform").withLongOpt("rg-pl").create("rgpl"));

        return readGroupOptions;
    }

    private Map<String, String> parseReadGroupOptions(Map<String, Option> readGroupOptions, CommandLine args)
            throws ParseException {
        HashMap<String, String> fields = new HashMap<String, String>();

        for (Map.Entry<String, Option> pair : readGroupOptions.entrySet()) {
            String fieldName = pair.getKey();
            Option opt = pair.getValue();
            if (args.hasOption(opt.getOpt())) {
                fields.put(fieldName, args.getOptionValue(opt.getOpt()));
            }
        }

        if (!fields.isEmpty()) {
            if (!fields.containsKey("ID") || !fields.containsKey("SM"))
                throw new ParseException(
                        "If you specify read group tags (RG) you must specify at least id and sample");
        }
        return fields;
    }

    /**
     * Scan command line and set configuration values appropriately.
     * Calls System.exit in case of a command line error.
     */
    @SuppressWarnings("static") // for OptionBuilder
    private void scanOptions(String[] args) {
        Options options = new Options();

        Option ref = OptionBuilder.withDescription("root path to the reference used to create the SAM data")
                .hasArg().withArgName("REF_PATH").withLongOpt("reference").create("ref");
        options.addOption(ref);

        Option ann = OptionBuilder.withDescription(
                "annotation file (.ann) of the BWA reference used to create the SAM data (not required if you specify "
                        + ref.getOpt() + ")")
                .hasArg().withArgName("ref.ann").withLongOpt("annotations").create("ann");
        options.addOption(ann);

        Option md5 = OptionBuilder.withDescription("generated MD5 checksums for reference contigs")
                .withLongOpt("md5").create("md5");
        options.addOption(md5);

        Option optSortOrder = OptionBuilder.withDescription(
                "Sort order.  Can be one of: unknown, unsorted, queryname, coordinate.  Default:  coordinate")
                .hasArg().withArgName("sort order").withLongOpt("sort-order").create("so");
        options.addOption(optSortOrder);

        Option as = OptionBuilder.withDescription("Genome assembly identifier (@SQ AS:xxxx tag)").hasArg()
                .withArgName("ASSEMBLY_ID").withLongOpt("sq-assembly").create("sqas");
        options.addOption(as);

        Option optHeaderOnly = OptionBuilder.withDescription("Only output the SAM header, then exit.")
                .withLongOpt("header-only").create("ho");
        options.addOption(optHeaderOnly);

        // read group options
        Map<String, Option> readGroupOptions = defineRGOptions();
        for (Option opt : readGroupOptions.values())
            options.addOption(opt);

        CommandLineParser parser = new GnuParser();

        try {
            CommandLine line = parser.parse(options, args);

            if (line.hasOption(ref.getOpt()))
                userReferenceRoot = line.getOptionValue(ref.getOpt());

            if (line.hasOption(md5.getOpt()))
                generatedMd5 = true;

            if (line.hasOption(ann.getOpt()))
                userAnnotation = line.getOptionValue(ann.getOpt());

            if (line.hasOption(as.getOpt())) // TODO: validate this input
                genomeAssemblyId = line.getOptionValue(as.getOpt());

            if (line.hasOption(optSortOrder.getOpt())) {
                String value = line.getOptionValue(optSortOrder.getOpt());
                if ("unknown".equals(value) || "unsorted".equals(value) || "queryname".equals(value)
                        || "coordinate".equals(value))
                    sortOrder = value;
                else
                    throw new ParseException(
                            "Invalid sort order.  Sort order must be one of: unknown, unsorted, queryname, coordinate.");
            }

            if (line.hasOption(optHeaderOnly.getOpt()))
                headerOnly = true;

            // remaining args
            String[] otherArgs = line.getArgs();

            if (headerOnly) {
                // We're only generating the header, so we don't expect any input reads.
                if (otherArgs.length > 1)
                    throw new ParseException(
                            "You can't provide an input path with --header-only. Provide an output path or let the output go to stdout.");

                if (otherArgs.length == 0)
                    userOutput = null;
                else
                    userOutput = otherArgs[0];
            } else // not headerOnly
            {
                if (otherArgs.length <= 0 || otherArgs.length > 2)
                    throw new ParseException(
                            "You must provide an HDFS input path and, optionally, an output path.");

                userOutput = null;
                userInput = otherArgs[0];

                if (otherArgs.length >= 2)
                    userOutput = otherArgs[1];
            }

            readGroupFields = parseReadGroupOptions(readGroupOptions, line);

            // option validation
            if (generatedMd5 && userReferenceRoot == null)
                throw new ParseException(
                        "You must specify the path the reference if you want to generate MD5 checksums");
            if (userReferenceRoot == null && userAnnotation == null)
                throw new ParseException(
                        "You must provide the path to the reference or at least its annotation file (<ref>.ann)");
        } catch (ParseException e) {
            System.err.println("Usage error: " + e.getMessage());
            // XXX: redirect System.out to System.err since the simple version of
            // HelpFormatter.printHelp prints to System.out, and we're on a way to
            // a fatal exit.
            System.setOut(System.err);
            HelpFormatter formatter = new HelpFormatter();
            formatter.printHelp("MergeAlignments [options] -ann <ref>.ann [<in>] [<out>]", options);
            System.exit(1);
        }
    }

    private void configureMerge() throws ParseException, IOException {
        // convert user-provided paths to Path objects and make them qualified
        if (userAnnotation != null)
            annotationPath = getQualifiedPath(userAnnotation);

        if (userReferenceRoot != null) {
            referenceRootPath = getQualifiedPath(userReferenceRoot);

            if (annotationPath == null)
                annotationPath = referenceRootPath.suffix(".ann");
            else {
                if (!annotationPath.equals(referenceRootPath.suffix(".ann")))
                    log.warn(
                            "specified annotation file does not follow the same naming pattern as the specified reference");
            }
        }

        // here we load the reference annotations
        loadAnnotations();
        log.info("Reference annotations read");
    }

    private void calculateChecksums() throws IOException {
        if (generatedMd5) {
            log.info("Calculating reference checksum...");
            log.info("Reference fasta path: " + referenceRootPath.toString());

            checksums = new FastaChecksummer();
            FileReader reader = new FileReader(new File(referenceRootPath.toUri()));
            try {
                checksums.setInput(reader);
                checksums.calculate();
            } finally {
                reader.close();
            }

            log.info("checksum complete");
        }
    }

    private void loadAnnotations() throws IOException {
        FileSystem fs = annotationPath.getFileSystem(getConf());
        log.info("Reading reference annotations from " + annotationPath);
        try {
            FSDataInputStream in = fs.open(annotationPath);
            refAnnotation = new BwaRefAnnotation(new InputStreamReader(in));
        } catch (IOException e) {
            log.fatal("Can't read annotation file " + annotationPath + " on filesystem " + fs.getUri());
            throw e;
        }
    }

    private static class SourcePathFilter implements org.apache.hadoop.fs.PathFilter {
        public boolean accept(Path p) {
            boolean decision = true;
            if (p.getName().toString().startsWith("_"))
                decision = false;

            return decision;
        }
    }

    private Path[] getSourcePaths() throws Exception {
        Path srcPath = new Path(userInput);
        FileSystem srcFs = srcPath.getFileSystem(getConf());
        if (srcFs.exists(srcPath)) {
            FileStatus stat = srcFs.getFileStatus(srcPath);
            if (stat.isDir()) {
                String msg = "source path " + srcPath + " is a directory.  Globbing with ";
                srcPath = new Path(srcPath, "*");
                log.info(msg + srcPath);
            }
        }

        // Glob source path.  The returned paths are already sorted.  We filter out paths starting
        // with '_' (see SourcePathFilter).
        // If the path doesn't contain a glob patter, and it doesn't exist, the function will return null.
        Path[] sources = FileUtil.stat2Paths(srcFs.globStatus(srcPath, new SourcePathFilter()));
        if (sources == null)
            throw new IllegalArgumentException("Source path " + srcPath.makeQualified(srcFs) + " doesn't exist");

        if (log.isDebugEnabled()) {
            log.debug("Sources:");
            for (int i = 0; i < sources.length; ++i)
                log.debug(sources[i]);
        }

        if (sources.length == 0)
            throw new IllegalArgumentException("no source files selected");

        log.info("Merging " + sources.length + " files.");

        return sources;
    }

    private void writeSamHeader(OutputStream rawOut) throws IOException {
        Writer out = new BufferedWriter(new OutputStreamWriter(rawOut));
        out.write("@HD\tVN:1.0\tSO:" + sortOrder + "\n");

        // prep @SQ format string
        StringBuilder formatBuilder = new StringBuilder(50);
        formatBuilder.append("@SQ\tSN:%s\tLN:%d");

        if (genomeAssemblyId != null)
            formatBuilder.append("\tAS:").append(genomeAssemblyId);
        if (referenceRootPath != null)
            formatBuilder.append("\tUR:").append(referenceRootPath.toUri());
        if (generatedMd5)
            formatBuilder.append("\tM5:%s");

        formatBuilder.append("\n");

        String format = formatBuilder.toString();
        if (generatedMd5) {
            for (BwaRefAnnotation.Contig c : refAnnotation)
                out.write(String.format(format, c.getName(), c.getLength(), checksums.getChecksum(c.getName())));
        } else {
            for (BwaRefAnnotation.Contig c : refAnnotation)
                out.write(String.format(format, c.getName(), c.getLength()));
        }

        // @PG:  Seal name and version
        String version = "not available";
        Package pkg = this.getClass().getPackage();
        if (pkg != null && pkg.getImplementationVersion() != null)
            version = pkg.getImplementationVersion();
        else
            log.warn("Could not get package version");

        out.write("@PG\tID:seal\tVN:" + version + "\n");

        // @RG, if provided
        if (!readGroupFields.isEmpty()) {
            StringBuilder rgBuilder = new StringBuilder("@RG");
            for (Map.Entry<String, String> pair : readGroupFields.entrySet())
                rgBuilder.append("\t").append(pair.getKey()).append(":").append(pair.getValue());
            rgBuilder.append("\n");

            out.write(rgBuilder.toString());
        }

        out.flush();
    }

    private void copyMerge(Path[] sources, OutputStream out) throws IOException {
        Configuration conf = getConf();

        for (int i = 0; i < sources.length; ++i) {
            FileSystem fs = sources[i].getFileSystem(conf);
            InputStream in = fs.open(sources[i]);
            try {
                IOUtils.copyBytes(in, out, conf, false);
            } finally {
                in.close();
            }
        }
    }

    /**
     * Make an OutputStream to write to the user selected userOutput.
     */
    private OutputStream getOutputStream() throws IOException {
        OutputStream out;
        if (userOutput == null) // Write to stdout
        {
            log.info("writing to standard out");
            out = System.out;
        } else {
            Path destPath = getQualifiedPath(userOutput);
            FileSystem destFs = destPath.getFileSystem(getConf());
            if (destFs.exists(destPath))
                throw new RuntimeException(
                        "Output destination " + destPath + " exists.  Please remove it or change the output path.");

            log.info("Merging sources to " + destPath);
            out = destFs.create(destPath);
        }
        return out;
    }

    public int run(String[] args) throws Exception {
        scanOptions(args);

        // ensure that annotation options make sense and load the reference annotations
        configureMerge();

        // Get the output stream.  This make create a new output file.
        // Do this before calculateChecksums so that any errors wrt to the output
        // path are found before spending minutes checksumming the reference.
        OutputStream destFile = getOutputStream();
        try {
            Path[] sources = null;
            if (!headerOnly) {
                // Get input paths.  Check for paths that don't exist.
                // As for getting the output stream, we verify the inputs now to fail early
                // in case of an invocation error.
                sources = getSourcePaths();
            }

            // calculate the reference checksums, if necessary
            calculateChecksums();
            writeSamHeader(destFile);
            if (!headerOnly)
                copyMerge(sources, destFile);
        } finally {
            destFile.close();
        }
        log.info("Finished");

        return 0;
    }

    public static void main(String[] args) {
        int res = 0;
        try {
            res = new SealToolRunner().run(new MergeAlignments(), args);
        } catch (Exception e) {
            System.err.println("Error executing MergeAlignments: " + e.getMessage());
            res = 1;
        }
        System.exit(res);
    }
}