com.philiphubbard.sabe.MRAssembler.java Source code

Introduction

Here is the source code for com.philiphubbard.sabe.MRAssembler.java
Source

// Copyright (c) 2014 Philip M. Hubbard
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
// 
// http://opensource.org/licenses/MIT

package com.philiphubbard.sabe;

import java.io.IOException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.HashMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;

import com.philiphubbard.digraph.BasicDigraph;
import com.philiphubbard.digraph.Digraph;
import com.philiphubbard.digraph.EulerPaths;
import com.philiphubbard.digraph.MRBuildVertices;
import com.philiphubbard.digraph.MRCompressChains;
import com.philiphubbard.digraph.MRVertex;

// A class to assemble genomic sequences from a list of "reads" using
// MapReduce (MR) algorithms implemented in Hadoop to improve performance.
//
// The algorithm uses one MapReduce pass to break each read string into
// substrings of length k-1.  These (k-1)-mers are the vertices of a
// De Bruijn graph, and the edges correspond to the k-mers that are the
// overlap of the adjacent substrings of length k-1.  The graph is then
// reduced in size using a sequence of MapReduce passes that compress 
// linear chains of edges, which are expected to be very long in practice.  
// The final sequence is assembled from the reduced graph by a sequential
// algorithm to find the Euler tour, the tour that visits each edge exactly
// once.  (Hence the name of the package, "Sequence Assembly By Euler tours.")
//
// The implementation includes basic techniques to handle errors in the
// initial reads and repeated substrings or "repeates" in the sequence.  Both 
// techniques depend on the initial reads containing duplicate coverage for 
// the entire sequence.  Duplicate coverage causes the graph to contain edge 
// multiples.  Error handling involves discarding vertices with fewer edges 
// than would be expected given the coverage.  Repeat handling involves making 
// two extra sequential passes over the reduced graph to identify chains whose 
// edge multiples indicate they must be repeats given the coverage.

public class MRAssembler {

    // Construct the assembler.  The vertexMerLength is the k-1 in the 
    // (k-1)-mers that define the graph vertices.  The coverage is the number
    // of times each part of the final sequence will be covered by the union
    // of all the reads; coverage is assumed to be constant and uniform over
    // the sequence.

    public MRAssembler(int vertexMerLength, int coverage) {
        this.vertexMerLength = vertexMerLength;
        this.coverage = coverage;
    }

    // Run the MapReduce passes and sequential algorithms that perform the 
    // sequence assembly.  The inputPath is a directory, all of whose files
    // contain read strings, one read per line (ending with "\n" character).
    // The outputPath is a directory in which a file with the final assembled
    // sequence will be created.  A temporary directory named "sabe.MRAssemblerTmp"
    // will be created in the current working directory to hold intermediate
    // results from the MapReduce passes.

    public boolean run(Path inputPath, Path outputPath)
            throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration();

        // Job.getInstance() copies the Configuration argument, so set its properties first.

        conf.setBoolean(MRVertex.CONFIG_ALLOW_EDGE_MULTIPLES, true);
        conf.setBoolean(MRVertex.CONFIG_COMPRESS_CHAIN_MULTIPLES_MUST_MATCH, false);
        conf.setInt(MRMerVertex.CONFIG_MER_LENGTH, vertexMerLength);
        conf.setBoolean(MRBuildVertices.CONFIG_PARTITION_BRANCHES_CHAINS, true);
        conf.setInt(MRBuildVertices.CONFIG_COVERAGE, coverage);
        conf.setInt(MRCompressChains.CONFIG_TERMINATION_COUNT, 1);

        Job buildJob = Job.getInstance(conf);
        buildJob.setJobName("mrassemblerbuild");

        Path buildInputPath = inputPath;
        Path buildOutputPath = new Path("sabe.MRAssemblerTmp");

        System.out.println("sabe.MRAssembler starting vertex construction");

        MRBuildMerVertices.setupJob(buildJob, buildInputPath, buildOutputPath);

        if (!buildJob.waitForCompletion(true))
            return false;

        //

        Path compressInputPath = new Path(buildOutputPath.toString() + "/chain");
        Path compressOutputPath = new Path(buildOutputPath.toString() + "/chainCompress");

        int iter = 0;
        boolean keepGoing = true;
        MRCompressChains.beginIteration();
        while (keepGoing) {
            Job compressJob = Job.getInstance(conf);
            compressJob.setJobName("mrassemblercompress");

            System.out.println("sabe.MRAssembler starting compression iteration " + iter);

            MRCompressMerChains.setupIterationJob(compressJob, compressInputPath, compressOutputPath);

            if (!compressJob.waitForCompletion(true))
                System.exit(1);

            iter++;
            keepGoing = MRCompressChains.continueIteration(compressJob, compressInputPath, compressOutputPath);
        }

        System.out.println("sabe.MRAssembler made " + iter + " compression iterations");

        //

        Path branchPath = new Path(buildOutputPath.toString() + "/branch");
        Path chainPath = compressOutputPath;

        FileSystem fileSystem = FileSystem.get(conf);

        Graph graph = buildCompressedGraph(conf, fileSystem, branchPath, chainPath);
        if (graph != null) {
            ArrayList<String> result = graph.assemble();

            FSDataOutputStream out = fileSystem.create(outputPath);
            for (String seq : result) {
                out.writeBytes(seq);
                out.writeBytes("\n");
            }
        }

        //

        fileSystem.delete(buildOutputPath, true);

        fileSystem.close();

        return true;
    }

    // Build the graph after compressing chains.

    protected Graph buildCompressedGraph(Configuration conf, FileSystem fileSystem, Path branchPath, Path chainPath)
            throws IOException, InterruptedException {
        System.out.println("sabe.MRAssembler starting graph construction");

        ArrayList<MRMerVertex> vertices = new ArrayList<MRMerVertex>();

        FileStatus[] branchFiles = fileSystem.listStatus(branchPath);
        for (FileStatus status : branchFiles)
            readVertices(status, vertices, conf);

        FileStatus[] chainFiles = fileSystem.listStatus(chainPath);
        for (FileStatus status : chainFiles)
            readVertices(status, vertices, conf);

        // Check for a malformed graph, that does not have exactly one source and sink.
        // Return null in this case.

        int numSources = 0;
        int numSinks = 0;
        for (MRMerVertex vertex : vertices) {
            if (vertex.getIsSource())
                numSources++;
            if (vertex.getIsSink())
                numSinks++;
        }
        if ((numSources != 1) || (numSinks != 1)) {
            System.out.println(
                    "Malformed graph: number of sources = " + numSources + ", number of sinks = " + numSinks);
            return null;
        }

        return new Graph(vertices);
    }

    // Read values from the specified FileStatus, create MRMerVertex instances from the values
    // and place them in the ArrayList.

    private void readVertices(FileStatus status, ArrayList<MRMerVertex> vertices, Configuration conf)
            throws IOException {
        Path path = status.getPath();
        if (path.getName().startsWith("part")) {
            SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(path));
            IntWritable key = new IntWritable();
            BytesWritable value = new BytesWritable();
            while (reader.next(key, value))
                vertices.add(new MRMerVertex(value, conf));
            reader.close();
        }
    }

    // A directed graph built from MRMerVertex instances.  The underlying representation
    // is a digraph.BasicDigraph.

    protected class Graph {

        // Construct the graph from the vertices.

        public Graph(ArrayList<MRMerVertex> vertices) {

            // The MRMerVertices have a wide range of IDs, since each ID is an encoding
            // of a (k-1)-mer.  For the digraph.BasicDigraph, vertices will have IDs
            // that are consecutive integers starting at 0.  These arrays map from
            // those integers to the Mers and MerStrings (resulting from chain compression)
            // for the vertices.

            mers = new int[vertices.size()];
            merStrings = new MerString[vertices.size()];

            // The vertices need to have their edges remapped, too.  This HashMap maps
            // from (k-1)-mer IDs to new digraph.BasicDigraph IDs to facilitate the
            // new edges.

            HashMap<Integer, Integer> merToIndex = new HashMap<Integer, Integer>(vertices.size());

            // The Repeats class needs an array indicating what vertices are branches 
            // (so it can avoid computing that status itself).

            boolean[] isBranch = new boolean[vertices.size()];

            MRMerVertex source = null;
            MRMerVertex sink = null;

            int i = 1;
            for (MRMerVertex vertex : vertices) {
                if (vertex.getIsSource()) {

                    if (source == null)
                        source = vertex;
                }

                if (vertex.getIsSink()) {

                    if (sink == null)
                        sink = vertex;
                }

                // The digraph.EulerPaths class expects the source to have index 0.
                int j = (vertex == source) ? 0 : i++;

                mers[j] = vertex.getId();
                merStrings[j] = vertex.getMerString();
                merToIndex.put(mers[j], j);
                isBranch[j] = vertex.getIsBranch();
            }

            // Create the graph of the vertices with the new IDs.

            graph = new BasicDigraph(vertices.size(), Digraph.EdgeMultiples.ENABLED);

            for (MRMerVertex vertex : vertices) {
                int j = merToIndex.get(vertex.getId());

                // Add the edges with the new IDs.

                MRVertex.AdjacencyIterator it = vertex.createToAdjacencyIterator();
                for (int to = it.begin(); !it.done(); to = it.next()) {
                    BasicDigraph.Edge edge = new BasicDigraph.Edge(merToIndex.get(to).intValue());
                    graph.addEdge(j, edge);
                }
            }

            System.out.println("sabe.MRAssembler starting rectification of repeats");

            Repeats.rectifySingle(graph, coverage, isBranch);

            // There must be an edge from the sink to the source for
            // digraph.EulerPaths to work correctly.

            addedSinkSourceEdge = false;
            if ((source != null) && (sink != null)) {
                int from = merToIndex.get(sink.getId()).intValue();
                int to = merToIndex.get(source.getId()).intValue();
                graph.addEdge(from, new BasicDigraph.Edge(to));
                addedSinkSourceEdge = true;
            }
        }

        // Assemble the final sequence and return it as a String.  There ought to be
        // jus one String, but the return value allows for the possibility of more
        // than one, just in case.

        public ArrayList<String> assemble() {
            System.out.println("sabe.MRAssembler starting final assembly");

            ArrayList<String> result = new ArrayList<String>();

            EulerPaths<BasicDigraph.Edge> euler = new EulerPaths<BasicDigraph.Edge>(graph);
            ArrayList<ArrayDeque<Integer>> paths = euler.getPaths();

            for (ArrayDeque<Integer> path : paths) {
                if (addedSinkSourceEdge)
                    path.pollLast();

                String seq = new String();
                for (int i : path) {
                    if (seq.length() == 0) {
                        if (merStrings[i] != null)
                            seq = merStrings[i].toDisplayString();
                        else
                            seq = Mer.fromInt(mers[i], vertexMerLength);
                    } else {
                        if (merStrings[i] != null)
                            seq += merStrings[i].toDisplayString().substring(vertexMerLength - 1);
                        else
                            seq += Mer.fromInt(mers[i], 1);
                    }
                }

                result.add(seq);
            }

            return result;
        }

        private int[] mers;
        private MerString[] merStrings;
        private BasicDigraph graph;
        private boolean addedSinkSourceEdge;
    }

    private int vertexMerLength;
    private int coverage;

}