it.crs4.seal.usort.USort.java Source code

Introduction

Here is the source code for it.crs4.seal.usort.USort.java
Source

/*
 * Copyright (C) 2011-2012 CRS4.
 *
 * This file is part of Seal.
 *
 * Seal is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option)
 * any later version.
 *
 * Seal is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with Seal.  If not, see <http://www.gnu.org/licenses/>.
 */
package it.crs4.seal.usort;

import java.io.IOException;
import java.util.HashMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import it.crs4.seal.common.FormatNameMap;
import it.crs4.seal.common.SealToolParser;
import it.crs4.seal.common.SealToolRunner;
import it.crs4.seal.common.SequenceId;
import it.crs4.seal.demux.Demux;

import fi.tkk.ics.hadoop.bam.SequencedFragment;

public class USort extends Configured implements Tool {
    private static final Log LOG = LogFactory.getLog(USort.class);

    public static final int NUM_REDUCE_TASKS = USortPartitioner.EXPECTED_NUM_PARTITIONS;

    /**
     * Partition assigning each read phase from each tile to the same reducer.
     *
     * In an Illumina flowcell we currently have 768 tiles split over 8 lanes.
     * Though actual number of these is specified in the xml files generated by
     * the sequencer, they should be pretty constant so, given the nature of this
     * tool, we'll hard code them.
     *
     * We'll create 768 x 2 reduce tasks--i.e., one per tile per read.
     *
     * Read 1 are sent to odd partitions; reads 2 to even partitions.
     * Specifically, we'll send them to (x+1) and x.
     *
     * Tiles are numbered in blocks of 16:
     *   block 1: 1101, 1102, ..., 1116
     *   block 2: 1201, 1202, ..., 1216
     *   block 3: 1301, 1302, ..., 1316
     *   block 4: 2101, 2102, ..., 2116
     *   block 5: 2201, 2202, ..., 2216
     *   block 6: 2301, 2302, ..., 2316
     *
     * Lanes range 1-8.  Each lane contains 16*6 = 96 tiles.
     *
     * Within each tiles there are many read 1 and 2.
     */
    public static class USortPartitioner extends Partitioner<SequenceId, SequencedFragment> {
        public static final int EXPECTED_NUM_PARTITIONS = 768 * 2;

        protected static final int NUM_LANES = 8;
        protected static final int BLOCKS_PER_LANE = 6;
        protected static final int TILES_PER_BLOCK = 16;
        protected static final int TILES_PER_LANE = BLOCKS_PER_LANE * TILES_PER_BLOCK;

        // flow cell composition
        protected static final int NUM_SURFACES = 2;
        protected static final int NUM_SWATHS = 3; // per surface
        protected static final int NUM_TILES = 16; // per swath

        protected static HashMap<Integer, Integer> Lookup;
        static {
            // Fill the look-up table that maps a tile id to a partition number.  We
            // multiply the partition number by 2, thus leaving a gap between
            // consecutive tiles.  We'll stick the odd reads in those gaps; i.e.,
            //
            //   tile 1101 -> element 0
            //   read2 from tile 1101 will go to partition 0
            //   read1 from tile 1101 will go to partition 0+1
            Lookup = new HashMap<Integer, Integer>(EXPECTED_NUM_PARTITIONS);

            int elementNumber = 0;
            for (int surface = 1; surface <= NUM_SURFACES; ++surface) {
                for (int swath = 1; swath <= NUM_SWATHS; ++swath) {
                    for (int tile = 1; tile <= NUM_TILES; ++tile) {
                        Lookup.put(surface * 1000 + swath * 100 + tile, 2 * elementNumber);
                        elementNumber += 1;
                    }
                }
            }
        }

        @Override
        public int getPartition(SequenceId key, SequencedFragment read, int numPartitions) {
            if (numPartitions != EXPECTED_NUM_PARTITIONS)
                throw new RuntimeException(
                        "Expecting " + EXPECTED_NUM_PARTITIONS + " reduce tasks but we have " + numPartitions);

            Integer lane = read.getLane();
            Integer tile = read.getTile();
            Integer read_num = read.getRead();

            if (lane == null || lane < 1 || lane > NUM_LANES)
                throw new RuntimeException("Invalid lane number '" + lane + "' in partitioner!");
            if (tile == null)
                throw new RuntimeException("Invalid tile number '" + tile + "' in partitioner!");
            if (read_num == null || read_num < 1 || read_num > 2)
                throw new RuntimeException("Invalid read number '" + read_num + "' in partitioner!");

            int tileIndex = (lane - 1) * TILES_PER_LANE + Lookup.get(tile);
            if (read_num == 1)
                tileIndex += 1;

            if (tileIndex >= EXPECTED_NUM_PARTITIONS)
                throw new RuntimeException("BUG!  Calculated partition index " + tileIndex
                        + " but the maximum expected is " + EXPECTED_NUM_PARTITIONS);

            return tileIndex;
        }
    }

    public static class Reduce extends Reducer<SequenceId, SequencedFragment, Text, SequencedFragment> {
        @Override
        public void reduce(SequenceId key, Iterable<SequencedFragment> values, Context context)
                throws IOException, InterruptedException {
            for (SequencedFragment f : values)
                context.write(null, f);
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        Configuration conf = getConf();
        // defaults
        conf.set(SealToolParser.INPUT_FORMAT_CONF, USortOptionParser.InputFormatDefault);
        conf.set(SealToolParser.OUTPUT_FORMAT_CONF, USortOptionParser.OutputFormatDefault);

        // parse command line
        USortOptionParser parser = new USortOptionParser();
        parser.parse(conf, args);

        Job job = new Job(conf, "USort " + parser.getInputPaths().get(0));
        job.setJarByClass(USort.class);

        job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName()));
        job.setOutputFormatClass(FormatNameMap.getOutputFormat(parser.getOutputFormatName()));

        job.setMapperClass(Demux.Map.class);
        job.setMapOutputKeyClass(SequenceId.class);
        job.setMapOutputValueClass(SequencedFragment.class);

        job.setPartitionerClass(USortPartitioner.class);

        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(SequencedFragment.class);

        for (Path p : parser.getInputPaths())
            FileInputFormat.addInputPath(job, p);

        FileOutputFormat.setOutputPath(job, parser.getOutputPath());

        boolean result = job.waitForCompletion(true);

        if (!result) {
            LOG.fatal(this.getClass().getName() + " failed!");
            return 1;
        } else
            return 0;
    }

    public static void main(String[] args) throws Exception {
        int res = new SealToolRunner().run(new USort(), args);
        System.exit(res);
    }
}