it.crs4.seal.recab.RecabTable.java Source code

Java tutorial

Introduction

Here is the source code for it.crs4.seal.recab.RecabTable.java

Source

// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal.  If not, see <http://www.gnu.org/licenses/>.

package it.crs4.seal.recab;

import it.crs4.seal.common.ClusterUtils;
import it.crs4.seal.common.ContextAdapter;
import it.crs4.seal.common.FormatNameMap;
import it.crs4.seal.common.IMRContext;
import it.crs4.seal.common.ReadPair;
import it.crs4.seal.common.SealToolRunner;

import org.apache.commons.cli.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.net.URI;
import java.net.URISyntaxException;
import java.io.IOException;
import java.io.FileReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Collection;

public class RecabTable extends Configured implements Tool {
    private static final Log LOG = LogFactory.getLog(RecabTable.class);

    public static final int DEFAULT_RED_TASKS_PER_TRACKER = 3;

    public static final String ASCII = "US-ASCII";

    public static final String TableDelim = ",";
    public static final byte[] TableDelimBytes;

    static {
        try {
            TableDelimBytes = TableDelim.getBytes(ASCII);
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(ASCII + " character set not supported!");
        }
    }

    public static final String LocalVariantsFile = "variants_table.data";

    public static final String VariantsFileTypeProperty = "seal.recab.variants-table-type";
    public static final String VariantsFileTypeVcf = "vcf";
    public static final String VariantsFileTypeRod = "rod";

    // Whether to only consider SNP variation locations or to consider all variation types.
    public static final String SnpsOnlyProperty = "seal.recab.snps-only";
    // Default:  consider only SNPs
    public static final boolean SnpsOnlyDefault = false;

    public static class Map extends Mapper<LongWritable, ReadPair, Text, ObservationCount> {
        private RecabTableMapper impl;
        private IMRContext<Text, ObservationCount> contextAdapter;

        protected VariantReader getVariantFileReader(Configuration conf) throws IOException {
            // known variation sites
            Reader in = new FileReader(LocalVariantsFile);
            VariantReader reader = null;

            String variantsFileType = conf.get(VariantsFileTypeProperty);
            if (variantsFileType == null) {
                LOG.warn("Configuration property " + VariantsFileTypeProperty
                        + " isn't set.  Assuming variants file is VCF");
                variantsFileType = VariantsFileTypeVcf;
            }

            if (variantsFileType.equals(VariantsFileTypeVcf)) {
                VcfVariantReader vcfReader = new VcfVariantReader(in);
                vcfReader.setReadSnpsOnly(conf.getBoolean(SnpsOnlyProperty, SnpsOnlyDefault));
                reader = vcfReader;
            } else if (variantsFileType.equals(VariantsFileTypeRod)) {
                reader = new RodFileVariantReader(in);
                if (!conf.getBoolean(SnpsOnlyProperty, SnpsOnlyDefault))
                    throw new RuntimeException(
                            "Sorry.  Using all variant types is currently not supported for Rod files.  Please let the Seal developers know if this is important to you.");
            } else
                throw new IllegalArgumentException("unrecognized variants file type set in "
                        + VariantsFileTypeProperty + " (accepted values are " + VariantsFileTypeVcf + " and "
                        + VariantsFileTypeRod + ")");

            return reader;
        }

        @Override
        public void setup(Context context) throws IOException {
            impl = new RecabTableMapper();
            contextAdapter = new ContextAdapter<Text, ObservationCount>(context);

            Configuration conf = context.getConfiguration();
            VariantReader reader = getVariantFileReader(conf);
            impl.setup(reader, contextAdapter, conf);
        }

        @Override
        public void map(LongWritable pos, ReadPair pair, Context context)
                throws java.io.IOException, InterruptedException {
            impl.map(pos, pair, contextAdapter);
        }
    }

    public static class Combiner extends Reducer<Text, ObservationCount, Text, ObservationCount> {
        private RecabTableCombiner impl;
        private IMRContext<Text, ObservationCount> contextAdapter;

        @Override
        public void setup(Context context) throws IOException {
            contextAdapter = new ContextAdapter<Text, ObservationCount>(context);

            impl = new RecabTableCombiner();
            impl.setup(context.getConfiguration());
        }

        @Override
        public void reduce(Text key, Iterable<ObservationCount> values, Context context)
                throws IOException, InterruptedException {
            impl.reduce(key, values, contextAdapter);
        }
    }

    public static class Red extends Reducer<Text, ObservationCount, Text, Text> {
        private RecabTableReducer impl;
        private IMRContext<Text, Text> contextAdapter;

        @Override
        public void setup(Context context) throws IOException {
            contextAdapter = new ContextAdapter<Text, Text>(context);

            impl = new RecabTableReducer();
            impl.setup(context.getConfiguration());
        }

        @Override
        public void reduce(Text key, Iterable<ObservationCount> values, Context context)
                throws IOException, InterruptedException {
            impl.reduce(key, values, contextAdapter);
        }
    }

    private void distributeVariantsFile(RecabTableOptionParser parser) {
        Configuration conf = getConf();
        DistributedCache.createSymlink(conf); // create symlinks in each task's working directory for the distributed files

        String distPath;
        String variantsFileType;

        if (parser.getVcfFile() != null) {
            variantsFileType = VariantsFileTypeVcf;
            distPath = parser.getVcfFile().toString();
            conf.set(VariantsFileTypeProperty, VariantsFileTypeVcf);
        } else if (parser.getRodFile() != null) {
            variantsFileType = VariantsFileTypeRod;
            distPath = parser.getRodFile().toString();
            conf.set(VariantsFileTypeProperty, VariantsFileTypeRod);
        } else
            throw new RuntimeException(
                    "BUG!! RecabTableOptionParser defined with getRodFile and getVcfFile both null!");

        distPath += "#" + LocalVariantsFile;
        try {
            DistributedCache.addCacheFile(new URI(distPath), conf);
        } catch (URISyntaxException e) {
            throw new RuntimeException("Invalid syntax in path to variants file. " + e);
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        LOG.info("starting");

        RecabTableOptionParser parser = new RecabTableOptionParser();
        parser.parse(getConf(), args);

        LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");

        // must be called before creating the job, since the job
        // *copies* the Configuration.
        distributeVariantsFile(parser);

        // Create a Job using the processed conf
        Job job = new Job(getConf(), "RecabTable " + parser.getInputPaths().get(0));

        job.setJarByClass(RecabTable.class);
        job.setInputFormatClass(FormatNameMap
                .getInputFormat(job.getConfiguration().get(RecabTableOptionParser.INPUT_FORMAT_CONF, "sam")));
        LOG.info("Using input format " + job.getInputFormatClass().getName());

        // input paths
        for (Path p : parser.getInputPaths())
            FileInputFormat.addInputPath(job, p);

        job.setMapperClass(Map.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(ObservationCount.class);

        job.setCombinerClass(Combiner.class);

        job.setReducerClass(Red.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // output
        FileOutputFormat.setOutputPath(job, parser.getOutputPath());

        // Submit the job, then poll for progress until the job is complete
        boolean result = job.waitForCompletion(true);
        if (result) {
            LOG.info("done");
            return 0;
        } else {
            LOG.fatal(this.getClass().getName() + " failed!");
            return 1;
        }
    }

    public static void main(String[] args) throws Exception {
        int res = new SealToolRunner().run(new RecabTable(), args);
        System.exit(res);
    }
}