Brush.AdjustMateEdge.java Source code

Java tutorial

Introduction

Here is the source code for Brush.AdjustMateEdge.java

Source

/*
AdjustMateEdge.java
2012  CloudBrush, developed by Chien-Chih Chen (rocky@iis.sinica.edu.tw), 
released under Apache License 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 
at: https://github.com/ice91/CloudBrush
*/

package Brush;

import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

public class AdjustMateEdge extends Configured implements Tool {
    private static final Logger sLogger = Logger.getLogger(AdjustMateEdge.class);

    // AdjustMateEdgeMapper
    ///////////////////////////////////////////////////////////////////////////

    public static class AdjustMateEdgeMapper extends MapReduceBase
            implements Mapper<LongWritable, Text, Text, Text> {
        private static int K = 0;
        static public float EXPCOV = 20f;
        static public float READLEN = 36;
        static public long READS = 0;
        static public long CTG_SUM = 0;

        public void configure(JobConf job) {
            K = Integer.parseInt(job.get("K"));
            EXPCOV = Float.parseFloat(job.get("EXPCOV"));
            READLEN = (float) Long.parseLong(job.get("READLENGTH"));
            READS = Long.parseLong(job.get("READS"));
            CTG_SUM = Long.parseLong(job.get("CTG_SUM"));
        }

        public void map(LongWritable lineid, Text nodetxt, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            Node node = new Node();
            node.fromNodeMsg(nodetxt.toString());
            //\\define unique
            double A_statistic = 0;
            //A_statistic = node.str().length()*EXPCOV/(READLEN-K+1)-(float)((node.str().length()*node.cov()/READLEN)*Math.log(2));
            A_statistic = node.len() * ((float) READS / (float) CTG_SUM) - node.getPairEnds().size() * Math.log(2);
            if (A_statistic > 10) {
                node.setisUnique(true);
                reporter.incrCounter("Brush", "unique", 1);
            } else {
                node.setisUnique(false);
                reporter.incrCounter("Brush", "repeat", 1);
            }
            for (String key : Node.edgetypes) {
                List<String> edges = node.getEdges(key);
                if (edges != null) {
                    for (int i = 0; i < edges.size(); i++) {
                        String[] vals = edges.get(i).split("!");
                        String edge_id = vals[0];
                        String oval_size = vals[1];
                        String con = Node.flip_link(key);
                        output.collect(new Text(edge_id), new Text(
                                Node.DARKMSG + "\t" + con + "\t" + oval_size + "\t" + node.toNodeMsg(true)));
                    }
                }
            }
            output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg()));
            reporter.incrCounter("Brush", "nodes", 1);

        }
    }

    // AdjustMateEdgeReducer
    ///////////////////////////////////////////////////////////////////////////

    public static class AdjustMateEdgeReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
        private static int K = 0;
        static public long INSLEN = 200;
        static public long INSLEN_SD = 20;

        public void configure(JobConf job) {
            K = Integer.parseInt(job.get("K"));
            INSLEN = Long.parseLong(job.get("INSLENGTH"));
            INSLEN_SD = Long.parseLong(job.get("INSLENGTH_SD"));
        }

        class NeighborInfo {
            String type;
            String oval_size;
            Node node;
        }

        public void reduce(Text nodeid, Iterator<Text> iter, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            Node node = new Node(nodeid.toString());
            List<NeighborInfo> flist = new ArrayList<NeighborInfo>();
            List<NeighborInfo> rlist = new ArrayList<NeighborInfo>();

            int sawnode = 0;

            while (iter.hasNext()) {
                String msg = iter.next().toString();

                //System.err.println(key.toString() + "\t" + msg);

                String[] vals = msg.split("\t");

                if (vals[0].equals(Node.NODEMSG)) {
                    node.parseNodeMsg(vals, 0);
                    sawnode++;
                } else if (vals[0].equals(Node.DARKMSG)) {
                    NeighborInfo ni = new NeighborInfo();

                    ni.type = vals[1];
                    ni.oval_size = vals[2];
                    ni.node = new Node(vals[3]);
                    ni.node.parseNodeMsg(vals, 4);

                    if (ni.type.charAt(0) == 'f') {
                        flist.add(ni);
                    } else if (ni.type.charAt(0) == 'r') {
                        rlist.add(ni);
                    }
                } else {
                    throw new IOException("Unknown msgtype: " + msg);
                }
            }
            if (sawnode != 1) {
                throw new IOException(
                        "ERROR: Didn't see exactly 1 nodemsg (" + sawnode + ") for " + nodeid.toString());
            }

            int pairs_support;
            if (node.isUnique()) {
                //\\\\\\\\\\\\ 
                boolean keep_edge = false;
                int keep_edge_idx = -1;
                int current_support = 0;
                for (int i = 0; i < flist.size(); i++) {
                    pairs_support = 0;
                    for (int j = 0; j < rlist.size(); j++) {
                        if (!flist.get(i).node.getNodeId().equals(rlist.get(j).node.getNodeId())) {
                            if ((flist.get(i).node.len() + rlist.get(j).node.len() + node.len()
                                    - Integer.parseInt(flist.get(i).oval_size)
                                    - Integer.parseInt(rlist.get(j).oval_size)) < (INSLEN + 2 * INSLEN_SD)) {
                                keep_edge = true; //edge distance small than insert size, do not use pair end information
                                break;
                                //continue;
                            }
                        }
                    }
                    if (keep_edge) {
                        break;
                    }
                    if (!flist.get(i).node.getNodeId().equals(node.getNodeId())) {
                        pairs_support = Node.Count_PairEnd(node.getPairEnds(), flist.get(i).node.getPairEnds());
                    }
                    int tmp;
                    for (int j = 0; j < rlist.size(); j++) {
                        if (!flist.get(i).node.getNodeId().equals(rlist.get(j).node.getNodeId())) {
                            tmp = Node.Count_PairEnd(flist.get(i).node.getPairEnds(),
                                    rlist.get(j).node.getPairEnds());
                            if (tmp > pairs_support) {
                                pairs_support = tmp;
                            }
                        }
                    }

                    if (pairs_support > current_support) {
                        keep_edge_idx = i;
                        current_support = pairs_support;
                    }
                    /*if (pairs_support < 1 ) {
                        node.addRemovalEdge(node.getNodeId(), flist.get(i).type, flist.get(i).node.getNodeId(), Integer.parseInt(flist.get(i).oval_size));
                        node.addRemovalEdge(flist.get(i).node.getNodeId(), Node.flip_link(flist.get(i).type), node.getNodeId(), Integer.parseInt(flist.get(i).oval_size));
                        reporter.incrCounter("Brush", "edge_removal", 1); 
                    }
                    if (pairs_support > 5 || !node.isUnique()) {
                        reporter.incrCounter("Brush", "5_edges", 1); 
                    }
                    if (pairs_support > 10 || !node.isUnique()) {
                        reporter.incrCounter("Brush", "10_edges", 1); 
                    }
                    if ((float)pairs_support/node.cov() > 0.5f || !node.isUnique()) {
                        reporter.incrCounter("Brush", "05_edges", 1); 
                    }
                    if ((float)pairs_support/node.cov() > 0.1f || !node.isUnique()) {
                        reporter.incrCounter("Brush", "01_edges", 1); 
                    }
                    reporter.incrCounter("Brush", "edges", 1); */
                }
                // remove f_edge
                // keep max pair support edge, remove other edge
                if (!keep_edge && current_support > 0) {
                    for (int i = 0; i < flist.size(); i++) {
                        if (i != keep_edge_idx) {
                            node.addRemovalEdge(node.getNodeId(), flist.get(i).type, flist.get(i).node.getNodeId(),
                                    Integer.parseInt(flist.get(i).oval_size));
                            node.addRemovalEdge(flist.get(i).node.getNodeId(), Node.flip_link(flist.get(i).type),
                                    node.getNodeId(), Integer.parseInt(flist.get(i).oval_size));
                            reporter.incrCounter("Brush", "edge_removal", 1);
                        }
                    }
                }

                keep_edge = false;
                keep_edge_idx = -1;
                current_support = 0;
                for (int i = 0; i < rlist.size(); i++) {
                    pairs_support = 0;
                    for (int j = 0; j < flist.size(); j++) {
                        if (!rlist.get(i).node.getNodeId().equals(flist.get(j).node.getNodeId())) {
                            if ((rlist.get(i).node.len() + flist.get(j).node.len() + node.len()
                                    - Integer.parseInt(rlist.get(i).oval_size)
                                    - Integer.parseInt(flist.get(j).oval_size)) < (INSLEN + 2 * INSLEN_SD)) {
                                keep_edge = true;
                                break;
                                //continue;
                            }
                        }
                    }
                    if (keep_edge) {
                        break;
                    }
                    if (!rlist.get(i).node.getNodeId().equals(node.getNodeId())) {
                        pairs_support = Node.Count_PairEnd(node.getPairEnds(), rlist.get(i).node.getPairEnds());
                    }
                    int tmp;
                    for (int j = 0; j < flist.size(); j++) {
                        if (!rlist.get(i).node.getNodeId().equals(flist.get(j).node.getNodeId())) {
                            tmp = Node.Count_PairEnd(rlist.get(i).node.getPairEnds(),
                                    flist.get(j).node.getPairEnds());
                            if (tmp > pairs_support) {
                                pairs_support = tmp;
                            }
                        }
                    }
                    if (pairs_support > current_support) {
                        keep_edge_idx = i;
                        current_support = pairs_support;
                    }
                    /*if (pairs_support < 1) {
                        node.addRemovalEdge(node.getNodeId(), rlist.get(i).type, rlist.get(i).node.getNodeId(), Integer.parseInt(rlist.get(i).oval_size));
                        node.addRemovalEdge(rlist.get(i).node.getNodeId(), Node.flip_link(rlist.get(i).type), node.getNodeId(), Integer.parseInt(rlist.get(i).oval_size));
                        reporter.incrCounter("Brush", "edge_removal", 1); 
                    }
                    if (pairs_support > 5 || !node.isUnique()) {
                        reporter.incrCounter("Brush", "5_edges", 1); 
                    }
                    if (pairs_support > 10 || !node.isUnique()) {
                        reporter.incrCounter("Brush", "10_edges", 1); 
                    }
                    if ((float)pairs_support/node.cov() > 0.5f || !node.isUnique() ) {
                        reporter.incrCounter("Brush", "05_edges", 1); 
                    }
                    if ((float)pairs_support/node.cov() > 0.1f || !node.isUnique()) {
                        reporter.incrCounter("Brush", "01_edges", 1); 
                    }*/
                    reporter.incrCounter("Brush", "edges", 1);
                }
                // remove r_edge
                if (!keep_edge && current_support > 0) {
                    for (int i = 0; i < rlist.size(); i++) {
                        if (i != keep_edge_idx) {
                            node.addRemovalEdge(node.getNodeId(), rlist.get(i).type, rlist.get(i).node.getNodeId(),
                                    Integer.parseInt(rlist.get(i).oval_size));
                            node.addRemovalEdge(rlist.get(i).node.getNodeId(), Node.flip_link(rlist.get(i).type),
                                    node.getNodeId(), Integer.parseInt(rlist.get(i).oval_size));
                            reporter.incrCounter("Brush", "edge_removal", 1);
                        }
                    }
                }
                //\\\\\\\\\\\\
            } // node is unique
            output.collect(nodeid, new Text(node.toNodeMsg()));
            //output.collect(new Text(node.getNodeId()), new Text(choices + ""));
        }
    }

    public RunningJob run(String inputPath, String outputPath, long reads, long ctg_sum) throws Exception {
        sLogger.info("Tool name: AdjustMateEdge");
        sLogger.info(" - input: " + inputPath);
        sLogger.info(" - output: " + outputPath);

        //JobConf conf = new JobConf(Stats.class);
        JobConf conf = new JobConf(AdjustMateEdge.class);
        conf.setJobName("AdjustMateEdge " + inputPath);

        conf.setLong("READS", reads);
        conf.setLong("CTG_SUM", ctg_sum);
        BrushConfig.initializeConfiguration(conf);

        FileInputFormat.addInputPath(conf, new Path(inputPath));
        FileOutputFormat.setOutputPath(conf, new Path(outputPath));

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(Text.class);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        conf.setMapperClass(AdjustMateEdgeMapper.class);
        conf.setReducerClass(AdjustMateEdgeReducer.class);

        //delete the output directory if it exists already
        FileSystem.get(conf).delete(new Path(outputPath), true);

        return JobClient.runJob(conf);
    }

    public int run(String[] args) throws Exception {
        String inputPath = "";
        String outputPath = "";

        run(inputPath, outputPath, 0, 0);

        return 0;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new AdjustMateEdge(), args);
        System.exit(res);
    }
}