Brush.CutRepeatBoundary.java Source code

Java tutorial

Introduction

Here is the source code for Brush.CutRepeatBoundary.java

Source

/*
CutRepreatBoundary.java
2012  CloudBrush, developed by Chien-Chih Chen (rocky@iis.sinica.edu.tw), 
released under Apache License 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 
at: https://github.com/ice91/CloudBrush
*/

package Brush;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Collections;
import java.util.Comparator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

public class CutRepeatBoundary extends Configured implements Tool {
    private static final Logger sLogger = Logger.getLogger(CutRepeatBoundary.class);

    // CutRepeatBoundaryMapper
    ///////////////////////////////////////////////////////////////////////////

    public static class CutRepeatBoundaryMapper extends MapReduceBase
            implements Mapper<LongWritable, Text, Text, Text> {
        private static int K = 0;

        static public float KMERCOV = 30f;
        static public float READLEN = 36;
        /*public static int MAXBUBBLELEN = 0;*/

        public void configure(JobConf job) {
            K = Integer.parseInt(job.get("K"));
            KMERCOV = Float.parseFloat(job.get("KMERCOV"));
            READLEN = (float) Long.parseLong(job.get("READLENGTH"));
            /*MAXBUBBLELEN = Integer.parseInt(job.get("MAXBUBBLELEN"));*/
        }

        public void map(LongWritable lineid, Text nodetxt, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            Node node = new Node();
            node.fromNodeMsg(nodetxt.toString());
            /*int fdegree = node.degree("f");
            int rdegree = node.degree("r");
            int tips = 0;
            if (fdegree + rdegree <= 1) {
            tips = 1;
            }*/
            int unique = 0;
            //\\define unique
            float A_statistic = 0;
            A_statistic = node.str().length() * KMERCOV / (READLEN - K + 1)
                    - (float) ((node.str().length() * node.cov() / READLEN) * Math.log(2));
            if (A_statistic > 10) {
                unique = 1;
            } else {
                unique = 0;
            }
            //if (node.isUnique()) {
            for (String key : Node.edgetypes) {
                List<String> edges = node.getEdges(key);
                if (edges != null) {
                    for (int i = 0; i < edges.size(); i++) {
                        String[] vals = edges.get(i).split("!");
                        String edge_id = vals[0];
                        String oval_size = vals[1];
                        String con = Node.flip_link(key);
                        output.collect(new Text(edge_id),
                                new Text(Node.DARKMSG + "\t" + con + "\t" + node.getNodeId() + "\t" + node.str_raw()
                                        + "\t" + oval_size + "\t" + node.cov() + "\t" + unique));
                    }
                }
            }
            //}

            output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg()));
            reporter.incrCounter("Brush", "nodes", 1);

        }
    }

    // CutRepeatBoundaryReducer
    ///////////////////////////////////////////////////////////////////////////

    public static class CutRepeatBoundaryReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
        private static int K = 0;
        private static int READLEN = 0;
        private static float majority = 0.6f;
        private static float PWM_N = 0.1f;
        static public long READS = 0;
        static public long CTG_SUM = 0;
        static public float KMERCOV = 30f;

        public void configure(JobConf job) {
            K = Integer.parseInt(job.get("K"));
            READLEN = (int) Long.parseLong(job.get("READLENGTH"));
            majority = Float.parseFloat(job.get("MAJORITY"));
            PWM_N = Float.parseFloat(job.get("PWM_N"));
            KMERCOV = Float.parseFloat(job.get("KMERCOV"));
            //READS = Long.parseLong(job.get("READS"));
            //CTG_SUM = Long.parseLong(job.get("CTG_SUM"));
        }

        public class EdgeInfo {
            public String id;
            public String str;
            public String edge_type;
            public int overlap_size;
            public float cov;
            public int unique;

            // for gray edge
            public EdgeInfo(String[] vals, int offset) throws IOException {
                id = vals[offset + 1];
                edge_type = vals[offset + 0];
                str = vals[offset + 2];
                overlap_size = Integer.parseInt(vals[offset + 3]);
                cov = Float.parseFloat(vals[offset + 4]);
                unique = Integer.parseInt(vals[offset + 5]);
                // for black edge's end tag
            }

            // for black edge
            public EdgeInfo(String con1, String node_id1, String str_raw1, int oval_size1, float cov1, int tips1) {
                id = node_id1;
                edge_type = con1;
                str = str_raw1;
                overlap_size = oval_size1;
                cov = cov1;
                unique = tips1;
            }

            public String toString() {
                return edge_type + " " + id + " " + overlap_size + " " + str + " " + cov;
            }
        }

        class EdgeComparator implements Comparator {
            public int compare(Object element1, Object element2) {
                EdgeInfo obj1 = (EdgeInfo) element1;
                EdgeInfo obj2 = (EdgeInfo) element2;
                //con + "|" + node_id + "|" + str_raw + "|" + oval_size + "|" + cov + "|" + end;
                if ((int) ((Node.dna2str(obj1.str).length() - obj1.overlap_size)
                        - (Node.dna2str(obj2.str).length() - obj2.overlap_size)) >= 0) {
                    return -1;
                } else {
                    return 1;
                }
            }
        }

        class OvelapSizeComparator implements Comparator {
            public int compare(Object element1, Object element2) {
                EdgeInfo obj1 = (EdgeInfo) element1;
                EdgeInfo obj2 = (EdgeInfo) element2;
                if ((int) (obj1.overlap_size - obj2.overlap_size) > 0) {
                    return -1;
                } else if ((int) (obj1.overlap_size - obj2.overlap_size) < 0) {
                    return 1;
                } else {
                    if (obj1.str.length() - obj2.str.length() < 0) {
                        return -1;
                    } else if (obj1.str.length() - obj2.str.length() > 0) {
                        return 1;
                    } else {
                        if (obj1.id.compareTo(obj2.id) < 0) {
                            return -1;
                        } else {
                            return 1;
                        }
                    }
                }
            }
        }

        class OvelapSizeComparator_f implements Comparator {
            public int compare(Object element1, Object element2) {
                EdgeInfo obj1 = (EdgeInfo) element1;
                EdgeInfo obj2 = (EdgeInfo) element2;
                if ((int) (obj1.overlap_size - obj2.overlap_size) > 0) {
                    return -1;
                } else if ((int) (obj1.overlap_size - obj2.overlap_size) < 0) {
                    return 1;
                } else {
                    if (obj1.str.length() - obj2.str.length() < 0) {
                        return -1;
                    } else if (obj1.str.length() - obj2.str.length() > 0) {
                        return 1;
                    } else {
                        if (obj1.id.compareTo(obj2.id) < 0) {
                            return -1;
                        } else {
                            return 1;
                        }
                    }
                }
            }
        }

        class OvelapSizeComparator_r implements Comparator {
            public int compare(Object element1, Object element2) {
                EdgeInfo obj1 = (EdgeInfo) element1;
                EdgeInfo obj2 = (EdgeInfo) element2;
                if ((int) (obj1.overlap_size - obj2.overlap_size) > 0) {
                    return -1;
                } else if ((int) (obj1.overlap_size - obj2.overlap_size) < 0) {
                    return 1;
                } else {
                    if (obj1.str.length() - obj2.str.length() < 0) {
                        return -1;
                    } else if (obj1.str.length() - obj2.str.length() > 0) {
                        return 1;
                    } else {
                        if (obj1.id.compareTo(obj2.id) < 0) {
                            return 1;
                        } else {
                            return -1;
                        }
                    }
                }
            }
        }

        public void reduce(Text nodeid, Iterator<Text> iter, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            Node node = new Node(nodeid.toString());
            List<EdgeInfo> flist = new ArrayList<EdgeInfo>();
            List<EdgeInfo> rlist = new ArrayList<EdgeInfo>();

            int sawnode = 0;

            while (iter.hasNext()) {
                String msg = iter.next().toString();

                //System.err.println(key.toString() + "\t" + msg);

                String[] vals = msg.split("\t");

                if (vals[0].equals(Node.NODEMSG)) {
                    node.parseNodeMsg(vals, 0);
                    sawnode++;
                } else if (vals[0].equals(Node.DARKMSG)) {
                    EdgeInfo ei = new EdgeInfo(vals[1], vals[2], vals[3], Integer.parseInt(vals[4]),
                            Float.parseFloat(vals[5]), Integer.parseInt(vals[6]));
                    //blist.add(ei);
                    if (ei.edge_type.charAt(0) == 'f') {
                        flist.add(ei);
                    } else if (ei.edge_type.charAt(0) == 'r') {
                        rlist.add(ei);
                    }
                } else {
                    throw new IOException("Unknown msgtype: " + msg);
                }
            }

            if (sawnode != 1) {
                throw new IOException(
                        "ERROR: Didn't see exactly 1 nodemsg (" + sawnode + ") for " + nodeid.toString());
            }

            //\\define unique
            double A_statistic = 0;
            A_statistic = node.str().length() * KMERCOV / (READLEN - K + 1)
                    - (float) ((node.str().length() * node.cov() / READLEN) * Math.log(2));
            //A_statistic = node.len() * ((float)READS/(float)CTG_SUM) - node.getPairEnds().size()*Math.log(2);
            if (A_statistic > 10) {
                node.setisUnique(true);
                reporter.incrCounter("Brush", "unique", 1);
            } else {
                node.setisUnique(false);
                reporter.incrCounter("Brush", "repeat", 1);
            }
            //\\\
            EdgeInfo edge = null;

            int Consensus_threshold = (READLEN - K) / 2;
            if (flist.size() > 1) {
                if (node.isUnique()) {
                    //\\\\\\\\\
                    Collections.sort(flist, new OvelapSizeComparator_f());
                    //\\PWM
                    List<String> f_edge_list = new ArrayList<String>();
                    EdgeInfo f_edge = null;
                    String f_consensus = "";
                    for (int i = 0; i < flist.size(); i++) {
                        f_edge = flist.get(i);
                        String str = Node.dna2str(f_edge.str);
                        String str_cut = "";
                        if (f_edge.edge_type.charAt(1) == 'r') {
                            str = Node.rc(str);
                        }
                        str_cut = str.substring(f_edge.overlap_size);
                        f_edge_list.add(str_cut + "!" + f_edge.cov);
                    }
                    f_consensus = Node.Consensus(f_edge_list, majority, PWM_N);
                    //f_consensus = Node.Consensus2(f_edge_list, PWM_N);
                    if (f_consensus != null) {
                        for (int i = 0; i < flist.size(); i++) {
                            edge = flist.get(i);
                            String str = Node.dna2str(edge.str);
                            if (edge.edge_type.charAt(1) == 'r') {
                                str = Node.rc(str);
                            }
                            String str_cut = "";
                            str_cut = str.substring(edge.overlap_size);
                            int count_match = 0;
                            int miss_match = 0;
                            int compare_len = 0;
                            for (int j = 0; j < str_cut.length() && j < f_consensus.length(); j++) {
                                if (str_cut.charAt(j) == f_consensus.charAt(j)) {
                                    count_match = count_match + 1;
                                } else {
                                    if (!(f_consensus.charAt(j) == 'N')) {
                                        miss_match = miss_match + 1;
                                    }
                                }
                                compare_len = j;
                            }
                            if (miss_match <= 0 /*|| edge.unique == 0*/) {
                                // keep edge
                            } else {
                                node.addRemovalEdge(node.getNodeId(), edge.edge_type, edge.id, edge.overlap_size);
                                node.addRemovalEdge(edge.id, Node.flip_link(edge.edge_type), node.getNodeId(),
                                        edge.overlap_size);
                                reporter.incrCounter("Brush", "edge_removal", 1);
                            }
                        }
                    }
                    //\\\\
                } // node is unique
                  //\\\\\\\\\\\\\\\\\\\\\\\\\\\
                  // count overlap boundary
                int current_overlap = 0;
                float overlap_count = 0;
                float total_cov = 0;
                int boundary_overlap = 0;
                boolean cut_boundary = false;
                for (int i = 0; i < flist.size(); i++) {
                    total_cov = total_cov + flist.get(i).cov;
                    if (flist.get(i).id.equals(node.getNodeId())) {
                        edge = flist.get(i);
                        node.addRemovalEdge(node.getNodeId(), edge.edge_type, edge.id, edge.overlap_size);
                        node.addRemovalEdge(edge.id, Node.flip_link(edge.edge_type), node.getNodeId(),
                                edge.overlap_size);
                        reporter.incrCounter("Brush", "edge_removal", 1);
                    }
                }
                for (int i = 0; i < flist.size(); i++) {
                    if (flist.get(i).overlap_size == current_overlap) {
                        overlap_count = overlap_count + flist.get(i).cov;
                        if (overlap_count > 1) {
                            boundary_overlap = flist.get(i).overlap_size;
                            break;
                        }
                    } else {
                        //overlap_count = flist.get(i).cov;                       
                        overlap_count = 1;
                        current_overlap = flist.get(i).overlap_size;
                    }
                }
                if (boundary_overlap != 0 && boundary_overlap < flist.get(0).overlap_size) {
                    cut_boundary = true;
                }
                if (cut_boundary) {
                    for (int i = 0; i < flist.size(); i++) {
                        edge = flist.get(i);
                        if (edge.overlap_size <= boundary_overlap) {
                            node.addRemovalEdge(node.getNodeId(), edge.edge_type, edge.id, edge.overlap_size);
                            node.addRemovalEdge(edge.id, Node.flip_link(edge.edge_type), node.getNodeId(),
                                    edge.overlap_size);
                            reporter.incrCounter("Brush", "edge_removal", 1);
                        }
                    }
                }
            }
            if (rlist.size() > 1) {
                if (node.isUnique()) {
                    //\\\\\\\
                    Collections.sort(rlist, new OvelapSizeComparator_r());
                    //\\PWM
                    List<String> r_edge_list = new ArrayList<String>();
                    EdgeInfo r_edge = null;
                    String r_consensus = "";
                    for (int i = 0; i < rlist.size(); i++) {
                        r_edge = rlist.get(i);
                        String str = Node.dna2str(r_edge.str);
                        String str_cut = "";
                        if (r_edge.edge_type.charAt(1) == 'r') {
                            str = Node.rc(str);
                        }
                        str_cut = str.substring(r_edge.overlap_size);
                        r_edge_list.add(str_cut + "!" + r_edge.cov);
                    }
                    r_consensus = Node.Consensus(r_edge_list, majority, PWM_N);
                    //r_consensus = Node.Consensus2(r_edge_list, PWM_N);
                    //\\\\\\\\\\\\\\\\\\\\\\\\\\\
                    if (r_consensus != null) {
                        for (int i = 0; i < rlist.size(); i++) {
                            edge = rlist.get(i);
                            String str = Node.dna2str(edge.str);
                            if (edge.edge_type.charAt(1) == 'r') {
                                str = Node.rc(str);
                            }
                            String str_cut = "";
                            str_cut = str.substring(edge.overlap_size);
                            int count_match = 0;
                            int miss_match = 0;
                            int compare_len = 0;
                            for (int j = 0; j < str_cut.length() && j < r_consensus.length(); j++) {
                                if (str_cut.charAt(j) == r_consensus.charAt(j)) {
                                    count_match = count_match + 1;
                                } else {
                                    if (!(r_consensus.charAt(j) == 'N')) {
                                        miss_match = miss_match + 1;
                                    }
                                }
                                compare_len = j;
                            }
                            if (miss_match <= 0 /*|| edge.unique == 0*/) {
                                // keep edge
                            } else {
                                node.addRemovalEdge(node.getNodeId(), edge.edge_type, edge.id, edge.overlap_size);
                                node.addRemovalEdge(edge.id, Node.flip_link(edge.edge_type), node.getNodeId(),
                                        edge.overlap_size);
                                reporter.incrCounter("Brush", "edge_removal", 1);
                            }
                        }
                    }
                    //\\\\\
                } // node is unique
                  // count overlap boundary
                int current_overlap = 0;
                float overlap_count = 0;
                float total_cov = 0;
                int boundary_overlap = 0;
                boolean cut_boundary = false;
                for (int i = 0; i < rlist.size(); i++) {
                    total_cov = total_cov + rlist.get(i).cov;
                    if (rlist.get(i).id.equals(node.getNodeId())) {
                        edge = rlist.get(i);
                        node.addRemovalEdge(node.getNodeId(), edge.edge_type, edge.id, edge.overlap_size);
                        node.addRemovalEdge(edge.id, Node.flip_link(edge.edge_type), node.getNodeId(),
                                edge.overlap_size);
                        reporter.incrCounter("Brush", "edge_removal", 1);
                    }
                }
                for (int i = 0; i < rlist.size(); i++) {
                    if (rlist.get(i).overlap_size == current_overlap) {
                        overlap_count = overlap_count + rlist.get(i).cov;
                        if (overlap_count > 1) {
                            boundary_overlap = rlist.get(i).overlap_size;
                            break;
                        }
                    } else {
                        //overlap_count = rlist.get(i).cov;
                        overlap_count = 1;
                        current_overlap = rlist.get(i).overlap_size;
                    }
                }
                if (boundary_overlap != 0 && boundary_overlap < rlist.get(0).overlap_size) {
                    cut_boundary = true;
                }
                if (cut_boundary) {
                    for (int i = 0; i < rlist.size(); i++) {
                        edge = rlist.get(i);
                        if (edge.overlap_size <= boundary_overlap) {
                            node.addRemovalEdge(node.getNodeId(), edge.edge_type, edge.id, edge.overlap_size);
                            node.addRemovalEdge(edge.id, Node.flip_link(edge.edge_type), node.getNodeId(),
                                    edge.overlap_size);
                            reporter.incrCounter("Brush", "edge_removal", 1);
                        }
                    }
                }
            }
            output.collect(nodeid, new Text(node.toNodeMsg()));
        }
    }

    // Run Tool
    ///////////////////////////////////////////////////////////////////////////

    public RunningJob run(String inputPath, String outputPath, long reads, long ctg_sum) throws Exception {
        sLogger.info("Tool name: CutRepeatBoundary");
        sLogger.info(" - input: " + inputPath);
        sLogger.info(" - output: " + outputPath);

        JobConf conf = new JobConf(CutRepeatBoundary.class);
        conf.setJobName("CutRepeatBoundary " + inputPath + " " + BrushConfig.K);
        //conf.setFloat("Error_Rate", ErrorRate);
        //conf.setFloat("Exp_Cov", Exp_Cov);
        conf.setLong("READS", reads);
        conf.setLong("CTG_SUM", ctg_sum);

        BrushConfig.initializeConfiguration(conf);

        FileInputFormat.addInputPath(conf, new Path(inputPath));
        FileOutputFormat.setOutputPath(conf, new Path(outputPath));

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(Text.class);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        conf.setMapperClass(CutRepeatBoundaryMapper.class);
        conf.setReducerClass(CutRepeatBoundaryReducer.class);

        //delete the output directory if it exists already
        FileSystem.get(conf).delete(new Path(outputPath), true);

        return JobClient.runJob(conf);
    }

    // Parse Arguments and run
    ///////////////////////////////////////////////////////////////////////////

    public int run(String[] args) throws Exception {
        String inputPath = "";
        String outputPath = "";
        BrushConfig.K = 21;

        run(inputPath, outputPath, 0, 0);
        return 0;
    }

    // Main
    ///////////////////////////////////////////////////////////////////////////

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new CutRepeatBoundary(), args);
        System.exit(res);
    }
}