Brush.TransitiveReduction.java Source code

Java tutorial

Introduction

Here is the source code for Brush.TransitiveReduction.java

Source

/*
TransitiveReduction.java
2012  CloudBrush, developed by Chien-Chih Chen (rocky@iis.sinica.edu.tw), 
released under Apache License 2.0 (http://www.apache.org/licenses/LICENSE-2.0) 
at: https://github.com/ice91/CloudBrush
*/

package Brush;

import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;

public class TransitiveReduction extends Configured implements Tool {
    private static final Logger sLogger = Logger.getLogger(TransitiveReduction.class);

    public static int _min2(int a, int b) {
        return (a < b) ? a : b;
    }

    public static int _max2(int a, int b) {
        return (a > b) ? a : b;
    }

    public static int _min3(int a, int b, int c) {
        return a < b ? a < c ? a : c : b < c ? b : c;
    }

    public static int fastdistance(String word1, String word2) {
        int len1 = word1.length();
        int len2 = word2.length();

        int[][] d = new int[len1 + 1][len2 + 1];

        for (int i = 0; i <= len1; i++) {
            d[i][0] = i;
        }

        for (int j = 0; j <= len2; j++) {
            d[0][j] = j;
        }

        for (int i = 1; i <= len1; i++) {
            char w1 = word1.charAt(i - 1);
            for (int j = 1; j <= len2; j++) {
                char w2 = word2.charAt(j - 1);
                int e = (w1 == w2) ? 0 : 1;

                d[i][j] = _min3(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + e);
            }
        }

        return d[len1][len2];
    }
    // TransitiveReductionMapper
    ///////////////////////////////////////////////////////////////////////////

    public static class TransitiveReductionMapper extends MapReduceBase
            implements Mapper<LongWritable, Text, Text, Text> {
        public void map(LongWritable lineid, Text nodetxt, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            Node node = new Node();
            node.fromNodeMsg(nodetxt.toString());

            for (String key : Node.edgetypes) {
                List<String> edges = node.getEdges(key);
                if (edges != null) {
                    for (int i = 0; i < edges.size(); i++) {
                        String[] vals = edges.get(i).split("!");
                        String edge_id = vals[0];
                        String oval_size = vals[1];
                        //String con = Node.flip_dir(adj) + "f";
                        String con = Node.flip_link(key);
                        output.collect(new Text(edge_id), new Text(Node.OVALMSG + "\t" + node.getNodeId() + "\t"
                                + node.str_raw() + "\t" + con + "\t" + oval_size));
                        //\\// emit reverse edge
                        //output.collect(new Text(edge_id), new Text(Node.OVALMSG + "\t" + node.getNodeId() + "\t" + node.str_raw() + "\t" + key + "\t" + oval_size));
                    }
                }
            }
            List<String> emit_node = new ArrayList<String>();
            output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg()));
            reporter.incrCounter("Brush", "nodes", 1);
        }
    }

    // TransitiveReductionReducer
    ///////////////////////////////////////////////////////////////////////////

    public static class TransitiveReductionReducer extends MapReduceBase
            implements Reducer<Text, Text, Text, Text> {
        private static int K = 0;
        static public float ERRORRATE = 0.00f;

        public void configure(JobConf job) {
            K = Integer.parseInt(job.get("K"));
            //ERRORRATE = Float.parseFloat(job.get("ERRORRATE"));
        }

        public class OverlapInfo {
            public String id;
            public String str;
            public String edge_type; //node overlap?e.g. node [r,f]f overlap
            public int overlap_size;

            public OverlapInfo(String[] vals, int offset) throws IOException {
                //print "$major\t$BUBBLELINKMSG\t$majord\t$nodeid\t$minord\t$minor\t$str\t$cov\n";

                if (!vals[offset].equals(Node.OVALMSG)) {
                    throw new IOException("Unknown message type");
                }
                id = vals[offset + 1];
                str = vals[offset + 2];
                edge_type = vals[offset + 3];
                overlap_size = Integer.parseInt(vals[offset + 4]);

            }

            public String toString() {
                return edge_type + " " + id + " " + overlap_size + " " + str;
            }
        }

        class OvelapSizeComparator_f implements Comparator {
            public int compare(Object element1, Object element2) {
                OverlapInfo obj1 = (OverlapInfo) element1;
                OverlapInfo obj2 = (OverlapInfo) element2;
                if ((int) (obj1.overlap_size - obj2.overlap_size) > 0) {
                    return -1;
                } else if ((int) (obj1.overlap_size - obj2.overlap_size) < 0) {
                    return 1;
                } else {
                    if (obj1.str.length() - obj2.str.length() < 0) {
                        return -1;
                    } else if (obj1.str.length() - obj2.str.length() > 0) {
                        return 1;
                    } else {
                        if (obj1.id.compareTo(obj2.id) < 0) {
                            return -1;
                        } else {
                            return 1;
                        }
                    }
                }
            }
        }

        class OvelapSizeComparator_r implements Comparator {
            public int compare(Object element1, Object element2) {
                OverlapInfo obj1 = (OverlapInfo) element1;
                OverlapInfo obj2 = (OverlapInfo) element2;
                if ((int) (obj1.overlap_size - obj2.overlap_size) > 0) {
                    return -1;
                } else if ((int) (obj1.overlap_size - obj2.overlap_size) < 0) {
                    return 1;
                } else {
                    if (obj1.str.length() - obj2.str.length() < 0) {
                        return -1;
                    } else if (obj1.str.length() - obj2.str.length() > 0) {
                        return 1;
                    } else {
                        if (obj1.id.compareTo(obj2.id) < 0) {
                            return 1;
                        } else {
                            return -1;
                        }
                    }
                }
            }
        }

        public class Prefix {
            public String id;
            public String suffix;
            public String str;
            public String edge_type;
            public int oval_size;

            public Prefix(String id1, String edge_type1, String str1, int oval_size1) {
                id = id1;
                edge_type = edge_type1;
                str = str1;
                oval_size = oval_size1;
                suffix = str1.substring(oval_size);
            }
        }

        public void reduce(Text nodeid, Iterator<Text> iter, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            Node node = new Node(nodeid.toString());
            //List<OverlapInfo> olist = new ArrayList<OverlapInfo>();
            List<OverlapInfo> o_flist = new ArrayList<OverlapInfo>();
            List<OverlapInfo> o_rlist = new ArrayList<OverlapInfo>();

            int sawnode = 0;

            while (iter.hasNext()) {
                String msg = iter.next().toString();

                //System.err.println(key.toString() + "\t" + msg);

                String[] vals = msg.split("\t");

                if (vals[0].equals(Node.NODEMSG)) {
                    node.parseNodeMsg(vals, 0);
                    sawnode++;
                } else if (vals[0].equals(Node.OVALMSG)) {
                    OverlapInfo oi = new OverlapInfo(vals, 0);
                    //olist.add(oi);
                    if (oi.edge_type.charAt(0) == 'f') {
                        o_flist.add(oi);
                    } else if (oi.edge_type.charAt(0) == 'r') {
                        o_rlist.add(oi);
                    }
                    //output.collect(new Text(node.getNodeId()), new Text("X" + "\t" + oi.id ));
                } else {
                    throw new IOException("Unknown msgtype: " + msg);
                }
            }
            //\\ one node
            if (sawnode != 1) {
                throw new IOException(
                        "ERROR: Didn't see exactly 1 nodemsg (" + sawnode + ") for " + nodeid.toString());
            }

            //\\ store confirmed edges
            Map<String, List<String>> edges_list = new HashMap<String, List<String>>();
            Map<String, List<String>> IDs_flist = new HashMap<String, List<String>>();
            Map<String, List<String>> IDs_rlist = new HashMap<String, List<String>>();
            Map<String, List<Prefix>> PREFIXs_list = new HashMap<String, List<Prefix>>();

            //\\\\\\\\\\\\\\\\\\\\\\\\\\ f_overlap
            int f_choices = o_flist.size();
            if (f_choices > 0) {
                // Sort overlap strings in order of decreasing overlap size
                Collections.sort(o_flist, new OvelapSizeComparator_f());
                //output.collect(new Text(node.getNodeId()), new Text(olist.size() + "!"));
                // See if there are any pairwise compatible strings
                for (int i = 0; i < f_choices; i++) {
                    String oval_id = o_flist.get(i).id;
                    String oval_type = o_flist.get(i).edge_type;
                    //String node_dir = oval_type.substring(0, 1);
                    String oval_dir = oval_type.substring(1);
                    int oval_size = o_flist.get(i).overlap_size;
                    String edge_content = oval_id + "!" + oval_size;

                    String oval_seq_tmp = Node.dna2str(o_flist.get(i).str);
                    String oval_seq;
                    if (oval_dir.equals("r")) {
                        oval_seq = Node.rc(oval_seq_tmp);
                    } else {
                        oval_seq = oval_seq_tmp;
                    }

                    //output.collect(new Text(node.getNodeId()), new Text("A" + "\t" + olist.get(i).id + "\t" + node.toNodeMsg()));
                    //\\ Self contained filter
                    if (oval_size == oval_seq.length() && oval_size == node.str().length()) {
                        //node.addRemovalEdge(oval_id, Node.flip_link(oval_type), node.getNodeId(), oval_size);
                        reporter.incrCounter("Brush", "contained_edge", 1);
                        //continue;
                    }
                    //\\\\\\\\\\\ Maximal Overlap filter
                    List<String> stored_IDs = IDs_flist.get(oval_type);
                    boolean has_large_overlap = false;
                    if (stored_IDs != null && stored_IDs.contains(oval_id)) {
                        has_large_overlap = true;
                    }
                    if (has_large_overlap) {
                        node.addRemovalEdge(oval_id, Node.flip_link(oval_type), node.getNodeId(), oval_size);
                        continue;
                    }

                    //\\\\\\\\\\\ Transitive Reduction filter
                    List<Prefix> stored_PREFIXs = PREFIXs_list.get("f");
                    String prefix = oval_seq.substring(oval_size);
                    boolean has_trans_edge = false;
                    for (int j = 0; stored_PREFIXs != null && j < stored_PREFIXs.size(); j++) {
                        if (stored_PREFIXs.get(j).oval_size == oval_size
                                && stored_PREFIXs.get(j).str.length() == oval_seq.length()) {
                            continue;
                        }
                        if (ERRORRATE <= 0) {
                            if (prefix.startsWith(stored_PREFIXs.get(j).suffix)) {
                                //System.err.println( "remove:" +  node.getNodeId() + " " + oval_type + " " + oval_id);
                                //output.collect(new Text("prefix_f"),  new Text(stored_PREFIXs.get(j).id + " " + stored_PREFIXs.get(j).edge_type ));
                                //output.collect(new Text("remove_f"),  new Text(node.getNodeId() + " " + oval_type + " " + oval_id));
                                reporter.incrCounter("Brush", "trans_edge", 1);
                                has_trans_edge = true;
                                node.addRemovalEdge(oval_id, Node.flip_link(oval_type), node.getNodeId(),
                                        oval_size);
                                break;
                            }
                        } else {
                            /* String previous = stored_PREFIXs.get(j).str.substring(stored_PREFIXs.get(j).oval_size-oval_size);
                             String current;
                             if (oval_seq.length() >= oval_size + (stored_PREFIXs.get(j).str.length()-stored_PREFIXs.get(j).oval_size )) {
                            current = oval_seq.substring(0, oval_size + (stored_PREFIXs.get(j).str.length()-stored_PREFIXs.get(j).oval_size));
                             } else {
                            current = oval_seq;
                             }
                             boolean prefix_match = false;
                             for(int k=0; k <= current.length() - K; k++) {
                            if (previous.substring(k, K+k).equals(current.substring(k, K+k))) {
                                prefix_match = true;
                                break;
                            }
                             }
                             if (prefix_match) {
                            int distance = fastdistance(previous, current);
                            float error_rate = (float)distance / (float)current.length();
                            if ( error_rate <= ERRORRATE ) {
                                reporter.incrCounter("Brush", "trans_edge", 1);
                                has_trans_edge = true;
                                node.addRemovalEdge(oval_id, Node.flip_link(oval_type), node.getNodeId(), oval_size);
                                break;
                            }
                             }*/
                            //output.collect(new Text(previous), new Text(current + " " + error_rate + " " + ERRORRATE + " :" + distance + " |" + current.length()));
                        }
                    }
                    if (has_trans_edge) {
                        continue;
                    }

                    //output.collect(new Text(node.getNodeId()), new Text("O" + "\t" + olist.get(i).id));
                    //\\\\\\\\\\\\\ Store confirmed edge
                    if (PREFIXs_list.containsKey("f")) {
                        PREFIXs_list.get("f").add(new Prefix(oval_id, oval_type, oval_seq, oval_size));
                    } else {
                        List<Prefix> tmp_PREFIXs = new ArrayList<Prefix>();
                        tmp_PREFIXs.add(new Prefix(oval_id, oval_type, oval_seq, oval_size));
                        PREFIXs_list.put("f", tmp_PREFIXs);
                    }
                    if (edges_list.containsKey(oval_type)) {
                        edges_list.get(oval_type).add(edge_content);
                        IDs_flist.get(oval_type).add(oval_id);
                    } else {
                        List<String> tmp_edges = null;
                        tmp_edges = new ArrayList<String>();
                        tmp_edges.add(edge_content);
                        edges_list.put(oval_type, tmp_edges);
                        List<String> tmp_IDs = new ArrayList<String>();
                        tmp_IDs.add(oval_id);
                        IDs_flist.put(oval_type, tmp_IDs);
                    }
                }
            }
            //\\\\\\\\\\\\\\\\\\\\\\\\\\ r_overlap
            int r_choices = o_rlist.size();
            if (r_choices > 0) {
                // Sort overlap strings in order of decreasing overlap size
                Collections.sort(o_rlist, new OvelapSizeComparator_r());
                //output.collect(new Text(node.getNodeId()), new Text(olist.size() + "!"));
                // See if there are any pairwise compatible strings
                for (int i = 0; i < r_choices; i++) {
                    String oval_id = o_rlist.get(i).id;
                    String oval_type = o_rlist.get(i).edge_type;
                    //String node_dir = oval_type.substring(0, 1);
                    String oval_dir = oval_type.substring(1);
                    int oval_size = o_rlist.get(i).overlap_size;
                    String edge_content = oval_id + "!" + oval_size;

                    String oval_seq_tmp = Node.dna2str(o_rlist.get(i).str);
                    String oval_seq;
                    if (oval_dir.equals("r")) {
                        oval_seq = Node.rc(oval_seq_tmp);
                    } else {
                        oval_seq = oval_seq_tmp;
                    }

                    //output.collect(new Text(node.getNodeId()), new Text("A" + "\t" + olist.get(i).id + "\t" + node.toNodeMsg()));
                    //\\ Self contained filter
                    if (oval_size == oval_seq.length() && oval_size == node.str().length()) {
                        //node.addRemovalEdge(oval_id, Node.flip_link(oval_type), node.getNodeId(), oval_size);
                        reporter.incrCounter("Brush", "contained_edge", 1);
                        //continue;
                    }
                    //\\\\\\\\\\\ Maximal Overlap filter
                    List<String> stored_IDs = IDs_rlist.get(oval_type);
                    boolean has_large_overlap = false;
                    if (stored_IDs != null && stored_IDs.contains(oval_id)) {
                        has_large_overlap = true;
                    }
                    if (has_large_overlap) {
                        node.addRemovalEdge(oval_id, Node.flip_link(oval_type), node.getNodeId(), oval_size);
                        continue;
                    }

                    //\\\\\\\\\\\ Transitive Reduction filter
                    List<Prefix> stored_PREFIXs = PREFIXs_list.get("r");
                    String prefix = oval_seq.substring(oval_size);
                    boolean has_trans_edge = false;
                    for (int j = 0; stored_PREFIXs != null && j < stored_PREFIXs.size(); j++) {
                        if (stored_PREFIXs.get(j).oval_size == oval_size
                                && stored_PREFIXs.get(j).str.length() == oval_seq.length()) {
                            continue;
                        }
                        if (ERRORRATE <= 0) {
                            if (prefix.startsWith(stored_PREFIXs.get(j).suffix)) {
                                //System.err.println( "remove:" +  node.getNodeId() + " " + oval_type + " " + oval_id);
                                //output.collect(new Text("prefix_r"),  new Text(stored_PREFIXs.get(j).id + " " + stored_PREFIXs.get(j).edge_type ));
                                //output.collect(new Text("remove_r"),  new Text(node.getNodeId() + " " + oval_type + " " + oval_id));
                                reporter.incrCounter("Brush", "trans_edge", 1);
                                has_trans_edge = true;
                                node.addRemovalEdge(oval_id, Node.flip_link(oval_type), node.getNodeId(),
                                        oval_size);
                                break;
                            }
                        } else {
                            /*  String previous = stored_PREFIXs.get(j).str.substring(stored_PREFIXs.get(j).oval_size-oval_size);
                              String current;
                              if (oval_seq.length() >= oval_size + (stored_PREFIXs.get(j).str.length()-stored_PREFIXs.get(j).oval_size )) {
                            current = oval_seq.substring(0, oval_size + (stored_PREFIXs.get(j).str.length()-stored_PREFIXs.get(j).oval_size));
                              } else {
                            current = oval_seq;
                              }
                              boolean prefix_match = false;
                              for(int k=0; k <= current.length() - K; k++) {
                            if (previous.substring(k, K+k).equals(current.substring(k, K+k))) {
                                prefix_match = true;
                                break;
                            }
                              }
                              if (prefix_match) {
                            int distance = fastdistance(previous, current);
                            float error_rate = (float)distance / (float)current.length();
                            if ( error_rate <= ERRORRATE ) {
                                reporter.incrCounter("Brush", "trans_edge", 1);
                                has_trans_edge = true;
                                node.addRemovalEdge(oval_id, Node.flip_link(oval_type), node.getNodeId(), oval_size);
                                break;
                            }
                              }*/
                            //output.collect(new Text(previous), new Text(current + " " + error_rate + " " + ERRORRATE + " :" + distance + " |" + current.length()));
                        }
                    }
                    if (has_trans_edge) {
                        continue;
                    }

                    //output.collect(new Text(node.getNodeId()), new Text("O" + "\t" + olist.get(i).id));
                    //\\\\\\\\\\\\\ Store confirmed edge
                    if (PREFIXs_list.containsKey("r")) {
                        PREFIXs_list.get("r").add(new Prefix(oval_id, oval_type, oval_seq, oval_size));
                    } else {
                        List<Prefix> tmp_PREFIXs = new ArrayList<Prefix>();
                        tmp_PREFIXs.add(new Prefix(oval_id, oval_type, oval_seq, oval_size));
                        PREFIXs_list.put("r", tmp_PREFIXs);
                    }
                    if (edges_list.containsKey(oval_type)) {
                        edges_list.get(oval_type).add(edge_content);
                        IDs_rlist.get(oval_type).add(oval_id);
                    } else {
                        List<String> tmp_edges = null;
                        tmp_edges = new ArrayList<String>();
                        tmp_edges.add(edge_content);
                        edges_list.put(oval_type, tmp_edges);
                        List<String> tmp_IDs = new ArrayList<String>();
                        tmp_IDs.add(oval_id);
                        IDs_rlist.put(oval_type, tmp_IDs);
                    }
                }
            }

            //\\\\\\\\\\\\\\\\\ set Edges
            for (String con : Node.edgetypes) {
                node.clearEdges(con);
                List<String> edges = edges_list.get(con);
                if (edges != null) {
                    node.setEdges(con, edges);
                }
            }
            //if (!node.hasCustom("contained")){
            output.collect(new Text(node.getNodeId()), new Text(node.toNodeMsg()));
            //}

        }
    }

    public RunningJob run(String inputPath, String outputPath) throws Exception {
        sLogger.info("Tool name: TransitiveReduction");
        sLogger.info(" - input: " + inputPath);
        sLogger.info(" - output: " + outputPath);

        //JobConf conf = new JobConf(Stats.class);
        JobConf conf = new JobConf(TransitiveReduction.class);
        conf.setJobName("TransitiveReduction " + inputPath);

        BrushConfig.initializeConfiguration(conf);

        FileInputFormat.addInputPath(conf, new Path(inputPath));
        FileOutputFormat.setOutputPath(conf, new Path(outputPath));

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);

        conf.setMapOutputKeyClass(Text.class);
        conf.setMapOutputValueClass(Text.class);

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);
        //conf.setBoolean("mapred.output.compress", true);

        conf.setMapperClass(TransitiveReductionMapper.class);
        conf.setReducerClass(TransitiveReductionReducer.class);

        //delete the output directory if it exists already
        FileSystem.get(conf).delete(new Path(outputPath), true);

        return JobClient.runJob(conf);
    }

    public int run(String[] args) throws Exception {
        String inputPath = "";
        String outputPath = "";

        run(inputPath, outputPath);

        return 0;
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new TransitiveReduction(), args);
        System.exit(res);
    }
}