edu.msu.cme.rdp.framebot.stat.TaxonAbundance.java Source code

Introduction

Here is the source code for edu.msu.cme.rdp.framebot.stat.TaxonAbundance.java
Source

/*
 * Copyright (C) 2015 Michigan State University <rdpstaff at msu.edu>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package edu.msu.cme.rdp.framebot.stat;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.HashMap;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;

/**
 *
 * @author wangqion @msu.edu
 */
public class TaxonAbundance {
    private static final String dformat = "%1$.4f";
    private static Options options = new Options();
    static {
        options.addOption(
                new Option("c", "seqCoverage", true, "contains the ID and coverage separated by space or tab."
                        + " Used to adjust the sequence abundance"));
        options.addOption(
                new Option("e", "identity", true, "the minimum protein identity, default is 0, range [0-100]"));
    }

    public static HashMap<String, Double> parseKmerCoverage(String infile) throws IOException {
        HashMap<String, Double> countMap = new HashMap<String, Double>();
        BufferedReader reader = new BufferedReader(new FileReader(new File(infile)));
        String line;
        while ((line = reader.readLine()) != null) {
            if (line.startsWith("#"))
                continue;
            String[] val = line.split("\\s+");
            countMap.put(val[0], Double.parseDouble(val[1]));
        }
        reader.close();
        return countMap;
    }

    public static void mapAbundance(File framebotResult, File lineagefile, String outfile,
            HashMap<String, Double> coveragetMap, double identity) throws IOException {
        HashMap<String, String> lineageMap = GetFrameBotStatMain.readDesc(lineagefile);
        HashMap<String, Double> rankMatchMap = new HashMap<String, Double>(); // taxon rank, 
        HashMap<String, Double> matchMap = new HashMap<String, Double>(); // match name

        double totalCount = 0.0;
        File[] files;
        if (framebotResult.isDirectory()) {
            files = framebotResult.listFiles();
        } else {
            files = new File[1];
            files[0] = framebotResult;
        }
        String line;
        BufferedReader reader;
        for (File f : files) {
            reader = new BufferedReader(new FileReader(f));
            while ((line = reader.readLine()) != null) {
                if (!line.startsWith("STAT"))
                    continue;
                String[] tokens = line.trim().split("\\t");
                double thisIdentity = Double.parseDouble(tokens[5]);
                if (thisIdentity < identity) {
                    continue;
                }

                String match = tokens[1];

                String[] lineage = lineageMap.get(match).split(";");
                String taxonname = lineage[0].trim();
                if (lineage.length > 1) {
                    taxonname = lineage[1];
                    if (lineage[1].equalsIgnoreCase("Proteobacteria")) {// || lineage[1].equalsIgnoreCase("Bacteroidetes")){
                        taxonname = lineage[2].trim();
                    }
                }
                if (taxonname.equals("")) {
                    taxonname = "NA";
                }

                double count = 1.0;
                if (coveragetMap != null) {
                    count = coveragetMap.get(tokens[2]);
                }
                Double prevCount = rankMatchMap.get(taxonname);
                if (prevCount != null) {
                    rankMatchMap.put(taxonname, count + prevCount);
                } else {
                    rankMatchMap.put(taxonname, count);
                }
                Double prevMatchCount = matchMap.get(match);
                if (prevMatchCount != null) {
                    matchMap.put(match, count + prevMatchCount);
                } else {
                    matchMap.put(match, count);
                }
                totalCount += count;
            }
            reader.close();
        }

        PrintStream out = new PrintStream(new File(outfile));
        // print out abundance group by phylum or class (within Proteobacteria)
        out.println("Taxon\tAbundance\tFraction Abundance");
        for (String taxonname : rankMatchMap.keySet()) {
            out.println(taxonname + "\t" + rankMatchMap.get(taxonname) + "\t"
                    + String.format(dformat, rankMatchMap.get(taxonname) / totalCount));
        }

        // print out abundance by closest match   
        out.println("\n\nLineage\tMatchName\tAbundance\tFraction Abundance");
        for (String match : matchMap.keySet()) {
            String lineage = lineageMap.get(match);
            if (lineage.equals("")) {
                lineage = "NA";
            }
            out.println(lineage + "\t" + match + "\t" + matchMap.get(match) + "\t"
                    + String.format(dformat, matchMap.get(match) / totalCount));
        }

        out.close();
    }

    /**
     * this class group the nearest matches by phylum/class, or by match 
     * @param args
     * @throws Exception 
     */
    public static void main(String[] args) throws Exception {
        HashMap<String, Double> coveragetMap = null;
        double identity = 0.0;
        try {
            CommandLine line = new PosixParser().parse(options, args);
            if (line.hasOption("seqCoverage")) {
                String coveragefile = line.getOptionValue("seqCoverage");
                coveragetMap = parseKmerCoverage(coveragefile);
            }
            if (line.hasOption("identity")) {
                identity = Double.parseDouble(line.getOptionValue("identity"));
                if (identity < 0 || identity > 100) {
                    throw new IllegalArgumentException("identity cutoff should be in the range of 0 and 100");
                }
            }

            args = line.getArgs();
            if (args.length != 3) {
                throw new Exception("");
            }

        } catch (Exception e) {
            System.out.println("Command Error: " + e.getMessage());
            new HelpFormatter().printHelp(80, "[options] <FrameBot Alignment file or Dir> <seqLineage> <out file> ",
                    "", options,
                    "seqLineage: a tab-delimited file with ref seqID and lineage, or fasta of ref seq with lineage as the descrption"
                            + "\nframeBot alignment file or Dir: frameBot alignment files "
                            + "\noutfile: output with the nearest match count group by phylum/class; and by match name");
        }
        TaxonAbundance.mapAbundance(new File(args[0]), new File(args[1]), args[2], coveragetMap, identity);
    }

}