popgenutils.dfcp.PrepareVCF4DFCP.java Source code

Java tutorial

Introduction

Here is the source code for popgenutils.dfcp.PrepareVCF4DFCP.java

Source

package popgenutils.dfcp;
/**
 * Classname
 * 
 * version 0.1
 *
 * Jul 6, 2015
 * 
 * The contents of this file are subject to the terms of the GNU
 * General Public License Version 3. For more information visit 
 * http://www.gnu.org/licenses/gpl.html
 */

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;

/**
 * @author Ouroboros
 *
 */
public class PrepareVCF4DFCP {

    static String filename;
    static int window_size;
    static String command;
    static int overlap_size;
    static String output_dir;
    static double pmask = -1, pgeno = -1;
    static int amask = -1, ageno = -1, sgeno = -1;
    static int hetindex = -1;
    static String popmappingfile;
    static String popstokeep;

    String[] gt_parts;
    StringBuilder gt_string_builder;

    /**
     * @param args
     */
    public static void main(String[] args) {
        CommandLineParser parser = new PosixParser();
        final Options options = new Options();

        options.addOption("help", false, "print this help.");

        // SPLIT COMMAND
        options.addOption("window", true, "used with [split] command; size of window");
        options.addOption("overlap", true, "used with [split] command; size of overlap in adjacent windows");

        // MASK COMMAND
        options.addOption("pmask", true, "used with [mask] command; percentage of alleles masked (made unknown)");
        options.addOption("amask", true,
                "used with [mask] command; absolute number of alleles masked (made unknown)");

        // MAKEGENO COMMAND
        options.addOption("pgeno", true,
                "used with [makegeno] command; percentage of individuals turned into genotypes");
        options.addOption("ageno", true,
                "used with [makegeno] command; absolute number of individuals turned into genotypes");
        options.addOption("sgeno", true, "used with [makegeno] command; specific individual turned into genotypes");

        // HET FILTER COMMAND
        options.addOption("hetindex", true,
                "used with [hetfilter] command; index of individual to filter the VCF's heterozygous sites");

        // POP FILTER COMMAND
        options.addOption("pops", true, "used with [popfilter] command; populations seperated by , to keep");
        options.addOption("mapfile", true, "used with [popfilter] command; sample to population map file");

        CommandLine cmdline = null;
        try {
            cmdline = parser.parse(options, args);
            args = cmdline.getArgs();
        } catch (ParseException e) {
            System.err.println(e.getMessage());
            HelpFormatter help = new HelpFormatter();
            help.printHelp("preparevcf.jar [options] [command] [filename] [output_dir]", options);
            System.exit(-1);
        }

        command = args[0];
        filename = args[1];
        output_dir = args[2] + "/";

        (new File(output_dir)).mkdirs();

        if (cmdline.hasOption("window")) {
            window_size = Integer.parseInt(cmdline.getOptionValue("window"));
        }
        if (cmdline.hasOption("overlap")) {
            overlap_size = Integer.parseInt(cmdline.getOptionValue("overlap"));
        }
        if (cmdline.hasOption("pmask")) {
            pmask = Double.parseDouble(cmdline.getOptionValue("pmask"));
        }
        if (cmdline.hasOption("amask")) {
            amask = Integer.parseInt(cmdline.getOptionValue("amask"));
        }
        if (cmdline.hasOption("pgeno")) {
            pgeno = Double.parseDouble(cmdline.getOptionValue("pgeno"));
        }
        if (cmdline.hasOption("ageno")) {
            ageno = Integer.parseInt(cmdline.getOptionValue("ageno"));
        }
        if (cmdline.hasOption("hetindex")) {
            hetindex = Integer.parseInt(cmdline.getOptionValue("hetindex"));
        }

        if (cmdline.hasOption("pops")) {
            popstokeep = cmdline.getOptionValue("pops");
        }
        if (cmdline.hasOption("mapfile")) {
            popmappingfile = cmdline.getOptionValue("mapfile");
        }

        PrepareVCF4DFCP preparer = new PrepareVCF4DFCP();

        switch (command.toLowerCase()) {
        case "split":
            preparer.split();
            break;
        case "mask":
            preparer.mask();
            break;
        case "makegeno":
            preparer.makegeno();
            break;

        case "hetfilter":
            preparer.filterhet();
            break;

        case "popfilter":
            preparer.filterpop();
            break;

        default:
            HelpFormatter help = new HelpFormatter();
            help.printHelp("preparevcf.jar [options] [command] [filename] [output_dir]", options);
            System.exit(-1);
            break;
        }

    }

    /**
     * 
     */
    private void filterpop() {
        Set<Integer> indices_to_keep = new HashSet<Integer>();
        Map<String, String> sample_to_pop = new HashMap<String, String>();
        Map<String, String> sample_to_superpop = new HashMap<String, String>();
        Set<String> pops_to_keep = new HashSet<String>();

        for (int i = 0; i < 9; i++) {
            indices_to_keep.add(i);
        }

        String[] popsparts = popstokeep.split(",");
        for (String pop : popsparts) {
            pops_to_keep.add(pop);
        }

        try (BufferedReader in = Files.newBufferedReader(Paths.get(popmappingfile), Charset.forName("UTF-8"))) {
            String line = null;
            while ((line = in.readLine()) != null) {
                String[] parts = line.split("\t");
                sample_to_pop.put(parts[0], parts[1]);
                sample_to_superpop.put(parts[0], parts[2]);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }

        StringBuilder header = new StringBuilder();
        try (BufferedReader in = Files.newBufferedReader(Paths.get(filename), Charset.forName("UTF-8"))) {
            BufferedWriter out = null;

            String line = null;
            while ((line = in.readLine()) != null) {

                if (line.startsWith("#CHROM")) {
                    //samples begin at 9
                    out = Files.newBufferedWriter(
                            Paths.get(output_dir + "/" + "popfilter_" + Paths.get(filename).getFileName()),
                            Charset.forName("UTF-8"));
                    out.write(header.toString());
                    String[] parts = line.split("\t");
                    for (int i = 9; i < parts.length; i++) {
                        if (pops_to_keep.contains(sample_to_superpop.get(parts[i])))
                            indices_to_keep.add(i);
                    }
                    out.write(parts[0]);
                    for (int i = 1; i < parts.length; i++) {
                        if (indices_to_keep.contains(i))
                            out.write("\t" + parts[i]);
                    }
                    out.write(System.getProperty("line.separator"));
                } else if (line.startsWith("#")) {
                    header.append(line + System.getProperty("line.separator"));
                } else {
                    // format at 8
                    String[] parts = line.split("\t");
                    out.write(parts[0]);
                    for (int i = 1; i < parts.length; i++) {
                        if (indices_to_keep.contains(i))
                            out.write("\t" + parts[i]);
                    }
                    out.write(System.getProperty("line.separator"));

                }
            }
            out.close();
        } catch (IOException e) {
            System.err.println("could not read from file " + filename);
            e.printStackTrace();
        }
    }

    /**
     * 
     */
    private void filterhet() {
        StringBuilder header = new StringBuilder();
        try (BufferedReader in = Files.newBufferedReader(Paths.get(filename), Charset.forName("UTF-8"))) {
            BufferedWriter out = null;

            String line = null;
            while ((line = in.readLine()) != null) {

                if (line.startsWith("#CHROM")) {
                    //samples begin at 9
                    out = Files.newBufferedWriter(
                            Paths.get(
                                    output_dir + "/" + hetindex + "_hetfilter" + Paths.get(filename).getFileName()),
                            Charset.forName("UTF-8"));
                    out.write(header.toString());
                    out.write(line + System.getProperty("line.separator"));
                } else if (line.startsWith("#")) {
                    header.append(line + System.getProperty("line.separator"));
                } else {
                    // format at 8
                    String[] parts = line.split("\t");
                    String[] parts2 = parts[8].split(":");
                    int gt_pos = 0;
                    for (int i = 1; i < parts2.length; i++) {
                        if (parts2[i].equals("GT"))
                            gt_pos = i;
                    }
                    parts2 = parts[9 + hetindex].split("\\||/");
                    if (!parts2[0].equals(parts2[1]))
                        out.write(line + System.getProperty("line.separator"));
                }
            }
            out.close();
        } catch (IOException e) {
            System.err.println("could not read from file " + filename);
            e.printStackTrace();
        }
    }

    /**
     * 
     */
    private void makegeno() {
        gt_string_builder = new StringBuilder();
        StringBuilder header = new StringBuilder();
        try (BufferedReader in = Files.newBufferedReader(Paths.get(filename), Charset.forName("UTF-8"))) {
            BufferedWriter out = null;
            BufferedWriter out_bgl_ref = null;
            BufferedWriter out_bgl_geno = null;

            String line = null;
            int number_of_individuals = 0;
            Set<Integer> individuals_to_genotype = new HashSet<Integer>();
            while ((line = in.readLine()) != null) {

                if (line.startsWith("#CHROM")) {
                    StringBuilder bglref_header = new StringBuilder();
                    StringBuilder bglgeno_header = new StringBuilder();
                    String[] parts = line.split("\t");
                    //samples begin at 9
                    number_of_individuals = parts.length - 9;
                    List<Integer> index_of_individuals = new ArrayList<Integer>();
                    for (int i = 0; i < number_of_individuals; i++) {
                        index_of_individuals.add(i);
                    }
                    Collections.shuffle(index_of_individuals);
                    if (pgeno >= 0) {
                        // convert pgeno into ageno
                        ageno = (int) Math.ceil(number_of_individuals * pgeno);
                    }
                    for (int i = 0; i < ageno; i++) {
                        individuals_to_genotype.add(index_of_individuals.get(i));
                    }

                    bglref_header.append(parts[0]);
                    bglgeno_header.append(parts[0]);
                    for (int i = 1; i < 9; i++) {
                        bglref_header.append("\t" + parts[i]);
                        bglgeno_header.append("\t" + parts[i]);
                    }
                    for (int i = 9; i < parts.length; i++) {
                        int index = i - 9;
                        if (individuals_to_genotype.contains(index)) {
                            bglgeno_header.append("\t" + parts[i]);
                        } else {
                            bglref_header.append("\t" + parts[i]);
                        }
                    }
                    out = Files.newBufferedWriter(
                            Paths.get(output_dir + "/" + ageno + "_geno" + Paths.get(filename).getFileName()),
                            Charset.forName("UTF-8"));
                    out_bgl_ref = Files.newBufferedWriter(
                            Paths.get(
                                    output_dir + "/" + ageno + "_geno_bglref" + Paths.get(filename).getFileName()),
                            Charset.forName("UTF-8"));
                    out_bgl_geno = Files.newBufferedWriter(
                            Paths.get(output_dir + "/" + ageno + "_geno_bgl" + Paths.get(filename).getFileName()),
                            Charset.forName("UTF-8"));
                    out.write(header.toString());
                    out_bgl_ref.write(header.toString());
                    out_bgl_geno.write(header.toString());
                    out.write(line + System.getProperty("line.separator"));
                    out_bgl_geno.write(bglgeno_header + System.getProperty("line.separator"));
                    out_bgl_ref.write(bglref_header + System.getProperty("line.separator"));
                } else if (line.startsWith("#")) {
                    header.append(line + System.getProperty("line.separator"));
                } else {
                    // format at 8
                    String[] parts = line.split("\t");
                    String[] parts2 = parts[8].split(":");
                    int gt_pos = 0;
                    for (int i = 1; i < parts2.length; i++) {
                        if (parts2[i].equals("GT"))
                            gt_pos = i;
                    }
                    out.write(parts[0]);
                    out_bgl_geno.write(parts[0]);
                    out_bgl_ref.write(parts[0]);
                    for (int i = 1; i < parts.length; i++) {
                        if (i > 8) {
                            parts2 = parts[i].split(":");
                            if (gt_pos == 0 && individuals_to_genotype.contains(i - 9)) {
                                out.write("\t" + convertToGT(parts2[0]));
                                out_bgl_geno.write("\t" + convertToGT(parts2[0]));
                            } else {
                                out.write("\t" + parts2[0]);
                                out_bgl_ref.write("\t" + parts2[0]);
                            }
                            for (int j = 1; j < parts2.length; j++) {
                                if (gt_pos == j && individuals_to_genotype.contains(i - 9)) {
                                    out.write(":" + convertToGT(parts2[i]));
                                    out_bgl_geno.write(":" + convertToGT(parts2[i]));
                                } else {
                                    out.write(":" + parts2[i]);
                                    out_bgl_ref.write(":" + parts2[i]);
                                }
                            }
                        } else {
                            out.write("\t" + parts[i]);
                            out_bgl_ref.write("\t" + parts[i]);
                            out_bgl_geno.write("\t" + parts[i]);
                        }
                    }
                    out.write(System.getProperty("line.separator"));
                    out_bgl_ref.write(System.getProperty("line.separator"));
                    out_bgl_geno.write(System.getProperty("line.separator"));
                }
            }
            out.close();
            out_bgl_ref.close();
            out_bgl_geno.close();
        } catch (IOException e) {
            System.err.println("could not read from file " + filename);
            e.printStackTrace();
        }
    }

    /**
     * @param string
     * @return
     */
    private String convertToGT(String gt_field) {
        gt_string_builder.setLength(0);
        gt_parts = gt_field.split("\\||/");
        gt_string_builder.append(gt_parts[0]);
        for (int i = 1; i < gt_parts.length; i++) {
            gt_string_builder.append("/" + gt_parts[i]);
        }
        return gt_string_builder.toString();
    }

    /**
     * 
     */
    private void mask() {
        StringBuilder header = new StringBuilder();
        try (BufferedReader in = Files.newBufferedReader(Paths.get(filename), Charset.forName("UTF-8"))) {
            BufferedWriter out = null;

            String line = null;
            int number_of_individuals = 0;
            Set<Integer> individuals_to_genotype = new HashSet<Integer>();
            while ((line = in.readLine()) != null) {

                if (line.startsWith("#CHROM")) {
                    //samples begin at 9
                    number_of_individuals = line.split("\t").length - 9;
                    List<Integer> index_of_individuals = new ArrayList<Integer>();
                    for (int i = 0; i < number_of_individuals; i++) {
                        index_of_individuals.add(i);
                    }
                    Collections.shuffle(index_of_individuals);
                    if (pmask >= 0) {
                        // convert pgeno into ageno
                        amask = (int) Math.ceil(number_of_individuals * pmask);
                    }
                    for (int i = 0; i < amask; i++) {
                        individuals_to_genotype.add(index_of_individuals.get(i));
                    }
                    out = Files.newBufferedWriter(
                            Paths.get(output_dir + "/" + amask + "_mask" + Paths.get(filename).getFileName()),
                            Charset.forName("UTF-8"));
                    out.write(header.toString());
                    out.write(line + System.getProperty("line.separator"));
                } else if (line.startsWith("#")) {
                    header.append(line + System.getProperty("line.separator"));
                } else {
                    // format at 8
                    String[] parts = line.split("\t");
                    String[] parts2 = parts[8].split(":");
                    int gt_pos = 0;
                    for (int i = 1; i < parts2.length; i++) {
                        if (parts2[i].equals("GT"))
                            gt_pos = i;
                    }
                    out.write(parts[0]);
                    for (int i = 1; i < parts.length; i++) {
                        if (i > 8) {
                            parts2 = parts[i].split(":");
                            if (gt_pos == 0 && individuals_to_genotype.contains(i - 8)) {
                                out.write("\t" + maskAlleles(parts2[0]));
                            } else
                                out.write(parts2[0]);
                            for (int j = 1; j < parts2.length; j++) {
                                if (gt_pos == j && individuals_to_genotype.contains(i - 8)) {
                                    out.write(":" + maskAlleles(parts2[i]));
                                } else
                                    out.write(":" + parts2[i]);
                            }
                        } else {
                            out.write("\t" + parts[i]);
                        }
                    }
                    out.write(System.getProperty("line.separator"));
                }
            }
        } catch (IOException e) {
            System.err.println("could not read from file " + filename);
            e.printStackTrace();
        }
    }

    /**
     * @param string
     * @return
     */
    private String maskAlleles(String gt_field) {
        gt_string_builder.setLength(0);
        gt_parts = gt_field.split("\\||/");
        gt_string_builder.append(".");
        for (int i = 1; i < gt_parts.length; i++) {
            gt_string_builder.append("/.");
        }
        return gt_string_builder.toString();
    }

    /**
     * 
     */
    private void split() {
        StringBuilder header = new StringBuilder();
        LineBuilder lines = new LineBuilder(window_size);
        try (BufferedReader in = Files.newBufferedReader(Paths.get(filename), Charset.forName("UTF-8"))) {
            int cnt = 0;
            int filecnt = 0;
            String line = null;
            while ((line = in.readLine()) != null) {
                if (line.startsWith("#")) {
                    header.append(line + System.getProperty("line.separator"));
                } else {
                    cnt++;
                    lines.addLine(line);
                    if (cnt == window_size) {
                        cnt = overlap_size;
                        lines.printFile(header.toString(), output_dir + filename.substring(0, filename.indexOf('.'))
                                + "_" + window_size + "_" + overlap_size + "_" + (filecnt++) + ".vcf");
                    }
                }
            }
        } catch (IOException e) {
            System.err.println("could not read from file " + filename);
            e.printStackTrace();
        }
    }

    private class LineBuilder {
        String[] lines;
        int startPos = 0;

        public LineBuilder(int line_count) {
            super();
            lines = new String[line_count];
        }

        public void addLine(String line) {
            lines[startPos] = line;
            startPos++;
            startPos = startPos % window_size;
        }

        public void printFile(String header, String filename) {

            try (BufferedWriter out = Files.newBufferedWriter(Paths.get(filename), Charset.forName("UTF-8"))) {
                out.write(header);

                for (int i = startPos; i < startPos + lines.length; i++) {
                    out.write(lines[i % lines.length] + System.getProperty("line.separator"));
                }
            } catch (IOException e) {
                System.err.println("could not write to file " + filename);
                e.printStackTrace();
            }
        }
    }

}