nl.systemsgenetics.cellTypeSpecificAlleleSpecificExpression.ReadGenoAndAsFromIndividual.java Source code

Introduction

Here is the source code for nl.systemsgenetics.cellTypeSpecificAlleleSpecificExpression.ReadGenoAndAsFromIndividual.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

package nl.systemsgenetics.cellTypeSpecificAlleleSpecificExpression;

import htsjdk.samtools.CigarElement;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;

import java.util.HashMap;

import java.util.Set;
import org.molgenis.genotype.Alleles;
import org.molgenis.genotype.RandomAccessGenotypeData;
import org.molgenis.genotype.trityper.TriTyperGenotypeData;
import org.molgenis.genotype.variant.GeneticVariant;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import java.io.PrintWriter;
import java.util.ArrayList;
import org.apache.commons.io.FilenameUtils;
import org.jdom.IllegalDataException;
import org.molgenis.genotype.vcf.VcfGenotypeData;

/**
 *
 * @author Adriaan van der Graaf
 */
public class ReadGenoAndAsFromIndividual {

    public static void readGenoAndAsFromIndividual(String loc_of_bam1, String genotype_loc,
            String coupling_location, String outputLocation, String snpLocation) throws IOException, Exception {

        if (GlobalVariables.verbosity >= 1) {
            //Print ASREADS header
            System.out.println("---- Starting ASREADS for the following settings: ----");
            System.out.println("\t input bam:         " + loc_of_bam1);
            System.out.println("\t genotype location: " + genotype_loc);
            System.out.println("\t coupling file:     " + coupling_location);
            System.out.println("\t output location:   " + outputLocation);
            if (!snpLocation.equals("")) {
                System.out.println("\t snp Location:      " + snpLocation);
            } else {
                System.out.println("\t snp Location:      " + "NONE");
            }

            System.out.println("------------------------------------------------------");
        }

        //parse command line arguments
        String loc_of_bam;
        loc_of_bam = loc_of_bam1;
        System.out.println("Location of bam file: ");
        System.out.println(loc_of_bam);

        if (!new File(loc_of_bam).exists()) {
            throw new IllegalArgumentException("ERROR! Location of bam file is not an existing file. Exitting.");
        } else {
            if (GlobalVariables.verbosity >= 10) {
                System.out.println("Location of bam file is an existing file, will continue.");
            }
        }

        RandomAccessGenotypeData TTdataSet;
        VcfGenotypeData VCFdataSet;
        HashMap<String, GeneticVariant> variantIdMap;
        String[] individual_names;

        String tabixLoc = genotype_loc + ".tbi";

        //open vcf dataset
        //based on extention and existance of both files. 
        if (FilenameUtils.isExtension(genotype_loc, "gz") && new File(tabixLoc).exists()
                && new File(genotype_loc).exists()) {
            try {
                VCFdataSet = new VcfGenotypeData(new File(genotype_loc), new File(tabixLoc), 0.99);
                variantIdMap = VCFdataSet.getVariantIdMap();
                individual_names = VCFdataSet.getSampleNames();
            } catch (IOException ex) {
                System.err.println("Error reading vcf dataset: " + genotype_loc);
                throw new IllegalArgumentException();
            }

        } else if (new File(genotype_loc + "/GenotypeMatrix.dat").exists()) {
            //assuming trityper dataset based on the genotype matrix
            try {
                TTdataSet = new TriTyperGenotypeData(new File(genotype_loc));
                variantIdMap = TTdataSet.getVariantIdMap();
                individual_names = TTdataSet.getSampleNames();
            } catch (IOException ex) {
                System.err.println("Error reading trityper dataset: " + genotype_loc);
                throw new IllegalArgumentException();
            }

        } else {
            throw new IllegalDataException("could not find a Trityper or vcf file in the genotype location");
        }

        //get the variants in the variantIdMAP

        Set<String> snpNames = variantIdMap.keySet();

        ArrayList<String> SNPsToAnalyze;
        SNPsToAnalyze = new ArrayList<String>();

        //If available, read the file with rs numbers.
        if (!snpLocation.equals("")) {

            ArrayList<String> includeSNPs = UtilityMethods.readFileIntoStringArrayList(snpLocation);

            int snpsNotFound = 0;

            for (String snp_to_include : includeSNPs) {
                if (snpNames.contains(snp_to_include)) {
                    SNPsToAnalyze.add(snp_to_include);
                } else {
                    snpsNotFound++;
                }
            }

            if (GlobalVariables.verbosity >= 1) {
                System.out.println("WARNING: Did not find " + Integer.toString(snpsNotFound) + " out of "
                        + Integer.toString(includeSNPs.size()) + " SNPs in the include file.");
            }
        } else {
            for (String snp_to_include : snpNames) {
                SNPsToAnalyze.add(snp_to_include);
            }
        }

        //String path = "/gcc/groups/lld/tmp01/projects/bamFiles/";
        //sample_map contains all the individuals that are in the sample file.
        HashMap sample_map = convert_individual_names(individual_names, coupling_location);

        if (GlobalVariables.verbosity >= 10) {
            System.out.println("Sample names were loaded.");
        }
        if (GlobalVariables.verbosity >= 100) {
            System.out.println(sample_map.toString());
        }

        //Twice because my files have the .MERGED.sorted.bam suffix attached to them.
        String sample_name = FilenameUtils
                .getBaseName(FilenameUtils.getBaseName(FilenameUtils.getBaseName(loc_of_bam)));

        if (GlobalVariables.verbosity >= 10) {
            System.out.println("sample_name: " + sample_name);
            System.out.println("sample_map:  " + sample_map.toString());
        }

        Object sample_idx = sample_map.get(sample_name);

        if (sample_idx == null) {
            throw new IllegalArgumentException("Couldn't find the filename in the sample names. Quitting.");
        }

        int sample_index = Integer.parseInt(sample_idx.toString());

        if (GlobalVariables.verbosity >= 10) {
            System.out.println("sample_index: " + sample_index);
        }

        //bam file path and filename
        String path_and_filename = loc_of_bam;
        File sample_file = new File(path_and_filename);

        SamReader bam_file = SamReaderFactory.makeDefault().open(sample_file);

        if (GlobalVariables.verbosity >= 10) {
            System.out.println("Initialized for reading bam file");
        }

        PrintWriter writer = new PrintWriter(outputLocation, "UTF-8");

        int i = 0;
        for (String i_snp : SNPsToAnalyze) {

            //System.out.println(i_snp);

            GeneticVariant this_variant = variantIdMap.get(i_snp);

            String chromosome = this_variant.getSequenceName();
            String position = String.valueOf(this_variant.getStartPos());

            // We only do analyses if we find a SNP and it is biallelic
            // However this is trityper data, so if we use
            // the allele count is used for the check of something. 

            //DO NOT ENTER A SEPARATED GENOMIC DATASET OTHERWISE THIS WILL BREAK.
            if (this_variant.isSnp() & this_variant.isBiallelic()) {

                String row_of_table = get_allele_specific_overlap_at_snp(this_variant, sample_index, chromosome,
                        position, bam_file);

                //commented out the phasing part.

                writer.println(chromosome + "\t" + position + "\t" + i_snp + "\t"
                        + this_variant.getVariantAlleles().getAllelesAsChars()[0] + "\t"
                        + this_variant.getVariantAlleles().getAllelesAsChars()[1] + "\t" + row_of_table + "\t"
                        + Arrays.toString(this_variant.getSampleVariants().get(sample_index).getAllelesAsChars()) //+ "\t" +
                //Boolean.toString(this_variant.getSamplePhasing().get(sample_index))
                );
            }

            i++;

            if ((i % 10000 == 0) && (GlobalVariables.verbosity >= 10)) {

                System.out.println("Finished " + Integer.toString(i) + " SNPs");

            }

        }
        writer.close();
    }

    public static String get_allele_specific_overlap_at_snp(GeneticVariant this_variant, int sample_index,
            String chromosome, String position, SamReader bam_file) {

        int pos_int = Integer.parseInt(position);

        Alleles all_variants = this_variant.getVariantAlleles();
        Character ref_allele_char = all_variants.getAllelesAsChars()[0];
        String ref_allele = ref_allele_char.toString();
        //System.out.println("ref_allele: " + ref_allele);
        Character alt_allele_char = all_variants.getAllelesAsChars()[1];
        String alt_allele = alt_allele_char.toString();
        //System.out.println("alt_allele: " + alt_allele);

        int ref_overlap = 0;
        int alt_overlap = 0;
        int no__overlap = 0;

        // now determine per individual the sample variants.
        // I'm assuming the ordering is the same as the individual names created 
        // by the  getSampleNames() method. 
        // Otherwise the data will be nicely permuted, and I will have to convert some stuff.

        int i = 0;

        String temp_string;

        int position_of_snp = Integer.parseInt(position);

        SAMRecordIterator all_reads_in_region;
        all_reads_in_region = bam_file.queryOverlapping(chromosome, position_of_snp, position_of_snp);

        // Right now assuming the above iterator provides me with reads.
        // Otherwise, I don't know.

        String bases = "";

        while (all_reads_in_region.hasNext()) {

            SAMRecord read_in_region = all_reads_in_region.next();

            Character base_in_read = get_base_at_position(read_in_region, pos_int);
            //System.out.println("base_in_read: " + base_in_read);

            if (base_in_read == ref_allele.charAt(0)) {
                ref_overlap++;
            } else if (base_in_read == alt_allele.charAt(0)) {
                alt_overlap++;
            } else if (base_in_read == '!' || base_in_read == 'N') {
                continue;
            } else {
                no__overlap++;

            }

        }

        //This line below cost me a day to figure out the error.
        all_reads_in_region.close();

        String string_for_output;
        string_for_output = Integer.toString(ref_overlap) + "\t" + Integer.toString(alt_overlap) + "\t"
                + Integer.toString(no__overlap);

        return (string_for_output);
    }

    public static Character get_base_at_position(SAMRecord sam_read, int pos_int) {

        int curr_pos = sam_read.getAlignmentStart();
        int end_pos = sam_read.getAlignmentEnd();

        String read = sam_read.getReadString();
        int idx_of_read = 0;
        int idx_of_cigar = 0;

        if (GlobalVariables.verbosity >= 100) {
            System.out.println("positions");
            System.out.println("curr_pos: " + Integer.toString(curr_pos));
            System.out.println("end_pos: " + Integer.toString(end_pos));
            System.out.println("SNP pos: " + Integer.toString(pos_int));
        }

        while (curr_pos <= end_pos) {

            CigarElement cigar = sam_read.getCigar().getCigarElement(idx_of_cigar);
            int final_pos;
            switch (cigar.getOperator()) {
            case D:
                curr_pos += cigar.getLength();
                if (GlobalVariables.verbosity >= 100) {
                    System.out.println("Found D in cigar: ");
                    System.out.println("current_pos + " + Integer.toString(cigar.getLength()));
                    System.out.println("final curr_pos: " + Integer.toString(curr_pos));

                }
                break;
            case EQ:
                final_pos = curr_pos + cigar.getLength();
                if (GlobalVariables.verbosity >= 100) {
                    System.out.println("Found EQ in cigar: ");
                    System.out.println();
                    System.out.println("index_of_read + " + Integer.toString(cigar.getLength()));
                    System.out.println("final idx_of_read: " + Integer.toString(idx_of_read));
                    System.out.println();
                    System.out.println("current_pos + " + Integer.toString(cigar.getLength()));
                    System.out.println("final curr_pos: " + Integer.toString(curr_pos));
                    System.out.println("\nStarting iteration:");

                }
                while (curr_pos < final_pos) {
                    if (GlobalVariables.verbosity >= 100) {
                        System.out.println("index is now:" + Integer.toString(idx_of_read));
                        System.out.println("ref pos is now:" + Integer.toString(curr_pos));
                        System.out.println("The base at this pos is:" + read.charAt(idx_of_read));
                    }

                    if (curr_pos == pos_int) {
                        //                            System.out.println("index is now:" + Integer.toString(idx_of_read));
                        //                            System.out.println("ref pos is now:" + Integer.toString(curr_pos));
                        //                            System.out.println("The base at this pos is:" + read.charAt(idx_of_read));
                        return (read.charAt(idx_of_read));
                    }

                    curr_pos++;
                    idx_of_read++;
                }
                break;
            case H:
                if (GlobalVariables.verbosity >= 10) {
                    System.out.println("found H line in Cigar, skipping read");
                }

                return ('!');
            case I:
                idx_of_read += cigar.getLength();
                if (GlobalVariables.verbosity >= 100) {
                    System.out.println("Found I in cigar: ");
                    System.out.println("index_of_read + " + Integer.toString(cigar.getLength()));
                    System.out.println("final idx_of_read: " + Integer.toString(idx_of_read));

                }
                break;
            case M:
                final_pos = curr_pos + cigar.getLength();
                if (GlobalVariables.verbosity >= 100) {
                    System.out.println("Found M in cigar: ");
                    System.out.println();
                    System.out.println("index_of_read + " + Integer.toString(cigar.getLength()));
                    System.out.println("final idx_of_read: " + Integer.toString(idx_of_read));
                    System.out.println();
                    System.out.println("current_pos + " + Integer.toString(cigar.getLength()));
                    System.out.println("final curr_pos: " + Integer.toString(curr_pos));
                    System.out.println("\nStarting iteration:");

                }
                while (curr_pos < final_pos) {
                    if (GlobalVariables.verbosity >= 100) {
                        System.out.println("index is now:" + Integer.toString(idx_of_read));
                        System.out.println("ref pos is now:" + Integer.toString(curr_pos));
                        System.out.println("The base at this pos is:" + read.charAt(idx_of_read));
                    }

                    if (curr_pos == pos_int) {
                        //                            System.out.println("index is now:" + Integer.toString(idx_of_read));
                        //                            System.out.println("ref pos is now:" + Integer.toString(curr_pos));
                        //                            System.out.println("The base at this pos is:" + read.charAt(idx_of_read));
                        return (read.charAt(idx_of_read));
                    }
                    curr_pos++;
                    idx_of_read++;
                }
                break;
            case N:
                curr_pos += cigar.getLength();
                if (GlobalVariables.verbosity >= 100) {
                    System.out.println("Found N in cigar: ");
                    System.out.println("current_pos + " + Integer.toString(cigar.getLength()));
                    System.out.println("final curr_pos: " + Integer.toString(curr_pos));

                }
                break;
            case P:
                curr_pos += cigar.getLength();
                if (GlobalVariables.verbosity >= 100) {
                    System.out.println("Found P in cigar: ");
                    System.out.println("current_pos + " + Integer.toString(cigar.getLength()));
                    System.out.println("final curr_pos: " + Integer.toString(curr_pos));

                }
                break;
            case S:
                idx_of_read += cigar.getLength();
                if (GlobalVariables.verbosity >= 100) {
                    System.out.println("Found S in cigar: ");
                    System.out.println("index_of_read + " + Integer.toString(cigar.getLength()));
                    System.out.println("final idx_of_read: " + Integer.toString(idx_of_read));

                }
                break;
            case X:
                curr_pos += cigar.getLength();
                idx_of_read += cigar.getLength();
                if (GlobalVariables.verbosity >= 100) {

                    System.out.println("Found X in cigar: ");
                    System.out.println();
                    System.out.println("index_of_read + " + Integer.toString(cigar.getLength()));
                    System.out.println("final idx_of_read: " + Integer.toString(idx_of_read));
                    System.out.println();
                    System.out.println("current_pos + " + Integer.toString(cigar.getLength()));
                    System.out.println("final curr_pos: " + Integer.toString(curr_pos));

                }
                break;

            }
            idx_of_cigar++;

            if (curr_pos - 1 > end_pos) {
                System.out.println("Something goes wrong when bases based on cigar.");
                System.out.println(Integer.toString(curr_pos) + ">" + Integer.toString(end_pos));
            }

        }
        //couldn't find anything, returning: '!'
        if (GlobalVariables.verbosity >= 100) {
            //System.out.println("### coulnd't find overlapping base in this read. ###");
        }
        return ('!');
    }

    /**
     *
     * @param individual_names
     * @param coupling_location
     * @return
     * @throws FileNotFoundException
     * @throws IOException
     * 
     */

    public static HashMap convert_individual_names(String[] individual_names, String coupling_location)
            throws FileNotFoundException, IOException {

        String coupling_loc = coupling_location;

        //This will be filled while reading the file
        ArrayList<String> sample_names_in_file = new ArrayList<String>();
        ArrayList<String> individual_names_in_file = new ArrayList<String>();

        //This will be the one that is later returned
        HashMap ordered_sample_names = new HashMap();

        File coupling_file = new File(coupling_loc);

        FileInputStream fis;
        BufferedInputStream bis;
        DataInputStream dis;

        fis = new FileInputStream(coupling_file);

        // Here BufferedInputStream is added for fast reading.
        bis = new BufferedInputStream(fis);
        dis = new DataInputStream(bis);

        // dis.available() returns 0 if the file does not have more lines.
        int i = 0;
        while (dis.available() != 0) {

            String[] curr_line = dis.readLine().split("\t");
            individual_names_in_file.add(i, curr_line[0]);
            sample_names_in_file.add(i, curr_line[1]);

            //print the individual al line for checking.
            //System.out.println("line: " + " " + curr_line[0] +" " + curr_line[1] );

        }

        // dispose all the resources after using them.
        fis.close();
        bis.close();
        dis.close();

        i = 0;
        for (String i_name : individual_names) {
            int index = individual_names_in_file.indexOf(i_name);

            if (index >= 0) {
                //We find a name in the genotype folder in the individual names stuff.
                ordered_sample_names.put(sample_names_in_file.get(index), i);
            }
            i++;

        }
        return (ordered_sample_names);

    }

}