org.ut.medsavant.annotation.format.MedSavantAnnotationFormatter.java Source code

Introduction

Here is the source code for org.ut.medsavant.annotation.format.MedSavantAnnotationFormatter.java
Source

/**
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * This is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1 of
 * the License, or (at your option) any later version.
 *
 * This software is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this software; if not, write to the Free
 * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 * 02110-1301 USA, or see the FSF site: http://www.fsf.org.
 */
package org.ut.medsavant.annotation.format;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.nio.charset.Charset;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.regex.Pattern;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;

import net.sf.samtools.util.BlockCompressedInputStream;
import net.sf.samtools.util.BlockCompressedOutputStream;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.apache.commons.lang3.ArrayUtils;
import org.broad.tabix.TabixWriter;
import org.ut.biolab.medsavant.shared.util.IOUtils;

/**
 * @author mfiume
 */
public class MedSavantAnnotationFormatter {
    private static final String idf_ext = ".idf";

    private static final String tsv_ext = ".tsv";

    private static final String txt_ext = ".txt";

    private static final Charset LATIN1 = Charset.forName("ISO-8859-1");

    private static final char headerChar = '#';

    private static final String delim = "\t";

    private static enum TXT_FORMATS {
        F1("(?:[0-9]{1,2}|X|Y)\t\\d++\t[ACGTN]++\t[ACGTN0-9]++\t[0-9.]++\t[.;\\d\\w]++",
                new String[] { "Chr", "Start", "Ref", "Obs", "Freq", "Name" }),

        F2("(?:[0-9]{1,2}|X|Y)\t\\d++\t\\d++\t[ACGT]\t[ACGT]\t[-\\d.E]++\t(?:\\w|NA)",
                new String[] { "Chr", "Start", "End", "Ref", "Obs", "Freq", "C" }),

        F3("(?:[0-9]{1,2}|X|Y)\t\\d++\t\\d++\t[ACGT]*+\t[ACGT]++\t[\\d.]++(?:\t[\\w*]){2}",
                new String[] { "Chr", "Start", "End", "Ref", "Obs", "Freq", "O1", "O2" }),

        F4("(?:[0-9]{1,2}|X|Y|M)\t\\d++\t\\d++\t[-ACGT]++\t[-ACGT]++\t[\\d.]++",
                new String[] { "Chr", "Start", "End", "Ref", "Obs", "Freq" }),

        F5("(?:[0-9]{1,2}|X|Y)\t\\d++\t\\d++\t[-ACGTR0-9]*+\t[-A-Y]*+\t[^\t]++",
                new String[] { "Chr", "Start", "End", "Ref", "Obs", "Comment" }),

        F6("[0-9]++\tchr[^\t]++\t\\d++\t\\d++\tlod=\\d++\t\\d++",
                new String[] { "Bin", "Chr", "Start", "End", "LOD", "N" }),

        F7("[0-9]++\tchr[^\t]++\t\\d++\t\\d++\tchr[^\t]++\t0\t[+-]\tchr[^\t]++(\t\\d++){5}(\tN/A){4}"
                + "\t[^\t]++(\t\\d++){8}(\t[-\\d.e]++){4}",
                new String[] { "Bin", "Chr", "Start", "End", "OtherRef", "0", "Strand", "OrthoChr", "OrthoStart",
                        "OrthoEnd", "Pos?", "N0", "1000", "NA", "NA", "NA", "NA", "AlignFile", "N1", "N2", "N3",
                        "N4", "N5", "N6", "N7", "N8", "F1", "F2", "F3", "F4" }),

        F8("chr[^\t]++\t\\d++\t\\d++\t[^\t]++\t1000\t[+-](\t\\d++){4}(?:\t[\\d,]++){2}\t(?:PF\\d{5};)?",
                new String[] { "Chr", "Start", "End", "Name", "1000", "Strand", "Start2", "End2", "N1", "N2",
                        "Array1", "Array2", "Pfam" }),

        F9("\\d++\tN[MR]_\\d++\tchr[^\t]++\t[+-](?:\t\\d++){5}(?:\t[\\d,]++){2}\t0\t[^\t]++(?:\t(unk|cmpl|incmpl)){2}\t[\\d,-]++",
                new String[] { "N1", "Idx", "Chr", "Strand", "Start", "End", "Start2?", "End2?", "N2", "Array1",
                        "Array2", "0", "Name", "Status1", "Status2", "Array3" }),

        F10("[0-9]++\tchr[^\t]++\t\\d++\t\\d++\trs\\d++\t0\t[+-](\t[^\t]++){3}\t"
                + "(genomic|cDNA|unknown)\t(single|insertion|deletion|in-del|mnp|mixed|named|microsatellite|het)\t(unknown|by-[\\d\\w-]++|,)++"
                + "(\t[\\d.]++){2}\t"
                + "((unknown|near-gene-\\d++|splice-\\d++|untranslated-\\d++|ncRNA|intron|coding-synon|missense|frameshift|nonsense|cds-indel|stop-loss|intron),?)++\t"
                + "(between|exact|range|rangeDeletion|rangeInsertion|rangeSubstitution|fuzzy)"
                + "(\t\\d*+\t[^\t]*+){3}(\t[\\d.,]*+){2}\t[^\t]*+",
                new String[] { "Bin", "Chr", "Start", "End", "RefSNP", "Score", "Strand", "Allele1", "Allele2",
                        "Observed", "DNAType", "ChangeType", "Source", "F1", "F2", "Effect", "Location", "N1",
                        "Obs", "N2", "Dataset", "N3", "C", "Array1", "Array2", "Obs" }),

        F11("uc[0-9]{3}[a-z]{3}\\.[1-4]\tchr[^\t]++\t[+-](\t\\d++){5}(\t(\\d++,)++){2}\t[^\t]*+\tuc[0-9]{3}[a-z]{3}\\.[1-4]",
                new String[] { "UCSC", "Chr", "Strand", "Start", "End", "Pos1", "Pos2", "N1", "Array1", "Array2",
                        "UniProtID", "UCSC2" });

        // F12("uc[0-9]{3}[a-z]{3}\\.[1-4](\t[^\t]*+){6}\t[^\t]++\t(RF\\d{5})?\t(chr[^\t]++)?", new String[] {"UCSC",
        // "ID1", "ID2","ID3", "ID4", "ID5", "ID6", "Desc", "Rfam", "Chr"});

        Pattern pattern;

        String[] columns;

        TabixWriter.Conf tabixConf;

        private TXT_FORMATS(String pattern, String[] columns) {
            this.pattern = Pattern.compile(pattern);
            this.columns = columns;
            this.tabixConf = new TabixWriter.Conf(0, ArrayUtils.indexOf(columns, "Chr") + 1,
                    ArrayUtils.indexOf(columns, "Start") + 1, ArrayUtils.indexOf(columns, "End") + 1, '#', 0);
        }
    }

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws Exception {
        String rootpath = "./";

        for (String child : new File(rootpath)
                .list(new SuffixFileFilter(new String[] { idf_ext, tsv_ext, txt_ext }))) {
            System.out.println("Processing " + child);
            try {
                processFile(new File(rootpath + child));
            } catch (Exception e) {
                System.err.println("Error processing " + child);
                e.printStackTrace();
                continue;
            }
        }
    }

    public static void output(Object[] string, PrintStream os) {
        for (Object s : string) {
            os.print(s + "\t");
        }
        os.println();
    }

    private static void processFile(File file) throws Exception {
        String fileName = file.getName();

        boolean isTSV = false;
        boolean isIDF = false;
        boolean isTXT = false;

        if (fileName.endsWith(idf_ext)) {
            isIDF = true;
        } else if (fileName.endsWith(tsv_ext)) {
            isTSV = true;
        } else if (fileName.endsWith(txt_ext)) {
            isTXT = true;
        }

        String truncatedFileName = FilenameUtils.removeExtension(fileName);

        File outDir = new File(file.getParent(), truncatedFileName);
        outDir.mkdir();

        File dataFile;
        File xmlFile = new File(outDir, truncatedFileName + ".xml");

        if (isIDF) {
            dataFile = new File(outDir, truncatedFileName + ".tab");
            processIDFFile(file, dataFile, xmlFile);
        } else if (isTSV) {
            dataFile = new File(outDir, truncatedFileName + ".gz");
            processTSVFile(file, dataFile, xmlFile);
        } else if (isTXT) {
            dataFile = new File(outDir, truncatedFileName + ".gz");
            processTXTFile(file, dataFile, xmlFile);
        }

        IOUtils.zipDirectory(outDir, new File(file.getParent(), truncatedFileName + ".zip"));
    }

    private static void processTSVFile(File file, File dataFile, File xmlFile) throws ClassNotFoundException,
            IOException, SQLException, ParserConfigurationException, TransformerException, Exception {
        bgZipFile(file, dataFile);

        TabixWriter.Conf TSV_CONF = new TabixWriter.Conf(0, 1, 2, 3, '#', 0);
        TabixWriter w = new TabixWriter(dataFile, TSV_CONF);
        w.createIndex(dataFile);

        new File(dataFile.getAbsolutePath() + ".tbi");

        String fileName = file.getName();
        String fileNameNoExtension = fileName.substring(0, fileName.lastIndexOf("."));

        int underscore = indexOfOrMax(fileNameNoExtension, "_");
        int hyphen = indexOfOrMax(fileNameNoExtension, "-");
        int period = indexOfOrMax(fileNameNoExtension, ".");

        fileNameNoExtension.substring(0, Math.min(underscore, Math.min(hyphen, period)));

        String reference;
        if (fileNameNoExtension.contains("GRCh_37")) {
            reference = "hg19";
        } else {
            throw new Exception("Unknown reference from " + fileNameNoExtension);
        }

        AnnotationWriter aw = new AnnotationWriter(dataFile, xmlFile);

        aw.setName(fileNameNoExtension);
        aw.setReference(reference);
        aw.setVersion("1.0");

        BlockCompressedInputStream bcis = new BlockCompressedInputStream(dataFile);
        String line;
        String[] tokens;
        while ((line = bcis.readLine()) != null) {
            // header line
            if (line.charAt(0) == headerChar) {
                line = line.substring(1, line.length());
                tokens = line.split(delim, -1);
                aw.setFieldNames(tokens);

                // data line
            } else {
                tokens = line.split(delim, -1);
                aw.addLine(tokens);
            }
        }
        bcis.close();

        aw.writeXMLSummary();
    }

    private static void processTXTFile(File file, File dataFile, File xmlFile) throws Exception {
        if (!dataFile.exists()) {
            bgZipFile(file, dataFile);
        }

        BlockCompressedInputStream bcis = new BlockCompressedInputStream(dataFile);
        String line;
        String[] tokens;
        TXT_FORMATS matchingFormat = null;

        line = bcis.readLine();
        tokens = line.split(delim, -1);
        for (TXT_FORMATS format : TXT_FORMATS.values()) {
            if (format.pattern.matcher(line).matches()) {
                matchingFormat = format;
                break;
            }
        }
        if (matchingFormat == null) {
            System.out.println("Unknown file format for [" + file.getCanonicalPath() + "]");
            return;
        }

        TabixWriter w = new TabixWriter(dataFile, matchingFormat.tabixConf);
        w.createIndex(dataFile);

        new File(dataFile.getAbsolutePath() + ".tbi");
        String fileName = file.getName();
        String fileNameNoExtension = fileName.substring(0, fileName.lastIndexOf("."));

        int underscore = indexOfOrMax(fileNameNoExtension, "_");
        int hyphen = indexOfOrMax(fileNameNoExtension, "-");
        int period = indexOfOrMax(fileNameNoExtension, ".");

        fileNameNoExtension.substring(0, Math.min(underscore, Math.min(hyphen, period)));

        String reference = (fileName.startsWith("hg18") ? "hg18" : "hg19");

        AnnotationWriter aw = new AnnotationWriter(dataFile, xmlFile);

        aw.setName(fileNameNoExtension);
        aw.setReference(reference);
        aw.setVersion("1.0");
        aw.setFieldNames(matchingFormat.columns);

        aw.addLine(tokens);
        while ((line = bcis.readLine()) != null) {
            tokens = line.split(delim, -1);
            aw.addLine(tokens);
        }
        bcis.close();

        aw.writeXMLSummary();
    }

    private static void processIDFFile(File file, File dataFile, File xmlFile) throws ClassNotFoundException,
            IOException, SQLException, ParserConfigurationException, TransformerException, Exception {

        String path = file.getAbsolutePath();

        Class.forName("org.sqlite.JDBC");
        Connection conn = DriverManager.getConnection("jdbc:sqlite:" + path);

        Statement nameStatement = conn.createStatement();
        ResultSet nameResultSet = nameStatement.executeQuery("SELECT name,build FROM source");
        nameResultSet.next();
        String name = nameResultSet.getString("name");
        String build = nameResultSet.getString("build");

        String version = "0";
        String minor_version = "0";

        Statement versionStatement = conn.createStatement();
        ResultSet versionResultSet = versionStatement.executeQuery("SELECT name,value FROM meta_data");
        while (versionResultSet.next()) {
            String n = versionResultSet.getString("name");
            String v = versionResultSet.getString("value");
            if (n.equals("version")) {
                version = v;
            } else if (n.equals("minor_version")) {
                minor_version = v;
            }
        }

        Statement dataStatement = conn.createStatement();
        ResultSet dataResultSet = dataStatement.executeQuery("SELECT * FROM interval");

        AnnotationWriter aw = new AnnotationWriter(dataFile, xmlFile);

        aw.setName(name);
        aw.setReference(build);
        aw.setVersion(version + "." + minor_version);

        aw.setFieldNames(new String[] { "query_id", "start", "stop", "data" });

        aw.openWriter();

        int rowNum = 0;
        while (dataResultSet.next()) {
            rowNum++;

            if (rowNum == 2) {
                break;
            }

            dataResultSet.getBytes("data");

            aw.addLine(new String[] { "" + dataResultSet.getInt("query_id"), "" + dataResultSet.getInt("start"),
                    "" + dataResultSet.getInt("stop"), new String(dataResultSet.getBytes("data"), LATIN1) });
        }

        aw.closeWriter();
        aw.writeXMLSummary();

        dataResultSet.close();
        conn.close();
    }

    private static void bgZipFile(File infile, File outFile) throws IOException {
        BlockCompressedOutputStream bcos = new BlockCompressedOutputStream(outFile);
        BufferedReader br = new BufferedReader(new FileReader(infile));
        String line;
        while ((line = br.readLine()) != null) {
            line = line + "\n";
            bcos.write(line.getBytes(LATIN1));
        }
        bcos.close();
    }

    private static int indexOfOrMax(String str, String regex) {
        if (str.contains(regex)) {
            return str.indexOf(regex);
        } else {
            return str.length();
        }
    }
}