MSUmpire.DIA.TargetHitPepXMLWriter.java Source code

Java tutorial

Introduction

Here is the source code for MSUmpire.DIA.TargetHitPepXMLWriter.java

Source

/* 
 * Author: Chih-Chiang Tsou <chihchiang.tsou@gmail.com>
 *             Nesvizhskii Lab, Department of Computational Medicine and Bioinformatics, 
 *             University of Michigan, Ann Arbor
 *
 * Copyright 2014 University of Michigan, Ann Arbor, MI
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package MSUmpire.DIA;

import MSUmpire.BaseDataStructure.UmpireInfo;
import MSUmpire.SeqUtility.FastaParser;
import MSUmpire.PSMDataStructure.PepIonID;
import MSUmpire.Utility.DateTimeTag;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.commons.io.FilenameUtils;
import org.apache.log4j.Logger;
import org.xmlpull.v1.XmlPullParserException;

/**
 * pepXML writer for targeted re-extraction
 * @author Chih-Chiang Tsou
 */
public class TargetHitPepXMLWriter {

    TargetMatchScoring Tscoring;
    StringBuilder sb = new StringBuilder();
    String Filename;
    String Fasta;
    String Decoytag;

    public class ParentProtein {

        String Acc;
        char PreAA = '-';
        char NextAA = '-';
        String Des;
    }

    public TargetHitPepXMLWriter(String Filename, String Fasta, String Decoytab, TargetMatchScoring Tscoring)
            throws IOException, XmlPullParserException {
        this.Filename = Filename;
        this.Fasta = Fasta;
        this.Decoytag = Decoytab;
        this.Tscoring = Tscoring;
        write();
    }

    public void write() throws IOException, XmlPullParserException {

        Logger.getRootLogger().info("Writing " + Filename);
        int minlength = Integer.MAX_VALUE;
        int maxlength = 0;
        int maxmiss = 0;

        for (UmpireSpecLibMatch match : Tscoring.libTargetMatches) {
            PepIonID pepIonID = match.pepIonID;
            if (pepIonID.Sequence.length() > maxlength) {
                maxlength = pepIonID.Sequence.length();
            }
            if (pepIonID.Sequence.length() < minlength) {
                minlength = pepIonID.Sequence.length();
            }
            if (pepIonID.getNMissedCleavages() > maxmiss) {
                maxmiss = pepIonID.getNMissedCleavages();
            }
        }

        FastaParser fastaparser = new FastaParser(Fasta);
        fastaparser.RemoveDecoy(Decoytag);
        //FastaParser_V2 fastaparser = FastaParser.FasterSerialzationRead(Fasta);        
        fastaparser.digestion(maxmiss, minlength, maxlength, Decoytag);

        Header();
        sb.append("<msms_run_summary base_name=\"" + FilenameUtils.getBaseName(Filename)
                + "\" search_engine=\"interprophet\" >\n");
        SearchSummary();
        int index = 0;
        for (UmpireSpecLibMatch match : Tscoring.libTargetMatches) {
            //<editor-fold defaultstate="collapsed" desc="Forward hit">            
            PeakGroupScore bestscore = match.BestHit;
            String Sequence = match.pepIonID.Sequence;
            if (bestscore != null) {
                ArrayList<ParentProtein> parentprots = new ArrayList<>();
                if (!fastaparser.PeptideList.containsKey(Sequence)) {
                    //Logger.getRootLogger().warn(Sequence + " is not found as tryptic peptides in " + Fasta + ". Searching in protein sequences..");
                    for (String acc : fastaparser.ProteinList.keySet()) {
                        String ProtSeq = fastaparser.ProteinList.get(acc).Seq;
                        if (ProtSeq.contains(Sequence)) {
                            ParentProtein prot = new ParentProtein();
                            prot.Acc = acc;
                            prot.Des = fastaparser.ProteinList.get(acc).Des;
                            int aaindex = ProtSeq.indexOf(Sequence) - 1;
                            if (aaindex >= 0) {
                                prot.PreAA = ProtSeq.charAt(aaindex);
                            }
                            aaindex = ProtSeq.indexOf(Sequence) + Sequence.length();
                            if (aaindex < ProtSeq.length()) {
                                prot.NextAA = ProtSeq.charAt(aaindex);
                            }
                            parentprots.add(prot);
                        }
                    }
                } else {
                    for (String acc : fastaparser.PeptideList.get(Sequence).Proteins) {
                        String ProtSeq = fastaparser.ProteinList.get(acc).Seq;
                        if (ProtSeq.contains(Sequence)) {
                            ParentProtein prot = new ParentProtein();
                            prot.Acc = acc;
                            prot.Des = fastaparser.ProteinList.get(acc).Des;
                            int aaindex = ProtSeq.indexOf(Sequence) - 1;
                            if (aaindex >= 0) {
                                prot.PreAA = ProtSeq.charAt(aaindex);
                            }
                            aaindex = ProtSeq.indexOf(Sequence) + Sequence.length();
                            if (aaindex < ProtSeq.length()) {
                                prot.NextAA = ProtSeq.charAt(aaindex);
                            }
                            parentprots.add(prot);
                        }
                    }
                }

                if (parentprots.isEmpty()) {
                    Logger.getRootLogger().warn(Sequence + " is not found in " + Fasta);
                } else {
                    sb.append("<spectrum_query spectrum=\"" + FilenameUtils.getBaseName(Filename) + "." + index
                            + "." + index + "." + match.pepIonID.Charge + "\" start_scan=\"" + index
                            + "\" end_scan=\"" + index + "\" precursor_neutral_mass=\""
                            + bestscore.PrecursorNeutralMass + "\" assumed_charge=\"" + match.pepIonID.Charge
                            + "\" index=\"" + (index++) + "\" retention_time_sec=\"" + bestscore.PrecursorRT * 60f
                            + "\">\n" + "<search_result>\n" + "<search_hit hit_rank=\"1\" peptide=\"" + Sequence
                            + "\" peptide_prev_aa=\"" + parentprots.get(0).PreAA + "\" peptide_next_aa=\""
                            + parentprots.get(0).NextAA + "\" protein=\"" + parentprots.get(0).Acc
                            + "\" protein_descr=\"" + parentprots.get(0).Des + "\" num_tot_proteins=\""
                            + parentprots.size() + "\" num_matched_ions=\""
                            + (bestscore.NoMatchB + bestscore.NoMatchY) + "\" tot_num_ions=\""
                            + 2 * (Sequence.length() - 1) + "\" calc_neutral_pep_mass=\""
                            + match.pepIonID.CalcNeutralPepMass() + "\" massdiff=\""
                            + (match.pepIonID.CalcNeutralPepMass() - bestscore.PrecursorNeutralMass)
                            + "\"  num_tol_term=\"2\" num_missed_cleavages=\""
                            + match.pepIonID.getNMissedCleavages() + "\" is_rejected=\"0\">\n");

                    for (int i = 1; i < parentprots.size(); i++) {
                        sb.append("<alternative_protein protein=\"" + parentprots.get(i).Acc + "\" protein_descr=\""
                                + parentprots.get(i).Des + "\" num_tol_term=\"2\" peptide_prev_aa=\""
                                + parentprots.get(i).PreAA + "\" peptide_next_aa=\"" + parentprots.get(i).NextAA
                                + "\"/>");
                    }
                    //                        + "<modification_info modified_peptide=\"KITIADCGQLE\">\n"
                    //                        + "<mod_aminoacid_mass position=\"7\" mass=\"160.0306\"/>\n"
                    //                        + "</modification_info>\n"
                    //                        + "<analysis_result analysis=\"peptideprophet\">\n"
                    //                        + "<peptideprophet_result probability=\"0.8297\" all_ntt_prob=\"(0.0059,0.3657,0.8297)\">\n"
                    //                        + "<search_score_summary>\n"
                    //                        + "<parameter name=\"fval\" value=\"1.6872\"/>\n"
                    //                        + "<parameter name=\"ntt\" value=\"2\"/>\n"
                    //                        + "<parameter name=\"nmc\" value=\"0\"/>\n"
                    //                        + "<parameter name=\"massd\" value=\"4.011\"/>\n"
                    //                        + "<parameter name=\"isomassd\" value=\"0\"/>\n"
                    //                        + "</search_score_summary>\n"
                    //                        + "</peptideprophet_result>\n"
                    //                        + "</analysis_result>\n"
                    sb.append("<analysis_result analysis=\"interprophet\">\n"
                            + "<interprophet_result probability=\"" + bestscore.MixtureModelLocalProb
                            + "\" all_ntt_prob=\"(" + bestscore.MixtureModelLocalProb + ","
                            + bestscore.MixtureModelLocalProb + "," + bestscore.MixtureModelLocalProb + ")\">\n"
                            //                        + "<search_score_summary>\n"
                            //                        + "<parameter name=\"nrs\" value=\"0\"/>\n"
                            //                        + "<parameter name=\"nsi\" value=\"0\"/>\n"
                            //                        + "<parameter name=\"nsm\" value=\"0\"/>\n"
                            //                        + "</search_score_summary>\n"
                            + "</interprophet_result>\n" + "</analysis_result>\n" + "</search_hit>\n"
                            + "</search_result>\n" + "</spectrum_query>\n");
                }
            }
            //</editor-fold>

            //<editor-fold defaultstate="collapsed" desc="Decoy hit">
            bestscore = match.BestDecoyHit;
            if (!fastaparser.PeptideList.containsKey(Sequence)) {
                continue;
            }
            String DecoySeq = fastaparser.PeptideList.get(Sequence).Decoy;
            if (bestscore != null) {
                ArrayList<ParentProtein> parentprots = new ArrayList<>();
                if (!fastaparser.PeptideList.containsKey(Sequence)) {
                    //Logger.getRootLogger().warn(Sequence + " is not found as tryptic peptides in " + Fasta + ". Searching in protein sequences..");
                    for (String acc : fastaparser.ProteinList.keySet()) {
                        String ProtSeq = fastaparser.ProteinList.get(acc).Seq;
                        if (ProtSeq.contains(Sequence)) {
                            ParentProtein prot = new ParentProtein();
                            prot.Acc = acc;
                            prot.Des = fastaparser.ProteinList.get(acc).Des;
                            int aaindex = ProtSeq.indexOf(Sequence) - 1;
                            if (aaindex >= 0) {
                                prot.PreAA = ProtSeq.charAt(aaindex);
                            }
                            aaindex = ProtSeq.indexOf(Sequence) + Sequence.length();
                            if (aaindex < ProtSeq.length()) {
                                prot.NextAA = ProtSeq.charAt(aaindex);
                            }
                            parentprots.add(prot);
                        }
                    }
                } else {
                    for (String acc : fastaparser.PeptideList.get(Sequence).Proteins) {
                        String ProtSeq = fastaparser.ProteinList.get(acc).Seq;
                        if (ProtSeq.contains(Sequence)) {
                            ParentProtein prot = new ParentProtein();
                            prot.Acc = acc;
                            prot.Des = fastaparser.ProteinList.get(acc).Des;
                            int aaindex = ProtSeq.indexOf(Sequence) - 1;
                            if (aaindex >= 0) {
                                prot.PreAA = ProtSeq.charAt(aaindex);
                            }
                            aaindex = ProtSeq.indexOf(Sequence) + Sequence.length();
                            if (aaindex < ProtSeq.length()) {
                                prot.NextAA = ProtSeq.charAt(aaindex);
                            }
                            parentprots.add(prot);
                        }
                    }
                }

                if (parentprots.isEmpty()) {
                    Logger.getRootLogger().warn(Sequence + " is not found in " + Fasta);
                } else {
                    sb.append("<spectrum_query spectrum=\"" + Decoytag + "_" + Sequence
                            + "\" precursor_neutral_mass=\"" + bestscore.PrecursorNeutralMass
                            + "\" assumed_charge=\"" + match.pepIonID.Charge + "\" index=\"" + (index++)
                            + "\" retention_time_sec=\"" + bestscore.PrecursorRT * 60f + "\">\n"
                            + "<search_result>\n" + "<search_hit hit_rank=\"1\" peptide=\"" + DecoySeq
                            + "\" peptide_prev_aa=\"" + parentprots.get(0).PreAA + "\" peptide_next_aa=\""
                            + parentprots.get(0).NextAA + "\" protein=\"" + Decoytag + "_" + parentprots.get(0).Acc
                            + "\" protein_descr=\"" + Decoytag + "_" + parentprots.get(0).Des
                            + "\" num_tot_proteins=\"" + parentprots.size() + "\" num_matched_ions=\""
                            + (bestscore.NoMatchB + bestscore.NoMatchY) + "\" tot_num_ions=\""
                            + 2 * (Sequence.length() - 1) + "\" calc_neutral_pep_mass=\""
                            + match.pepIonID.CalcNeutralPepMass() + "\" massdiff=\""
                            + (match.pepIonID.CalcNeutralPepMass() - bestscore.PrecursorNeutralMass)
                            + "\" num_missed_cleavages=\"" + match.pepIonID.getNMissedCleavages()
                            + "\" is_rejected=\"0\">\n");

                    for (int i = 1; i < parentprots.size(); i++) {
                        sb.append("<alternative_protein protein=\"" + Decoytag + "_" + parentprots.get(i).Acc
                                + "\" protein_descr=\"" + Decoytag + "_" + parentprots.get(i).Des
                                + "\" num_tol_term=\"2\" peptide_prev_aa=\"" + parentprots.get(i).PreAA
                                + "\" peptide_next_aa=\"" + parentprots.get(i).NextAA + "\"/>");
                    }
                    //                        + "<modification_info modified_peptide=\"KITIADCGQLE\">\n"
                    //                        + "<mod_aminoacid_mass position=\"7\" mass=\"160.0306\"/>\n"
                    //                        + "</modification_info>\n"
                    //                        + "<analysis_result analysis=\"peptideprophet\">\n"
                    //                        + "<peptideprophet_result probability=\"0.8297\" all_ntt_prob=\"(0.0059,0.3657,0.8297)\">\n"
                    //                        + "<search_score_summary>\n"
                    //                        + "<parameter name=\"fval\" value=\"1.6872\"/>\n"
                    //                        + "<parameter name=\"ntt\" value=\"2\"/>\n"
                    //                        + "<parameter name=\"nmc\" value=\"0\"/>\n"
                    //                        + "<parameter name=\"massd\" value=\"4.011\"/>\n"
                    //                        + "<parameter name=\"isomassd\" value=\"0\"/>\n"
                    //                        + "</search_score_summary>\n"
                    //                        + "</peptideprophet_result>\n"
                    //                        + "</analysis_result>\n"
                    sb.append("<analysis_result analysis=\"interprophet\">\n"
                            + "<interprophet_result probability=\"" + bestscore.MixtureModelLocalProb
                            + "\" all_ntt_prob=\"(" + bestscore.MixtureModelLocalProb + ","
                            + bestscore.MixtureModelLocalProb + "," + bestscore.MixtureModelLocalProb + ")\">\n"
                            //                        + "<search_score_summary>\n"
                            //                        + "<parameter name=\"nrs\" value=\"0\"/>\n"
                            //                        + "<parameter name=\"nsi\" value=\"0\"/>\n"
                            //                        + "<parameter name=\"nsm\" value=\"0\"/>\n"
                            //                        + "</search_score_summary>\n"
                            + "</interprophet_result>\n" + "</analysis_result>\n" + "</search_hit>\n"
                            + "</search_result>\n" + "</spectrum_query>\n");
                }
            }

            //</editor-fold>
        }
        sb.append("</msms_run_summary>\n");
        sb.append("</msms_pipeline_analysis>\n");
        FileWriter writer = new FileWriter(Filename);
        writer.write(sb.toString());
        writer.close();
    }

    private void SearchSummary() {
        sb.append("<sample_enzyme name=\"trypsin\">\n" + "<specificity cut=\"KR\" no_cut=\"P\" sense=\"C\"/>\n"
                + "</sample_enzyme>\n" + "<search_summary base_name=\"" + FilenameUtils.getBaseName(Filename)
                + "\" search_engine=\"Umpire\" precursor_mass_type=\"monoisotopic\" fragment_mass_type=\"monoisotopic\" search_id=\"1\">\n"
                + "<search_database local_path=\"" + Fasta + "\" type=\"AA\"/>\n"
                + "<enzymatic_search_constraint enzyme=\"trypsin\" max_num_internal_cleavages=\"1\" min_number_termini=\"2\"/>\n"
                + "    </search_summary>\n");
    }

    private void Header() {
        sb.append("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n");
        sb.append("<msms_pipeline_analysis date=\"" + DateTimeTag.GetTag() + "\" summary_xml=\"" + Filename
                + "\" xmlns=\"http://regis-web.systemsbiology.net/pepXML\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://sashimi.sourceforge.net/schema_revision/pepXML/pepXML_v18.xsd\">\n");
        sb.append("<analysis_summary analysis=\"Umpire\" version=\"" + UmpireInfo.GetInstance().Version
                + "\" time=\"" + DateTimeTag.GetTag() + "\"/>\n");
    }
}