edu.monash.merc.system.parser.xml.GPMWSXmlParser.java Source code

Java tutorial

Introduction

Here is the source code for edu.monash.merc.system.parser.xml.GPMWSXmlParser.java

Source

/*
 * Copyright (c) 2011-2013, Monash e-Research Centre
 * (Monash University, Australia)
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *    * Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *    * Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *    * Neither the name of the Monash University nor the names of its
 *      contributors may be used to endorse or promote products derived from
 *      this software without specific prior written permission.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Affero General Public License for more details.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * You should have received a copy of the GNU Affero General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 */

package edu.monash.merc.system.parser.xml;

import edu.monash.merc.common.name.ColorType;
import edu.monash.merc.common.name.DataType;
import edu.monash.merc.common.name.DbAcType;
import edu.monash.merc.common.name.TLLevel;
import edu.monash.merc.dto.*;
import edu.monash.merc.dto.gpm.GPMEntryBean;
import edu.monash.merc.exception.DMXMLParserException;
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.codehaus.stax2.XMLEventReader2;
import org.codehaus.stax2.XMLInputFactory2;

import javax.xml.namespace.QName;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;

/**
 * GPMWSXmlParser class parses the gpm xml
 *
 * @author Simon Yu
 *         <p/>
 *         Email: xiaoming.yu@monash.edu
 * @version 1.0
 * @since 1.0
 *        <p/>
 *        Date: 12/12/12 4:34 PM
 */
public class GPMWSXmlParser {

    static String PE_MS_PROB_EVIDENCE_TEXT = "GPMDB best probability (log(e)) = ";

    static String PE_MS_SAMPLE_EVIDENCE_TEXT_PART1 = "Observed in ";

    static String PE_MS_SAMPLE_EVIDENCE_TEXT_PART2 = " samples in GPMDB";

    static String GPM_HYPERLINK_PROB_SAM_BASE = "http://gpmdb.thegpm.org/thegpm-cgi/dblist_label.pl?label=";

    static String GPM_HYPERLINK_PROB_SAM_BASE_EXT = "&proex=-1";

    private static String ELE_GPM_PROTEIN = "protein";

    private static String ATTR_GPM_ENSP = "ensp";

    private static String ATTR_GPM_ENSG = "ensg";

    private static String ELE_GPM_IDENTIFICATION = "identification";

    private static String ATTR_GPM_IDEN_BESTE = "beste";

    private static String ATTR_GPM_IDEN_SAMPLES = "samples";

    private XMLInputFactory2 xmlif2;

    private static final Logger logger = Logger.getLogger(GPMWSXmlParser.class.getName());

    public List<GPMEntryBean> parseGPMXml(String fileName, XMLInputFactory2 factory2) {

        xmlif2 = factory2;
        logger.info("Starting to parse " + fileName);

        XMLEventReader2 xmlEventReader = null;
        List<GPMEntryBean> gpmEntryBeans = new ArrayList<GPMEntryBean>();
        try {
            xmlEventReader = (XMLEventReader2) xmlif2.createXMLEventReader(new FileInputStream(fileName));

            QName proteinQN = new QName(ELE_GPM_PROTEIN);
            QName enspQN = new QName(ATTR_GPM_ENSP);
            QName ensgQN = new QName(ATTR_GPM_ENSG);
            QName identiQN = new QName(ELE_GPM_IDENTIFICATION);
            QName besteQN = new QName(ATTR_GPM_IDEN_BESTE);
            QName samplesQN = new QName(ATTR_GPM_IDEN_SAMPLES);
            String ensg = null;
            String ensp = null;
            String beste = null;
            String samples = null;

            GPMEntryBean gpmEntryBean = null;

            while (xmlEventReader.hasNextEvent()) {
                //eventType = reader.next();
                XMLEvent event = xmlEventReader.nextEvent();
                if (event.isStartElement()) {
                    StartElement element = event.asStartElement();

                    //protein entry
                    if (element.getName().equals(proteinQN)) {
                        //get the version attribute
                        Attribute enspAttr = element.getAttributeByName(enspQN);
                        Attribute ensgAttr = element.getAttributeByName(ensgQN);
                        if (enspAttr != null) {
                            ensp = enspAttr.getValue();
                        }
                        if (ensgAttr != null) {
                            ensg = ensgAttr.getValue();
                        }
                        //create gpn entry bean
                        gpmEntryBean = new GPMEntryBean();
                        //create gpm dbsource
                        DBSourceBean gpmDbSourceBean = new DBSourceBean();
                        gpmDbSourceBean.setDbName(DbAcType.GPM.type());
                        gpmDbSourceBean.setPrimaryEvidences(true);
                        //set the gpm dbsource
                        gpmEntryBean.setPrimaryDbSourceBean(gpmDbSourceBean);

                        //create a gene bean
                        if (StringUtils.isNotBlank(ensg)) {
                            GeneBean geneBean = new GeneBean();
                            geneBean.setEnsgAccession(ensg);
                            gpmEntryBean.setGeneBean(geneBean);
                        }

                        //create an identified accession bean
                        AccessionBean identAccessionBean = parseIdentifiedAccessionBean(ensp);
                        gpmEntryBean.setIdentifiedAccessionBean(identAccessionBean);
                        //create dbsource and accession entry bean
                        List<DbSourceAcEntryBean> dbSourceAcEntryBeanList = parseDBSourceAcEntryBeans(ensp, ensg);
                        gpmEntryBean.setDbSourceAcEntryBeans(dbSourceAcEntryBeanList);

                    }

                    //protein entry
                    if (element.getName().equals(identiQN)) {
                        Attribute besteAttr = element.getAttributeByName(besteQN);
                        Attribute samplesAttr = element.getAttributeByName(samplesQN);
                        if (besteAttr != null) {
                            beste = besteAttr.getValue();
                        }
                        if (samplesAttr != null) {
                            samples = samplesAttr.getValue();
                        }

                        //create pe ms prob evidence based on beste and ensp accesion
                        PEEvidenceBean peMsProbEvidence = createMsProbEvidence(beste, ensp);
                        //parse pe ms samples evidence
                        PEEvidenceBean peMsSamplesEvidence = createMsSamplesEvidence(samples, ensp);

                        if (peMsProbEvidence != null) {
                            gpmEntryBean.setPeMsProbEvidenceBean(peMsProbEvidence);
                        }
                        if (peMsSamplesEvidence != null) {
                            gpmEntryBean.setPeMsSamplesEvidenceBean(peMsSamplesEvidence);
                        }
                    }
                }
                //End of element
                if (event.isEndElement()) {
                    EndElement endElement = event.asEndElement();
                    //hpa entry end
                    if (endElement.getName().equals(proteinQN)) {
                        //if gene is not null, the  accession is not null and get some evidences for this gpm entry, then we add it to the list
                        GeneBean geneBean = gpmEntryBean.getGeneBean();
                        AccessionBean identifiedAcBean = gpmEntryBean.getIdentifiedAccessionBean();
                        PEEvidenceBean peMsProbEvidence = gpmEntryBean.getPeMsProbEvidenceBean();
                        PEEvidenceBean peMsSampEvidence = gpmEntryBean.getPeMsProbEvidenceBean();
                        if (geneBean != null && identifiedAcBean != null) {
                            if (peMsProbEvidence != null || peMsSampEvidence != null) {
                                gpmEntryBeans.add(gpmEntryBean);
                            }
                        }
                    }
                }
            }
            return gpmEntryBeans;
        } catch (Exception ex) {
            logger.error(ex);
            throw new DMXMLParserException(ex);
        } finally {
            if (xmlEventReader != null) {
                try {
                    xmlEventReader.close();
                } catch (Exception e) {
                    //ignore whatever caught.
                }
            }
        }

    }

    private AccessionBean parseIdentifiedAccessionBean(String enspAc) {
        if (StringUtils.isNotBlank(enspAc)) {
            AccessionBean accessionBean = new AccessionBean();
            accessionBean.setAccession(enspAc);
            accessionBean.setAcType(DbAcType.Protein.type());
            return accessionBean;
        }
        return null;
    }

    private List<DbSourceAcEntryBean> parseDBSourceAcEntryBeans(String enspAc, String ensgAc) {
        List<DbSourceAcEntryBean> dbSourceAcEntryBeans = new ArrayList<DbSourceAcEntryBean>();
        DbSourceAcEntryBean dbSourceAcEntryBeanEnsp = generateDbSourceAcEntry(enspAc, DbAcType.Protein);
        if (dbSourceAcEntryBeanEnsp != null) {
            dbSourceAcEntryBeans.add(dbSourceAcEntryBeanEnsp);
        }
        DbSourceAcEntryBean dbSourceAcEntryBeanEnsg = generateDbSourceAcEntry(ensgAc, DbAcType.Gene);
        if (dbSourceAcEntryBeanEnsg != null) {
            dbSourceAcEntryBeans.add(dbSourceAcEntryBeanEnsg);
        }
        return dbSourceAcEntryBeans;
    }

    private DbSourceAcEntryBean generateDbSourceAcEntry(String accession, DbAcType dbAcType) {
        if (StringUtils.isNotBlank(accession)) {
            DbSourceAcEntryBean dbSourceAcEntryBean = new DbSourceAcEntryBean();
            AccessionBean accessionBean = new AccessionBean();
            accessionBean.setAccession(accession);
            if (dbAcType.equals(DbAcType.Gene)) {
                accessionBean.setAcType(DbAcType.Gene.type());
            }
            if (dbAcType.equals(DbAcType.Protein)) {
                accessionBean.setAcType(DbAcType.Protein.type());
            }
            DBSourceBean dbSourceBean = new DBSourceBean();
            dbSourceBean.setDbName(DbAcType.Ensembl.type());

            dbSourceAcEntryBean.setAccessionBean(accessionBean);
            dbSourceAcEntryBean.setDbSourceBean(dbSourceBean);
            return dbSourceAcEntryBean;
        }
        return null;
    }

    private PEEvidenceBean createMsProbEvidence(String beste, String enspAc) {

        if (StringUtils.isNotBlank(beste) && StringUtils.isNotBlank(enspAc)) {
            PEEvidenceBean peMsProbEvidence = new PEEvidenceBean();
            //create a TPBDataTypeBean
            TPBDataTypeBean tpbDataTypeBean = new TPBDataTypeBean();
            //set the data type
            tpbDataTypeBean.setDataType(DataType.PE_MS_PROB.type());
            //set the traffic lights level to 3
            tpbDataTypeBean.setLevel(TLLevel.TL3.level());
            peMsProbEvidence.setTpbDataTypeBean(tpbDataTypeBean);
            peMsProbEvidence.setEvidenceValue(PE_MS_PROB_EVIDENCE_TEXT + beste);
            peMsProbEvidence.setHyperlink(GPM_HYPERLINK_PROB_SAM_BASE + enspAc + GPM_HYPERLINK_PROB_SAM_BASE_EXT);
            double besteValue = Double.valueOf(beste).doubleValue();

            if (besteValue <= -10) {
                peMsProbEvidence.setColorLevel(ColorType.GREEN.color());
            }
            if (besteValue > -10 && besteValue <= -3) {
                peMsProbEvidence.setColorLevel(ColorType.YELLOW.color());
            }
            if (besteValue > -3 && besteValue < 0) {
                peMsProbEvidence.setColorLevel(ColorType.RED.color());
            }
            if (besteValue >= 0) {
                peMsProbEvidence.setColorLevel(ColorType.BLACK.color());
            }
            return peMsProbEvidence;
        }

        return null;
    }

    private PEEvidenceBean createMsSamplesEvidence(String samples, String enspAc) {

        if (StringUtils.isNotBlank(samples) && StringUtils.isNotBlank(enspAc)) {

            PEEvidenceBean peMsSamplesEvidence = new PEEvidenceBean();
            //create a TPBDataTypeBean
            TPBDataTypeBean tpbDataTypeBean = new TPBDataTypeBean();
            //set the data type
            tpbDataTypeBean.setDataType(DataType.PE_MS_SAM.type());
            //set the traffic lights level to 3
            tpbDataTypeBean.setLevel(TLLevel.TL3.level());
            peMsSamplesEvidence.setTpbDataTypeBean(tpbDataTypeBean);

            peMsSamplesEvidence.setEvidenceValue(
                    PE_MS_SAMPLE_EVIDENCE_TEXT_PART1 + samples + PE_MS_SAMPLE_EVIDENCE_TEXT_PART2);
            peMsSamplesEvidence
                    .setHyperlink(GPM_HYPERLINK_PROB_SAM_BASE + enspAc + GPM_HYPERLINK_PROB_SAM_BASE_EXT);
            int samplesValue = Integer.valueOf(samples).intValue();

            if (samplesValue >= 100) {
                peMsSamplesEvidence.setColorLevel(ColorType.GREEN.color());
            }
            if (samplesValue >= 20 && samplesValue < 100) {
                peMsSamplesEvidence.setColorLevel(ColorType.YELLOW.color());
            }
            if (samplesValue > 0 && samplesValue < 20) {
                peMsSamplesEvidence.setColorLevel(ColorType.RED.color());
            }
            if (samplesValue <= 0) {
                peMsSamplesEvidence.setColorLevel(ColorType.BLACK.color());
            }
            return peMsSamplesEvidence;
        }

        return null;
    }

    public static void main(String[] art) throws Exception {
        GPMWSXmlParser parser = new GPMWSXmlParser();
        String gpmFileName = "./testData/2012.9.23.gpm2tpb.xml";
        List<GPMEntryBean> gpmEntryBeans = parser.parseGPMXml(gpmFileName,
                WSXmlInputFactory.getInputFactoryConfiguredForXmlConformance());

        for (GPMEntryBean gpmEntryBean : gpmEntryBeans) {
            System.out.println("Db name: " + gpmEntryBean.getPrimaryDbSourceBean().getDbName() + " - ensg: "
                    + gpmEntryBean.getGeneBean().getEnsgAccession());
            PEEvidenceBean pemsProbEvidence = gpmEntryBean.getPeMsProbEvidenceBean();
            System.out.println("PE MS Prob evidence: " + pemsProbEvidence.getEvidenceValue() + " color level: "
                    + pemsProbEvidence.getColorLevel());
            PEEvidenceBean pemsSamplesEvidence = gpmEntryBean.getPeMsSamplesEvidenceBean();
            System.out.println("PE MS Samples Evidence: " + pemsSamplesEvidence.getEvidenceValue()
                    + " color level: " + pemsSamplesEvidence.getColorLevel());
        }
        System.out.println("================== total size: " + gpmEntryBeans.size());
    }
}