musite.io.xml.UniProtXMLReader.java Source code

Java tutorial

Introduction

Here is the source code for musite.io.xml.UniProtXMLReader.java

Source

/**
 * Musite
 * Copyright (C) 2010 Digital Biology Laboratory, University Of Missouri
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package musite.io.xml;

import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;

import java.util.ArrayList;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.dom4j.Element;
import org.dom4j.ElementPath;
import org.dom4j.ElementHandler;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;

import musite.ProteinImpl;
import musite.Proteins;
import musite.ProteinsImpl;

import musite.io.Reader;

import musite.PTM;
import musite.PTMAnnotationUtil;

/**
 *
 * @author Jianjiong Gao
 */
public class UniProtXMLReader implements Reader<Proteins> {
    private Proteins data;
    private Set<PTM> ptmFilter;
    private Set<String> organismFilter;
    //    private AnnotationFilter annotationFilter;
    private boolean includeBySimilarity = false, includeProbable = false, includePotential = false;

    private boolean keepAllIds = false;
    private Map<String, String> mapIdMainId = null;

    private static Set<String> UNIPROT_TYPES = musite.util.CollectionUtil.getSet("modified residue",
            "lipid moiety-binding region");

    public UniProtXMLReader() {
        this(null);
    }

    public UniProtXMLReader(Proteins data) {
        this.data = data;
    }

    public void setPTMFilter(PTM ptm) {
        ptmFilter = new HashSet(1);
        ptmFilter.add(ptm);
    }

    public void setPTMFilter(Set<PTM> types) {
        this.ptmFilter = types;
    }

    public void setOrganismFilter(Set<String> organisms) {
        this.organismFilter = organisms;
    }

    //    public void setAnnotationFilter(AnnotationFilter annotationFilter) {
    //        this.annotationFilter = annotationFilter;
    //    }

    public void setIncludeBySimilarity(boolean includeBySimilarity) {
        this.includeBySimilarity = includeBySimilarity;
    }

    public void setIncludeProbable(boolean includeProbable) {
        this.includeProbable = includeProbable;
    }

    public void setIncludePotential(boolean includePotential) {
        this.includePotential = includePotential;
    }

    public void setKeepAllIds(boolean keepAllIds) {
        this.keepAllIds = keepAllIds;
        if (keepAllIds)
            mapIdMainId = new HashMap();
    }

    public Map getIDMap() {
        return mapIdMainId;
    }

    public Proteins read(InputStream is) throws IOException {
        if (is == null) {
            throw new IllegalArgumentException();
        }

        final Proteins result = data == null ? new ProteinsImpl() : data;

        SAXReader saxReader = new SAXReader();

        final StringBuilder acc = new StringBuilder(30);
        final StringBuilder name = new StringBuilder(30);
        final StringBuilder fullName = new StringBuilder(200);
        final StringBuilder org = new StringBuilder(30);
        final StringBuilder seq = new StringBuilder(2000);
        final List<List> sites = new ArrayList(4); // location, ptm, enzyme, annotation
        final Set<String> accs = new HashSet();

        // entry
        saxReader.addHandler("/uniprot/entry", new ElementHandler() {
            public void onStart(ElementPath path) {
                acc.setLength(0);
                fullName.setLength(0);
                seq.setLength(0);
                org.setLength(0);
                name.setLength(0);
                sites.clear();
                accs.clear();
            }

            public void onEnd(ElementPath path) {
                // process a element
                if (org.length() > 0 && (organismFilter == null || organismFilter.contains(org.toString()))
                        && acc.length() > 0 && seq.length() > 0) {
                    String accession = acc.toString();
                    String sequence = seq.toString();

                    ProteinImpl protein = new ProteinImpl(acc.toString(), sequence,
                            name.length() == 0 ? null : name.toString(),
                            fullName.length() == 0 ? null : fullName.toString(),
                            org.length() == 0 ? null : org.toString());
                    result.addProtein(protein);

                    for (List l : sites) {
                        Integer site = (Integer) l.get(0);
                        PTM ptm = (PTM) l.get(1);
                        String enzyme = (String) l.get(2);
                        if (enzyme != null && enzyme.equalsIgnoreCase("autocatalysis")) {
                            enzyme = name.toString();
                        }

                        Map ann = (Map) l.get(3);
                        try {
                            PTMAnnotationUtil.annotate(protein, site, ptm, enzyme, ann);
                        } catch (Exception e) {
                            e.printStackTrace();
                        }
                    }

                    if (keepAllIds) {
                        for (String ac : accs) {
                            mapIdMainId.put(ac, accession);
                        }
                        if (!accs.isEmpty())
                            protein.putInfo("other-accessions", new HashSet(accs));
                    }

                    //System.out.println(accession);
                }

                // prune the tree
                Element row = path.getCurrent();
                row.detach();
            }
        });

        // accession
        saxReader.addHandler("/uniprot/entry/accession", new ElementHandler() {
            public void onStart(ElementPath path) {
                // do nothing
            }

            public void onEnd(ElementPath path) {
                if (acc.length() == 0) {
                    Element el = path.getCurrent();
                    acc.append(el.getText());
                    //                    if (keepAllIds) {
                    //                        accs.add(acc.toString());
                    //                    }
                } else {
                    if (keepAllIds) {
                        accs.add(path.getCurrent().getText());
                    }
                }

            }
        });

        // name
        saxReader.addHandler("/uniprot/entry/name", new ElementHandler() {
            public void onStart(ElementPath path) {
                // do nothing
            }

            public void onEnd(ElementPath path) {
                if (name.length() > 0)
                    return;

                Element el = path.getCurrent();
                name.append(el.getText());
            }
        });

        // full name
        saxReader.addHandler("/uniprot/entry/protein/recommendedName/fullName", new ElementHandler() {
            public void onStart(ElementPath path) {
                // do nothing
            }

            public void onEnd(ElementPath path) {
                if (fullName.length() > 0)
                    return;

                Element el = path.getCurrent();
                fullName.append(el.getTextTrim());
            }
        });

        saxReader.addHandler("/uniprot/entry/organism/name", new ElementHandler() {
            public void onStart(ElementPath path) {
                // do nothing
            }

            public void onEnd(ElementPath path) {
                if (org.length() > 0)
                    return;

                Element el = path.getCurrent();
                String attr = el.attributeValue("type");
                if (attr == null || !attr.equalsIgnoreCase("scientific")) {
                    return;
                }

                org.append(el.getText());
            }
        });

        saxReader.addHandler("/uniprot/entry/sequence", new ElementHandler() {
            public void onStart(ElementPath path) {
                // do nothing
            }

            public void onEnd(ElementPath path) {
                if (seq.length() > 0)
                    return;

                Element el = path.getCurrent();
                seq.append(el.getText().replaceAll("\\p{Space}", ""));
            }
        });

        saxReader.addHandler("/uniprot/entry/feature", new ElementHandler() {
            public void onStart(ElementPath path) {
                // do nothing
            }

            public void onEnd(ElementPath path) {
                Element el = path.getCurrent();
                String type = el.attributeValue("type");
                if (type == null)
                    return;

                PTM ptm = null;
                String enzyme = null;
                String description = null;
                String keyword = null;

                if (UNIPROT_TYPES.contains(type.toLowerCase())) {
                    description = el.attributeValue("description");
                    if (description == null)
                        return;

                    String[] descs = description.split("; ");
                    for (String desc : descs) {
                        PTM tmp = PTM.ofKeyword(desc);
                        if (tmp != null) {
                            ptm = tmp;
                            keyword = desc;
                        } else if (desc.startsWith("by ")) {
                            enzyme = desc.substring(3);
                        }
                    }
                }
                //                else if (type.equalsIgnoreCase("glycosylation site")) {
                //                    description = el.attributeValue("description");
                //                    ptm = PTM.GLYCOSYLATION;
                //                }
                //                else if (type.equalsIgnoreCase()) {
                //                    description = el.attributeValue("description");
                //                    String[] descs = description.split("; ");
                //                    for (String desc : descs) {
                //                        PTM tmp = PTM.ofKeyword(desc);
                //                        if (tmp != null) {
                //                            ptm = tmp;
                //                            keyword = desc;
                //                        } else if (desc.startsWith("by ")) {
                //                            enzyme = desc.substring(3);
                //                        }
                //                    }
                //                }

                if (ptm == null || (ptmFilter != null && !ptmFilter.contains(ptm)))
                    return;

                String status = el.attributeValue("status");
                if (status != null) {
                    if (!includeBySimilarity && status.equalsIgnoreCase("By similarity"))
                        return;
                    if (!includeProbable && status.equalsIgnoreCase("Probable"))
                        return;
                    if (!includePotential && status.equalsIgnoreCase("Potential"))
                        return;
                }

                int site = -1;

                List<Element> locs = el.elements("location");
                for (Element loc : locs) {
                    List<Element> poss = loc.elements("position");
                    for (Element pos : poss) {
                        String str = pos.attributeValue("position");
                        if (str == null)
                            continue;

                        try {
                            site = Integer.parseInt(str) - 1; //start from 0
                        } catch (NumberFormatException e) {
                            continue;
                        }
                    }
                }

                if (site != -1) {
                    List l = new ArrayList();
                    l.add(site);
                    l.add(ptm);
                    l.add(enzyme);
                    Map<String, Object> m = new HashMap();
                    if (keyword != null)
                        m.put("keyword", keyword);
                    if (description != null)
                        m.put("description", description);
                    if (status != null)
                        m.put("status", status);
                    l.add(m);
                    sites.add(l);
                }
            }
        });

        BufferedInputStream bis = new BufferedInputStream(is);

        try {
            saxReader.read(bis);
        } catch (DocumentException e) {
            throw new IOException(e.getMessage());
        }

        return result;
    };
}