org.mskcc.cbio.cgds.scripts.ImportUniProtIdMapping.java Source code

Java tutorial

Introduction

Here is the source code for org.mskcc.cbio.cgds.scripts.ImportUniProtIdMapping.java

Source

/** Copyright (c) 2012 Memorial Sloan-Kettering Cancer Center.
**
** This library is free software; you can redistribute it and/or modify it
** under the terms of the GNU Lesser General Public License as published
** by the Free Software Foundation; either version 2.1 of the License, or
** any later version.
**
** This library is distributed in the hope that it will be useful, but
** WITHOUT ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF
** MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.  The software and
** documentation provided hereunder is on an "as is" basis, and
** Memorial Sloan-Kettering Cancer Center 
** has no obligations to provide maintenance, support,
** updates, enhancements or modifications.  In no event shall
** Memorial Sloan-Kettering Cancer Center
** be liable to any party for direct, indirect, special,
** incidental or consequential damages, including lost profits, arising
** out of the use of this software and its documentation, even if
** Memorial Sloan-Kettering Cancer Center 
** has been advised of the possibility of such damage.  See
** the GNU Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public License
** along with this library; if not, write to the Free Software Foundation,
** Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
**/

package org.mskcc.cbio.cgds.scripts;

import static com.google.common.base.Preconditions.checkNotNull;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.mskcc.cbio.cgds.dao.DaoException;
import org.mskcc.cbio.cgds.dao.DaoUniProtIdMapping;
import org.mskcc.cbio.cgds.util.ConsoleUtil;
import org.mskcc.cbio.cgds.util.FileUtil;
import org.mskcc.cbio.cgds.util.ProgressMonitor;
import org.mskcc.cbio.portal.remote.ConnectionManager;

/**
 * Import data into the uniprot_id_mapping table.
 */
public final class ImportUniProtIdMapping {
    private final File uniProtIdMapping;
    private final ProgressMonitor progressMonitor;

    public ImportUniProtIdMapping(final File uniProtIdMapping, final ProgressMonitor progressMonitor) {
        checkNotNull(uniProtIdMapping, "uniProtIdMapping must not be null");
        checkNotNull(progressMonitor, "progressMonitor must not be null");
        this.uniProtIdMapping = uniProtIdMapping;
        this.progressMonitor = progressMonitor;
    }

    public void importData() throws DaoException, IOException {
        Set<String> swissProtAccs = getSwissProtAccessionHuman();
        int rows = 0;
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new FileReader(uniProtIdMapping));
            Map<Integer, Set<String>> mapEntrezSwissProt = new HashMap<Integer, Set<String>>();
            Map<Integer, Set<String>> mapEntrezUniprot = new HashMap<Integer, Set<String>>();
            while (reader.ready()) {
                String line = reader.readLine();
                String[] tokens = line.split("\t");
                int entrezGeneId = Integer.parseInt(tokens[0]);
                String uniProtId = tokens[1];
                if (swissProtAccs.contains(uniProtId)) {
                    Set<String> swiss = mapEntrezSwissProt.get(entrezGeneId);
                    if (swiss == null) {
                        swiss = new HashSet<String>();
                        mapEntrezSwissProt.put(entrezGeneId, swiss);
                    }
                    swiss.add(uniProtId);
                } else {
                    Set<String> uniprot = mapEntrezUniprot.get(entrezGeneId);
                    if (uniprot == null) {
                        uniprot = new HashSet<String>();
                        mapEntrezUniprot.put(entrezGeneId, uniprot);
                    }
                    uniprot.add(uniProtId);
                }
                progressMonitor.incrementCurValue();
                ConsoleUtil.showProgress(progressMonitor);
            }
            mapEntrezUniprot.keySet().removeAll(mapEntrezSwissProt.entrySet());
            mapEntrezUniprot.putAll(mapEntrezSwissProt);
            for (Map.Entry<Integer, Set<String>> entry : mapEntrezUniprot.entrySet()) {
                int entrezGeneId = entry.getKey();
                String uniprot = pickOneUniprot(entry.getValue());
                if (uniprot != null) {
                    rows += DaoUniProtIdMapping.addUniProtIdMapping(entrezGeneId, uniprot);
                }
            }
            System.out.println("Total number of uniprot id mappings saved: " + rows);
        } finally {
            try {
                reader.close();
            } catch (Exception e) {
                // ignore
            }
        }
    }

    private String pickOneUniprot(Set<String> uniprotIds) throws IOException {
        if (uniprotIds.size() == 1) {
            return uniprotIds.iterator().next();
        }

        int maxLength = 0;
        String ret = null;
        for (String id : uniprotIds) {
            int len = getLengthOfUniprotEntry(id);
            if (len > maxLength) {
                ret = id;
                maxLength = len;
            }
        }

        return ret;
    }

    private int getLengthOfUniprotEntry(String uniprotId) throws IOException {
        String strURL = "http://www.uniprot.org/uniprot/" + uniprotId + ".fasta";
        MultiThreadedHttpConnectionManager connectionManager = ConnectionManager.getConnectionManager();
        HttpClient client = new HttpClient(connectionManager);
        GetMethod method = new GetMethod(strURL);

        try {
            int statusCode = client.executeMethod(method);
            if (statusCode == HttpStatus.SC_OK) {
                BufferedReader bufReader = new BufferedReader(
                        new InputStreamReader(method.getResponseBodyAsStream()));
                String line = bufReader.readLine();
                if (line == null || !line.startsWith(">")) {
                    return 0;
                }

                int len = 0;
                for (line = bufReader.readLine(); line != null; line = bufReader.readLine()) {
                    len += line.length();
                }
                return len;
            } else {
                //  Otherwise, throw HTTP Exception Object
                throw new HttpException(
                        statusCode + ": " + HttpStatus.getStatusText(statusCode) + " Base URL:  " + strURL);
            }
        } finally {
            //  Must release connection back to Apache Commons Connection Pool
            method.releaseConnection();
        }
    }

    private Set<String> getSwissProtAccessionHuman() throws IOException {
        String strURL = "http://www.uniprot.org/uniprot/?query="
                + "taxonomy%3ahuman+AND+reviewed%3ayes&force=yes&format=list";

        MultiThreadedHttpConnectionManager connectionManager = ConnectionManager.getConnectionManager();
        HttpClient client = new HttpClient(connectionManager);
        GetMethod method = new GetMethod(strURL);

        try {
            int statusCode = client.executeMethod(method);
            if (statusCode == HttpStatus.SC_OK) {
                BufferedReader bufReader = new BufferedReader(
                        new InputStreamReader(method.getResponseBodyAsStream()));
                Set<String> accs = new HashSet<String>();
                for (String line = bufReader.readLine(); line != null; line = bufReader.readLine()) {
                    accs.add(line);
                }
                return accs;
            } else {
                //  Otherwise, throw HTTP Exception Object
                throw new HttpException(
                        statusCode + ": " + HttpStatus.getStatusText(statusCode) + " Base URL:  " + strURL);
            }
        } finally {
            //  Must release connection back to Apache Commons Connection Pool
            method.releaseConnection();
        }
    }

    public static void main(final String[] args) {
        if (args.length < 1) {
            System.out.println("command line usage: importUniProtIdMapping.pl <uniprot_id_mapping.txt>");
            System.exit(1);
        }
        ProgressMonitor progressMonitor = new ProgressMonitor();
        progressMonitor.setConsoleMode(true);
        try {
            File uniProtIdMapping = new File(args[0]);
            System.out.println("Reading uniprot id mappings from:  " + uniProtIdMapping.getAbsolutePath());
            int lines = FileUtil.getNumLines(uniProtIdMapping);
            System.out.println(" --> total number of lines:  " + lines);
            progressMonitor.setMaxValue(lines);
            ImportUniProtIdMapping importUniProtIdMapping = new ImportUniProtIdMapping(uniProtIdMapping,
                    progressMonitor);
            importUniProtIdMapping.importData();
        } catch (DaoException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            ConsoleUtil.showWarnings(progressMonitor);
            System.err.println("Done.");
        }
    }
}