edu.scripps.fl.pubchem.web.session.PCWebSession.java Source code

Introduction

Here is the source code for edu.scripps.fl.pubchem.web.session.PCWebSession.java
Source

/*
 * Copyright 2011 The Scripps Research Institute
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.scripps.fl.pubchem.web.session;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIUtils;
import org.apache.http.client.utils.URLEncodedUtils;
import org.apache.http.entity.mime.MultipartEntity;
import org.dom4j.Document;
import org.dom4j.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.csvreader.CsvReader;

import edu.scripps.fl.dom4j.util.AppendingVisitorSupport;
import edu.scripps.fl.pubchem.web.PCBioActivityAssaySummary;
import edu.scripps.fl.pubchem.web.PCBioActivityCompoundSummary;
import edu.scripps.fl.pubchem.web.PCOutcomeCounts;
import edu.scripps.fl.util.CollectionUtils;

public class PCWebSession extends WebSessionBase {

    private static final Logger log = LoggerFactory.getLogger(PCWebSession.class);
    public static final String SITE = "pubchem.ncbi.nlm.nih.gov";

    protected class WaitOnRequestId {
        private Pattern errorPattern = Pattern.compile("<pre>Error: Caught CException: (.+)</pre>");
        private final Logger log = LoggerFactory.getLogger(PCDepositionSystemSession.class);
        private String page;
        private Pattern reqIdPattern = Pattern.compile("\\W(reqid=\\d+)");

        // private Pattern reqIdPattern =
        // Pattern.compile("([^\"\']+reqid=\\d+)");

        public WaitOnRequestId(String page) throws Exception {
            this.page = page;
            while (true) {
                Matcher matcher = reqIdPattern.matcher(this.page);
                if (matcher.find()) {
                    String reqid = matcher.group(1);
                    Thread.sleep(2000);
                    String url = "http://" + SITE + "/assay/assay.cgi?" + reqid;
                    log.info(String.format("found requestId in page. Next uri: %s", url));
                    this.page = getPage(url);
                } else
                    break;
            }

            Matcher matcher = errorPattern.matcher(page);
            if (matcher.find()) {
                File file = File.createTempFile(getClass().getName(), ".html");
                FileUtils.writeStringToFile(file, page);
                String error = String.format("PubChem problem: %s. Please check %s", matcher.group(1), file);
                throw new Exception(error);
            }
        }

        public WaitOnRequestId(URI uri) throws Exception {
            this(getPage(uri));
        }

        public Document asDocument() throws Exception {
            return getDocumentFromHtml(new ByteArrayInputStream(page.getBytes()));
        }

        public String asPage() {
            return page;
        }
    }

    /**
     * 
     * Returns CSV data file for the AID.
     * 
     * @param aid
     * @return
     * @throws Exception
     */
    public InputStream getAssayCSV(int aid) throws Exception {
        return getAssayFile("alldata", "csv", aid);
    }

    protected InputStream getAssayFile(String format, String ext, int aid) throws Exception {
        List<NameValuePair> params = addParameters(new ArrayList<NameValuePair>(), "aid", aid, "q", "r", "exptype",
                format, "zip", "gzip", "resultsummary", "detail", "releasehold", "t");
        URI uri = URIUtils.createURI("http", SITE, 80, "/assay/assay.cgi", URLEncodedUtils.format(params, "UTF-8"),
                null);
        Document document = new WaitOnRequestId(uri).asDocument();
        return getFtpLinkAsStream(document);
    }

    /**
     * 
     * Returns a description xml file for an aid
     * 
     * @param aid
     * @return
     * @throws Exception
     */
    public InputStream getDescrXML(int aid) throws Exception {
        // if we don't provide the version, we get the latest one anyway.
        // http://pubchem.ncbi.nlm.nih.gov/assay/assay.cgi?aid=2551&version=1.1&q=expdesc_xmldisplay
        List<NameValuePair> params = addParameters(new ArrayList<NameValuePair>(), "aid", aid, "q",
                "expdesc_xmldisplay");
        URI uri = URIUtils.createURI("http", SITE, 80, "/assay/assay.cgi", URLEncodedUtils.format(params, "UTF-8"),
                null);
        HttpResponse response = getHttpClient().execute(new HttpGet(uri));
        return response.getEntity().getContent();
    }

    /**
     * 
     * Returns an SDF file of substances in an AID.<br>
     * It only works if on-hold SIDs are from your own center. If you submit
     * AIDs for other centers, you cannot download their SIDs!
     * 
     * @param aid
     * @return
     * @throws Exception
     */
    @SuppressWarnings("unchecked")
    public InputStream getAssaySDF(int aid) throws Exception {
        String page = getDataTablePage(aid);
        String url = getLinkFromJavaScript(page, "Substance Structure");
        Document document = getDocument(url);
        MultipartEntity entity = new MultipartEntity();
        Node formNode = document.selectSingleNode("//form[1]");
        addFormParts(formNode, entity, new HashSet(Arrays.asList("savejob")));
        document = new WaitOnRequestId(postPage("http://" + SITE + "/pc_fetch/pc_fetch.cgi", entity)).asDocument();
        return getFtpLinkAsStream(document);
    }

    /**
     * 
     * Returns a list of SIDs that are present in an AID.<br>
     * This is a convenience method. It first calls getAssayCSV because I can't
     * figure out the link from the DataTable page.
     * 
     * @param aid
     * @return
     * @throws Exception
     */
    public List<Long> getAssaySIDs(int aid) throws Exception {
        // really simple, no dependency parsing of the CSV file because I can't
        // get the link from the DataTable page to work :-(
        InputStream is = getAssayCSV(aid);
        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        String line = br.readLine(); // ignore header line;
        List<Long> list = new ArrayList<Long>();
        while (null != (line = br.readLine())) {
            String items[] = line.split(",");
            list.add(Long.parseLong(items[1])); // SID in 2nd column
        }
        return list;
    }

    public List getActiveSIDs(int aid) throws Exception {
        InputStream is = getAssayCSV(aid);
        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        String line = br.readLine();
        List list = new ArrayList();
        do {
            if (null == (line = br.readLine()))
                break;
            String items[] = line.split(",");
            if (items[5].equals("Active"))
                list.add(Long.valueOf(Long.parseLong(items[1])));
        } while (true);
        return list;
    }

    /**
     * 
     * Returns the full XML file for an AID.
     * 
     * @param aid
     * @return
     * @throws Exception
     */
    public InputStream getAssayXML(int aid) throws Exception {
        return getAssayFile("xml", "xml", aid);
    }

    protected String getDataTablePage(int aid) throws Exception {
        List<NameValuePair> params = addParameters(new ArrayList<NameValuePair>(), "aid", aid, "q", "r", "pagefrom",
                "BioAssaySummary", "acount", "0", "releasehold", "t", "activity" + aid, "");
        URI uri = URIUtils.createURI("http", SITE, 80, "/assay/assay.cgi", URLEncodedUtils.format(params, "UTF-8"),
                null);
        return new WaitOnRequestId(uri).asPage();
    }

    protected InputStream getFtpLinkAsStream(Document document) throws Exception {
        Node linkNode = document.selectSingleNode("//a[@href=starts-with(.,'ftp://')]");
        if (null != linkNode) {
            String url = linkNode.valueOf("@href");
            log.info("Found ftp link: " + url);
            return new GZIPInputStream(new URL(url).openStream());
        }
        File file = File.createTempFile(getClass().getName(), ".html");
        document.write(new FileWriter(file));
        throw new Exception("Received a PubChem web page without an ftp link. Please check " + file);
    }

    protected String getLinkFromJavaScript(String page, String linkName) throws Exception {
        Pattern pattern = Pattern.compile("\\[\"" + linkName + "\"\\s*,\\s*\"([^\\\"]+)");
        Matcher matcher = pattern.matcher(page);
        if (!matcher.find()) {
            File file = File.createTempFile(getClass().getName(), ".html");
            FileUtils.writeStringToFile(file, page);
            String error = "Could not determine " + linkName + " link on PubChem page. Please check " + file;
            throw new Exception(error);
        }
        String url = matcher.group(1);
        log.info("Found javascript link: " + url);
        return url;
    }

    /**
     * Fetches an SDF file for the given CID or SID.
     * 
     * @param type
     *            "cid" or "sid"
     * @param id
     *            the cid or sid
     * @return sdf file as stream
     * @throws Exception
     */
    public InputStream getSDF(String type, int id) throws Exception {
        String href = String.format("http://pubchem.ncbi.nlm.nih.gov/summary/summary.cgi?%s=%s&disopt=SaveSDF",
                type, id);
        HttpResponse response = getHttpClient().execute(new HttpGet(href));
        return response.getEntity().getContent();
    }

    /**
     * 
     * Returns counts of active, all, inactive, inconclusive and probe
     * substances in the aid.
     * 
     * @param aid
     * @return OutComeCount
     * @throws Exception
     */
    public PCOutcomeCounts getSubstanceOutcomeCounts(int aid) throws Exception {
        PCOutcomeCounts oc = null;
        Document doc = getDocument("http://" + SITE + "/assay/assay.cgi?aid=" + aid);
        Node node = doc.selectSingleNode("//div[@id='uptsub']");
        //      Node node = doc.selectSingleNode("//b[. = 'Links:']");
        //Node node = doc.selectSingleNode("//b[. = 'Substances: ']");
        if (node != null) {
            node = node.getParent();
            AppendingVisitorSupport visitor = new AppendingVisitorSupport();
            node.accept(visitor);
            String text = visitor.getText();
            Pattern sectionPattern = Pattern.compile("Tested Substances(.+)", Pattern.DOTALL | Pattern.MULTILINE);
            Matcher matcher = sectionPattern.matcher(text);
            Boolean found = matcher.find();
            if (found) {
                text = matcher.group(1);
                Pattern countPattern;
                if (text.contains("("))
                    countPattern = Pattern.compile("([\\w]+)\\((\\d+)\\)");
                else
                    countPattern = Pattern.compile("([\\w]+):\\s+(\\d+)");

                matcher = countPattern.matcher(text);
                oc = new PCOutcomeCounts();
                while (matcher.find()) {
                    String outcome = matcher.group(1);
                    int count = Integer.parseInt(matcher.group(2));
                    if ("All".equalsIgnoreCase(outcome))
                        oc.all = count;
                    else if (outcome.contains("All"))
                        oc.all = count;
                    else if ("Active".equalsIgnoreCase(outcome))
                        oc.active = count;
                    else if ("Inactive".equalsIgnoreCase(outcome))
                        oc.inactive = count;
                    else if ("Inconclusive".equalsIgnoreCase(outcome))
                        oc.inconclusive = count;
                    else if ("Probe".equalsIgnoreCase(outcome))
                        oc.probe = count;
                }
            }
        }
        return oc;
    }

    protected InputStream getBioActivityCompoundSummaryAsStream(List<Long> cids) throws Exception {
        List<NameValuePair> params = addParameters(new ArrayList<NameValuePair>(), "cid",
                StringUtils.join(cids, ","), "q", "cmp", "exptype", "csv");
        URI uri = URIUtils.createURI("http", SITE, 80, "/assay/assaytool.cgi",
                URLEncodedUtils.format(params, "UTF-8"), null);
        Document doc = new WaitOnRequestId(uri).asDocument();
        return getFtpLinkAsStream(doc);
    }

    public Map<Long, PCBioActivityCompoundSummary> getBioActivityCompoundSummaryAsMap(List<Long> cids)
            throws Exception {
        List<PCBioActivityCompoundSummary> list = getBioActivityCompoundSummary(cids);
        return CollectionUtils.toMap("CID", list);
    }

    public List<PCBioActivityCompoundSummary> getBioActivityCompoundSummary(List<Long> cids) throws Exception {
        List<PCBioActivityCompoundSummary> list = new ArrayList<PCBioActivityCompoundSummary>(cids.size());
        CsvReader reader = new CsvReader(new InputStreamReader(getBioActivityCompoundSummaryAsStream(cids)), ',');
        reader.readHeaders();
        while (reader.readRecord()) {
            String line[] = reader.getValues();
            PCBioActivityCompoundSummary sum = new PCBioActivityCompoundSummary();
            BeanUtils.setProperty(sum, "CID", line[0]);
            BeanUtils.setProperty(sum, "IUPAC", line[1]);
            BeanUtils.setProperty(sum, "synonyms", line[2]);
            BeanUtils.setProperty(sum, "bioAssayProbes", line[3]);
            BeanUtils.setProperty(sum, "bioAssayActives", line[4]);
            BeanUtils.setProperty(sum, "bioAssayTested", line[5]);
            BeanUtils.setProperty(sum, "activeContentrationBelow1uM", line[6]);
            BeanUtils.setProperty(sum, "activeContentrationBelow1nM", line[7]);
            BeanUtils.setProperty(sum, "activeProteins", line[8]);
            BeanUtils.setProperty(sum, "testedProteins", line[9]);
            BeanUtils.setProperty(sum, "activeContentrationLowerBound", line[10]);
            BeanUtils.setProperty(sum, "activeContentrationUpperBound", line[11]);
            list.add(sum);
        }
        return list;
    }

    protected InputStream getBioActivityAssaySummaryAsStream(List<Long> cids) throws Exception {
        List<NameValuePair> params = addParameters(new ArrayList<NameValuePair>(), "cid",
                StringUtils.join(cids, ","), "q", "cids", "exptype", "bioactivitycsv");
        URI uri = URIUtils.createURI("http", SITE, 80, "/assay/assay.cgi", URLEncodedUtils.format(params, "UTF-8"),
                null);
        Document doc = new WaitOnRequestId(uri).asDocument();
        return getFtpLinkAsStream(doc);
    }

    public Map<Long, PCBioActivityAssaySummary> getBioActivityAssaySummaryAsMap(List<Long> cids) throws Exception {
        List<PCBioActivityAssaySummary> list = getBioActivityAssaySummary(cids);
        return CollectionUtils.toMap("AID", list);
    }

    public List<PCBioActivityAssaySummary> getBioActivityAssaySummary(List<Long> cids) throws Exception {
        List<PCBioActivityAssaySummary> list = new ArrayList<PCBioActivityAssaySummary>(cids.size());
        CsvReader reader = new CsvReader(new InputStreamReader(getBioActivityAssaySummaryAsStream(cids)), ',');
        reader.readHeaders();
        reader.readRecord();
        String line[] = reader.getValues();
        // csv parsing written this way b/c of pubchem pub (1 row csv, 1000's
        // columns!)
        for (int ii = 0; ii < line.length; ii += 10) {
            if (line.length - ii < 10)
                break;
            PCBioActivityAssaySummary act = new PCBioActivityAssaySummary();
            BeanUtils.setProperty(act, "AID", line[ii]);
            BeanUtils.setProperty(act, "probeCount", line[ii + 1]);
            BeanUtils.setProperty(act, "activeCount", line[ii + 2]);
            BeanUtils.setProperty(act, "inactiveCount", line[ii + 3]);
            BeanUtils.setProperty(act, "testedCount", line[ii + 4]);
            BeanUtils.setProperty(act, "activesLessThan1uM", line[ii + 5]);
            BeanUtils.setProperty(act, "activesLessThan1nM", line[ii + 6]);
            if (!"".equals(line[ii + 7])) {
                String[] actRange = line[ii + 7].split(" - ");
                BeanUtils.setProperty(act, "activityConcentrationMin", actRange[0]);
                BeanUtils.setProperty(act, "activityConcentrationMax", actRange[1]);
            }
            BeanUtils.setProperty(act, "bioAssayName", line[ii + 8]);
            BeanUtils.setProperty(act, "proteinTargetName", line[ii + 9]);

            list.add(act);
        }
        return list;
    }
}