nl.ru.cmbi.pisa.wrapper.PisaCachedWebDao.java Source code

Introduction

Here is the source code for nl.ru.cmbi.pisa.wrapper.PisaCachedWebDao.java
Source

/**
 * Copyright 2010 CMBI (contact: <Gerrit.Vriend@radboudumc.nl>)
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package nl.ru.cmbi.pisa.wrapper;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import lombok.Cleanup;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.Setter;

import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Wrapper around EBI's PISA webservice that caches retrieved results locally.
 * <p>
 * This class supplies a java interface to the PISA RESTful interface described at the <a
 * href="https://www.ebi.ac.uk/msd-srv/prot_int/pi_download.html">PISA download page</a>, adding some level of
 * abstraction and error handling.
 * <p>
 * To avoid overloading the EBI-service, the returned result pages are stored locally for future reference. This cache
 * currently does not expire automatically (yet), it should be maintained manually; the commands listed here may be
 * useful.
 * <p>
 * <h1>useful (linux) commands to maintain the cache:</h1>
 * - Clean up all error-message cache-files, and empty directories resulting from that clean-up.<br>
 * # find ./ -size -90c | xargs rm ; find -empty | xargs rmdir; find -empty | xargs rmdir<br>
 * <p>
 * - find all assembly files that have an error message other than "unavailable (out of range)"<br>
 * # find ./ -name *.pdb -size -200c | xargs cat | grep -v ^$ | grep -v "unavailable (out of range)"<br>
 * 
 * @author jkerssem
 */
//@Service
@EqualsAndHashCode(of = { "cacheDir", "pisaBaseUrl" })
public class PisaCachedWebDao {
    /** the obligatory logger */
    private static final Logger log = LoggerFactory.getLogger(PisaCachedWebDao.class);

    /**
     * Convenience constant: the URL the PISA REST-service run at at the EBI:<br/>
     * currently: {@value}
     */
    public static final String defaultPisaBaseUrl = "http://www.ebi.ac.uk/pdbe/pisa/cgi-bin/";

    /**
     * Convenience constant: a default local directory where the PISA cache is maintained<br>
     * currently: {@value}
     */
    public static final String defaultPisaCacheDir = "/data/PisaCache/";

    // Cache-specific variables;
    /**
     * disk-location of the cache, must be a directory.
     * <p>
     * It should not be changed after initialisation, or you 'lose' the existing cache.
     * <p>
     * It is used together with the {@link #multimerFileSuffix}, {@link #assemblyFileSuffix} and
     * {@link #interfaceFileSuffix} to generate the actual file paths for the cache.
     */
    @Getter
    private File cacheDir;
    private static final String multimerFileSuffix = "_multimer.xml";
    private static final String interfaceFileSuffix = "_interface.xml";
    private static final String assemblyFileSuffix = "_assembly.pdb";

    // PISA-web URLs
    /**
     * The base URL where the pisa scripts are hosted.
     * <p>
     * From this are derived the {@link #multimerBaseUrl}, {@link #interfaceBaseUrl} and {@link #assemblyBaseUrl}. Set
     * it to <code>null</code> to disable webfetching altogether.
     */
    @Getter
    private String pisaBaseUrl;
    @Getter
    private String multimerBaseUrl;
    @Getter
    private String interfaceBaseUrl;
    @Getter
    private String assemblyBaseUrl;

    /**
     * Amount of pdbIDs to get in one webrequest, currently {@value} .
     * <p>
     * PISA suggests between 20 to 50 to avoid overloading the server.
     */
    @Getter
    @Setter
    private int batchSize = 50;

    /**
     * Amount of milliseconds to wait between sending out web-requests.
     * <p>
     * This is the throttle delay to ensure PISA servers are not overloaded. Setting this too low/fast will cause PISA
     * to silently drop further requests made to it. A good indication is 1000 milliseconds.
     */
    @Getter
    @Setter
    private int webFetchThrottleMillis = 1000;

    /**
     * Internal tracker for time of last web-request.
     * 
     * @see #webFetchThrottleMillis
     * @see #throttledOpenStream(URL)
     */
    private long lastWebRequestTime = 0;

    /** parser for PISA XML files, stored here and re-used to avoid instantiation overhead */
    private SAXReader saxReader;

    /*
     * Constructors & other housekeeping ===============================================================================
     */

    /** Constructor that loads from the default "pisa-wrapper.properties" file */
    public PisaCachedWebDao() {
        try {
            // Load properties specified in pisa-wrapper.properties
            final Properties props = new Properties();
            final InputStream inStream = this.getClass().getResourceAsStream("/pisa-wrapper.properties");
            props.load(inStream);

            initURLs(props.getProperty("pisaurl").trim());
            initCache(new File(props.getProperty("cachedir").trim()));

            String webdelayString = null;
            try {
                webdelayString = props.getProperty("webfetchdelay", "1000").trim();
                setWebFetchThrottleMillis(Integer.valueOf(webdelayString));
            } catch (final NumberFormatException nfe) {
                setWebFetchThrottleMillis(1000);
                log.warn("Couldn't parse webfetchdelay parameter from config file: \"{}\"", webdelayString);
            }
            String webbatchString = null;
            try {
                webbatchString = props.getProperty("webfetchbatchsize", "50").trim();
                setBatchSize(Integer.valueOf(webbatchString));
            } catch (final NumberFormatException nfe) {
                log.warn("Couldn't parse webfetchbatchsize parameter from config file: \"{}\"", webbatchString);
                setBatchSize(50);
            }
        } catch (final IOException e) {
            throw new IllegalStateException("Couldn't instantiate pisa-wrapper", e);
        }

        log.info("Created caching PISA DAO from properties; cache @ \"{}\", PISA @ \"{}\"",
                cacheDir.getAbsolutePath(), pisaBaseUrl);
    }

    /**
     * pass-through constructor for cache-location-as-string, see {@link #PisaCachedWebDao(String, File)}
     */
    public PisaCachedWebDao(final String pisaBaseUrl, final String cacheDirPath) throws FileNotFoundException {
        this(pisaBaseUrl, new File(cacheDirPath));
    }

    /**
     * Constructor, builds the cache-dir and PISA URL's.
     * 
     * @param pisaBaseUrl
     *            the base URL where PISA is listening. This DAO will append the necessary parameters to
     *            get multimers, interfaces and assemblies.
     * @param cacheDir
     *            the location of a readable/writable directory where this DAO will write it's cache files.
     *            Set this to <code>null</code> to disable caching entirely.
     * @throws FileNotFoundException
     *             in case the provided cache-directory is unwritable or not a directory
     * @see #defaultPisaBaseUrl
     */
    public PisaCachedWebDao(final String pisaBaseUrl, final File cacheDir) throws FileNotFoundException {
        initURLs(pisaBaseUrl);
        initCache(cacheDir);

        log.info("Created caching PISA DAO; cache @ \"{}\", PISA @ \"{}\"", cacheDir.getAbsolutePath(),
                pisaBaseUrl);
    }

    /**
     * Initialises the cache directory, if desired.
     * 
     * @param newCacheDir
     *            File where the cachedir should be, can be a:
     *            <ul>
     *            <li>existing directory, empty
     *            <li>existing directory, containing an existing cache
     *            <li>non-existing directory, it will be created (including parent directories, if needed)
     *            </ul>
     *            Of course, the location needs to be writeable.
     * @throws FileNotFoundException
     *             if the cache directory cannot be created or written too
     */
    private void initCache(final File newCacheDir) throws FileNotFoundException {
        if (newCacheDir == null) {
            cacheDir = null;
        } else {
            if (!newCacheDir.exists()) {
                // 1) dir not existing, create it
                log.info("Creating PISA cache directory at \"{}\"", newCacheDir.getAbsolutePath());
                if (!newCacheDir.mkdirs()) {
                    // 1a) Can't create directory
                    log.error("Could not create PISA cache directory at {}", newCacheDir.getAbsolutePath());
                    throw new FileNotFoundException("Could not create PISA cache directory");
                }

            } else if (!newCacheDir.isDirectory()) {
                // 2) not a dir, error out
                log.error("Suggested PISA cache location {} exists, but is not a directory",
                        newCacheDir.getAbsolutePath());
                throw new FileNotFoundException("Suggested cache location exists, but is no directory");

            } else if (!newCacheDir.canWrite()) {
                // 3) dir, but not writable, error out
                log.error("Suggested cache directory {} exists, but is not writable",
                        newCacheDir.getAbsolutePath());
                throw new FileNotFoundException("Suggested cachedirectory exists, but is not writable");
            }

            // All is well, set cache directory
            cacheDir = newCacheDir;
        }
    }

    /** Initialise the URLs, pass in <code>null</code> to disable webfetching */
    private void initURLs(final String newPisaBaseUrl) {
        if (null == newPisaBaseUrl) {
            pisaBaseUrl = null;
            multimerBaseUrl = null;
            interfaceBaseUrl = null;
            assemblyBaseUrl = null;
        } else {
            pisaBaseUrl = newPisaBaseUrl;
            multimerBaseUrl = newPisaBaseUrl + "multimers.pisa?";
            interfaceBaseUrl = newPisaBaseUrl + "interfaces.pisa?";
            assemblyBaseUrl = newPisaBaseUrl + "multimer.pdb?";
        }
    }

    @Override
    public String toString() {
        return "PisaCachedWebDao{pisa@\"" + pisaBaseUrl + "\", cache@\"" + cacheDir.getAbsolutePath() + "\"}";
    }

    /*
     * END Constructors ================================================================================================
     */

    /*
     * Data Methods ====================================================================================================
     */

    /**
     * Gets a map, keyed by PDB-id, containing the associated PISA multimer xml file
     * <p>
     * This method first checks the cache to see if the desired files are already present, and adds these to the result.
     * If some or all items are not present, the next step depends on if webfetching is enabled.<br>
     * If enabled, a batched web-request is sent to PISA for all ID's not found in cache. Depending on how many items
     * need to be fetched and how the throttling parameters {@link #webFetchThrottleMillis} and {@link #batchSize} are
     * set, this may take some time.<br>
     * If webfetching is disabled, indicated by {@link #pisaBaseUrl} being <code>null</code>, a best-effort, incomplete
     * result is returned from the entries in the cache. This may mean the returned {@link Map} is completely empty!
     * <p>
     * Entries in the map are in the same order as they were given in <code>pdbIDs</code>.
     * 
     * @param pdbIDs
     *            the PDB id's for which to get the multimer info.
     * @return A map, sorted in the same order as <code>pdbIDs</code>, containing the XML files keyed to their pdbIDs.
     * @throws IOException
     *             in case web-fetching fails.
     */
    public LinkedHashMap<String, String> getRawMultimerInfoMap(final String... pdbIDs) throws IOException {
        final LinkedHashMap<String, String> outputBuffer = new LinkedHashMap<String, String>();
        final List<String> webQueue = new ArrayList<String>();

        // Loop over the requested ID's a first time to see what can be gotten from cache.
        for (final String pdbId : pdbIDs) {
            final String cacheContent = getFromCache(fetchType.Multimer, pdbId);
            if (cacheContent != null) {
                outputBuffer.put(pdbId, cacheContent);
            } else {
                // Not in cache..
                outputBuffer.put(pdbId, null); // reserve a spot to preserve output order equal to input order
                webQueue.add(pdbId); // add to get-later queue
            }
        }

        // Now that we know all entries to get from web, get them, overriding the null's we just put in.
        if (pisaBaseUrl != null) { // (except if webfetching is disabled)
            outputBuffer.putAll(getMultipleFromWeb(fetchType.Multimer, webQueue));
        }

        return outputBuffer;
    }

    /**
     * Obtains the raw Multimer XML file for a single PDB ID.
     * This is a convenience wrapper around {@link #getRawMultimerInfoMap(String...)}
     * 
     * @param pdbID
     *            A pdb code
     * @return a single XML-formatted string containing the PISA result.
     * @throws IOException
     */
    public String getRawMultimerInfoSingle(final String pdbID) throws IOException {
        final LinkedHashMap<String, String> rawMultimerInfo = getRawMultimerInfoMap(pdbID);
        final String multimerXml = rawMultimerInfo.get(pdbID);
        if (multimerXml == null) {
            log.warn("No multimer results for {}", pdbID);
        }
        return multimerXml;
    }

    /**
     * Gets a map, keyed by PDB-id, containing the associated PISA interface xml file
     * <p>
     * This method first checks the cache to see if the desired files are already present, and adds these to the result.
     * If some or all items are not present, the next step depends on if webfetching is enabled.<br>
     * If enabled, a batched web-request is sent to PISA for all ID's not found in cache. Depending on how many items
     * need to be fetched and how the throttling parameters {@link #webFetchThrottleMillis} and {@link #batchSize} are
     * set, this may take some time.<br>
     * If webfetching is disabled, indicated by {@link #pisaBaseUrl} being <code>null</code>, a best-effort, incomplete
     * result is returned from the entries in the cache. This may mean the returned {@link Map} is completely empty!
     * <p>
     * Entries in the map are in the same order as they were given in <code>pdbIDs</code>.
     * 
     * @param pdbIDs
     *            the PDB id's for which to get the interface info.
     * @return A map, sorted in the same order as <code>pdbIDs</code>, containing the XML files.
     * @throws IOException
     */
    public LinkedHashMap<String, String> getRawInterfaceInfoMap(final String... pdbIDs) throws IOException {
        final LinkedHashMap<String, String> outputBuffer = new LinkedHashMap<String, String>();
        final List<String> webQueue = new ArrayList<String>();

        // Loop over the requested ID's a first time to see what can be gotten from cache.
        for (final String pdbId : pdbIDs) {
            final String cacheContent = getFromCache(fetchType.Interface, pdbId);
            if (cacheContent != null) {
                outputBuffer.put(pdbId, cacheContent);
            } else {
                // Not in cache..
                outputBuffer.put(pdbId, null); // reserve a spot to preserve output order equal to input order
                webQueue.add(pdbId); // add to get-later queue
            }
        }

        // Now that we know all entries to get from web, get them, overriding the null's we just put in.
        if (null != pisaBaseUrl) { // (except if webfetching is disabled)
            outputBuffer.putAll(getMultipleFromWeb(fetchType.Interface, webQueue));
        }

        return outputBuffer;
    }

    /**
     * Obtains the raw Interface XML file for a single PDB ID.
     * This is a convenience wrapper around {@link #getRawInterfaceInfoMap(String...)}
     * 
     * @param pdbID
     *            A pdb code
     * @return a single XML-formatted string containing the PISA result.
     * @throws IOException
     */
    public String getRawInterfaceInfoSingle(final String pdbID) throws IOException {
        final LinkedHashMap<String, String> rawInterfaceInfo = getRawInterfaceInfoMap(pdbID);
        final String interfaceXml = rawInterfaceInfo.get(pdbID);
        if (interfaceXml == null) {
            log.warn("No interface results for {}", pdbID);
        }
        return interfaceXml;
    }

    /**
     * Fetches the pdb-format coordinates file for the specified assembly.
     * <p>
     * First, the entry is looked up in cache, if present, the cached version is served, if not present and webfetching
     * is enabled, the file is fetched from web and returned (and put to cache too). If webfetching is disabled and the
     * entry was not in cache, <code>null</code> is returned.
     * 
     * @param rawPdbId
     *            The pdb id (case insensitive)
     * @param setNr
     *            the pisa major number indicating the assembly set
     * @param assemblyNr
     *            the pisa minor number indicating the assembly subgroup
     * @return pdb-style ATOM-coordinates, or <code>null</code> if the assembly does not exist/could not be fetched
     *         from web
     * @throws IOException
     *             when something goes awry reading from cache or from web.
     */
    public String getAssembly(final String rawPdbId, final int setNr, final int assemblyNr) throws IOException {
        final String pdbId = rawPdbId.trim().toLowerCase();

        String pisaResult = getFromCacheAssembly(pdbId, setNr, assemblyNr);
        if (null == pisaResult) { // not in cache..
            if (null == pisaBaseUrl) { // web-fetching disabled
                return null;
            } else { // get from web.
                try {
                    pisaResult = getAssemblyFromWeb(pdbId, setNr, assemblyNr);
                } catch (final IOException ioex) {
                    log.warn("Problem getting PISA assembly for {}", pdbId);
                    throw new IOException("Problem getting PISA assembly", ioex);
                }
            }
        }

        // Check if output indicates errors (meaning either "no such assembly", or actual error)
        if (pisaResult.startsWith(" *** ")) {
            if (pisaResult.trim().endsWith("unavailable (out of range)")
                    || pisaResult.trim().endsWith("not found in PISA database")) {
                // not an error, assembly doesn't exist.
                return null;
            }

            // actual error, probably URL formatting..
            log.warn("No PISA assembly found, PISA says: \"{}\"", pisaResult.substring(5).trim());
            return null;
        }

        return pisaResult;
    }

    /**
     * Gets the 3D rotation-translation biomolecule matrices from PISA.
     * <p>
     * These matrices detail how chains in the specified PDB entry should be manipulated to obtain desired the PISA
     * assembly.
     * 
     * @param rawPdbId
     *            the 4-letter PDB code we want the assembly of (e.g. "1crn")
     * @param setNr
     *            the PISA assembly set. "1" is most stable in solution, higher numbers increasingly unstable.
     * @param assemblyNr
     *            the substructure within an assembly. Note that structurally identical substructures get the
     *            same assemblyNr. This DAO ignores all but the first substructure. Ignored chains (those chains not
     *            needed to make this assembly) are listed in the {@link MatricesResult#ignoredChains} property of the
     *            returned result.
     * @return A compound result of the transformation matrices to apply and the ignored chains (I.E. those
     *         chains whose transformation lead to an identical substructure to the one returned). This result may be
     *         empty.
     * @throws IOException
     *             when web-fetching fails.
     */
    @SuppressWarnings("unchecked")
    public MatricesResult getMatrices(final String rawPdbId, final int setNr, final int assemblyNr)
            throws IOException {

        final String pdbId = rawPdbId.trim().toLowerCase();
        Document mmrInfo;
        try {
            mmrInfo = parseDocumentOf(getRawMultimerInfoSingle(pdbId));
        } catch (final DocumentException docex) {
            log.error("Problem parsing Pisa raw multimer info for {}, see debug level for details", pdbId);
            log.debug("Exception: ", docex);
            throw new IOException("Problem parsing multimer XML document", docex);
        }

        final List<ChainTransform> output = new ArrayList<ChainTransform>();
        final Set<String> ignoredChainIds = new HashSet<String>();

        try {
            // Unfortunately, multiple assemblies share the same <ID> subnode if they are structurally identical
            // So, we need indexing ("[1]") to get only the first of an identical set of assemblies
            // ('identical' meaning "structurally identical except for ChainIDs")
            // Sigh... reusing IDs for different things!
            final String matricesXpath = String.format(
                    "(/pisa_multimers/pdb_entry[ pdb_code='%s' ]/asm_set[ ser_no=%d ]/assembly[ id=%d ])[1]/molecule",
                    pdbId, setNr, assemblyNr);

            final List<Node> matrices = mmrInfo.selectNodes(matricesXpath);

            for (final Node node : matrices) {
                final ChainTransform ct = new ChainTransform(node.valueOf("chain_id"),
                        // X-row
                        (Double) node.numberValueOf("rxx"), (Double) node.numberValueOf("rxy"),
                        (Double) node.numberValueOf("rxz"), (Double) node.numberValueOf("tx"),
                        // Y-row
                        (Double) node.numberValueOf("ryx"), (Double) node.numberValueOf("ryy"),
                        (Double) node.numberValueOf("ryz"), (Double) node.numberValueOf("ty"),
                        // Z-row
                        (Double) node.numberValueOf("rzx"), (Double) node.numberValueOf("rzy"),
                        (Double) node.numberValueOf("rzz"), (Double) node.numberValueOf("tz"));
                if (ct.isUnity() && ct.isStationary()) {
                    ct.setDuplication(false);
                } else {
                    ct.setDuplication(true);
                }

                output.add(ct);
            }

            final String ignoredMatricesXpath = String.format(
                    "(/pisa_multimers/pdb_entry[ pdb_code='%s' ]/asm_set[ ser_no=%d ]/assembly[ id=%d ])[position() > 1]/molecule/chain_id",
                    pdbId, setNr, assemblyNr);
            final List<Node> ignoredChains = mmrInfo.selectNodes(ignoredMatricesXpath);

            for (final Node ignoredChain : ignoredChains) {
                ignoredChainIds.add(ignoredChain.getText());
            }

        } catch (final ClassCastException ccex) {
            log.error(
                    "malformed XML response from PISA when getting matrices for assembly {}:{}.{}: expected list of matrix nodes, but obtained something else",
                    new Object[] { pdbId, setNr, assemblyNr });
            throw new IOException("malformed XML response from PISA when getting matrices for assembly of " + pdbId
                    + ": expected list of matrix nodes, but obtained something else", ccex);
        }

        return new MatricesResult(output, ignoredChainIds);
    }

    /**
     * @param rawXml
     *            A string containing an XML file
     * @return the XML file parsed as {@link Document}, parsed by {@link #saxReader}, or <code>null</code> if rawXml is
     *         <code>null</code>.
     * @throws DocumentException
     *             whenever parsing goes wrong.
     */
    private Document parseDocumentOf(final String rawXml) throws DocumentException {
        if (rawXml == null) {
            log.debug("Attempt to read 'null' xml");
            return null;
        }

        if (saxReader == null) {
            saxReader = new SAXReader();
        }

        final Document parsedDoc = saxReader.read(new StringReader(rawXml));
        return parsedDoc;
    }

    private enum fetchType {
        Multimer, Interface
    }

    private String getBaseUrlFor(final fetchType type) {
        switch (type) {
        case Multimer:
            return multimerBaseUrl;
        case Interface:
            return interfaceBaseUrl;
        default:
            return null; // only used if type==null is entered, in which case you deserve pain!
        }
    }

    /**
     * Retrieves multiple Pisa entries, in batches with delays to not-stress the PISA servers.
     * 
     * @param type
     *            To indicate if you want multimer info or interface info.
     * @param pdbIds
     *            the list of PDB id's for which you want info, the order is maintained in the output.
     * @return a map, keyed by pdb-id, containing the PISA XML file for that ID.
     *         (iteration order is the same as that of <code>pdbIds</code> through use of {@link LinkedHashMap})
     *         or an empty {@link Map} if web-fetch is disabled ({@link #pisaBaseUrl} set to <code>null</code>)
     * @throws IOException
     *             whenever webstream reading goes wrong.
     * @see #webFetchThrottleMillis
     * @see #batchSize
     */
    private LinkedHashMap<String, String> getMultipleFromWeb(final fetchType type, final List<String> pdbIds)
            throws IOException {
        if (null == pisaBaseUrl) {
            return new LinkedHashMap<String, String>(0);
        }

        // linked map to preserve iteration-order of provided pdbIds
        final LinkedHashMap<String, String> output = new LinkedHashMap<String, String>();

        final String baseUrl = getBaseUrlFor(type);

        // loop through PDB-ids, processing them in <batchsize>-sized blocks
        final int size = pdbIds.size();
        int i = 0;
        while (i < size) {
            // make a url for a subset of the pdb-IDs
            final List<String> subBlock = pdbIds.subList(i, Math.min(size, i + batchSize));
            final URL fetchUrl = new URL(baseUrl + Util.join(subBlock, ","));

            // read the subset XML results
            log.debug("Getting {}-block from web: {}", type, subBlock);
            @Cleanup
            final BufferedReader pisaResult = new BufferedReader(throttledOpenStream(fetchUrl));
            // Get xml header and footer.
            String header = pisaResult.readLine() + "\n";
            final String footer = header.replaceAll("<", "</");
            header += pisaResult.readLine() + "\n"; // <status>-line

            // get XML-result contents, split them per "<pdb_entry>"-block
            String line;
            String pdbAc = null;
            StringBuilder buf = new StringBuilder();
            while ((line = pisaResult.readLine()) != null) {

                // encountered start of entry, start a new buffer with header
                if ("  <pdb_entry>".equals(line)) {
                    buf = new StringBuilder();
                    pdbAc = null;
                    buf.append(header);
                }
                // trap accession codes we encounter
                if (line.startsWith("    <pdb_code>")) {
                    pdbAc = line.substring(14, 18);
                }

                // store line
                buf.append(line);
                buf.append('\n');

                // at end of an entry, process it
                if ("  </pdb_entry>".equals(line)) {
                    buf.append(footer);
                    final String fullXml = buf.toString();
                    putToCache(fullXml, getCacheFileFor(type, pdbAc));
                    output.put(pdbAc, fullXml);
                }
            }

            i += batchSize;
        }
        return output;
    }

    /**
     * Retrieves the specified assembly from the PISA website.
     * 
     * @param pdbId
     *            PDB accession to get assembly for
     * @param setNr
     *            Assembly set identifier
     * @param assemblyNr
     *            Assembly sub-structure identifier
     * @return the raw pisa assembly file,
     *         or <code>null</code> if webfetching is disabled ({@link #pisaBaseUrl} set to <code>null</code>)
     * @throws IOException
     *             whenever something goes wrong in the making of the
     *             web-connection, or reading from it.
     */
    private String getAssemblyFromWeb(final String pdbId, final int setNr, final int assemblyNr)
            throws IOException {
        if (null == pisaBaseUrl) {
            return null;
        }

        String pisaResult;
        log.debug("Fetching assembly {}:{},{} from web.", new Object[] { pdbId, setNr, assemblyNr });

        URL asmUrl;
        try {
            asmUrl = new URL(
                    assemblyBaseUrl + pdbId + ":" + String.valueOf(setNr) + "," + String.valueOf(assemblyNr));
            log.debug("Reading assembly from {}", asmUrl);
            pisaResult = Util.readAllFrom(throttledOpenStream(asmUrl));
            putToCache(pisaResult, getCacheFileForAssembly(pdbId, setNr, assemblyNr));

            return pisaResult;
        } catch (final MalformedURLException muex) {
            throw new IOException("Couldn't read PISA Assembly from web: URL invalid", muex);
        } catch (final IOException ioex) {
            throw new IOException("Couldn't read PISA Assembly from web: Problem reading stream", ioex);
        }
    }

    /**
     * @param pdbId
     *            for which accession should info be gotten from cache?
     * @param type
     *            what type of info should be gotten from cache: interface or multimer information?
     * @return The cache contents, or null if there was no cache.
     *         (Either because caching was disabled, or because this specific entry wasn't present (yet))
     * @throws IOException
     */
    private String getFromCache(final fetchType type, final String pdbId) {
        if (cacheDir == null) {
            return null;
        }

        String cacheContent = null;

        final File cacheFile = getCacheFileFor(type, pdbId);
        if (cacheFile.exists()) {
            log.trace("{} info for {} found in cache", type, pdbId);
            try {
                cacheContent = Util.readAllFrom(new FileReader(cacheFile));
            } catch (final IOException ioex) {
                // problem reading from cache
                log.warn("Couldn't read {}-cachefile for {}, see debug level for details", type, pdbId);
                log.debug("Exception details:", ioex);
                return null;
            }
        }
        return cacheContent;
    }

    /**
     * @param pdbId
     * @return The cache contents, or null if there was no cache.
     *         (Either because caching was disabled, or because this specific entry wasn't present)
     * @throws IOException
     */
    private String getFromCacheAssembly(final String pdbId, final int setNr, final int assemblyNr) {
        if (cacheDir == null) {
            return null;
        }

        String cacheContent = null;

        final File cacheFile = getCacheFileForAssembly(pdbId, setNr, assemblyNr);
        if (cacheFile.exists()) {
            log.debug("Assembly for {} found in cache", pdbId);
            try {
                cacheContent = Util.readAllFrom(new FileReader(cacheFile));
            } catch (final IOException ioex) {
                // problem reading from cache
                log.warn("Couldn't read assembly cachefile for {}, see debug level for details", pdbId);
                log.debug("Exception details:", ioex);
                return null;
            }
        }
        return cacheContent;
    }

    /**
     * Puts the raw PISA info to the specified cache file.
     * <p>
     * NB: this swallows any {@link IOException} thrown during the writing of said cachefile.<br>
     * Reasoning:<br>
     * A cache is non-essential, so exceptions aren't that bad. Log statements ARE emitted at Warn-level when this
     * happens. The full exception stack-trace is available at Debug log-level
     * 
     * @param pisaContent
     *            The pisa information to store in cache
     * @param cacheFile
     *            The cache location, should be obtained from {@link #getInterfaceFile(String)},
     *            {@link #getMultimerFile(String)} or {@link #getCacheFileForAssembly(String, int, int)}
     */
    private void putToCache(final String pisaContent, final File cacheFile) {
        // Store web-result in cache
        try {
            cacheFile.getParentFile().mkdirs();
            log.trace("Writing cache file {}", cacheFile.getAbsolutePath());
            @Cleanup
            final FileWriter writer = new FileWriter(cacheFile);
            writer.write(pisaContent);
            writer.close();
        } catch (final IOException ioex) {
            // not writing to cache is inconvenient, but not lethal.
            // Log it, but don't let it fail the entire get-operation
            log.warn("Couldn't write cache file {}", cacheFile.getAbsolutePath());
            log.debug("Exception: ", ioex);
        }
    }

    /**
     * Gets the base directory for this PDB-id the entry, directory may not exist yet
     */
    private File getCacheBaseDirFor(final String pdbId) {
        final File file = new File(cacheDir, pdbId.substring(1, 3) + File.separator + pdbId + File.separator);
        return file;
    }

    /**
     * Gets the file-path for an entry's cache file. File may not exist yet.
     * 
     * <p>
     * This gets the the file reference to what may be the cache-file for the specified interface or multimer
     * (determined by type parameter). If the file.exists(), then the entry is in cache, otherwise it doesn't exist in
     * cache, but should be stored to this location after fetching.
     */
    private File getCacheFileFor(final fetchType type, final String rawPdbId) {
        final String pdbId = rawPdbId.trim().toLowerCase();

        switch (type) {
        case Multimer:
            return new File(getCacheBaseDirFor(pdbId), pdbId + multimerFileSuffix);
        case Interface:
            return new File(getCacheBaseDirFor(pdbId), pdbId + interfaceFileSuffix);
        default:
            return null; // only used if type==null is entered, in which case you deserve pain!
        }
    }

    /**
     * Gets the file-path for an entry's assembly file. File may not exist yet.
     * 
     * <p>
     * This gets the the file reference to what may be the cache-file for the specified assembly. If the file.exists(),
     * then the entry is in cache, otherwise it doesn't exist in cache, but should be stored to this location after
     * fetching.
     * 
     * @param rawPdbId
     *            as for {@link #getAssembly(String, int, int)}
     * @param setNr
     *            as for {@link #getAssembly(String, int, int)}
     * @param assemblyNr
     *            as for {@link #getAssembly(String, int, int)}
     * @return the file reference, whether it exists or not.
     * @see #getAssembly(String, int, int)
     * @see #getCacheFileFor(fetchType, String)
     */
    private File getCacheFileForAssembly(final String rawPdbId, final int setNr, final int assemblyNr) {
        final String pdbId = rawPdbId.trim().toLowerCase();
        return new File(getCacheBaseDirFor(pdbId),
                pdbId + "_" + Integer.toString(setNr) + "_" + Integer.toString(assemblyNr) + assemblyFileSuffix);
    }

    /**
     * Opens a stream to the URL, delaying so that only one request
     * is made per {@link #webFetchThrottleMillis} milliseconds.
     * 
     * @see #throttleDelay()
     * @see #webFetchThrottleMillis
     * @see #lastWebRequestTime
     */
    private InputStreamReader throttledOpenStream(final URL fetchUrl) throws IOException {
        throttleDelay();
        lastWebRequestTime = System.currentTimeMillis();
        log.trace("Opening URL stream: {}", fetchUrl);
        return new InputStreamReader(fetchUrl.openStream());
    }

    /**
     * Delays execution if last web request was less than {@link #webFetchThrottleMillis} milliseconds ago.
     */
    private void throttleDelay() {
        final long now = System.currentTimeMillis();
        final long nextOpening = lastWebRequestTime + webFetchThrottleMillis;
        if (now < nextOpening) {
            // log.trace("Throttling: delaying thread {} ms", nextOpening - now);
            Util.delay(nextOpening - now);
        }
    }
}