opendap.metacat.DDXRetriever.java Source code

Java tutorial

Introduction

Here is the source code for opendap.metacat.DDXRetriever.java

Source

/////////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2010 OPeNDAP, Inc.
// Author: James Gallagher <jgallagher@opendap.org>
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
// You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
/////////////////////////////////////////////////////////////////////////////

package opendap.metacat;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Date;
import java.util.Enumeration;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** 
 * Use this with crawlers to manage a collection of DDX responses and the URLs
 * that reference them. This class can cache just the URLs, or both the URLs
 * and the matching responses. It can perform conditional gets as well, using
 * postgres (via ResponseCachePostgres) as a simple HTTP/1.1 cache.
 *  
 * @author jimg
 *
 */
public class DDXRetriever {

    private static Logger log = LoggerFactory.getLogger(DDXRetriever.class);

    /// Use the cache.
    // private boolean useCache;
    private boolean readOnly;

    // The DDXCache that holds both the DDXs LMT and the XML/text
    private ResponseCachePostgres DDXCache = null;

    public DDXRetriever(String cacheName) throws Exception {
        this(true, cacheName);
    }

    public DDXRetriever(boolean readOnly, String namePrefix) throws Exception {

        // this.useCache = useCache;
        this.readOnly = readOnly;

        DDXCache = new ResponseCachePostgres(readOnly, namePrefix + "_DDX", "ddx_responses");
    }

    /**
     * This 'main' has many conflicting features/options. I've changed the 
     * function of this class considerably since it was written.
     * @param args
     */
    public static void main(String[] args) {
        DDXRetriever retriever = null;

        // create the command line parser
        CommandLineParser parser = new PosixParser();

        // create the Options
        Options options = new Options();

        // The default action is to read from the net, checking the cache and
        // print the document to standard output.

        options.addOption("v", "verbose", false, "Be verbose");
        options.addOption("r", "read-only", false, "Only rad from the cache; no updates");

        options.addOption("n", "cache-name", true, "Use this to set a prefix for the cache name.");
        options.addOption("d", "ddx-url", true,
                "Get and print the DDX using the referenced URL. If this is not given, print all of the DDX URLs in the named cache.");

        try {
            // parse the command line arguments
            CommandLine line = parser.parse(options, args);

            boolean verbose = line.hasOption("verbose");

            String ddxURL = line.getOptionValue("ddx-url");
            if (verbose && ddxURL != null && !ddxURL.isEmpty())
                System.out.println("DDX URL: " + ddxURL);

            String cacheName = line.getOptionValue("cache-name");
            if (cacheName != null && cacheName.isEmpty())
                throw new Exception("--cache-name is required.");

            if (verbose)
                System.out.println("cacheName: " + cacheName);

            boolean readOnly = line.hasOption("read-only");

            retriever = new DDXRetriever(readOnly, cacheName);

            if (ddxURL != null && !ddxURL.isEmpty()) {
                System.out.println("DDX: " + retriever.getDDXDoc(ddxURL));
            } else {
                Enumeration<String> ddxs = retriever.getCachedDDXURLs();
                int i = 0;
                while (ddxs.hasMoreElements()) {
                    ++i;
                    ddxURL = ddxs.nextElement();
                    System.out.println("DDX URL: " + ddxURL);
                    if (verbose)
                        System.out.println("DDX: " + retriever.getCachedDDXDoc(ddxURL));
                }
                System.out.println("Found " + i + " URLs");
            }

            // Save the cache if not read-only
            if (!readOnly)
                retriever.saveDDXCache();

        } catch (ParseException exp) {
            System.err.println("Unexpected exception:" + exp.getMessage());
        }

        catch (Exception e) {
            System.err.println("Error : " + e.getMessage());
            e.printStackTrace();
        }
    }

    /**
     * Get the cache. Use the methods in ResponseCachePostgres to get information
     * from the cache. For this class the cache holds the LMTs and DDX for
     * each URL (the URLs are the keys).
     * @return The DDX cache.
     */
    /*
    public ResponseCachePostgres getCache() {
       return DDXCache;
    }
    */

    /** Simple method to test if the DDX will parse. Generally there's no 
     * need to call this but it'll be useful when developing the crawler.
     * 
     * @note This method must be called by client code; it is not used by
     * any of the methods here.
     * 
    * @param ddxString The DDX to test
     * @return true if the DDX parses, false if the SAX parser throws an
     * exception
     */
    public boolean isWellFormedDDX(String ddxString) {
        try {
            org.jdom.input.SAXBuilder sb = new org.jdom.input.SAXBuilder();
            @SuppressWarnings("unused")
            org.jdom.Document ddxDoc = sb.build(new ByteArrayInputStream(ddxString.getBytes()));
        } catch (Exception e) {
            return false;
        }
        return true;
    }

    private String convertStreamToString(InputStream is) throws IOException {
        /*
         * To convert the InputStream to String we use the
         * BufferedReader.readLine() method. We iterate until the BufferedReader
         * returns null which means there's no more data to read. Each line will
         * be appended to a StringBuilder and the result returned as a String.
         */
        if (is != null) {
            StringBuilder sb = new StringBuilder();
            String line;

            try {
                BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
                while ((line = reader.readLine()) != null) {
                    sb.append(line).append("\n");
                }
            } finally {
                is.close();
            }

            return sb.toString();
        } else {
            return "";
        }
    }

    /**
     * Given a URL to a DDX, get the DDX document. If the DDXRetriever was
     * built with caching turned on, this uses a poor man's HTTP/1.1 cache
     * based on Last Modified Times. 
     * 
     * If caching is on, then calling this on a series of DDX URLs will fill
     * the cache. If the cache is saved and later used again it is possible
     * to re-read the URLs straight from the cache.
     *  
     * @see getCache()
     * @param DDXURL Get the DDX referenced by this URL
     * @return The DDX document, in a String
     * @throws Exception 
     */
    public String getDDXDoc(String DDXURL) throws Exception {
        String ddx = null;

        URL url = new URL(DDXURL);
        URLConnection connection = url.openConnection();

        if (DDXCache.getLastVisited(DDXURL) != 0 && DDXCache.getCachedResponse(DDXURL) != null)
            connection.setIfModifiedSince(DDXCache.getLastVisited(DDXURL));

        // Here's where we'd poke in a header to ask for the DAP3.2 DDX

        connection.connect();

        // Cast to a HttpURLConnection
        if (connection instanceof HttpURLConnection) {
            HttpURLConnection httpConnection = (HttpURLConnection) connection;
            int code = httpConnection.getResponseCode();

            // If we have something, process. Since a conditional get was
            // used, the response might be empty (code == 304) and nothing
            // should be done in that case
            switch (code) {
            case 200:
                ddx = convertStreamToString(httpConnection.getInputStream());
                // Update the last visited and document caches
                if (!readOnly) {
                    Date date = new Date();
                    DDXCache.setLastVisited(DDXURL, date.getTime());
                    DDXCache.setCachedResponse(DDXURL, ddx);
                }
                break;

            case 304:
                ddx = DDXCache.getCachedResponse(DDXURL);
                if (!readOnly) {
                    // Update the last visited cache to now
                    Date date = new Date();
                    DDXCache.setLastVisited(DDXURL, date.getTime());
                }
                break;

            default:
                log.error("Expected a 200 or 304 HTTP return code. Got: " + new Integer(code).toString());
            }
        } else {
            throw new MalformedURLException("Expected a HTTP URL (" + DDXURL + ").");
        }

        return ddx;
    }

    public void cacheDDXURL(String ddxUrl) throws Exception {
        Date date = new Date();
        DDXCache.setLastVisited(ddxUrl, date.getTime());
    }

    public Enumeration<String> getCachedDDXURLs() {
        return DDXCache.getLastVisitedKeys();
    }

    public String getCachedDDXDoc(String DDXURL) throws Exception {
        if (DDXCache == null)
            throw new Exception("Caching is off but I was asked to read from the cache.");
        return DDXCache.getCachedResponse(DDXURL);
    }

    public long getCachedDDXLMT(String DDXURL) throws Exception {
        if (DDXCache == null)
            throw new Exception("Caching is off but I was asked to read from the cache.");
        return DDXCache.getLastVisited(DDXURL);
    }

    public void saveDDXCache() throws Exception {
        if (readOnly)
            throw new Exception("I was asked to save a read-only cache.");
        DDXCache.saveState();
    }
}