com.spinn3r.api.BaseClient.java Source code

Java tutorial

Introduction

Here is the source code for com.spinn3r.api.BaseClient.java

Source

/*
 * Copyright 2007 Tailrank, Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License.  You may obtain a copy
 * of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 * For more information see:
 * 
 * <a href="http://tailrank.com">http://tailrank.com</a>
 * <a href="http://spinn3r.com">http://spinn3r.com</a>
 * <a href="http://feedblog.org">http://feedblog.org</a>
 */

package com.spinn3r.api;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;

import com.google.protobuf.CodedInputStream;
import com.spinn3r.api.Config.Format;
import com.spinn3r.api.protobuf.ContentApi;
import com.spinn3r.io.protostream.ProtoStreamDecoder;

/**
 * Generic client support used which need to be in all APIs.
 *
 * See Main.java for usage example.
 * 
 * All implementations need to catch and handle Exceptions.
 *
 * <h2>Restarting</h2>
 *
 * When stopping/starting the API you need to persist an 'after' point where to
 * start the URL from again.  This is just the timestamp of the most recent URL
 * you found minus a buffer.
 *
 * 
 * 
 */
public abstract class BaseClient<ResultType extends BaseResult> implements Client<ResultType> {

    public static int PROTOBUF_SIZE_LIMIT = 256 * 1024 * 1024;

    private static final String X_MORE_RESULTS = "X-More-Results";

    /**
     * Maximum number of retries.
     */
    public static final long RETRY_MAX = 1;

    public static final String USER_AGENT_HEADER = "User-Agent";
    public static final String ACCEPT_ENCODING_HEADER = "Accept-Encoding";

    public static final String FEED_HANDLER = "feed";
    public static final String PERMALINK_HANDLER = "permalink3";
    public static final String COMMENT_HANDLER = "comment3";
    public static final String LINK_HANDLER = "link3";

    public static final String GZIP_ENCODING = "gzip";

    // Would be nice to have this use String.format() but this isn't really
    // compatible back to Java 1.4.. are we requiring Java 1.5 now?
    //
    // TODO: include OS name, kernel version, etc.

    public static final String USER_AGENT = String.format("Spinn3r API Reference Client %s (Java %s, maxMemory=%s)",
            Config.DEFAULT_VERSION, System.getProperty("java.version"), Runtime.getRuntime().maxMemory());

    /**
     * Specified in java.security to indicate the caching policy for successful
     * name lookups from the name service.. The value is specified as as integer
     * to indicate the number of seconds to cache the successful lookup.
     * 
     *
     * sun.net.inetaddr.ttl:
     * 
     * This is a sun private system property which corresponds to
     * networkaddress.cache.ttl. It takes the same value and has the same meaning,
     * but can be set as a command-line option. However, the preferred way is to
     * use the security property mentioned above.
     * 
     * A value of -1 indicates "cache forever".
     */
    public static final int NETWORKADDRESS_CACHE_TTL = 5 * 60;

    /**
     * These properties specify the default connect and read timeout (resp.) for
     * the protocol handler used by java.net.URLConnection.
     * 
     * sun.net.client.defaultConnectTimeout specifies the timeout (in
     * milliseconds) to establish the connection to the host. For example for
     * http connections it is the timeout when establishing the connection to
     * the http server. For ftp connection it is the timeout when establishing
     * the connection to ftp servers.
     * 
     * sun.net.client.defaultReadTimeout specifies the timeout (in milliseconds)
     * when reading from input stream when a connection is established to a
     * resource.
     */
    public static final int DEFAULT_CONNECT_TIMEOUT = 5 * 60 * 1000;

    /**
     * Lower read timeout.  Makes NO sense to wait for five minutes to read a
     * byte from spinn3r.
     */
    public static final int DEFAULT_READ_TIMEOUT = DEFAULT_CONNECT_TIMEOUT;

    /**
     * Specify the maximum number of redirects to use.
     */
    public static final int DEFAULT_MAX_REDIRECTS = 5;

    /**
     * Whether we should use HTTP Keep Alive in java.net.URL.  We default to
     * true here because MOST of our TCP connections WILL be used again since
     * everything is to spinn3r.com.
     */
    public static final boolean DEFAULT_HTTP_KEEPALIVE = true;

    /**
     * Keeps track of the number of connections that this client has used.
     */
    private long connectionCount = 0;

    abstract public boolean getIsCompressed();

    // **** fetching support ****************************************************

    public BaseClientResult<ResultType> fetch(Config<ResultType> config)
            throws IOException, ParseException, InterruptedException {

        PartialBaseClientResult<ResultType> partial_result = partialFetch(config);
        try {
            return completeFetch(partial_result);
        } finally {
            closeQuietly(partial_result.getConnection());
        }
    }

    public static void closeQuietly(URLConnection conn) {
        try {
            if (conn != null) {
                InputStream is = conn.getInputStream();
                if (is != null) {
                    is.close();
                }
            }
        } catch (IOException ignore) {
        }
    }

    public PartialBaseClientResult<ResultType> partialFetch(Config<ResultType> config)
            throws IOException, ParseException, InterruptedException {

        PartialBaseClientResult<ResultType> res;

        int retry_ctr = 0;
        int limit = getLimit(config);

        while (true) {

            try {

                // set the optimal limit if necessary
                if (retry_ctr > 0)
                    limit = config.getConservativeLimit();

                res = startFetch(config, limit);

                break;

            } catch (Exception e) {

                //revert limit to conservative values.
                if (retry_ctr < RETRY_MAX) {
                    ++retry_ctr;
                    continue;
                }

                //this is slightly ugly but prevents nested exceptions.
                if (e instanceof IOException)
                    throw (IOException) e;

                if (e instanceof ParseException)
                    throw (ParseException) e;

                if (e instanceof InterruptedException)
                    throw (InterruptedException) e;

                throw new IOException(e);

            }

        }

        return res;
    }

    /**
     * Fetch the API with the given FeedConfig
     * 
     * @throws IOException if there's an error with network transport.
     * @throws ParseException if there's a problem parsing the resulting XML.
     */
    private PartialBaseClientResult<ResultType> startFetch(Config<ResultType> config, int request_limit)
            throws IOException, InterruptedException {

        PartialBaseClientResult<ResultType> result = new PartialBaseClientResult<ResultType>(config);

        if (config.getVendor() == null)
            throw new RuntimeException("Vendor not specified");

        String resource = config.getNextRequestURL();

        //enforce max limit so that we don't generate runtime exceptions.
        if (request_limit > config.getMaxLimit())
            request_limit = config.getMaxLimit();

        if (resource == null) {

            resource = config.getFirstRequestURL();

            // if the API has NEVER been used before then generate the first
            // request URL from the config parameters.
            if (resource == null)
                resource = config.generateFirstRequestURL(request_limit);

        }

        //apply the request_limit to the current URL.  This needs to be done so
        //that we can change the limit at runtime.  When I originally designed
        //the client I didn't want to support introspecting and mutating the URL
        //on the client but with the optimial limit performance optimization
        //this is impossible.
        resource = setParam(resource, "limit", request_limit);

        // add a connection number to the vendor code
        resource = addConnectionNumber(resource);

        // store the last requested URL so we can expose this to the caller for
        // debug purposes.

        result.setLastRequestURL(resource);
        result.setRequestLimit(request_limit);

        URLConnection conn = getConnection(resource);

        /*
         * If this is an http connection and there is an error code,
         * return throw an error message containing the response 
         * message.
         */
        if (conn instanceof HttpURLConnection) {
            HttpURLConnection httpConn = (HttpURLConnection) conn;
            int responseCode = httpConn.getResponseCode();

            if (responseCode >= 400) {
                StringBuilder message = new StringBuilder("");
                InputStream errorStream = httpConn.getErrorStream();
                if (errorStream == null)
                    throw new IOException(String.format("Response code %d received", responseCode));

                BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(errorStream)));
                String line;
                while ((line = reader.readLine()) != null)
                    message.append(line);

                throw new IOException(message.toString());
            }
        }

        result.setConnection(conn);

        setMoreResults(conn, result);

        result.setNextRequestURL(conn.getHeaderField("X-Next-Request-URL"));

        return result;
    }

    private String addConnectionNumber(String url) {

        String[] parts = url.split("\\?");

        if (parts.length < 2) {
            return url;
        }

        String newURL = parts[0] + "?";
        List<String> newParams = new LinkedList<String>();

        for (String param : parts[1].split("&")) {
            String[] paramParts = param.split("=");

            if (paramParts.length >= 2) {
                if (paramParts[0].equals("vendor")) {
                    newParams.add(String.format("vendor=%s-%d", getConfig().getVendor(),
                            connectionCount++ % LegacyWrapperClient.PARALLELISM));
                } else {
                    newParams.add(param);
                }
            } else {
                newParams.add(param);
            }
        }

        return newURL + StringUtils.join(newParams, "&");
    }

    /**
     * Fetch the API with the given FeedConfig
     * 
     * @throws IOException if there's an error with network transport.
     * @throws ParseException if there's a problem parsing the resulting XML.
     */
    public BaseClientResult<ResultType> completeFetch(PartialBaseClientResult<ResultType> partial_result)
            throws IOException, ParseException, InterruptedException {

        Config<ResultType> config = partial_result.getConfig();
        int request_limit = partial_result.getRequestLimit();
        String resource = partial_result.getLastRequestURL();
        boolean has_results_header = partial_result.getHasMoreResultsHeader();
        boolean has_more_results = partial_result.getHasMoreResults();
        String next_request = partial_result.getNextRequestURL();

        BaseClientResult<ResultType> result = new BaseClientResult<ResultType>(config);

        result.setLastRequestURL(resource);
        result.setRequestLimit(request_limit);
        result.setHasMoreResultsHeadder(has_results_header);
        result.setHasMoreResults(has_more_results);
        result.setNextRequestURL(next_request);

        try {

            long before = System.currentTimeMillis();

            long call_before = System.currentTimeMillis();

            URLConnection conn = partial_result.getConnection();

            //TODO: clean up the naming here.  getLocalInputStream actually
            //reads everything into a byte array in memory.
            InputStream localInputStream = getLocalInputStream(conn.getInputStream(), result);

            result.setLocalInputStream(localInputStream);

            if (GZIP_ENCODING.equals(conn.getContentEncoding()))
                result.setIsCompressed(true);

            InputStream is = result.getInputStream();

            long call_after = System.currentTimeMillis();

            result.setCallDuration(call_after - call_before);

            if (!config.getDisableParse()) {

                if (config.getFormat() == Format.PROTOSTREAM)
                    result.setResults(protobufParse(doProtoStreamFetch(localInputStream, config), config));
                else if (config.getFormat() == Format.PROTOBUF)
                    result.setResults(protobufParse(doProtobufFetch(localInputStream, config), config));

                else {

                    Document doc = doXmlFetch(is, config);

                    if (doc != null) {
                        result.setResults(xmlParse(doc, config));
                    }

                }
            }

            long after = System.currentTimeMillis();

            result.setParseDuration(after - before);

        }

        catch (Exception e) {
            throw new ParseException(e, "Unable to handle request: " + resource);
        }

        if (!result.getHasMoreResultsHeadder())
            result.setHasMoreResults(result.getResults().size() == request_limit);

        return result;
    }

    protected URLConnection getConnection(String resource) throws IOException {

        URLConnection conn = null;

        try {

            // create the HTTP connection.
            URL request = new URL(resource);
            conn = request.openConnection();

            // set the UserAgent so Spinn3r know which client lib is calling.
            conn.setRequestProperty(USER_AGENT_HEADER, USER_AGENT + "; " + getConfig().getCommandLine());
            conn.setRequestProperty(ACCEPT_ENCODING_HEADER, GZIP_ENCODING);
            conn.setConnectTimeout(20000);
            conn.connect();

        }

        catch (IOException ioe) {

            //create a custom exception message with the right error.
            String message = conn.getHeaderField(null);
            IOException ce = new IOException(message);
            ce.setStackTrace(ioe.getStackTrace());

            throw ce;
        }

        return conn;
    }

    private void setMoreResults(URLConnection conn, PartialBaseClientResult<ResultType> result) {

        String more = conn.getHeaderField(X_MORE_RESULTS);

        if (more == null)
            result.setHasMoreResultsHeader(false);

        else {
            result.setHasMoreResultsHeader(true);

            if ("true".equals(more))
                result.setHasMoreResults(true);
            else
                result.setHasMoreResults(false);
        }
    }

    /**
     * Return the correct limit, factoring in the limit set by the user. 
     *
     */
    public int getLimit(Config<ResultType> config) {

        int limit = config.getLimit();

        if (limit == -1)
            return config.getOptimalLimit();

        return limit;

    }

    public List<ContentApi.Entry> doProtoStreamFetch(InputStream inputStream, Config<?> config) throws IOException {

        List<ContentApi.Entry> res = new ArrayList<ContentApi.Entry>();

        ContentApi.Entry.Builder builder = ContentApi.Entry.newBuilder();

        ProtoStreamDecoder<ContentApi.Entry> decoder = ProtoStreamDecoder.newDecoder(inputStream, builder);

        for (ContentApi.Entry entry = decoder.read(); entry != null; entry = decoder.read()) {
            res.add(entry);
        }

        return res;
    }

    public ContentApi.Response doProtobufFetch(InputStream inputStream, Config<?> config)
            throws IOException, InterruptedException {
        CodedInputStream cis = CodedInputStream.newInstance(inputStream);
        cis.setSizeLimit(PROTOBUF_SIZE_LIMIT);
        return ContentApi.Response.parseFrom(cis);
    }

    public Document doXmlFetch(InputStream inputStream, Config<ResultType> config)
            throws IOException, ParseException, InterruptedException {

        try {
            // now get the system XML parser using JAXP

            DocumentBuilderFactory docBuildFactory = DocumentBuilderFactory.newInstance();

            //namespaces won't work at ALL if this isn't enabled.
            docBuildFactory.setNamespaceAware(true);

            DocumentBuilder parser = docBuildFactory.newDocumentBuilder();

            // parse the document into a DOM.... I'd like to use JDOM here but
            // it's yet another lib to support and we want to keep things thin
            // and lightweight.
            //
            // Another advantage to DOM is that it's very portable.

            Document doc = parser.parse(inputStream);

            return doc;

        }

        catch (IOException ioe) {
            throw ioe;
        }

        catch (Exception e) {

            String message = String.format("Unable to parse %s: %s", getLastRequestURL(), e.getMessage());

            ParseException pe = new ParseException(message);
            pe.initCause(e);

            throw pe;
        }

    }

    /**
     * Get a local copy of the input stream so we can benchmark the download
     * time.
     */
    private InputStream getLocalInputStream(InputStream is, BaseClientResult<ResultType> result)
            throws IOException {
        return new ByteArrayInputStream(getInputStreamAsByteArray(is, result));
    }

    /**
     * Get the input stream as a byte array.
     */
    private byte[] getInputStreamAsByteArray(InputStream is, BaseClientResult<ResultType> result)
            throws IOException {

        //include length of content from the original site with contentLength

        ByteArrayOutputStream bos = new ByteArrayOutputStream(500000);

        //now process the Reader...
        byte data[] = new byte[2048];

        int readCount = 0;

        int total = 0;

        try {
            while ((readCount = is.read(data)) >= 0) {
                bos.write(data, 0, readCount);
                total += readCount;
            }
        } finally {
            IOUtils.closeQuietly(is);

            result.getBs1().sample(total);
            result.getBs5().sample(total);
            result.getBs15().sample(total);
        }

        bos.close();

        return bos.toByteArray();

    }

    /**
     * We've received a response from the API so parse it out.
     *
     */
    protected List<ResultType> xmlParse(Document doc, Config<ResultType> config) throws Exception {

        Element root = (Element) doc.getFirstChild();

        List<ResultType> result = new ArrayList<ResultType>();

        NodeList items = root.getElementsByTagName("item");

        for (int i = 0; i < items.getLength(); ++i) {

            Element current = (Element) items.item(i);

            result.add(config.createResultObject(current));

        }

        return result;

    }

    /**
     * We've received a response from the API so parse it out.
     *
     */
    protected List<ResultType> protobufParse(ContentApi.Response response, Config<ResultType> config)
            throws Exception {

        return protobufParse(response.getEntryList(), config);

    }

    protected List<ResultType> protobufParse(List<ContentApi.Entry> entries, Config<ResultType> config)
            throws Exception {

        List<ResultType> result = new ArrayList<ResultType>();

        for (ContentApi.Entry entry : entries) {
            result.add(config.createResultObject(entry));
        }

        return result;
    }

    /**
     * Set a parameter in the HTTP URL.
     *
     */
    protected String setParam(String v, String key, Object value) {

        int start = v.indexOf(String.format("%s=", key));

        if (start != -1) {
            int end = v.indexOf("&", start);

            if (end == -1)
                end = v.length();

            StringBuffer buff = new StringBuffer(v);

            buff.replace(start, end, String.format("%s=%s", key, value));
            return buff.toString();
        }

        return v;

    }

    /**
     * Parse command line arguments like --foo=bar where foo is the key and bar
     * is the value.
     *
     */
    public static Map<String, String> getopt(String[] args) {

        Map<String, String> result = new HashMap<String, String>();

        for (String arg : args) {

            String[] split = arg.split("=");

            String key = split[0];

            if (key.startsWith("--"))
                key = key.substring(2, key.length());

            String value = split[1];

            result.put(key, value);

        }

        return result;

    }

    /**
     * Set reasonable HTTP timeouts and DNS caching settings via a static
     * constructor BEFORE any HTTP calls are used.
     */
    static {

        // A full list of properties is available here:

        // http://java.sun.com/j2se/1.4.2/docs/guide/net/properties.html

        //NOTE: Thu Aug 10 2006 05:17 PM (burton@tailrank.com): its not a good
        //idea to set these values since they modify the defaults for ALL IO
        //applications. It woould be BETTER to select more realistic values
        //instead of infinity though.

        System.setProperty("sun.net.inetaddr.ttl", Integer.toString(NETWORKADDRESS_CACHE_TTL));

        System.setProperty("networkaddress.cache.ttl", Integer.toString(NETWORKADDRESS_CACHE_TTL));

        System.setProperty("sun.net.client.defaultReadTimeout", Integer.toString(DEFAULT_READ_TIMEOUT));

        System.setProperty("sun.net.client.defaultConnectTimeout", Integer.toString(DEFAULT_CONNECT_TIMEOUT));

        System.setProperty("http.maxRedirects", Integer.toString(DEFAULT_MAX_REDIRECTS));

        System.setProperty("http.keepAlive", Boolean.toString(DEFAULT_HTTP_KEEPALIVE));

    }

}