org.apache.manifoldcf.crawler.connectors.webcrawler.DataCache.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.manifoldcf.crawler.connectors.webcrawler.DataCache.java

Source

/* $Id: DataCache.java 988245 2010-08-23 18:39:35Z kwright $ */

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.connectors.webcrawler;

import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
import org.apache.manifoldcf.crawler.system.Logging;
import org.apache.manifoldcf.crawler.system.ManifoldCF;
import java.util.*;
import java.io.*;

import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.client.RedirectException;
import org.apache.http.client.CircularRedirectException;
import org.apache.http.NoHttpResponseException;
import org.apache.http.HttpException;

/** This class is a cache of a specific URL's data.  It's fetched early and kept,
* so that (1) an accurate data length can be found, and (2) we can compute a version
* checksum.
*/
public class DataCache {
    public static final String _rcsid = "@(#)$Id: DataCache.java 988245 2010-08-23 18:39:35Z kwright $";

    // Hashmap containing the cache of files.
    // This is keyed by document identifier, and contains DocumentData objects.
    protected Map<String, DocumentData> cacheData = new HashMap<String, DocumentData>();

    /** Constructor.
    */
    public DataCache() {
    }

    /** Add a data entry into the cache.
    * This method is called whenever the data from a fetch is considered interesting or useful, and will
    * be thus passed on from getDocumentVersions() to the processDocuments() phase.  At the moment that's
    * usually a 200 or a 302 response.
    *@param documentIdentifier is the document identifier (url).
    *@param connection is the connection, upon which a fetch has been done that needs to be
    * cached.
    *@return a "checksum" value, to use as a version string.
    */
    public String addData(IProcessActivity activities, String documentIdentifier, IThrottledConnection connection)
            throws ManifoldCFException, ServiceInterruption {
        // Grab the response code, and the content-type header
        int responseCode = connection.getResponseCode();
        String contentType = connection.getResponseHeader("Content-Type");
        String referralURI = connection.getResponseHeader("Location");

        // Create a temporary file; that's what we will cache
        try {
            // First, get the stream.
            InputStream dataStream = connection.getResponseBodyStream();
            if (dataStream == null)
                return null;
            try {
                File tempFile = File.createTempFile("_webcache_", "tmp");
                try {
                    // Causes memory leaks if left around; there's no way to release
                    // the record specifying that the file should be deleted, even
                    // after it's removed.  So disable this and live with the occasional
                    // dangling file left as a result of shutdown or error. :-(
                    // tempFile.deleteOnExit();
                    ManifoldCF.addFile(tempFile);

                    // Transfer data to temporary file
                    long checkSum = 0L;
                    OutputStream os = new FileOutputStream(tempFile);
                    try {
                        byte[] byteArray = new byte[65536];
                        while (true) {
                            int amt;
                            try {
                                amt = dataStream.read(byteArray, 0, byteArray.length);
                            } catch (java.net.SocketTimeoutException e) {
                                Logging.connectors.warn(
                                        "Socket timeout exception reading socket stream: " + e.getMessage(), e);
                                long currentTime = System.currentTimeMillis();
                                throw new ServiceInterruption("Socket timeout: " + e.getMessage(), e,
                                        currentTime + 300000L, currentTime + 12 * 60 * 60000L, -1, false);
                            } catch (ConnectTimeoutException e) {
                                Logging.connectors.warn(
                                        "Socket connect timeout exception reading socket stream: " + e.getMessage(),
                                        e);
                                long currentTime = System.currentTimeMillis();
                                throw new ServiceInterruption("Socket timeout: " + e.getMessage(), e,
                                        currentTime + 300000L, currentTime + 12 * 60 * 60000L, -1, false);
                            } catch (InterruptedIOException e) {
                                //Logging.connectors.warn("IO interruption seen",e);
                                throw new ManifoldCFException("Interrupted: " + e.getMessage(),
                                        ManifoldCFException.INTERRUPTED);
                            } catch (IOException e) {
                                Logging.connectors.warn("IO exception reading socket stream: " + e.getMessage(), e);
                                long currentTime = System.currentTimeMillis();
                                throw new ServiceInterruption("Read timeout: " + e.getMessage(), e,
                                        currentTime + 300000L, currentTime + 12 * 60 * 60000L, -1, false);
                            }
                            if (amt == -1)
                                break;
                            int i = 0;
                            while (i < amt) {
                                byte x = byteArray[i++];
                                long bytevalue = (long) x;
                                checkSum = (checkSum << 5) ^ (checkSum >> 3) ^ (bytevalue << 2) ^ (bytevalue >> 3);
                            }

                            os.write(byteArray, 0, amt);
                            // Check if job is alive before looping
                            activities.checkJobStillActive();
                        }
                    } finally {
                        os.close();
                    }

                    synchronized (this) {
                        deleteData(documentIdentifier);
                        cacheData.put(documentIdentifier,
                                new DocumentData(tempFile, responseCode, contentType, referralURI));
                        return new Long(checkSum).toString();
                    }

                } catch (IOException e) {
                    ManifoldCF.deleteFile(tempFile);
                    throw e;
                } catch (ManifoldCFException e) {
                    ManifoldCF.deleteFile(tempFile);
                    throw e;
                } catch (ServiceInterruption e) {
                    ManifoldCF.deleteFile(tempFile);
                    throw e;
                } catch (Error e) {
                    ManifoldCF.deleteFile(tempFile);
                    throw e;
                }
            } finally {
                try {
                    dataStream.close();
                } catch (java.net.SocketTimeoutException e) {
                    Logging.connectors.warn(
                            "WEB: Socket timeout exception closing data stream, ignoring: " + e.getMessage(), e);
                } catch (ConnectTimeoutException e) {
                    Logging.connectors.warn("WEB: Socket connect timeout exception closing data stream, ignoring: "
                            + e.getMessage(), e);
                } catch (InterruptedIOException e) {
                    throw e;
                } catch (IOException e) {
                    // We can get this if the socket was unexpectedly closed by the server; treat this
                    // as a Service Interruption.  Generally, this is ok - warn but don't do anything else.
                    Logging.connectors.warn("WEB: IO exception closing data stream, ignoring: " + e.getMessage(),
                            e);
                }
            }
        } catch (java.net.SocketTimeoutException e) {
            throw new ManifoldCFException("Socket timeout exception creating temporary file: " + e.getMessage(), e);
        } catch (ConnectTimeoutException e) {
            throw new ManifoldCFException(
                    "Socket connect timeout exception creating temporary file: " + e.getMessage(), e);
        } catch (InterruptedIOException e) {
            //Logging.connectors.warn("IO interruption seen",e);
            throw new ManifoldCFException("Interrupted: " + e.getMessage(), ManifoldCFException.INTERRUPTED);
        } catch (IOException e) {
            throw new ManifoldCFException("IO exception creating temporary file: " + e.getMessage(), e);
        }
    }

    /** Get the response code.
    *@param documentIdentifier is the document identifier.
    *@return the code.
    */
    public synchronized int getResponseCode(String documentIdentifier) {
        DocumentData dd = cacheData.get(documentIdentifier);
        if (dd == null)
            return IThrottledConnection.FETCH_NOT_TRIED;
        return dd.getResponseCode();
    }

    /** Get the content type.
    *@param documentIdentifier is the document identifier.
    *@return the content type, or null if there is none.
    */
    public synchronized String getContentType(String documentIdentifier) {
        DocumentData dd = cacheData.get(documentIdentifier);
        if (dd == null)
            return null;
        return dd.getContentType();
    }

    /** Get the referral URI.
    *@param documentIdentifier is the document identifier.
    *@return the referral URI, or null if none.
    */
    public synchronized String getReferralURI(String documentIdentifier) {
        DocumentData dd = cacheData.get(documentIdentifier);
        if (dd == null)
            return null;
        return dd.getReferralURI();
    }

    /** Fetch binary data length.
    *@param documentIdentifier is the document identifier.
    *@return the length.
    */
    public synchronized long getDataLength(String documentIdentifier) {
        DocumentData dd = cacheData.get(documentIdentifier);
        if (dd == null)
            return 0L;
        return dd.getData().length();
    }

    /** Fetch binary data entry from the cache.
    *@param documentIdentifier is the document identifier (url).
    *@return a binary data stream.
    */
    public synchronized InputStream getData(String documentIdentifier) throws ManifoldCFException {
        DocumentData dd = cacheData.get(documentIdentifier);
        if (dd == null)
            return null;
        try {
            return new FileInputStream(dd.getData());
        } catch (FileNotFoundException e) {
            throw new ManifoldCFException("File not found exception opening data: " + e.getMessage(), e);
        }
    }

    /** Delete specified item of data.
    *@param documentIdentifier is the document identifier (url).
    */
    public synchronized void deleteData(String documentIdentifier) {
        DocumentData dd = cacheData.remove(documentIdentifier);
        if (dd != null) {
            ManifoldCF.deleteFile(dd.getData());
        }
    }

    // Protected classes

    /** This class represents everything we need to know about a document that's getting passed from the
    * getDocumentVersions() phase to the processDocuments() phase.
    */
    protected static class DocumentData {
        /** The cache file for the data */
        protected File data;
        /** The response code */
        protected int responseCode;
        /** The content-type header value */
        protected String contentType;
        /** The referral URI */
        protected String referralURI;

        // More will probably go here later, but I can't think of much else at the moment.

        /** Constructor. */
        public DocumentData(File data, int responseCode, String contentType, String referralURI) {
            this.data = data;
            this.responseCode = responseCode;
            this.contentType = contentType;
            this.referralURI = referralURI;
        }

        /** Get the data */
        public File getData() {
            return data;
        }

        /** Get the response code */
        public int getResponseCode() {
            return responseCode;
        }

        /** Get the contentType */
        public String getContentType() {
            return contentType;
        }

        /** Get the referral URI */
        public String getReferralURI() {
            return referralURI;
        }

    }

}