org.paxle.crawler.proxy.impl.ProxyDataProviderCallable.java Source code

Java tutorial

Introduction

Here is the source code for org.paxle.crawler.proxy.impl.ProxyDataProviderCallable.java

Source

/**
 * This file is part of the Paxle project.
 * Visit http://www.paxle.net for more information.
 * Copyright 2007-2010 the original author or authors.
 *
 * Licensed under the terms of the Common Public License 1.0 ("CPL 1.0").
 * Any use, reproduction or distribution of this program constitutes the recipient's acceptance of this agreement.
 * The full license text is available under http://www.opensource.org/licenses/cpl1.0.txt
 * or in the file LICENSE.txt in the root directory of the Paxle distribution.
 *
 * Unless required by applicable law or agreed to in writing, this software is distributed
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

package org.paxle.crawler.proxy.impl;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.Date;
import java.util.concurrent.Callable;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.paxle.core.doc.ICrawlerDocument;
import org.paxle.core.doc.IDocumentFactory;
import org.paxle.crawler.ICrawlerTools;
import org.xsocket.connection.http.HttpResponseHeader;

public class ProxyDataProviderCallable implements Callable<ICrawlerDocument> {
    /**
     * For logging
     */
    private final Log logger = LogFactory.getLog(this.getClass());

    private final ICrawlerTools crawlerTools;
    private final IDocumentFactory crawlerDocFactory;
    private final URI location;
    private final HttpResponseHeader responseHeaders;
    private final InputStream bodyInputStream;

    public ProxyDataProviderCallable(URI location, HttpResponseHeader responseHeaders, InputStream bodyInputStream,
            IDocumentFactory crawlerDocFactory, ICrawlerTools crawlerTools) {
        if (location == null)
            throw new NullPointerException("The command object was null.");
        if (responseHeaders == null)
            throw new NullPointerException("The response-headers were null.");
        if (bodyInputStream == null)
            throw new NullPointerException("The response-body wasnull.");
        if (crawlerDocFactory == null)
            throw new NullPointerException("No doc-factory found");

        this.location = location;
        this.responseHeaders = responseHeaders;
        this.bodyInputStream = bodyInputStream;
        this.crawlerDocFactory = crawlerDocFactory;
        this.crawlerTools = crawlerTools;
    }

    public ICrawlerDocument call() throws Exception {
        ICrawlerDocument doc = null;
        try {
            // create a new CrawlerDocument
            doc = this.crawlerDocFactory.createDocument(ICrawlerDocument.class);

            // getting the content location
            doc.setLocation(this.location);

            // extract headers
            this.extractHeaderData(this.responseHeaders, doc);

            // copy the content to file
            this.crawlerTools.saveInto(doc, bodyInputStream);

            doc.setStatus(ICrawlerDocument.Status.OK);
            this.logger.debug(String.format("Processing of URL '%s' finished.", this.location));
        } catch (Throwable e) {
            String msg = e.getMessage();

            // TODO: better error handling required

            doc.setStatus(ICrawlerDocument.Status.UNKNOWN_FAILURE, msg);
        } finally {
            try {
                bodyInputStream.close();
            } catch (IOException e) {
                /* ignore this */}
        }
        return doc;
    }

    private void extractHeaderData(final HttpResponseHeader resHdr, final ICrawlerDocument doc) throws IOException {
        // content-type and charset
        String contentType = resHdr.getContentType();
        if (contentType == null)
            contentType = "application/octet-stream";
        int idx = contentType.indexOf(";");
        if (idx != -1)
            contentType = contentType.substring(0, idx).trim();

        doc.setMimeType(contentType);
        doc.setCharset(resHdr.getCharacterEncoding());

        // getting the content language
        String contentLanguage = resHdr.getHeader(ProxyResponseHandler.HTTPHEADER_CONTENT_LANGUAGE);
        if (contentLanguage != null) {
            String[] languages = contentLanguage.split(",");
            doc.setLanguages(languages);
        }

        // crawling Date
        Date crawlingDate = null;
        String crawlingDateHeader = resHdr.getHeader(ProxyResponseHandler.HTTPHEADER_DATE);
        if (crawlingDateHeader == null) {
            crawlingDate = new Date();
        } else {
            // TODO: parsing date
        }
        doc.setCrawlerDate(crawlingDate);

        // last mod date
        Date lastModDate = null;
        String lastModDateHeader = resHdr.getHeader(ProxyResponseHandler.HTTPHEADER_LAST_MODIFIED);
        if (lastModDateHeader != null) {
            // TODO: parsing date
        }
        doc.setLastModDate(lastModDate);

        // ETAG
        //      String etageHeader = resHdr.getHeader(ProxyResponseHandler.HTTPHEADER_ETAG);
        //      if (etageHeader != null) {
        //         doc.setEtag(etageHeader);
        //      }
    }
}