org.paxle.crawler.impl.CrawlerTools.java Source code

Introduction

Here is the source code for org.paxle.crawler.impl.CrawlerTools.java
Source

/**
 * This file is part of the Paxle project.
 * Visit http://www.paxle.net for more information.
 * Copyright 2007-2010 the original author or authors.
 *
 * Licensed under the terms of the Common Public License 1.0 ("CPL 1.0").
 * Any use, reproduction or distribution of this program constitutes the recipient's acceptance of this agreement.
 * The full license text is available under http://www.opensource.org/licenses/cpl1.0.txt
 * or in the file LICENSE.txt in the root directory of the Paxle distribution.
 *
 * Unless required by applicable law or agreed to in writing, this software is distributed
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

package org.paxle.crawler.impl;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.util.Formatter;
import java.util.Iterator;
import java.util.concurrent.Semaphore;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Reference;
import org.apache.felix.scr.annotations.ReferenceCardinality;
import org.apache.felix.scr.annotations.ReferencePolicy;
import org.apache.felix.scr.annotations.Service;
import org.paxle.core.charset.ACharsetDetectorOutputStream;
import org.paxle.core.charset.ICharsetDetector;
import org.paxle.core.crypt.ACryptOutputStream;
import org.paxle.core.crypt.ICrypt;
import org.paxle.core.crypt.ICryptManager;
import org.paxle.core.doc.ICrawlerDocument;
import org.paxle.core.io.IIOTools;
import org.paxle.core.io.temp.ITempFileManager;
import org.paxle.core.mimetype.IMimeTypeDetector;
import org.paxle.crawler.ContentLengthLimitExceededException;
import org.paxle.crawler.ICrawlerTools;

@Component(immediate = true)
@Service(ICrawlerTools.class)
public class CrawlerTools implements ICrawlerTools {

    /**
     * For logging
     */
    private Log logger = LogFactory.getLog(CrawlerTools.class);

    /**
     * A component to create temp-files
     */
    @Reference
    protected ITempFileManager tfm;

    /**
     * A copy tool
     */
    @Reference
    protected IIOTools ioTools;

    /**
     * A tool to detect the charset encoding of a text
     */
    @Reference(policy = ReferencePolicy.DYNAMIC, cardinality = ReferenceCardinality.OPTIONAL_UNARY)
    protected ICharsetDetector chardet;

    /**
     * A tool to detect the mime-type of a content
     */
    @Reference(policy = ReferencePolicy.DYNAMIC, cardinality = ReferenceCardinality.OPTIONAL_UNARY)
    protected IMimeTypeDetector mimeTypeDetector;

    @Reference(policy = ReferencePolicy.DYNAMIC, cardinality = ReferenceCardinality.OPTIONAL_UNARY)
    protected ICryptManager cryptManager;

    public static final int DEFAULT_BUFFER_SIZE_BYTES = 1024;

    private static class LimitedRateCopier implements ILimitedRateCopier {

        private final Semaphore sem = new Semaphore(1);
        private final long minDelta;
        private int working = 0;
        private volatile long curDelta;

        public LimitedRateCopier(final int maxKBps) {
            this.minDelta = 1000l / (maxKBps * 1024 / DEFAULT_BUFFER_SIZE_BYTES);
        }

        private int addWorker() throws InterruptedException {
            final int r;
            sem.acquire();
            r = ++working;
            sem.release();
            curDelta = minDelta * r;
            return r;
        }

        private int delWorker() throws InterruptedException {
            final int r;
            sem.acquire();
            r = --working;
            sem.release();
            curDelta = minDelta * r;
            return r;
        }

        public long copy(final InputStream is, final OutputStream os, final long bytes) throws IOException {
            long rt = 0;
            try {
                final byte[] buf = new byte[DEFAULT_BUFFER_SIZE_BYTES];
                int cs = (int) ((bytes > 0 && bytes < DEFAULT_BUFFER_SIZE_BYTES) ? bytes
                        : DEFAULT_BUFFER_SIZE_BYTES);

                int rn;
                long time = System.currentTimeMillis();
                addWorker();
                try {
                    while ((rn = is.read(buf, 0, cs)) > 0) {
                        os.write(buf, 0, rn);
                        rt += rn;

                        time += curDelta;
                        final long sleep = time - System.currentTimeMillis();
                        if (sleep > 0)
                            Thread.sleep(sleep);

                        if (bytes > 0) {
                            cs = (int) Math.min(bytes - rt, DEFAULT_BUFFER_SIZE_BYTES);
                            if (cs == 0)
                                break;
                        }
                    }
                } finally {
                    delWorker();
                }
                os.flush();
            } catch (InterruptedException e) {
                /* ignore, simply quit copying */ }
            return rt;
        }
    }

    public ILimitedRateCopier createLimitedRateCopier(final int maxKBps) {
        return new LimitedRateCopier(maxKBps);
    }

    /**
     * Uses compression
     * @see CrawlerTools#saveListing(Iterator, String, boolean)
     */
    public void saveListing(final ICrawlerDocument cdoc, final Iterator<DirlistEntry> fileListIt)
            throws IOException {
        saveListing(cdoc, fileListIt, true, true);
    }

    /**
     * Generates a file-listing in a standard format understood by Paxle. Currently this format
     * consists of a rudimentary HTML-page linking to the files in the list given by
     * <code>fileListIt</code>. The resulting format of this list not yet finalized and subject
     * to change.
     * 
     * @param cdoc the {@link ICrawlerDocument} to save the dir-listing to
     * @param tfm if <code>cdoc</code> does not already contain a
     *        {@link ICrawlerDocument#getContent() content-file}, the {@link ITempFileManager} is
     *        used to create one.
     * @param fileListIt the file-listing providing the required information to include in the result
     * @param compress determines whether the content should be compressed transparently (via GZip)
     *        to save space. Compression reduces the size of the representation of large directories
     *        up to a sixth.
     */
    public void saveListing(final ICrawlerDocument cdoc, final Iterator<DirlistEntry> fileListIt,
            boolean inclParent, boolean compress) throws IOException {
        if (cdoc == null)
            throw new NullPointerException("The crawler-document is null.");

        File content = cdoc.getContent();
        if (content == null) {
            content = this.tfm.createTempFile();
            cdoc.setContent(content);
        }

        final String charset = "UTF-8";

        if (compress) {
            cdoc.setMimeType("application/x-gzip");
        } else {
            cdoc.setMimeType("text/html");
            cdoc.setCharset(charset);
        }

        OutputStream writerOut = null;
        Formatter writer = null;
        try {
            // no need to buffer here, the Formatter uses a buffer internally
            writerOut = new FileOutputStream(content);

            /* Since the generated format is plain text, contains much redundant information and can potentially
             * become quite large (several hundred KB), a parameter offering compression can be specified. */
            if (compress)
                writerOut = new GZIPOutputStream(writerOut);

            writer = new Formatter(writerOut, charset);

            // getting the base dir
            String baseURL = cdoc.getLocation().toASCIIString();
            if (!baseURL.endsWith("/"))
                baseURL += "/";

            writer.format("<html><head><title>Index of %s</title></head><hr><table><tbody>\r\n",
                    cdoc.getLocation());
            if (inclParent) {
                // getting the parent dir
                String parentDir = "/";
                if (baseURL.length() > 1) {
                    parentDir = baseURL.substring(0, baseURL.length() - 1);
                    int idx = parentDir.lastIndexOf("/");
                    parentDir = parentDir.substring(0, idx + 1);
                }

                writer.format("<tr><td colspan=\"3\"><a href=\"%s\">Up to higher level directory</a></td></tr>\r\n",
                        parentDir);
            }

            // generate directory listing
            // FIXME: we need to escape the urls properly here.
            while (fileListIt.hasNext()) {
                final DirlistEntry entry = fileListIt.next();
                final String nexturi;
                final URI entryuri = entry.getFileURI();
                if (entryuri == null) {
                    nexturi = baseURL + entry.getFileName();
                } else {
                    nexturi = entryuri.toASCIIString();
                }
                writer.format(
                        "<tr>" + "<td><a href=\"%1$s\">%2$s</a></td>" + "<td>%3$d Bytes</td>"
                                + "<td>%4$tY-%4$tm-%4$td %4$tT</td>" + "</tr>\r\n",
                        nexturi, entry.getFileName(), Long.valueOf(entry.getSize()),
                        Long.valueOf(entry.getLastModified()));
            }
            writer.format("</tbody></table><hr></body></html>");

            cdoc.setStatus(ICrawlerDocument.Status.OK);
        } catch (UnsupportedEncodingException e) {
            // XXX: shouldn't this be an Error?
            throw new RuntimeException(charset + " not supported", e);
        } finally {
            if (writer != null)
                writer.close();
            else if (writerOut != null)
                writerOut.close();
        }
    }

    /**
     * This function tests if the given charset is supported by the java runtime. 
     * 
     * @param contentCharset the charset to test
     * @return <code>true</code> if the charset is not supported
     */
    boolean isUnsupportedCharset(String contentCharset) {
        boolean unsupportedCharset = true;

        try {
            if (contentCharset != null && Charset.isSupported(contentCharset)) {
                unsupportedCharset = false;
            }
        } catch (IllegalCharsetNameException e) {
        }

        return unsupportedCharset;
    }

    public long saveInto(ICrawlerDocument doc, InputStream is) throws IOException {
        return saveInto(doc, is, null);
    }

    public long saveInto(ICrawlerDocument doc, InputStream is, final ILimitedRateCopier lrc) throws IOException {
        return saveInto(doc, is, lrc, -1);
    }

    /**
     * Copies all data from the given {@link InputStream} to the given {@link ICrawlerDocument crawler-document}.<br />
     * Additionally this function ...
     * <ul>
     *    <li>detects the charset-encoding of the content using an {@link ICharsetDetector}</li>
     *  <li>detects the mime-type of the content using an {@link IMimeTypeDetector}</li>
     *  <li>generates a MD5 checksum of the content using an {@link ICryptManager}</li>
     * </ul>
     * 
     * @see #copy(InputStream, OutputStream, long) for details
     * @param doc the crawler-document the content belongs to
     * @param is the stream to read from
     * @param lrc the {@link LimitedRateCopier} to use to copy the data, if any, otherwise <code>null</code>
     * @param maxFileSize max allowed content-size to copy in bytes or <code>-1</code>
     * @return the number of copied bytes
     * 
     * @throws IOException if an I/O-error occures
     * @throws ContentLengthLimitExceededException if the content-length read via the input stream exceeds the
     *    limit defined via maxFileSize
     */
    public long saveInto(ICrawlerDocument doc, InputStream is, final ILimitedRateCopier lrc, final int maxFileSize)
            throws IOException, ContentLengthLimitExceededException {
        if (doc == null)
            throw new NullPointerException("The crawler-document is null.");
        if (is == null)
            throw new NullPointerException("The content inputstream is null.");

        // testing if the charset is supported by java
        if (doc.getCharset() != null && isUnsupportedCharset(doc.getCharset())) {
            logger.warn(String.format("The resource '%s' has an unsupported charset '%s'. Resetting charset ...",
                    doc.getLocation(), doc.getCharset()));
            doc.setCharset(null);
        }

        File file = null;
        OutputStream os = null;
        try {
            // init file output-stream
            file = this.tfm.createTempFile();
            os = new BufferedOutputStream(new FileOutputStream(file));

            // limit file size
            ContentLengthLimitOutputStream tos = null;
            if (maxFileSize != -1) {
                tos = new ContentLengthLimitOutputStream(maxFileSize, os);
                os = tos;
            }

            // init charset detection stream
            ACharsetDetectorOutputStream chardetos = null;
            if (this.chardet != null) {
                chardetos = this.chardet.createOutputStream(os);
                os = chardetos;
            }

            // init md5-calculation stream
            ACryptOutputStream md5os = null;
            final ICrypt md5 = (cryptManager == null) ? null : cryptManager.getCrypt("md5");
            if (md5 != null) {
                md5os = md5.createOutputStream(os);
                os = md5os;
            }

            /* ================================================================
             * COPY DATA
             * ================================================================ */
            final long copied = (lrc == null) ? ioTools.copy(is, os) : lrc.copy(is, os, -1);

            /* ================================================================
             * CHARSET DETECTION
             * ================================================================ */
            if (chardetos != null) {
                final String charset = chardetos.getCharset();
                if (charset == null || charset.length() == 0) {
                    logger.debug(String.format(
                            "The CharsetDetector was unable to determine the charset for resource '%s'.",
                            doc.getLocation()));
                } else if (isUnsupportedCharset(charset)) {
                    logger.warn(String.format(
                            "The CharsetDetector detected an unsupported charset '%s' for resource '%s'.", charset,
                            doc.getLocation()));
                } else {
                    doc.setCharset(charset);
                    logger.debug(String.format("The CharsetDetector detected charset '%s' for resource '%s'.",
                            charset, doc.getLocation()));
                }
            }

            /* ================================================================
             * MD5-HASH
             * ================================================================ */
            if (md5os != null) {
                final byte[] md5sum = md5os.getHash();
                if (md5sum != null)
                    doc.setMD5Sum(md5sum);
            }

            /* ================================================================
             * MIME-TYPE DETECTION
             * ================================================================ */
            if (doc.getMimeType() == null) {
                if (mimeTypeDetector != null) {
                    String mimeType = null;
                    try {
                        mimeType = mimeTypeDetector.getMimeType(file);
                    } catch (Exception e) {
                        logger.warn(String.format(
                                "Unexpected '%s' while trying to determine the mime-type of resource '%s'.",
                                e.getClass().getName(), doc.getLocation()), e);
                    }
                    logger.debug(String.format("MimeType of resource '%s' was detected as '%s'", doc.getLocation(),
                            mimeType));
                    if (mimeType != null)
                        doc.setMimeType(mimeType);
                }
            }

            doc.setContent(file);
            return copied;
        } catch (Throwable e) {
            if (file != null) {
                // closing stream
                try {
                    os.close();
                } catch (Exception e2) {
                    /* ignore this */
                } finally {
                    os = null;
                }

                // releasing temp file
                this.tfm.releaseTempFile(file);
            }

            // re-throw IO-Exceptions
            if (e instanceof IOException)
                throw (IOException) e;

            // convert other exceptions
            IOException ioe = new IOException(e.getMessage());
            ioe.initCause(e);
            throw ioe;
        } finally {
            if (os != null)
                os.close();
        }
    }
}