org.koiroha.jyro.workers.crawler.Crawler.java Source code

Introduction

Here is the source code for org.koiroha.jyro.workers.crawler.Crawler.java
Source

/* **************************************************************************
 * Copyright (C) 2008 BJoRFUAN. All Right Reserved
 * **************************************************************************
 * This module, contains source code, binary and documentation, is in the
 * BSD License, and comes with NO WARRANTY.
 *
 *                                                 torao <torao@bjorfuan.com>
 *                                                       http://www.moyo.biz/
 * $Id:$
*/
package org.koiroha.jyro.workers.crawler;

import java.io.*;
import java.net.URI;
import java.nio.charset.Charset;
import java.sql.*;
import java.util.*;

import javax.xml.parsers.*;
import javax.xml.xpath.*;

import org.apache.http.*;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.log4j.Logger;
import org.koiroha.jyro.*;
import org.koiroha.jyro.filters.Transaction;
import org.koiroha.jyro.util.IO;
import org.koiroha.xml.Xml;
import org.koiroha.xml.parser.HTMLDocumentBuilderFactory;
import org.w3c.dom.*;
import org.xml.sax.InputSource;

// ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
// Crawler: Crawler Worker
// ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
/**
 * The worker class to crawl web sites.
 *
 * @version $Revision:$
 * @author torao
 * @since 2011/08/06 Java SE 6
 */
public class Crawler extends Worker {

    // ======================================================================
    // Log Output
    // ======================================================================
    /**
     * Log output of this class.
     */
    private static final Logger logger = Logger.getLogger(Crawler.class);

    // ======================================================================
    // Max Content Length
    // ======================================================================
    /**
     * Max content length as byte to read.
     */
    private long maxContentLength = 1 * 1024 * 1024;

    // ======================================================================
    // User-Agent
    // ======================================================================
    /**
     * The value of User-Agent header.
     */
    private String userAgent = "Mozilla/5.0 (compatible; Jyrobot/" + Jyro.VERSION
            + "; +http://www.koiroha.org/jyro.html)";

    // ======================================================================
    // Constructor
    // ======================================================================
    /**
     *
     */
    public Crawler() {
        return;
    }

    // ======================================================================
    // Receive specified job
    // ======================================================================
    /**
     *
     * @param job
     * @return
     * @throws WorkerException
    */
    @Distribute(name = "crawl", params = { "url", "referer" })
    public void crawl(String url, String referer) throws SQLException, JyroException {
        List<URI> urls = retrieveLink(url);
        WorkerContext context = getContext();

        Connection con = Transaction.getConnection();
        con.setAutoCommit(false);

        PreparedStatement stmt1 = con.prepareStatement(
                "INSERT INTO jyro_urls(scheme,host,port,path,retrieved_at,created_at) VALUES(?,?,?,?,?,?)");
        PreparedStatement stmt = con.prepareStatement(
                "SELECT EXISTS(SELECT * FROM jyro_urls WHERE scheme=? AND host=? AND port=? AND path=?)");
        for (URI uri : urls) {
            int port = getPort(uri);
            if (port < 0) {
                logger.debug(uri + " is not supported");
                continue;
            }

            stmt.setString(1, uri.getScheme());
            stmt.setString(2, uri.getHost());
            stmt.setInt(3, port);
            stmt.setString(4, uri.getPath());
            ResultSet rs = stmt.executeQuery();
            rs.next();
            boolean exists = rs.getBoolean(1);
            if (!exists) {
                Timestamp now = new Timestamp(System.currentTimeMillis());
                stmt1.setString(1, uri.getScheme());
                stmt1.setString(2, uri.getHost());
                stmt1.setInt(3, port);
                stmt1.setString(4, uri.getPath());
                stmt1.setTimestamp(5, now);
                stmt1.setTimestamp(6, now);
                stmt1.executeUpdate();
                con.commit();
                context.call("crawl", uri.toString());
            } else {
                logger.info("URL " + uri + " already retrieved");
            }
        }
        stmt.close();
        stmt1.close();
        con.commit();
        return;
    }

    // ======================================================================
    // URL Content Retrieve Stage
    // ======================================================================
    /**
     * Retrieve content of specified object.
     *
     * @param content content to retrieve
     * @throws WorkerException if fail to retrieve
    */
    private Content retrieveContent(URI uri, URI referer) throws IOException {

        // execute request
        HttpClient client = new DefaultHttpClient();
        HttpGet request = new HttpGet(uri);
        request.setHeader("User-Agent", userAgent);
        if (referer != null) {
            request.setHeader("Referer", referer.toASCIIString());
        }

        HttpResponse response = null;
        String contentType = null;
        byte[] binary = null;
        InputStream in = null;
        try {

            // read response content
            response = client.execute(request);
            HttpEntity entity = response.getEntity();
            if (entity != null) {
                Header header = entity.getContentType();
                if (header != null) {
                    contentType = header.getValue();
                }

                // read content
                in = entity.getContent();
                ByteArrayOutputStream out = new ByteArrayOutputStream();
                byte[] buffer = new byte[1024];
                long remaining = maxContentLength;
                while (true) {
                    int len = (int) Math.min(buffer.length, remaining);
                    len = in.read(buffer, 0, len);
                    if (len < 0) {
                        break;
                    }
                    out.write(buffer, 0, len);
                    remaining -= len;
                }
                binary = out.toByteArray();
            }
        } finally {
            IO.close(in);
        }

        Content content = new Content(uri);
        Content.Request req = new Content.Request(request.getRequestLine().getMethod(),
                URI.create(request.getRequestLine().getUri()),
                request.getRequestLine().getProtocolVersion().toString());
        Content.Response res = new Content.Response(response.getStatusLine().getProtocolVersion().toString(),
                response.getStatusLine().getStatusCode(), response.getStatusLine().getReasonPhrase());
        return null;
    }

    // ======================================================================
    // Content Analyzer Stage
    // ======================================================================
    /**
     * Parse content.
     *
     * @param content content to analyze
     * @throws WorkerException if fail to retrieve
    */
    private List<URI> analyzeContent(URI uri, String type, byte[] content) {
        List<URI> urls = Collections.emptyList();
        if (type.toLowerCase().startsWith("text/html")) {
            urls = parseHtmlContent(uri, type, content);
        }
        return urls;
    }

    // ======================================================================
    // Content Analyzer Stage
    // ======================================================================
    /**
     * Parse content.
     *
     * @param content content to analyze
     * @throws WorkerException if fail to retrieve
    */
    private List<URI> parseHtmlContent(URI uri, String type, byte[] content) {
        List<URI> urls = new ArrayList<URI>();

        try {
            // parse html document
            Charset def = Xml.getCharset(type);
            HTMLDocumentBuilderFactory factory = new HTMLDocumentBuilderFactory();
            factory.setNamespaceAware(false);
            factory.setXIncludeAware(false);
            InputSource is = factory.guessInputSource(new ByteArrayInputStream(content), def.name(),
                    content.length);
            DocumentBuilder builder = factory.newDocumentBuilder();
            Document doc = builder.parse(is);

            // extract link
            XPath xpath = XPathFactory.newInstance().newXPath();
            NodeList nl = (NodeList) xpath.evaluate("//a/@href", doc, XPathConstants.NODESET);
            for (int i = 0; i < nl.getLength(); i++) {
                String href = ((Attr) nl.item(i)).getValue();
                urls.add(uri.resolve(href));
            }
        } catch (Exception ex) {
            throw new IllegalStateException(ex);
        }
        return urls;
    }

    // ======================================================================
    // Determine Port
    // ======================================================================
    /**
     * Determine the port number of specified uri. Return negative value if
     * URI scheme is not supported.
     *
     * @param uri URI
     * @return port number
    */
    private static int getPort(URI uri) {
        int port = uri.getPort();
        if (port >= 0) {
            return port;
        }
        String scheme = uri.getScheme();
        if (scheme.equals("http")) {
            return 80;
        } else if (scheme.equals("https")) {
            return 443;
        } else if (scheme.equals("ftp")) {
            return 21;
        }
        return -1;
    }

    // ======================================================================
    // Receive specified job
    // ======================================================================
    /**
     *
     * @param job
     * @return
     * @throws WorkerException
    */
    public List<URI> retrieveLink(String url) throws WorkerException {
        logger.debug("running crawler to: " + url);

        List<URI> urls = new ArrayList<URI>();
        try {
            HttpClient client = new DefaultHttpClient();
            HttpGet request = new HttpGet(url);
            request.setHeader("User-Agent", userAgent);
            HttpResponse response = client.execute(request);

            Header header = response.getFirstHeader("Content-Type");
            String contentType = header.getValue();
            logger.debug("Content-Type: " + contentType);
            if (contentType.toLowerCase().startsWith("text/html")) {
                DocumentBuilderFactory factory = new HTMLDocumentBuilderFactory();
                DocumentBuilder builder = factory.newDocumentBuilder();
                Document doc = builder.parse(response.getEntity().getContent());

                XPath xpath = XPathFactory.newInstance().newXPath();
                NodeList nl = (NodeList) xpath.evaluate("//a/@href", doc, XPathConstants.NODESET);
                for (int i = 0; i < nl.getLength(); i++) {
                    String href = ((Attr) nl.item(i)).getValue();
                    urls.add(URI.create(url).resolve(href));
                }
            }
        } catch (Exception ex) {
            throw new WorkerException(ex);
        }
        return urls;
    }

}