edu.uci.mhlee.BasicCrawler.java Source code

Introduction

Here is the source code for edu.uci.mhlee.BasicCrawler.java
Source

package edu.uci.mhlee;
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.http.Header;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;

/**
 * @author Yasser Ganjisaffar [lastname at gmail dot com]
 */
public class BasicCrawler extends WebCrawler {
    static Connection connection = null;
    static PropertyReader pr = null;
    static {
        try {
            pr = new PropertyReader();
            Class.forName("org.sqlite.JDBC");
            connection = DriverManager.getConnection("jdbc:sqlite:" + pr.getDbPath());
            //connection.setAutoCommit(false);

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private final static Pattern BINARY_FILES_EXTENSIONS = Pattern
            .compile(".*\\.(bmp|gif|jpe?g|png|tiff?|pdf|ico|xaml|pict|rif|pptx?|ps"
                    + "|mid|mp2|mp3|mp4|wav|wma|au|aiff|flac|ogg|3gp|aac|amr|au|vox"
                    + "|avi|mov|mpe?g|ra?m|m4v|smil|wm?v|swf|aaf|asf|flv|mkv"
                    + "|zip|rar|gz|7z|aac|ace|alz|apk|arc|arj|dmg|jar|lzip|lha)" + "(\\?.*)?$"); // For url Query parts ( URL?q=... )

    /**
     * You should implement this function to specify whether the given url
     * should be crawled or not (based on your crawling logic).
     */
    @Override
    public boolean shouldVisit(WebURL url) {
        String href = url.getURL().toLowerCase();
        //logger.info(url+" is... "+(!BINARY_FILES_EXTENSIONS.matcher(href).matches() && href.contains("ics.uci.edu")));
        return !BINARY_FILES_EXTENSIONS.matcher(href).matches() //Exclude media or other invalid extensions 
                && href.contains("ics.uci.edu") //Include only ics.uci.edu
                && !isAlreadyVisited(url) //Exclude already visited URL
                && !(href.contains("?") && href.contains("=")); //exclude url query
    }

    /**
     * This function is called when a page is fetched and ready to be processed
     * by your program.
     */
    @Override
    public void visit(Page page) {
        int docid = page.getWebURL().getDocid();
        String url = page.getWebURL().getURL();
        String domain = page.getWebURL().getDomain();
        String path = page.getWebURL().getPath();
        String subDomain = page.getWebURL().getSubDomain();
        String parentUrl = page.getWebURL().getParentUrl();
        String anchor = page.getWebURL().getAnchor();

        WebContent webContent = new WebContent();
        logger.debug("Docid: " + docid);
        logger.info("URL: " + url);
        logger.debug("Domain: " + domain);
        logger.debug("Sub-domain: " + subDomain);
        logger.debug("Path: " + path);
        logger.debug("Parent page: " + parentUrl);
        logger.debug("Anchor text: " + anchor);
        webContent.setDocid(docid);
        webContent.setUrl(url);
        webContent.setDomain(domain);
        webContent.setSubDomain(subDomain);
        webContent.setPath(path);
        webContent.setParentUrl(parentUrl);
        webContent.setAnchor(anchor);

        if (page.getParseData() instanceof HtmlParseData) {
            HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
            String text = htmlParseData.getText();
            String html = htmlParseData.getHtml();
            String title = htmlParseData.getTitle();
            List<WebURL> links = htmlParseData.getOutgoingUrls();

            logger.debug("Text length: " + text.length());
            logger.debug("Html length: " + html.length());
            logger.debug("Number of outgoing links: " + links.size());

            webContent.setTextLength(text.length());
            webContent.setHtmlLength(html.length());

            webContent.setText(text);
            webContent.setHtml(html);
            webContent.setTitle(title);

            String[] stParts = Utils.mySplit(text);
            webContent.setWordcount(stParts.length);
            webContent.setOutgoingLink(links.size());
            webContent.setLinks(links);

        }

        Header[] responseHeaders = page.getFetchResponseHeaders();
        if (responseHeaders != null) {
            logger.debug("Response headers:");
            for (Header header : responseHeaders) {
                logger.debug(header.getName() + ": " + header.getValue());
            }
        }

        logger.debug("=============");
        write2DB(webContent);
    }

    public int write2DB(WebContent content) {
        logger.info("docid:" + content.getDocid() + " added.");

        String insertSQL = null;
        try {
            Statement statement = connection.createStatement();
            statement.setQueryTimeout(30); // set timeout to 30 sec.

            insertSQL = "insert into webContents values(" + "null," + //auto increment
                    content.getDocid() + "," + "'" + content.getUrl() + "'," + "'" + content.getSubDomain() + "',"
                    + "'" + content.getPath() + "'," + "'" + content.getParentUrl() + "'," + "'"
                    + content.getAnchor() + "'," + "" + content.getTextLength() + "," + "" + content.getHtmlLength()
                    + "," + "" + content.getWordcount() + "," + "'" + content.getText() + "'," + "'"
                    + content.getHtml() + "'," + "" + content.getOutgoingLink() + "," + "'" + content.getTitle()
                    + "'" + ")";

            statement.executeUpdate(insertSQL);

            for (WebURL url : content.getLinks()) {
                insertSQL = "insert into webLinks values(" + "null," + content.getDocid() + "," + "'" + url.getURL()
                        + "'" + ")";
                statement.executeUpdate(insertSQL);
            }

        } catch (SQLException e) {
            // if the error message is "out of memory", 
            // it probably means no database file is found
            System.err.println(e.getMessage());
            System.err.println("========================");
            System.err.println(insertSQL);
            System.err.println("========================");
        } finally {
            //         try
            //         {
            //            if(connection != null)
            //               connection.close();
            //         }
            //         catch(SQLException e)
            //         {
            //            // connection close failed.
            //            System.err.println(e);
            //         }
        }

        return 0;
    }

    public boolean isAlreadyVisited(WebURL url) {
        boolean isAlreadyVisited = false;
        try {
            // create a database connection

            Statement statement = connection.createStatement();
            statement.setQueryTimeout(30); // set timeout to 30 sec.
            String urlString = url.getURL().toLowerCase();
            //url.get

            //System.out.println("select count(*) as cnt from webContents where url = '"+urlString+"'");
            ResultSet rs = statement
                    .executeQuery("select count(*) as cnt from webContents where url = '" + urlString + "'");
            while (rs.next()) {
                // read the result set
                //System.out.println("cnt = " + rs.getString("name"));
                //System.out.println("id = " + rs.getInt("id"));
                if (Integer.parseInt(rs.getString("cnt")) > 0) {
                    isAlreadyVisited = true;
                }
            }
            //System.out.println("Is already visited?"+isAlreadyVisited);
        } catch (SQLException e) {
            // if the error message is "out of memory", 
            // it probably means no database file is found
            System.err.println(e.getMessage());
        } finally {
            //         try
            //         {
            //            if(connection != null)
            //               connection.close();
            //         }
            //         catch(SQLException e)
            //         {
            //            // connection close failed.
            //            System.err.println(e);
            //         }
        }

        return isAlreadyVisited;
    }

    public static void main(String[] args) {
        String href = "http://aaaa/?alskdjfalk=10";
        String[] aaa = href.split("(\\?.*)?$");
        System.out.println(aaa[0]);
        System.out.println(aaa.length);
        Pattern URL_QUERY = Pattern.compile(".*\\.(\\?.*)?$");
        System.out.println(URL_QUERY.matcher(href).matches());
        System.out.println(href.contains("?"));
    }
}