com.sjsu.crawler.Crawler.java Source code

Introduction

Here is the source code for com.sjsu.crawler.Crawler.java
Source

/*
 * 
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package com.sjsu.crawler;

import java.util.Collection;
import java.util.Iterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.sjsu.crawler.core.AbstractCrawler;
import com.sjsu.crawler.link.Link;
//import com.sjsu.crawler.model.MaxIterationsModel;
import com.sjsu.crawler.parser.PageData;
//import com.sjsu.crawler.parser.httpclient.SimpleHttpClientParser;
import com.sjsu.crawler.util.StopWatch;

/**
 * Project: Crawler
 *
 * 
 */
public class Crawler extends AbstractCrawler {

    private static final transient Log LOG = LogFactory.getLog(Crawler.class);

    private StopWatch total = new StopWatch();
    private StopWatch loading = new StopWatch();
    private StopWatch parsing = new StopWatch();
    private StopWatch listener = new StopWatch();

    /**
     * Constructor for Crawler.
     */
    public Crawler() {
        super();
    }

    /**
     * Starts the crawling process in a single thread.
     * 
     * If the parser isn't defined, the {@link SimpleHttpClientParser} is set.
     * If the model isn't defined, the {@link MaxDepthModel} is set.
     * 
     * @param server
     *            the starting server of the crawling
     * @param start
     *            the starting path of the crawling
     */
    public final void start(final String server, final String start) {

        // set the default parser
        if (parser == null) {
            LOG.debug("No parser set, defaulting to SimpleHttpClientParser.");
            // parser = new SimpleHttpClientParser();
        }

        // // set default crawler model
        // if (model == null) {
        // LOG.debug("No model set, defaulting to MaxIterationsModel.");
        // model = new MaxIterationsModel();
        // }

        // initialize stop watch
        total.reset();
        loading.reset();
        parsing.reset();
        listener.reset();

        total.start();

        // add at least one link to the list
        model.add(null, server + start);

        // starts the crawling process
        start();

        total.stop();

        // output some statistics
        if (LOG.isInfoEnabled()) {

            Collection visitedURIs = model.getVisitedURIs();
            Collection toVisitURIs = model.getToVisitURIs();

            LOG.info("Visited URIs: " + visitedURIs.size());

            if (!toVisitURIs.isEmpty()) {
                LOG.warn("still URIs to be visited, at least: " + toVisitURIs.size());
            }

            // output stop watch data
            LOG.info("Total time: " + total.getTime() + " ms");
            LOG.info("- loading:  " + loading.getTime() + " ms");
            LOG.info("- parsing:  " + parsing.getTime() + " ms");
            LOG.info("- listener: " + listener.getTime() + " ms");
        }
    }

    /**
     * Starts the crawling process in a single thread.
     *
     * Before starting the crawling process, the model and the parser have to be
     * set.
     *
     * @see com.sjsu.crawler.core.ICrawler#start()
     */
    public final void start() {
        // loop until there aren't any URIs anymore
        while (!model.isEmpty()) {

            // remove a link from the stack
            Link link = model.pop();

            PageData pageData = parser.load(link);
            if (isPageDataOK(pageData)) {
                final Collection outLinks = parser.parse(pageData, linkFilter);
                // ******To print outlinks/child urls***********
                Iterator iterator = outLinks.iterator();
                while (iterator.hasNext()) {
                    String val = (String) iterator.next();
                    //System.out.println("Outlinks:"+val);
                }
                fireParserEvent(link, pageData, outLinks);
                // remove already visited URIs from the outgoing links list
                outLinks.removeAll(model.getVisitedURIs());

                // the rest of the URIs are new and can be visited
                model.add(link, outLinks);
            }
        }
    }

}