org.berlin.crawl.bom.StartURLSeedReader.java Source code

Java tutorial

Introduction

Here is the source code for org.berlin.crawl.bom.StartURLSeedReader.java

Source

/* Copyright (c) 2013 Berlin Brown (berlin2research.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.berlin.crawl.bom;

/*
 * Octane crawler is a simple web crawler in Java.  All open with a liberal license.
 * 
 * http://code.google.com/p/octane-crawler/
 * http://berlin2research.com/
 * 
 * Author: Berlin Brown (berlin dot brown at gmail.com)
 * 
 * Libraries used:
 * ---------------- 
 * dom4j-1.6.1.jar, hibernate-core-4.0.1.Final.jar, hsqldb-1.8.0.10.jar, httpclient-4.2.3.jar, jackson-core-asl-1.9.12.jar, 
 * log4j-1.2.16.jar, mysql-connector-java-5.1.23.jar, opennlp-maxent-3.0.2-incubating.jar
 * opennlp-tools-1.5.2-incubating.jar, spring-core-3.1.1.RELEASE.jar, spring-web-3.1.1.RELEASE.jar, 
 * struts-core-1.3.10.jar, tagsoup-1.2.1.jar, tika-core-1.3.jar
 */

import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;

import org.berlin.crawl.bean.BotSeed;
import org.berlin.crawl.dao.BotCrawlerDAO;
import org.hibernate.Session;
import org.hibernate.SessionFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;

public class StartURLSeedReader {

    private Queue<Thread> activeThreads = new LinkedList<Thread>();

    /**
     * Max active threads for processing links.
     * Default (60)
     */
    private int maxActiveThreads = 400;

    private static final Logger logger = LoggerFactory.getLogger(StartURLSeedReader.class);

    public void launch() {
        final ApplicationContext ctx = new ClassPathXmlApplicationContext(
                "/org/berlin/batch/batch-databot-context.xml");
        final BotCrawlerDAO dao = new BotCrawlerDAO();
        final SessionFactory sf = (SessionFactory) ctx.getBean("sessionFactory");
        final Session session = sf.openSession();
        final List<BotSeed> seeds = dao.findSeedRequests(session);
        int numberOfSeeds = 0;

        // Randomize the seed list //      
        Collections.shuffle(seeds);
        for (final BotSeed seed : seeds) {
            if ("Y".equalsIgnoreCase(seed.getEnabled())) {
                numberOfSeeds++;
                if (this.activeThreads.size() < maxActiveThreads) {
                    logger.info("At Start URL Seeder, seeding / num=" + numberOfSeeds + " / " + seed.getId()
                            + " sz=" + this.activeThreads.size());
                    // Only launch so many threads               
                    final BotTrueCrawler crawl = new BotTrueCrawler(this, ctx, seed.toLink());
                    crawl.launch();
                    // Slight delay after producing the links for processing //
                    try {
                        Thread.sleep(BotTrueCrawler.DELAY_FROM_PRODUCER + (4 * 1000));
                    } catch (final InterruptedException e) {
                        e.printStackTrace();
                    } /// End of try catch //
                } else {
                    // Wait for the threads to finish, using join
                    for (final Thread wt : this.activeThreads) {
                        try {
                            logger.info("At Start URL Seeder, waiting on seeds to complete");
                            wt.join();
                        } catch (final InterruptedException e) {
                            e.printStackTrace();
                        }
                    } // End of the for //
                      // If we reached this point, remove all the threads , they finished processing
                    if (this.activeThreads.size() > 0) {
                        this.activeThreads.clear();
                    }
                } // End of the if - else //            
                logger.info("!/=! At END OF seed launching, seeding / num=" + numberOfSeeds + " / " + seed.getId());
            } // End of the if //
        } // End of the for //

        // Wait for the threads to finish, using join
        for (final Thread wt : this.activeThreads) {
            try {
                logger.info("At Start URL Seeder, waiting on seeds to complete [f66x0");
                wt.join();
            } catch (final InterruptedException e) {
                e.printStackTrace();
            }
        } // End of the for //

        logger.info("!/=! At END OF seed seeding, launch complete");
        if (session != null) {
            // May not need to close the session
            session.close();
        } // End of the if //
    } // End of the method //

    public synchronized void addThread(final Thread t) {
        if (this.activeThreads.size() < maxActiveThreads) {
            // Only allow so many threads         
            this.activeThreads.add(t);
        }
    } // End of the method //

} // End of the class //