edu.xiyou.fruits.WebCrawler.fetcher.Fetcher.java Source code

Java tutorial

Introduction

Here is the source code for edu.xiyou.fruits.WebCrawler.fetcher.Fetcher.java

Source

/*
 *Copyright (c) 2015 Andrew-Wang. 
 *
 *Licensed under the Apache License, Version 2.0 (the "License");
 *you may not use this file except in compliance with the License.
 *You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *Unless required by applicable law or agreed to in writing, software
 *distributed under the License is distributed on an "AS IS" BASIS,
 *WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *See the License for the specific language governing permissions and
 *limitations under the License.
 */
package edu.xiyou.fruits.WebCrawler.fetcher;

import edu.xiyou.fruits.WebCrawler.net.HttpRequest;
import edu.xiyou.fruits.WebCrawler.parser.*;
import edu.xiyou.fruits.WebCrawler.scheduler.BloomDepthScheduler;
import edu.xiyou.fruits.WebCrawler.scheduler.BloomScheduler;
import edu.xiyou.fruits.WebCrawler.scheduler.Scheduler;
import edu.xiyou.fruits.WebCrawler.utils.Config;
import edu.xiyou.fruits.WebCrawler.utils.FileUtils;
import edu.xiyou.fruits.WebCrawler.utils.RegexRule;
import org.apache.http.HttpStatus;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

/**
 * Created by andrew on 15-6-7.
 */
public class Fetcher {
    private static Logger logger = LoggerFactory.getLogger(Fetcher.class);

    private static int RUNNING = 0;
    private static int STOP = 1;
    private static String CURRENT_THREAD_NAME = Thread.currentThread().getName();
    private static String FETCH_OUTPUT_STRING = CURRENT_THREAD_NAME + " fetch: ";

    protected Scheduler scheduler;
    private ThreadPool threadPool = new ThreadPool(Config.THREAD_COUNT);
    protected Handler handler;
    protected int depth;

    private final AtomicInteger activeThread = new AtomicInteger(0);
    private AtomicInteger waitThread = new AtomicInteger(0);
    private AtomicInteger fetchCount = new AtomicInteger(0);

    private volatile int runState = RUNNING;

    public Fetcher(Scheduler scheduler, Handler handler, int depth) {
        this.handler = handler;
        this.scheduler = scheduler;
        this.depth = depth;
    }

    class FetcherThread implements Runnable {

        //        private Scheduler scheduler;
        private AtomicInteger fetchCount;
        private HttpRequest request = new HttpRequest();
        private Html html;
        private ReentrantLock threadLock = new ReentrantLock();
        private Condition product = threadLock.newCondition();
        private Condition consume = threadLock.newCondition();

        public FetcherThread(AtomicInteger fetchCount) {
            this.fetchCount = fetchCount;
        }

        int runState = RUNNING;

        //        private String getTask(){
        //            String url = null;
        //            threadLock.lock();
        //            try {
        //                while ((url = scheduler.takeTasks()) == null){
        //                    Thread.sleep(500);
        //                }
        //            } catch (InterruptedException e) {
        //                logger.error(e.getMessage());
        //            }finally{
        //                thread.unlock;
        //            }
        //            return null;
        //        }

        @Override
        public void run() {

            logger.info(CURRENT_THREAD_NAME + " is running \n");

            try {
                activeThread.incrementAndGet();
                while ((runState == RUNNING) && (fetchCount.get() < Config.FETCH_COUNT)) {
                    String url = null;
                    url = scheduler.takeTasks();
                    if (url != null) {

                        logger.info(FETCH_OUTPUT_STRING + url);
                        CrawlDatum datum = new CrawlDatum(url);
                        html = (Html) request.getResponse(datum);
                        datum.setFetchTime(System.currentTimeMillis());
                        if (html == null) {
                            continue;
                        }

                        int status = html.getStatusLine().getStatusCode();
                        if (status != HttpStatus.SC_OK) {
                            handler.onFail(html);
                            logger.info(FETCH_OUTPUT_STRING + url + " fail", "StatusCode: " + status);
                            continue;
                        }

                        handler.onSuccess(html);
                        List<String> urlList = handler.handleAndGetLinks(html);
                        //                        System.out.println(urlList.size());
                        scheduler.putTasks(urlList);
                        fetchCount.incrementAndGet();
                    }
                }
            } catch (InterruptedException e) {
                logger.error(e.getMessage());
            } finally {
                activeThread.decrementAndGet();
            }
        }

        public void stop() {
            runState = STOP;
        }
    }

    public void before() {
        if (scheduler instanceof BloomDepthScheduler) {
            ((BloomDepthScheduler) scheduler).merge();
        }
    }

    public void after() {
        threadPool.close();
    }

    public void fetch() {
        before();
        FetcherThread[] threads = new FetcherThread[Config.THREAD_COUNT];

        for (int i = 0; i < Config.THREAD_COUNT; i++) {
            threads[i] = new FetcherThread(fetchCount);
            threadPool.execute(threads[i]);
        }

        if (runState == STOP) {
            for (int i = 0; i < Config.THREAD_COUNT; i++) {
                threads[i].stop();
            }
        }

        if (activeThread.get() > 0) {
            try {
                Thread.sleep(20 * 60 * 1000);
            } catch (InterruptedException e) {
                logger.error(e.getMessage());
            }
        }
    }

    public void stop() {
        runState = STOP;
    }

    public static void main(String[] args) {
        Scheduler scheduler1 = new BloomScheduler();
        scheduler1.putTasks(Arrays.asList("http://www.importnew.com/all-posts", "http://blog.csdn.net/"));
        Fetcher fetcher = new Fetcher(scheduler1, new Handler() {
            @Override
            public void onSuccess(Response response) {
                //                System.out.println(((Html) response).getUrl() + response.getStatusLine());
                //                System.out.println(new String(((Html)response).getContent()));
                String path = "/home/andrew/Data/";
                String fileName = path + System.currentTimeMillis();
                try {
                    FileUtils.write2File(new File(fileName), ((Html) response).getContent());
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

            @Override
            public void onFail(Response response) {

            }

            @Override
            public List<String> handleAndGetLinks(Response response) {
                LinksList list = new LinksList();
                RegexRule regexRule = new RegexRule();
                regexRule.addPositive(Arrays.asList("http://blog.csdn.net/\\w+/article/details/\\d+",
                        "http://www.importnew.com/\\d+.html", "http://blog.csdn.net/?&page=\\d+"));
                list.getLinkByA(Jsoup.parse(new String(response.getContent())), regexRule);
                return list;
            }
        }, 0);

        //        try {
        //            System.out.println(scheduler1.takeTasks());
        //        } catch (InterruptedException e) {
        //            logger.error(e.getMessage() + "bbbbbbbbbbbbbb");
        //        }
        fetcher.fetch();
    }
}