com.camel.crawler.WebCrawler.java Source code

Java tutorial

Introduction

Here is the source code for com.camel.crawler.WebCrawler.java

Source

/*
 * Copyright (c) 2013, FPX and/or its affiliates. All rights reserved.
 * Use, Copy is subject to authorized license.
 */
package com.camel.crawler;

import java.io.File;
import java.io.IOException;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.DefaultHttpClient;

import com.camel.utils.FileUtils;
import com.squareup.okhttp.OkHttpClient;
import com.squareup.okhttp.Request;
import com.squareup.okhttp.Response;

/**
 * 
 * @author dengqb
 * @date 2015313
 */
public class WebCrawler {
    private final OkHttpClient client;

    //private final static String URL_PRE = "http://www.88152.com/shop/";
    //    private static int pageInit = 1845289;
    //    private static int pageEnd = 1937300;
    private final static String URL_PRE = "http://b2b.88152.com/show-";
    private static int pageInit = 1;
    private static int pageEnd = 1887909;

    private File emailFile;
    private Pattern emailPattern;

    private final String regex = "(<span>)(.*)(</span>)";

    private ArrayBlockingQueue queue;

    public WebCrawler() {
        client = new OkHttpClient();
        emailPattern = Pattern.compile(regex);
        emailFile = new File("emailout.txt");
    }

    public void fetchWeb(String url) throws IOException {
        client.setConnectTimeout(2, TimeUnit.SECONDS);
        Request request = new Request.Builder().url(url).build();
        Response response = client.newCall(request).execute();

        int responseCode = response.code();

        if (responseCode == 200) {
            extraInfo(response.body().string());
        } else {
            System.out.println("got error page");
        }
    }

    /**
     * ???
     * @param webpage
     */
    public void extraInfo(String webpage) {
        Matcher matcher = emailPattern.matcher(webpage);
        if (matcher.find()) {
            String email = matcher.group(2);
            if (!StringUtils.isEmpty(email)) {
                System.out.println("find email=" + email);
                FileUtils.writeAppendFile(emailFile, email.trim() + System.lineSeparator());
            } else {
                System.out.println("empty email address");
            }
        }
    }

    public static void main(String[] args) {
        WebCrawler crawler = new WebCrawler();

        boolean finished = true;

        while (finished) {
            try {
                pageInit++;
                System.out.println("url num:=" + pageInit);
                try {
                    crawler.fetchWeb(URL_PRE + String.valueOf(pageInit) + "/");
                } catch (IOException e) {
                    pageInit--;
                    e.printStackTrace();
                    //?1?
                    try {
                        System.out.println("exception sleep 1 min");
                        Thread.sleep(60000);
                    } catch (InterruptedException e1) {
                        e1.printStackTrace();
                    }
                }
                //????crawler
                if (pageInit == pageEnd) {
                    finished = false;
                }
                try {
                    Thread.sleep(700);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            } catch (Exception e) {
                System.out.println("unknow exception");
                e.printStackTrace();
            }
        }
    }
}