Java tutorial
/* * Copyright (c) 2013, FPX and/or its affiliates. All rights reserved. * Use, Copy is subject to authorized license. */ package com.camel.crawler; import java.io.File; import java.io.IOException; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.http.client.HttpClient; import org.apache.http.impl.client.DefaultHttpClient; import com.camel.utils.FileUtils; import com.squareup.okhttp.OkHttpClient; import com.squareup.okhttp.Request; import com.squareup.okhttp.Response; /** * * @author dengqb * @date 2015313 */ public class WebCrawler { private final OkHttpClient client; //private final static String URL_PRE = "http://www.88152.com/shop/"; // private static int pageInit = 1845289; // private static int pageEnd = 1937300; private final static String URL_PRE = "http://b2b.88152.com/show-"; private static int pageInit = 1; private static int pageEnd = 1887909; private File emailFile; private Pattern emailPattern; private final String regex = "(<span>)(.*)(</span>)"; private ArrayBlockingQueue queue; public WebCrawler() { client = new OkHttpClient(); emailPattern = Pattern.compile(regex); emailFile = new File("emailout.txt"); } public void fetchWeb(String url) throws IOException { client.setConnectTimeout(2, TimeUnit.SECONDS); Request request = new Request.Builder().url(url).build(); Response response = client.newCall(request).execute(); int responseCode = response.code(); if (responseCode == 200) { extraInfo(response.body().string()); } else { System.out.println("got error page"); } } /** * ??? * @param webpage */ public void extraInfo(String webpage) { Matcher matcher = emailPattern.matcher(webpage); if (matcher.find()) { String email = matcher.group(2); if (!StringUtils.isEmpty(email)) { System.out.println("find email=" + email); FileUtils.writeAppendFile(emailFile, email.trim() + System.lineSeparator()); } else { System.out.println("empty email address"); } } } public static void main(String[] args) { WebCrawler crawler = new WebCrawler(); boolean finished = true; while (finished) { try { pageInit++; System.out.println("url num:=" + pageInit); try { crawler.fetchWeb(URL_PRE + String.valueOf(pageInit) + "/"); } catch (IOException e) { pageInit--; e.printStackTrace(); //?1? try { System.out.println("exception sleep 1 min"); Thread.sleep(60000); } catch (InterruptedException e1) { e1.printStackTrace(); } } //????crawler if (pageInit == pageEnd) { finished = false; } try { Thread.sleep(700); } catch (InterruptedException e) { e.printStackTrace(); } } catch (Exception e) { System.out.println("unknow exception"); e.printStackTrace(); } } } }