cn.austin.crawle.Crawler.java Source code

Java tutorial

Introduction

Here is the source code for cn.austin.crawle.Crawler.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package cn.austin.crawle;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.concurrent.BlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

/**
 *
 * @author Berton
 */
public class Crawler implements Runnable {

    BlockingQueue<String> messageQueue = null;
    private PageGenerator pageGenerator = null;

    public Crawler() {
    }

    public Crawler(BlockingQueue<String> messageQueue, PageGenerator pageGenerator) {
        this.messageQueue = messageQueue;
        this.pageGenerator = pageGenerator;
    }

    public String crawleTheContent(int currpage) throws IOException {
        String content = null;
        CloseableHttpResponse response = null;
        InputStream is = null;
        try {
            CloseableHttpClient httpclient = HttpClients.createDefault();
            int start = (currpage - 1) * 14 + 1;
            String url = "https://book.douban.com/subject_search?start=%s&search_text=%E7%BC%96%E7%A8%8B&cat=1001"
                    .replace("%s", String.valueOf(start));
            HttpGet httpGet = new HttpGet(url);

            httpGet.addHeader("Accept",
                    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            httpGet.addHeader("Accept-Encoding", "gzip, deflate, sdch, br");
            httpGet.addHeader("Accept-Language", "zh-CN,zh;q=0.8");
            httpGet.addHeader("Connection", "keep-alive");
            httpGet.addHeader("Cookie",
                    "gr_user_id=f2ae551f-d8be-472d-a17c-fb427123b958; viewed=\"1470240_11386364\"; ll=\"108296\"; bid=DT2w_1e4hnY; _vwo_uuid_v2=A0ADC053693BC0BAB806B37B77D3EE2E|58055d5c7afc6b50c4901fc93157dfdd; _pk_id.100001.3ac3=69b82fcb3d85b9cb.1481024949.4.1481087406.1481074949.; __utma=30149280.413198661.1441366971.1481074927.1481087106.26; __utmc=30149280; __utmz=30149280.1480767548.22.22.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=81379588.563439035.1481024949.1481074927.1481087107.4; __utmc=81379588; __utmz=81379588.1481024949.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)");
            httpGet.addHeader("Host", "book.douban.com");
            httpGet.addHeader("Referer", "https://book.douban.com/");
            httpGet.addHeader("Upgrade-Insecure-Requests", "1");
            httpGet.addHeader("User-Agent",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36");

            response = httpclient.execute(httpGet);
            HttpEntity entity = response.getEntity();

            is = entity.getContent();
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
            StringBuilder result = new StringBuilder();
            String read = null;
            while ((read = bufferedReader.readLine()) != null) {
                result.append(read);
            }
            content = result.toString();
        } finally {
            response.close();
            is.close();
        }
        return content;
    }

    @Override
    public void run() {

        while (true) {
            int currIndex = pageGenerator.getCurr();

            if (currIndex > pageGenerator.getMaxPage()) {
                break;
            }
            try {
                String content = crawleTheContent(currIndex);
                messageQueue.put(content);
            } catch (IOException | InterruptedException ex) {
                Logger.getLogger(Crawler.class.getName()).log(Level.SEVERE, null, ex);
            }

        }
    }

}