cn.edu.bit.whitesail.crawl.Crawler.java Source code

Java tutorial

Introduction

Here is the source code for cn.edu.bit.whitesail.crawl.Crawler.java

Source

/**
 * @(#)Crawler.java Mar 7, 2009
 *
 *Copyright 2009 BaiFan
 *
 *Licensed under the Apache License, Version 2.0 (the "License");
 *you may not use this file except in compliance with the License.
 *You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 *Unless required by applicable law or agreed to in writing, software
 *distributed under the License is distributed on an "AS IS" BASIS,
 *WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *See the License for the specific language governing permissions and
 *limitations under the License.
 */
package cn.edu.bit.whitesail.crawl;

import cn.edu.bit.whitesail.WhiteSail;
import cn.edu.bit.whitesail.page.Page;
import cn.edu.bit.whitesail.page.URL;
import cn.edu.bit.whitesail.parser.HtmlParser;
import cn.edu.bit.whitesail.parser.Parser;

import cn.edu.bit.whitesail.utils.MD5Signature;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.HttpParams;

/**
 *
 *
 * @version 0.1
 * @author baifan
 * @since 0.1
 */
public class Crawler extends Thread {

    private final static Log LOG = LogFactory.getLog(Crawler.class);
    private HttpClient httpClient;
    private static transient boolean isFinished = false;
    private Parser parser;
    private String fileName;
    private long block = 1;
    private File file;
    private FileOutputStream fos;
    private String detectedEncoding;

    public Crawler() {
        httpClient = new DefaultHttpClient();
        parser = new HtmlParser();
        HttpParams params = httpClient.getParams();

        params.setParameter("http.useragent", "BaiFan Robot");
        // combine cookie
        params.setParameter("http.protocol.single-cookie-header", Boolean.TRUE);
        // socket time out
        params.setParameter("http.socket.timeout", 20000);
        // limit the redirect times
        params.setParameter("http.protocol.max-redirects", 3);
        // cookie policy,
        params.setParameter("http.protocol.cookie-policy", "compatibility");
    }

    @Override
    public void run() {
        while (!isFinished) {

            URL u = fetchURL();
            if (u == null || u.to == null || u.to.equals("")) {
                continue;
            }
            if (!WhiteSail.VISITIED_URL_TABLE.add(u.to)) {
                continue;
            }

            if (LOG.isInfoEnabled()) {
                LOG.info("download " + u.to);
            }
            byte[] contents = download(u.to);
            if (contents == null) {
                continue;
            }
            if (!WhiteSail.VISITIED_PAGE_TABLE.add(MD5Signature.calculate(contents))) {
                continue;
            }
            Page page = new Page(contents, detectedEncoding);
            page.fromURL = u.from;
            page.URL = u.to;

            WhiteSail.UNVISITED_URL_TABLE.add(parser.extractURLFromContent(page));

            appendData(page);
        }
        httpClient.getConnectionManager().shutdown();
        try {
            fos.close();
        } catch (IOException ex) {
            LOG.error("fileoutputstream can not close");
        }
    }

    private URL fetchURL() {
        URL result = null;
        while ((result = WhiteSail.UNVISITED_URL_TABLE.get()) == null) {
            try {
                LOG.info("unvisitable is empty , crawler sleeps for a while");
                Thread.sleep(10000);
            } catch (InterruptedException ex) {
                LOG.warn("crawler therad catch InterruptedException");
                continue;
            }
        }
        return result;
    }

    private byte[] download(String URL) {
        byte[] result = null;
        HttpEntity httpEntity = null;
        try {

            HttpGet httpGet = new HttpGet(URL);

            httpGet.addHeader("Accept-Language", "zh-cn,zh,en");
            httpGet.addHeader("Accept-Encoding", "gzip,deflate");

            HttpResponse response = httpClient.execute(httpGet);

            if (response.getStatusLine().getStatusCode() != 200) {
                return null;
            }

            Header header = response.getFirstHeader("content-type");

            if (header == null || header.getValue().indexOf("text/html") < 0) {
                return null;
            }

            int pos = header.getValue().indexOf("charset=");
            if (pos >= 0) {
                detectedEncoding = header.getValue().substring(pos + 8);
            }

            httpEntity = response.getEntity();
            InputStream in = null;
            in = httpEntity.getContent();

            header = response.getFirstHeader("Content-Encoding");
            if (null != header) {
                if (header.getValue().indexOf("gzip") >= 0) {
                    in = new GZIPInputStream(in);
                } else if (header.getValue().indexOf("deflate") >= 0) {
                    in = new InflaterInputStream(in, new Inflater(true));
                }
            }

            ByteArrayOutputStream out = new ByteArrayOutputStream();
            byte[] buffer = new byte[1024];
            int len = 0;
            while ((len = in.read(buffer)) != -1) {
                out.write(buffer, 0, len);
            }
            result = out.toByteArray();

        } catch (IOException ex) {
            LOG.warn("downloading error,abandon");
            result = null;
        }
        return result;
    }

    private void appendData(Page page) {
        if (file == null) {
            fileName = WhiteSail.DATA_DIRECTORY + "/" + getId() + "_" + block;
            file = new File(fileName);
        }
        if (fos == null) {
            try {
                fos = new FileOutputStream(file, true);
            } catch (FileNotFoundException ex) {
                LOG.error("can not open file");
            }
        }
        try {
            fos.write(page.toByteArray());
            fos.write(page.rawContent);
            fos.flush();
        } catch (IOException ex) {
            LOG.error("can not write file");
        }

    }
}