wuit.crawler.CrawlerHtml.java Source code

Java tutorial

Introduction

Here is the source code for wuit.crawler.CrawlerHtml.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

package wuit.crawler;

import wuit.crawler.main.MainServer;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.params.HttpParams;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.ExecutionContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import wuit.common.crawler.CrawlerUtiles;
import wuit.common.crawler.db.KeyValue;

/**
 *
 * @author lxl
 */
public class CrawlerHtml extends Thread {
    public int state = 1; //??

    private int crawlerMode = 0; //???
    private int maxDeepth = 0; //???
    private Map<String, KeyValue> mapFilterUrl; //??
    private int waiting = 5;

    public CrawlerHtml(int crawlerMode, int maxDeepth, Map<String, KeyValue> mapFilterUrl) {
        this.crawlerMode = crawlerMode;
        this.maxDeepth = maxDeepth;
        this.mapFilterUrl = mapFilterUrl;
    }

    public void run() {
        int waitCount = 0;
        while (state == 1) {
            try {
                DSCrawlerUrl url = MainServer.DBCrawler.getCrawlerUrl();
                if (waitCount >= waiting) {
                    setState(0);
                    break;
                }
                if (url == null || url.url.equals("")) {
                    Thread.sleep(2000);
                    waitCount = waitCount + 1;
                    continue;
                } else {
                    crawlePage(url);
                    waitCount = 0;
                    Thread.sleep(1000);
                }
            } catch (InterruptedException ex) {
                Logger.getLogger(CrawlerHtml.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    }

    public synchronized void setState(int val) {
        state = val;
    }

    public void crawlePage(DSCrawlerUrl crawlerUrl) {
        String html = doGetHttp(crawlerUrl.url); //
        //       System.out.println(url);

        DSCrawlerUrl pageUrl = new DSCrawlerUrl();
        pageUrl.url = crawlerUrl.url;
        getUrlInfo(pageUrl); //??
        MainServer.DBCrawler.putHtmlPage(pageUrl, html); //?html????                                             

        //????????
        if (crawlerUrl.crawlerDeepth < maxDeepth) {
            Map<String, DSCrawlerUrl> mapPageUrls = extractLinks(html, crawlerUrl); //???
            MainServer.DBCrawler.filterUrls(crawlerMode, mapPageUrls, pageUrl, mapFilterUrl); //??
            MainServer.DBCrawler.putCrawlerUrls(mapPageUrls, crawlerUrl.crawlerDeepth + 1); //???
        }
    }

    public Map<String, DSCrawlerUrl> extractLinks(String html, DSCrawlerUrl currPageUrl) {
        Map<String, DSCrawlerUrl> mapPageUrls = new HashMap<String, DSCrawlerUrl>();

        Parser m_parser;
        if (html == null || html.equals(""))
            return mapPageUrls;
        try {
            m_parser = Parser.createParser(html, "utf-8");
            NodeFilter filter = new TagNameFilter("a");
            NodeList nodelist = m_parser.parse(filter);
            NodeIterator it = nodelist.elements();
            while (it.hasMoreNodes()) {
                LinkTag node = (LinkTag) it.nextNode();
                String url = node.getAttribute("href");
                if (url != null && url.indexOf("#") >= 0 || url.toLowerCase().indexOf("script") >= 0)
                    continue;
                if ((url == null || url.equals(""))) {
                    continue;
                }
                //System.out.println(url);
                //?url
                DSCrawlerUrl _pageUrl = parsePageUrl(url, currPageUrl.url);
                _pageUrl.crawlerDeepth = currPageUrl.crawlerDeepth + 1;
                mapPageUrls.put(url, _pageUrl);
            }

        } catch (Exception e) {
            System.out.println(" composite Convert extractorUrl :" + e.getMessage());
        }

        //??
        //        MainServer.state.setLinkPageSum(mapPageUrls.size());

        return mapPageUrls;
    }

    public DSCrawlerUrl parsePageUrl(String url, String pageUrl) {
        DSCrawlerUrl info = new DSCrawlerUrl();
        try {
            if (url.toLowerCase().indexOf("http:") == 0) {
                info.url = url;
                getUrlInfo(info);
            } else {
                URL _pageUrl = new URL(pageUrl);
                int index = pageUrl.lastIndexOf("/");
                while (url.indexOf(".") == 0) {
                    url = url.substring(1, url.length());
                }
                if (url.indexOf("/") == 0)
                    info.url = _pageUrl.getProtocol() + "://" + _pageUrl.getHost() + url;
                else
                    info.url = _pageUrl.getProtocol() + "://" + _pageUrl.getHost() + "/" + url;
                getUrlInfo(info);
            }
        } catch (Exception e) {
            return null;
        }
        return info;
    }

    private void getUrlInfo(DSCrawlerUrl info) {
        try {
            //            System.out.println("getUrlInfo : " + info.url);

            URL _url = new URL(info.url);
            info.dns = _url.getHost() + "";
            info.path = _url.getPath();
            info.file = _url.getProtocol();
            if (!info.url.equals("") && info.url != null) {
                InetAddress a = InetAddress.getByName(_url.getHost());
                if (a != null)
                    info.IP = a.getHostAddress();
            }
            /*        } catch (MalformedURLException ex) {
                        Logger.getLogger(CrawlerHtml.class.getName()).log(Level.SEVERE, null, ex);
                    } catch (UnknownHostException ex) {
                        Logger.getLogger(CrawlerHtml.class.getName()).log(Level.SEVERE, null, ex);*/
        } catch (Exception e) {
            System.out.println(" crawlerHtml   " + e.getMessage());
        }
    }

    public DSCrawlerUrl getBaiDuKeyWordURL(String keyWords) {
        String val1 = "";
        try {
            val1 = java.net.URLEncoder.encode(keyWords, "utf-8");
        } catch (UnsupportedEncodingException ex) {
            Logger.getLogger(CrawlerUtiles.class.getName()).log(Level.SEVERE, null, ex);
        }

        DSCrawlerUrl pageUrl = new DSCrawlerUrl();
        pageUrl.url = "http://www.baidu.com/s?wd=" + val1 + "&cl=3&pn=10&tn=site888_pg&rn=100";
        pageUrl.crawlerDeepth = 0;
        pageUrl.title = "start Baidu";
        return pageUrl;
    }

    public String doGetHttp(String url) {
        HttpParams params = new BasicHttpParams();
        HttpConnectionParams.setConnectionTimeout(params, 12000);
        HttpConnectionParams.setSoTimeout(params, 9000);
        HttpClient httpclient = new DefaultHttpClient(params);
        String rs = "";
        try {
            HttpGet httpget = new HttpGet(url);
            System.out.println("executing request " + url);
            HttpContext httpContext = new BasicHttpContext();
            //            httpget.addHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");
            httpget.addHeader("User-Agent",
                    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 1.7; .NET CLR 1.1.4322; CIBA; .NET CLR 2.0.50727)");

            HttpResponse response = httpclient.execute(httpget, httpContext);
            HttpUriRequest realRequest = (HttpUriRequest) httpContext.getAttribute(ExecutionContext.HTTP_REQUEST);
            HttpHost targetHost = (HttpHost) httpContext.getAttribute(ExecutionContext.HTTP_TARGET_HOST);
            url = targetHost.toString() + realRequest.getURI();
            int resStatu = response.getStatusLine().getStatusCode();//? 
            if (resStatu == 200) {
                HttpEntity entity = response.getEntity();
                if (entity != null) {
                    rs = EntityUtils.toString(entity, "iso-8859-1");
                    String in_code = getEncoding(rs);
                    String encode = getHtmlEncode(rs);
                    if (encode.isEmpty()) {
                        httpclient.getConnectionManager().shutdown();
                        return "";
                    } else {
                        if (!in_code.toLowerCase().equals("utf-8")
                                && !in_code.toLowerCase().equals(encode.toLowerCase())) {
                            if (!in_code.toLowerCase().equals("iso-8859-1"))
                                rs = new String(rs.getBytes("iso-8859-1"), in_code);
                            if (!encode.toLowerCase().equals(in_code.toLowerCase()))
                                rs = new String(rs.getBytes(in_code), encode);
                        }
                    }
                    try {
                    } catch (RuntimeException ex) {
                        httpget.abort();
                        throw ex;
                    } finally {
                        // Closing the input stream will trigger connection release
                        //                    try { instream.close(); } catch (Exception ignore) {}
                    }
                }
            }
        } finally {
            // When HttpClient instance is no longer needed,
            // shut down the connection manager to ensure
            // immediate deallocation of all system resources
            httpclient.getConnectionManager().shutdown();
            return rs;
        }
    }

    public String getHtmlEncode(final String s) {
        String val = match(s, "(?<=\\<meta http-equiv='content-type' content=\"text/html;charset=)[^>]+?(?=\")");
        if (val.isEmpty())
            val = match(s, "(?<=\\<meta http-equiv=\"Content-Type\" content=\"text/html; charset=)[^>]+?(?=\")");
        if (val.isEmpty())
            val = match(s, "(?<=\\<meta http-equiv=\"Content-Type\" content=\"text/html;charset=)[^>]+?(?=\")");
        if (val.isEmpty())
            val = match(s, "(?<=\\<meta charset=\")[^>]+?(?=\")");
        return val;
    }

    public String getEncoding(String str) {
        String encode = "";
        try {
            encode = "ISO-8859-1";
            if (str.equals(new String(str.getBytes(encode), encode))) {
                String s1 = encode;
                return s1;
            }
            encode = "UTF-8";
            if (str.equals(new String(str.getBytes(encode), encode))) {
                String s2 = encode;
                return s2;
            }
            encode = "GB2312";
            if (str.equals(new String(str.getBytes(encode), encode))) {
                String s = encode;
                return s;
            }

            encode = "GBK";
            if (str.equals(new String(str.getBytes(encode), encode))) {
                String s3 = encode;
                return s3;
            }
            encode = "BIG5";
            if (str.equals(new String(str.getBytes(encode), encode))) {
                String s3 = encode;
                return s3;
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return "";
    }

    public String match(String content, String filter) {
        String val = "";
        try {
            Matcher m = Pattern.compile(filter, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE).matcher(content);
            while (m.find()) {
                val = m.group();
                break;
            }
        } catch (Exception e) {
            System.out.println("Composite Parse match " + e.getMessage());
        }
        return val;
    }
}