wuit.common.crawler.WebSit.CrawlerAPIBaiDu.java Source code

Java tutorial

Introduction

Here is the source code for wuit.common.crawler.WebSit.CrawlerAPIBaiDu.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

package wuit.common.crawler.WebSit;

//import org.apache.commons;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.util.URIUtil;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import wuit.common.crawler.composite.DSComposite;
import wuit.common.crawler.db.KeyValue;

/**
 *
 * @author lxl
 */
public class CrawlerAPIBaiDu {
    private static Log log = LogFactory.getLog(CrawlerAPIBaiDu.class);

    /**
      * HTTP GET?HTML
      *
      * @param url
      *            URL?
      * @param queryString
      *            ?,?null
      * @param charset
      *            
      * @param pretty
      *            ?
      * @return ?HTML
      */
    public static String doGet(String url, String queryString, String charset, boolean pretty) {
        StringBuffer response = new StringBuffer();
        HttpClient client = new HttpClient();
        HttpMethod method = new GetMethod(url);
        try {
            if (StringUtils.isNotBlank(queryString))
                // get??http?????%?
                method.setQueryString(URIUtil.encodeQuery(queryString));
            client.executeMethod(method);
            if (method.getStatusCode() == HttpStatus.SC_OK) {
                BufferedReader reader = new BufferedReader(
                        new InputStreamReader(method.getResponseBodyAsStream(), charset));
                String line;
                while ((line = reader.readLine()) != null) {
                    if (pretty)
                        response.append(line).append(System.getProperty("line.separator"));
                    else
                        response.append(line);
                }
                reader.close();
            }
        } catch (URIException e) {
            log.error("HTTP Get?" + queryString + "???", e);
        } catch (IOException ex) {
            Logger.getLogger(CrawlerAPIBaiDu.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            method.releaseConnection();
        }
        return response.toString();
    }

    /**
     * HTTP POST?HTML
     *
     * @param url
     *            URL?
     * @param params
     *            ?,?null
     * @param charset
     *            
     * @param pretty
     *            ?
     * @return ?HTML
     */
    /*public static String doPost(String url, Map<String, String> params,
        String charset, boolean pretty) {
    StringBuffer response = new StringBuffer();
    HttpClient client = new HttpClient();
    HttpMethod method = new PostMethod(url);
    // Http Post?
    if (params != null) {
        HttpMethodParams p = new HttpMethodParams();
        for (Map.Entry<String, String> entry : params.entrySet()) {
            p.setParameter(entry.getKey(), entry.getValue());
        }
        method.setParams(p);
    }
    try {
        client.executeMethod(method);
        if (method.getStatusCode() == HttpStatus.SC_OK) {
            BufferedReader reader = new BufferedReader(
                    new InputStreamReader(method.getResponseBodyAsStream(),
                            charset));
            String line;
            while ((line = reader.readLine()) != null) {
                if (pretty)
                    response.append(line).append(
                            System.getProperty("line.separator"));
                else
                    response.append(line);
            }
            reader.close();
        }
    } catch (IOException e) {
        log.error("HTTP Post" + url + "??", e);
    } finally {
        method.releaseConnection();
    }
    return response.toString();
    }*/

    public static void matchValues(String content, String filter, List<KeyValue> list) {
        if (list == null)
            list = new ArrayList<KeyValue>();
        try {
            Matcher m = Pattern.compile(filter, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE).matcher(content);
            while (m.find()) {
                if (m.group().isEmpty())
                    continue;
                KeyValue value = new KeyValue();
                value.value = m.group();
                value.start = m.start();
                value.end = m.end();
                list.add(value);
            }
        } catch (Exception e) {
            System.out.println("Crawler Utitles  matchValues :" + e.getMessage());
        }
    }

    public static void main(String[] args) throws UnsupportedEncodingException {

        String url_m = "www.aibang.com/[^/].+?/";
        String url = "http://www.aibang.com/beijing/xiangcai/";
        url = "www.aibang.com/shanghai/|www.aibang.com/beijing/|www.aibang.com/nanjing/|www.aibang.com/huhehaote/|http://www.aibang.com/haerbin/";
        List<KeyValue> list2 = new ArrayList<KeyValue>();

        CrawlerAPIBaiDu.matchValues(url, url_m, list2);

        String city = "";
        String key = "1?";

        String results = doGet(" http://api.map.baidu.com/place/search?&query="
                + java.net.URLEncoder.encode(key, "utf-8") + "&region=" + java.net.URLEncoder.encode(city, "utf-8")
                + "&output=json&key=bcb9b248df88de9cb49ff5ceab7c784e", null, "utf-8", true);

        List<KeyValue> list = new ArrayList<KeyValue>();
        List<DSComposite> _list = new ArrayList<DSComposite>();
        CrawlerAPIBaiDu.matchValues(results, "(?<=name\":\")[^\"].+?(?=\")", list);
        for (int i = 0; i < list.size(); i++) {
            //             System.out.println(list.get(i).value);
            DSComposite info = new DSComposite();
            info.name = list.get(i).value;
            _list.add(info);
        }
        list.clear();
        CrawlerAPIBaiDu.matchValues(results, "(?<=address\":\")[^}].+?(?=\")", list);
        for (int i = 0; i < list.size(); i++) {
            //             System.out.println(list.get(i).value);
            _list.get(i).local.address = list.get(i).value;
        }
        list.clear();
        CrawlerAPIBaiDu.matchValues(results, "(?<=\"lat\":)\\d{1,}\\.\\d{1,}", list);
        for (int i = 0; i < list.size(); i++) {
            _list.get(i).lat = list.get(i).value;
        }
        list.clear();
        CrawlerAPIBaiDu.matchValues(results, "(?<=\"lng\":)\\d{1,}\\.\\d{1,}", list);
        for (int i = 0; i < list.size(); i++) {
            //             System.out.println(list.get(i).value);
            _list.get(i).lng = list.get(i).value;
        }
        list.clear();
        CrawlerAPIBaiDu.matchValues(results, "(?<=telephone\":\")[^}].+?(?=\",)", list);
        for (int i = 0; i < list.size(); i++) {
            _list.get(i).phone = list.get(i).value;
        }

        list.clear();
        CrawlerAPIBaiDu.matchValues(results, "(?<=\"detail_url\":\")[^\"].+?(?=\")", list);
        for (int i = 0; i < list.size(); i++) {
            _list.get(i).collection.url = list.get(i).value;
        }

        for (int i = 0; i < _list.size(); i++) {
            System.out.println(_list.get(i).name + ":" + _list.get(i).local.address + ":" + _list.get(i).phone + ":"
                    + _list.get(i).lat + ":" + _list.get(i).lng + ":" + _list.get(i).collection.url);
        }

        System.out.println(results);

        /*
        CrawlerAPIBaiDu httpTookit = new CrawlerAPIBaiDu();
        System.out.println("");
        Scanner sc = new Scanner(System.in);
        //       String city = sc.next();
        System.out.println("");
        //       String key = sc.next();
        String result = httpTookit.query(city,key);
        System.out.println(result);
        */

    }

    /*
     * 
     * @param city
     *      
     * @param key
     *      
     * @return 
     * */
    public String query(String city, String key) throws UnsupportedEncodingException {

        String results = doGet(" http://api.map.baidu.com/place/search?&query="
                + java.net.URLEncoder.encode(key, "utf-8") + "&region=" + java.net.URLEncoder.encode(city, "utf-8")
                + "&output=json&key=bcb9b248df88de9cb49ff5ceab7c784e", null, "utf-8", true);

        return results;
    }

}