Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package wuit.common.crawler.WebSit; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.HttpConnectionParams; import org.apache.http.params.HttpParams; import org.apache.http.protocol.BasicHttpContext; import org.apache.http.protocol.ExecutionContext; import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; import wuit.common.crawler.composite.DSCrawlerUrl; import wuit.common.crawler.db.KeyValue; /** * * @author lxl */ public class Crawler { public static String doGetHttp(DSCrawlerUrl pageUrl) { HttpParams params = new BasicHttpParams(); HttpConnectionParams.setConnectionTimeout(params, 12000); HttpConnectionParams.setSoTimeout(params, 9000); HttpClient httpclient = new DefaultHttpClient(params); String rs = ""; try { HttpGet httpget = new HttpGet(pageUrl.url); // System.out.println("executing request " + pageUrl.url); HttpContext httpContext = new BasicHttpContext(); // httpget.addHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"); httpget.addHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 1.7; .NET CLR 1.1.4322; CIBA; .NET CLR 2.0.50727)"); HttpResponse response = httpclient.execute(httpget, httpContext); HttpUriRequest realRequest = (HttpUriRequest) httpContext.getAttribute(ExecutionContext.HTTP_REQUEST); HttpHost targetHost = (HttpHost) httpContext.getAttribute(ExecutionContext.HTTP_TARGET_HOST); pageUrl.url = targetHost.toString() + realRequest.getURI(); int resStatu = response.getStatusLine().getStatusCode();//? if (resStatu == 200) { HttpEntity entity = response.getEntity(); if (entity != null) { rs = EntityUtils.toString(entity, "iso-8859-1"); String in_code = getEncoding(rs); String encode = getHtmlEncode(rs); if (encode.isEmpty()) { httpclient.getConnectionManager().shutdown(); return ""; } else { if (!in_code.toLowerCase().equals("utf-8") && !in_code.toLowerCase().equals(encode.toLowerCase())) { if (!in_code.toLowerCase().equals("iso-8859-1")) rs = new String(rs.getBytes("iso-8859-1"), in_code); if (!encode.toLowerCase().equals(in_code.toLowerCase())) rs = new String(rs.getBytes(in_code), encode); } } try { } catch (RuntimeException ex) { httpget.abort(); throw ex; } finally { // Closing the input stream will trigger connection release // try { instream.close(); } catch (Exception ignore) {} } } } } finally { // When HttpClient instance is no longer needed, // shut down the connection manager to ensure // immediate deallocation of all system resources httpclient.getConnectionManager().shutdown(); return rs; } } public static String getEncoding(String str) { String encode = ""; try { encode = "ISO-8859-1"; if (str.equals(new String(str.getBytes(encode), encode))) { String s1 = encode; return s1; } encode = "UTF-8"; if (str.equals(new String(str.getBytes(encode), encode))) { String s2 = encode; return s2; } encode = "GB2312"; if (str.equals(new String(str.getBytes(encode), encode))) { String s = encode; return s; } encode = "GBK"; if (str.equals(new String(str.getBytes(encode), encode))) { String s3 = encode; return s3; } encode = "BIG5"; if (str.equals(new String(str.getBytes(encode), encode))) { String s3 = encode; return s3; } } catch (Exception e) { e.printStackTrace(); } return ""; } public static String getHtmlEncode(final String s) { String val = match(s, "(?<=\\<meta http-equiv='content-type' content=\"text/html;charset=)[^>]+?(?=\")"); if (val.isEmpty()) val = match(s, "(?<=\\<meta http-equiv=\"Content-Type\" content=\"text/html; charset=)[^>]+?(?=\")"); if (val.isEmpty()) val = match(s, "(?<=\\<meta http-equiv=\"Content-Type\" content=\"text/html;charset=)[^>]+?(?=\")"); if (val.isEmpty()) val = match(s, "(?<=\\<meta charset=\")[^>]+?(?=\")"); return val; } public static String match(String content, String filter) { String val = ""; try { Matcher m = Pattern.compile(filter, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE).matcher(content); while (m.find()) { val = m.group(); break; } } catch (Exception e) { System.out.println("Composite Parse match " + e.getMessage()); } return val; } public static void matchValues(String content, String filter, List<KeyValue> list) { if (list == null) list = new ArrayList<KeyValue>(); try { Matcher m = Pattern.compile(filter, Pattern.CASE_INSENSITIVE | Pattern.MULTILINE).matcher(content); while (m.find()) { if (m.group().isEmpty()) continue; KeyValue value = new KeyValue(); value.value = m.group(); value.start = m.start(); value.end = m.end(); list.add(value); } } catch (Exception e) { System.out.println("Crawler Utitles matchValues :" + e.getMessage()); } } }