Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.eweb4j.spiderman.plugin.util; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.Hashtable; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.zip.GZIPInputStream; import org.apache.http.Header; import org.apache.http.HeaderElement; import org.apache.http.HttpEntity; import org.apache.http.HttpException; import org.apache.http.HttpResponse; import org.apache.http.HttpResponseInterceptor; import org.apache.http.HttpStatus; import org.apache.http.HttpVersion; import org.apache.http.NameValuePair; import org.apache.http.client.HttpClient; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpDelete; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpHead; import org.apache.http.client.methods.HttpOptions; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.methods.HttpPut; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.params.ClientPNames; import org.apache.http.client.params.CookiePolicy; import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.conn.ssl.SSLSocketFactory; import org.apache.http.entity.HttpEntityWrapper; import org.apache.http.entity.mime.HttpMultipartMode; import org.apache.http.entity.mime.MultipartEntity; import org.apache.http.entity.mime.content.FileBody; import org.apache.http.entity.mime.content.StringBody; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.message.BasicNameValuePair; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.params.CoreProtocolPNames; import org.apache.http.params.HttpParams; import org.apache.http.params.HttpProtocolParamBean; import org.apache.http.protocol.HTTP; import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; import org.eweb4j.mvc.Http; import org.eweb4j.spiderman.fetcher.FetchRequest; import org.eweb4j.spiderman.fetcher.FetchResult; import org.eweb4j.spiderman.fetcher.Page; import org.eweb4j.spiderman.fetcher.PageFetcher; import org.eweb4j.spiderman.fetcher.Status; import org.eweb4j.spiderman.xml.Site; import org.eweb4j.util.CommonUtil; /** * Web ?? * @author weiwei l.weiwei@163.com * @date 2013-1-7 ?11:04:50 */ public class PageFetcherImpl implements PageFetcher { private ThreadSafeClientConnManager connectionManager; private DefaultHttpClient httpClient; private final Object mutex = new Object(); private long lastFetchTime = 0; private SpiderConfig config; private Map<String, String> headers = new Hashtable<String, String>(); private Map<String, List<String>> cookies = new Hashtable<String, List<String>>(); private Site site; public PageFetcherImpl() { } /** * ?GZIP * @author weiwei l.weiwei@163.com * @date 2013-1-7 ?11:26:24 */ private static class GzipDecompressingEntity extends HttpEntityWrapper { public GzipDecompressingEntity(final HttpEntity entity) { super(entity); } public InputStream getContent() throws IOException, IllegalStateException { InputStream wrappedin = wrappedEntity.getContent(); return new GZIPInputStream(wrappedin); } public long getContentLength() { return -1; } } public void setConfig(SpiderConfig config) { this.config = config; } public void addCookie(String key, String val, String host, String path) { Cookie c = new Cookie(key, val, host, path); //Cookie String name = c.name(); String value = c.value(); List<String> vals = this.cookies.get(name); if (vals == null) vals = new ArrayList<String>(); vals.add(value); this.cookies.put(key, vals); BasicClientCookie clientCookie = new BasicClientCookie(name, value); clientCookie.setPath(c.path()); clientCookie.setDomain(c.domain()); httpClient.getCookieStore().addCookie(clientCookie); } public void addHeader(String key, String val) { if (this.headers.containsKey(key)) this.headers.put(key, this.headers.get(key) + "; " + val); else this.headers.put(key, val); } /** * client?Header?Cookie * @param aconfig * @param cookies */ public void init(Site _site) { //HTTP? HttpParams params = new BasicHttpParams(); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); connectionManager = new ThreadSafeClientConnManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); httpClient.getParams().setIntParameter("http.socket.timeout", 60000); httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); httpClient.getParams().setParameter(ClientPNames.HANDLE_REDIRECTS, config.isFollowRedirects()); // HttpClientParams.setCookiePolicy(httpClient.getParams(),CookiePolicy.BEST_MATCH); //? httpClient.addResponseInterceptor(new HttpResponseInterceptor() { public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { //?GZIP if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); if (_site != null) { this.site = _site; if (this.site.getHeaders() != null && this.site.getHeaders().getHeader() != null) { for (org.eweb4j.spiderman.xml.Header header : this.site.getHeaders().getHeader()) { this.addHeader(header.getName(), header.getValue()); } } if (this.site.getCookies() != null && this.site.getCookies().getCookie() != null) { for (org.eweb4j.spiderman.xml.Cookie cookie : this.site.getCookies().getCookie()) { this.addCookie(cookie.getName(), cookie.getValue(), cookie.getHost(), cookie.getPath()); } } } } /** * ?url * @date 2013-1-7 ?11:08:54 * @param toFetchURL * @return */ public FetchResult fetch(FetchRequest req) throws Exception { if (req.getHttpMethod() != null && !Http.Method.GET.equals(req.getHttpMethod())) { //?URL??QueryParam String query = new URL(req.getUrl()).getQuery(); for (String q : query.split("\\&")) { String[] qv = q.split("="); String name = qv[0]; String val = qv[1]; List<Object> vals = req.getParams().get(name); if (vals == null) { vals = new ArrayList<Object>(); req.getParams().put(name, vals); } vals.add(val); } return request(req); } FetchResult fetchResult = new FetchResult(); HttpGet get = null; HttpEntity entity = null; String toFetchURL = req.getUrl(); try { get = new HttpGet(toFetchURL); //GZIP???GZIP? get.addHeader("Accept-Encoding", "gzip"); for (Iterator<Entry<String, String>> it = headers.entrySet().iterator(); it.hasNext();) { Entry<String, String> entry = it.next(); get.addHeader(entry.getKey(), entry.getValue()); } //???,?? // TODO ?delay? synchronized (mutex) { //?? long now = (new Date()).getTime(); //?Host?? if (now - lastFetchTime < config.getPolitenessDelay()) Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime)); //????HOST??URL lastFetchTime = (new Date()).getTime(); } //get? Header[] headers = get.getAllHeaders(); for (Header h : headers) { Map<String, List<String>> hs = req.getHeaders(); String key = h.getName(); List<String> val = hs.get(key); if (val == null) val = new ArrayList<String>(); val.add(h.getValue()); hs.put(key, val); } req.getCookies().putAll(this.cookies); fetchResult.setReq(req); //get?? HttpResponse response = httpClient.execute(get); headers = response.getAllHeaders(); for (Header h : headers) { Map<String, List<String>> hs = fetchResult.getHeaders(); String key = h.getName(); List<String> val = hs.get(key); if (val == null) val = new ArrayList<String>(); val.add(h.getValue()); hs.put(key, val); } //URL fetchResult.setFetchedUrl(toFetchURL); String uri = get.getURI().toString(); if (!uri.equals(toFetchURL)) if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) fetchResult.setFetchedUrl(uri); entity = response.getEntity(); //??? int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != HttpStatus.SC_OK) { if (statusCode != HttpStatus.SC_NOT_FOUND) { Header locationHeader = response.getFirstHeader("Location"); //301?302?URL?? if (locationHeader != null && (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY)) fetchResult.setMovedToUrl( URLCanonicalizer.getCanonicalURL(locationHeader.getValue(), toFetchURL)); } //???OKURLstatusCode?? //??? if (this.site.getSkipStatusCode() != null && this.site.getSkipStatusCode().trim().length() > 0) { String[] scs = this.site.getSkipStatusCode().split(","); for (String code : scs) { int c = CommonUtil.toInt(code); //????entity if (statusCode == c) { assemPage(fetchResult, entity); break; } } } fetchResult.setStatusCode(statusCode); return fetchResult; } //?? if (entity != null) { fetchResult.setStatusCode(statusCode); assemPage(fetchResult, entity); return fetchResult; } } catch (Throwable e) { fetchResult.setFetchedUrl(e.toString()); fetchResult.setStatusCode(Status.INTERNAL_SERVER_ERROR.ordinal()); return fetchResult; } finally { try { if (entity == null && get != null) get.abort(); } catch (Exception e) { throw e; } } fetchResult.setStatusCode(Status.UNSPECIFIED_ERROR.ordinal()); return fetchResult; } /** * * @date 2013-1-7 ?11:08:54 * @param toFetchURL * @return */ public FetchResult request(FetchRequest req) throws Exception { FetchResult fetchResult = new FetchResult(); HttpUriRequest request = null; HttpEntity entity = null; String toFetchURL = req.getUrl(); boolean isPost = false; try { if (Http.Method.GET.equalsIgnoreCase(req.getHttpMethod())) request = new HttpGet(toFetchURL); else if (Http.Method.POST.equalsIgnoreCase(req.getHttpMethod())) { request = new HttpPost(toFetchURL); isPost = true; } else if (Http.Method.PUT.equalsIgnoreCase(req.getHttpMethod())) request = new HttpPut(toFetchURL); else if (Http.Method.HEAD.equalsIgnoreCase(req.getHttpMethod())) request = new HttpHead(toFetchURL); else if (Http.Method.OPTIONS.equalsIgnoreCase(req.getHttpMethod())) request = new HttpOptions(toFetchURL); else if (Http.Method.DELETE.equalsIgnoreCase(req.getHttpMethod())) request = new HttpDelete(toFetchURL); else throw new Exception("Unknown http method name"); //???,?? // TODO ?delay? synchronized (mutex) { //?? long now = (new Date()).getTime(); //?Host?? if (now - lastFetchTime < config.getPolitenessDelay()) Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime)); //????HOST??URL lastFetchTime = (new Date()).getTime(); } //GZIP???GZIP? request.addHeader("Accept-Encoding", "gzip"); for (Iterator<Entry<String, String>> it = headers.entrySet().iterator(); it.hasNext();) { Entry<String, String> entry = it.next(); request.addHeader(entry.getKey(), entry.getValue()); } //? Header[] headers = request.getAllHeaders(); for (Header h : headers) { Map<String, List<String>> hs = req.getHeaders(); String key = h.getName(); List<String> val = hs.get(key); if (val == null) val = new ArrayList<String>(); val.add(h.getValue()); hs.put(key, val); } req.getCookies().putAll(this.cookies); fetchResult.setReq(req); HttpEntity reqEntity = null; if (Http.Method.POST.equalsIgnoreCase(req.getHttpMethod()) || Http.Method.PUT.equalsIgnoreCase(req.getHttpMethod())) { if (!req.getFiles().isEmpty()) { reqEntity = new MultipartEntity(HttpMultipartMode.BROWSER_COMPATIBLE); for (Iterator<Entry<String, List<File>>> it = req.getFiles().entrySet().iterator(); it .hasNext();) { Entry<String, List<File>> e = it.next(); String paramName = e.getKey(); for (File file : e.getValue()) { // For File parameters ((MultipartEntity) reqEntity).addPart(paramName, new FileBody(file)); } } for (Iterator<Entry<String, List<Object>>> it = req.getParams().entrySet().iterator(); it .hasNext();) { Entry<String, List<Object>> e = it.next(); String paramName = e.getKey(); for (Object paramValue : e.getValue()) { // For usual String parameters ((MultipartEntity) reqEntity).addPart(paramName, new StringBody( String.valueOf(paramValue), "text/plain", Charset.forName("UTF-8"))); } } } else { List<NameValuePair> params = new ArrayList<NameValuePair>(req.getParams().size()); for (Iterator<Entry<String, List<Object>>> it = req.getParams().entrySet().iterator(); it .hasNext();) { Entry<String, List<Object>> e = it.next(); String paramName = e.getKey(); for (Object paramValue : e.getValue()) { params.add(new BasicNameValuePair(paramName, String.valueOf(paramValue))); } } reqEntity = new UrlEncodedFormEntity(params, HTTP.UTF_8); } if (isPost) ((HttpPost) request).setEntity(reqEntity); else ((HttpPut) request).setEntity(reqEntity); } //?? HttpResponse response = httpClient.execute(request); headers = response.getAllHeaders(); for (Header h : headers) { Map<String, List<String>> hs = fetchResult.getHeaders(); String key = h.getName(); List<String> val = hs.get(key); if (val == null) val = new ArrayList<String>(); val.add(h.getValue()); hs.put(key, val); } //URL fetchResult.setFetchedUrl(toFetchURL); String uri = request.getURI().toString(); if (!uri.equals(toFetchURL)) if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) fetchResult.setFetchedUrl(uri); entity = response.getEntity(); //??? int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != HttpStatus.SC_OK) { if (statusCode != HttpStatus.SC_NOT_FOUND) { Header locationHeader = response.getFirstHeader("Location"); //301?302?URL?? if (locationHeader != null && (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY)) fetchResult.setMovedToUrl( URLCanonicalizer.getCanonicalURL(locationHeader.getValue(), toFetchURL)); } //???OKURLstatusCode?? //??? if (this.site.getSkipStatusCode() != null && this.site.getSkipStatusCode().trim().length() > 0) { String[] scs = this.site.getSkipStatusCode().split(","); for (String code : scs) { int c = CommonUtil.toInt(code); //????entity if (statusCode == c) { assemPage(fetchResult, entity); break; } } } fetchResult.setStatusCode(statusCode); return fetchResult; } //?? if (entity != null) { fetchResult.setStatusCode(statusCode); assemPage(fetchResult, entity); return fetchResult; } } catch (Throwable e) { fetchResult.setFetchedUrl(e.toString()); fetchResult.setStatusCode(Status.INTERNAL_SERVER_ERROR.ordinal()); return fetchResult; } finally { try { if (entity == null && request != null) request.abort(); } catch (Exception e) { throw e; } } fetchResult.setStatusCode(Status.UNSPECIFIED_ERROR.ordinal()); return fetchResult; } private void assemPage(FetchResult fetchResult, HttpEntity entity) throws Exception { Page page = load(entity); page.setUrl(fetchResult.getFetchedUrl()); fetchResult.setPage(page); } /** * EntityPage * @date 2013-1-7 ?11:22:06 * @param entity * @return * @throws Exception */ private Page load(HttpEntity entity) throws Exception { Page page = new Page(); //ContentType String contentType = null; Header type = entity.getContentType(); if (type != null) contentType = type.getValue(); page.setContentType(contentType); //? String contentEncoding = null; Header encoding = entity.getContentEncoding(); if (encoding != null) contentEncoding = encoding.getValue(); page.setEncoding(contentEncoding); // String contentCharset = EntityUtils.getContentCharSet(entity); page.setCharset(contentCharset); //???? String charset = config.getCharset(); String content = this.read(entity.getContent(), charset); page.setContent(content); // if (charset == null || charset.trim().length() == 0) // page.setContentData(content.getBytes()); // else // page.setContentData(content.getBytes(charset)); return page; } /** * ????String * @date 2013-1-7 ?11:25:04 * @param inputStream * @param charset * @return */ private String read(final InputStream inputStream, String charset) { StringBuilder sb = new StringBuilder(); BufferedReader reader = null; try { if (charset == null || charset.trim().length() == 0) reader = new BufferedReader(new InputStreamReader(inputStream)); else reader = new BufferedReader(new InputStreamReader(inputStream, charset)); String line = null; while ((line = reader.readLine()) != null) { sb.append(line); } } catch (IOException e) { } return sb.toString(); } /** * ??? * @date 2013-1-7 ?11:25:38 * @param inputStream * @return * @throws Exception */ private byte[] read(final InputStream inputStream) throws Exception { byte[] bytes = new byte[1000]; int i = 0; int b; try { while ((b = inputStream.read()) != -1) { bytes[i++] = (byte) b; if (bytes.length == i) { byte[] newBytes = new byte[(bytes.length * 3) / 2 + 1]; for (int j = 0; j < bytes.length; j++) { newBytes[j] = bytes[j]; } bytes = newBytes; } } } catch (IOException e) { throw new Exception("There was a problem reading stream.", e); } byte[] copy = Arrays.copyOf(bytes, i); return copy; } public HttpClient getHttpClient() { return httpClient; } /** * Proxy * if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } */ }