Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.tiaoin.crawl.plugin.util; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Arrays; import java.util.Date; import java.util.Hashtable; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.zip.GZIPInputStream; import org.apache.http.Header; import org.apache.http.HeaderElement; import org.apache.http.HttpEntity; import org.apache.http.HttpException; import org.apache.http.HttpResponse; import org.apache.http.HttpResponseInterceptor; import org.apache.http.HttpStatus; import org.apache.http.HttpVersion; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.params.ClientPNames; import org.apache.http.client.params.CookiePolicy; import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.conn.ssl.SSLSocketFactory; import org.apache.http.entity.HttpEntityWrapper; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.params.CoreProtocolPNames; import org.apache.http.params.HttpParams; import org.apache.http.params.HttpProtocolParamBean; import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; import com.tiaoin.crawl.core.fetcher.FetchResult; import com.tiaoin.crawl.core.fetcher.Page; import com.tiaoin.crawl.core.fetcher.PageFetcher; import com.tiaoin.crawl.core.fetcher.Status; import com.tiaoin.crawl.core.xml.Site; /** * Web ?? * @author weiwei l.weiwei@163.com * @date 2013-1-7 ?11:04:50 */ public class PageFetcherImpl implements PageFetcher { private ThreadSafeClientConnManager connectionManager; private DefaultHttpClient httpClient; private final Object mutex = new Object(); private long lastFetchTime = 0; private SpiderConfig config; private Map<String, String> headers = new Hashtable<String, String>(); public PageFetcherImpl() { } /** * ?GZIP * @author weiwei l.weiwei@163.com * @date 2013-1-7 ?11:26:24 */ private static class GzipDecompressingEntity extends HttpEntityWrapper { public GzipDecompressingEntity(final HttpEntity entity) { super(entity); } public InputStream getContent() throws IOException, IllegalStateException { InputStream wrappedin = wrappedEntity.getContent(); return new GZIPInputStream(wrappedin); } public long getContentLength() { return -1; } } public void setConfig(SpiderConfig config) { this.config = config; } public void addCookie(String key, String val, String host, String path) { Cookie c = new Cookie(key, val, host, path); //Cookie String name = c.name(); String value = c.value(); BasicClientCookie clientCookie = new BasicClientCookie(name, value); clientCookie.setPath(c.path()); clientCookie.setDomain(c.domain()); httpClient.getCookieStore().addCookie(clientCookie); } public void addHeader(String key, String val) { this.headers.put(key, val); } /** * client?Header?Cookie * @param aconfig * @param cookies */ public void init(Site site) { //System.out.println(site.toString()); if (null != site.getHeaders() && site.getHeaders().getHeader() != null) { for (com.tiaoin.crawl.core.xml.Header header : site.getHeaders().getHeader()) { this.addHeader(header.getName(), header.getValue()); } } if (null != site.getCookies() && site.getCookies().getCookie() != null) { for (com.tiaoin.crawl.core.xml.Cookie cookie : site.getCookies().getCookie()) { this.addCookie(cookie.getName(), cookie.getValue(), cookie.getHost(), cookie.getPath()); } } //HTTP? HttpParams params = new BasicHttpParams(); params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString()); params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout()); params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout()); HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params); paramsBean.setVersion(HttpVersion.HTTP_1_1); paramsBean.setContentCharset("UTF-8"); paramsBean.setUseExpectContinue(false); SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); if (config.isIncludeHttpsPages()) schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory())); connectionManager = new ThreadSafeClientConnManager(schemeRegistry); connectionManager.setMaxTotal(config.getMaxTotalConnections()); connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost()); httpClient = new DefaultHttpClient(connectionManager, params); httpClient.getParams().setIntParameter("http.socket.timeout", 15000); httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH); //? httpClient.addResponseInterceptor(new HttpResponseInterceptor() { public void process(final HttpResponse response, final HttpContext context) throws HttpException, IOException { HttpEntity entity = response.getEntity(); Header contentEncoding = entity.getContentEncoding(); if (contentEncoding != null) { HeaderElement[] codecs = contentEncoding.getElements(); for (HeaderElement codec : codecs) { //?GZIP if (codec.getName().equalsIgnoreCase("gzip")) { response.setEntity(new GzipDecompressingEntity(response.getEntity())); return; } } } } }); } /** * ?url * @date 2013-1-7 ?11:08:54 * @param toFetchURL * @return */ public FetchResult fetch(String toFetchURL) throws Exception { FetchResult fetchResult = new FetchResult(); HttpGet get = null; HttpEntity entity = null; try { get = new HttpGet(toFetchURL); //GZIP???GZIP? get.addHeader("Accept-Encoding", "gzip"); for (Iterator<Entry<String, String>> it = headers.entrySet().iterator(); it.hasNext();) { Entry<String, String> entry = it.next(); get.setHeader(entry.getKey(), entry.getValue()); } //???,?? // TODO ?delay? synchronized (mutex) { //?? long now = (new Date()).getTime(); //?Host?? if (now - lastFetchTime < config.getPolitenessDelay()) Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime)); //????HOST??URL lastFetchTime = (new Date()).getTime(); } //get?? HttpResponse response = httpClient.execute(get); //URL fetchResult.setFetchedUrl(toFetchURL); String uri = get.getURI().toString(); if (!uri.equals(toFetchURL)) if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) fetchResult.setFetchedUrl(uri); entity = response.getEntity(); //??? int statusCode = response.getStatusLine().getStatusCode(); if (statusCode != HttpStatus.SC_OK) { if (statusCode != HttpStatus.SC_NOT_FOUND) { Header locationHeader = response.getFirstHeader("Location"); //301?302?URL?? if (locationHeader != null && (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY)) fetchResult.setMovedToUrl( URLCanonicalizer.getCanonicalURL(locationHeader.getValue(), toFetchURL)); } //???OKURLstatusCode?? fetchResult.setStatusCode(statusCode); return fetchResult; } //?? if (entity != null) { fetchResult.setStatusCode(HttpStatus.SC_OK); Page page = load(entity); page.setUrl(fetchResult.getFetchedUrl()); fetchResult.setPage(page); return fetchResult; } } catch (Throwable e) { fetchResult.setStatusCode(Status.INTERNAL_SERVER_ERROR.ordinal()); return fetchResult; } finally { try { if (entity == null && get != null) get.abort(); } catch (Exception e) { throw e; } } fetchResult.setStatusCode(Status.UNSPECIFIED_ERROR.ordinal()); return fetchResult; } /** * EntityPage * @date 2013-1-7 ?11:22:06 * @param entity * @return * @throws Exception */ private Page load(HttpEntity entity) throws Exception { Page page = new Page(); //ContentType String contentType = null; Header type = entity.getContentType(); if (type != null) contentType = type.getValue(); page.setContentType(contentType); //? String contentEncoding = null; Header encoding = entity.getContentEncoding(); if (encoding != null) contentEncoding = encoding.getValue(); page.setEncoding(contentEncoding); // String contentCharset = EntityUtils.getContentCharSet(entity); page.setCharset(contentCharset); //???? String charset = config.getCharset(); String content = this.read(entity.getContent(), charset); page.setContent(content); if (charset == null || charset.trim().length() == 0) page.setContentData(content.getBytes()); else page.setContentData(content.getBytes(charset)); return page; } /** * ????String * @date 2013-1-7 ?11:25:04 * @param inputStream * @param charset * @return */ private String read(final InputStream inputStream, String charset) { StringBuilder sb = new StringBuilder(); BufferedReader reader = null; try { if (charset == null || charset.trim().length() == 0) reader = new BufferedReader(new InputStreamReader(inputStream)); else reader = new BufferedReader(new InputStreamReader(inputStream, charset)); String line = null; while ((line = reader.readLine()) != null) { sb.append(line); } } catch (IOException e) { } return sb.toString(); } /** * ??? * @date 2013-1-7 ?11:25:38 * @param inputStream * @return * @throws Exception */ private byte[] read(final InputStream inputStream) throws Exception { byte[] bytes = new byte[1000]; int i = 0; int b; try { while ((b = inputStream.read()) != -1) { bytes[i++] = (byte) b; if (bytes.length == i) { byte[] newBytes = new byte[(bytes.length * 3) / 2 + 1]; for (int j = 0; j < bytes.length; j++) { newBytes[j] = bytes[j]; } bytes = newBytes; } } } catch (IOException e) { throw new Exception("There was a problem reading stream.", e); } byte[] copy = Arrays.copyOf(bytes, i); return copy; } public HttpClient getHttpClient() { return httpClient; } /** * Proxy * if (config.getProxyHost() != null) { if (config.getProxyUsername() != null) { httpClient.getCredentialsProvider().setCredentials( new AuthScope(config.getProxyHost(), config.getProxyPort()), new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword())); } HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort()); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } */ }