Java tutorial
/****************************************************** * Web crawler * * * Copyright (C) 2012 by Peter Hedenskog (http://peterhedenskog.com) * ****************************************************** * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. * ******************************************************* */ package com.soulgalore.crawler.guice; import java.util.Set; import java.util.StringTokenizer; import org.apache.http.Header; import org.apache.http.HttpHost; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.HttpClient; import org.apache.http.client.params.ClientPNames; import org.apache.http.client.params.CookiePolicy; import org.apache.http.client.protocol.RequestAcceptEncoding; import org.apache.http.client.protocol.ResponseContentEncoding; import org.apache.http.conn.params.ConnRoutePNames; import org.apache.http.cookie.Cookie; import org.apache.http.cookie.CookieSpec; import org.apache.http.cookie.CookieSpecFactory; import org.apache.http.cookie.MalformedCookieException; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.cookie.CookieOrigin; import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager; import org.apache.http.impl.cookie.BestMatchSpec; import org.apache.http.impl.cookie.BrowserCompatSpec; import org.apache.http.params.HttpParams; import org.apache.http.params.HttpProtocolParams; import com.google.inject.Inject; import com.google.inject.Provider; import com.google.inject.name.Named; import com.soulgalore.crawler.core.CrawlerConfiguration; import com.soulgalore.crawler.util.Auth; import com.soulgalore.crawler.util.AuthUtil; import com.soulgalore.crawler.util.HTTPSFaker; import com.soulgalore.crawler.util.HeaderUtil; /** * Provide a HTTPClient. * * */ public class HttpClientProvider implements Provider<HttpClient> { /** * The number of threads used in the HTTP Client Manager, meaning we can have this number of HTTP * connections open at the same time. */ private final int nrOfThreads; /** * The number of connections that can be open to the same route. Setting this to the same number * as the number of HTTP threads, will ensure that we will use all the thread, even if we only are * using one route. */ private final int maxToRoute; /** * The number in ms before a socket timeout. */ private final int socketTimeout; /** * The number in ms before a connection timeout. */ private final int connectionTimeout; private final Set<Auth> auths; private final String proxy; /** * Create a provider. * * @param maxNrOfThreads the max number of threads in the client * @param theSocketTimeout the socket timeout time * @param theConnectionTimeout the connection timeout time * @param authAsString the auth string * @param theProxy the proxy */ @Inject public HttpClientProvider(@Named(CrawlerConfiguration.MAX_THREADS_PROPERTY_NAME) int maxNrOfThreads, @Named(CrawlerConfiguration.SOCKET_TIMEOUT_PROPERTY_NAME) int theSocketTimeout, @Named(CrawlerConfiguration.CONNECTION_TIMEOUT_PROPERTY_NAME) int theConnectionTimeout, @Named(CrawlerConfiguration.AUTH_PROPERTY_NAME) String authAsString, @Named(CrawlerConfiguration.PROXY_PROPERTY_NAME) String theProxy) { nrOfThreads = maxNrOfThreads; maxToRoute = maxNrOfThreads; connectionTimeout = theConnectionTimeout; socketTimeout = theSocketTimeout; auths = AuthUtil.getInstance().createAuthsFromString(authAsString); proxy = theProxy; } /** * Get the client. * * @return the client */ public HttpClient get() { final ThreadSafeClientConnManager cm = new ThreadSafeClientConnManager(); cm.setMaxTotal(nrOfThreads); cm.setDefaultMaxPerRoute(maxToRoute); final DefaultHttpClient client = HTTPSFaker.getClientThatAllowAnyHTTPS(cm); client.getParams().setParameter("http.socket.timeout", socketTimeout); client.getParams().setParameter("http.connection.timeout", connectionTimeout); client.addRequestInterceptor(new RequestAcceptEncoding()); client.addResponseInterceptor(new ResponseContentEncoding()); CookieSpecFactory csf = new CookieSpecFactory() { public CookieSpec newInstance(HttpParams params) { return new BestMatchSpecWithURLErrorLog(); } }; client.getCookieSpecs().register("bestmatchwithurl", csf); client.getParams().setParameter(ClientPNames.COOKIE_POLICY, "bestmatchwithurl"); if (!"".equals(proxy)) { StringTokenizer token = new StringTokenizer(proxy, ":"); if (token.countTokens() == 3) { String proxyProtocol = token.nextToken(); String proxyHost = token.nextToken(); int proxyPort = Integer.parseInt(token.nextToken()); HttpHost proxy = new HttpHost(proxyHost, proxyPort, proxyProtocol); client.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); } else System.err.println("Invalid proxy configuration: " + proxy); } if (auths.size() > 0) { for (Auth authObject : auths) { client.getCredentialsProvider().setCredentials( new AuthScope(authObject.getScope(), authObject.getPort()), new UsernamePasswordCredentials(authObject.getUserName(), authObject.getPassword())); } } return client; } private class BestMatchSpecWithURLErrorLog extends BestMatchSpec { @Override public void validate(Cookie cookie, CookieOrigin origin) throws MalformedCookieException { try { super.validate(cookie, origin); } catch (MalformedCookieException e) { System.err.println("Cookie rejected for url: " + origin.getHost() + (origin.getPort() != 80 ? ":" + origin.getPort() : "") + origin.getPath() + " the error:" + e.getMessage() + " for cookie:" + cookie.toString()); throw e; } } } }