Java tutorial
/* Copyright 2010-2013 Norconex Inc. * * This file is part of Norconex HTTP Collector. * * Norconex HTTP Collector is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Norconex HTTP Collector is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Norconex HTTP Collector. If not, * see <http://www.gnu.org/licenses/>. */ package com.norconex.collector.http.client.impl; import java.io.IOException; import java.io.Reader; import java.io.Writer; import java.util.ArrayList; import java.util.List; import javax.xml.stream.XMLOutputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamWriter; import org.apache.commons.configuration.XMLConfiguration; import org.apache.commons.lang.StringUtils; import org.apache.http.HttpHost; import org.apache.http.NameValuePair; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.params.ClientPNames; import org.apache.http.client.params.CookiePolicy; import org.apache.http.conn.params.ConnRoutePNames; import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.message.BasicNameValuePair; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.params.CoreProtocolPNames; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import com.norconex.collector.http.HttpCollectorException; import com.norconex.collector.http.client.IHttpClientInitializer; import com.norconex.commons.lang.config.ConfigurationLoader; import com.norconex.commons.lang.config.IXMLConfigurable; /** * Default implementation of {@link IHttpClientInitializer}. * <p> * XML configuration usage: * </p> * <pre> * <httpClientInitializer class="com.norconex.collector.http.client.impl.DefaultHttpClientInitializer"> * <cookiesDisabled>[false|true]</cookiesDisabled> * <userAgent>...</userAgent> * <proxyHost>...</proxyHost> * <proxyPort>...</proxyPort> * <proxyUsername>...</proxyUsername> * <proxyPassword>...</proxyPassword> * <proxyRealm>...</proxyRealm> * * <authMethod>[form|basic|digest]</authMethod> * * <!-- These apply to any authentication mechanism --> * <authUsername>...</authUsername> * <authPassword>...</authPassword> * * <!-- These apply to FORM authentication --> * <authUsernameField>...</authUsernameField> * <authPasswordField>...</authPasswordField> * <authURL>...</authURL> * * <!-- These apply to both BASIC and DIGEST authentication --> * <authHostname>...</authHostname> * <authPort>...</authPort> * <authRealm>...</authRealm> * </httpClientInitializer> * </pre> * @author Pascal Essiembre */ public class DefaultHttpClientInitializer implements IHttpClientInitializer, IXMLConfigurable { private static final long serialVersionUID = 8489434479618081974L; @SuppressWarnings("unused") private static final Logger LOG = LogManager.getLogger(DefaultHttpClientInitializer.class); //TODO use enum? public static final String AUTH_METHOD_FORM = "form"; public static final String AUTH_METHOD_BASIC = "basic"; public static final String AUTH_METHOD_DIGEST = "digest"; private static final int CONNECTION_TIMEOUT = 30000; private static final int FTP_PORT = 80; private String authMethod; private String authURL; private String authUsernameField; private String authUsername; private String authPasswordField; private String authPassword; private String authHostname; private int authPort = -1; private String authRealm; private boolean cookiesDisabled; private String userAgent; private String proxyHost; private int proxyPort; private String proxyUsername; private String proxyPassword; private String proxyRealm; @Override public void initializeHTTPClient(DefaultHttpClient httpClient) { // Time out after 30 seconds. //TODO Make configurable. httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, CONNECTION_TIMEOUT); // Add support for FTP websites (FTP served by HTTP server). Scheme ftp = new Scheme("ftp", FTP_PORT, new PlainSocketFactory()); httpClient.getConnectionManager().getSchemeRegistry().register(ftp); //TODO make charset configurable instead since UTF-8 is not right // charset for URL specifications. It is used here to overcome // so invalid redirect errors, where the redirect target URL is not // URL-Encoded and has non-ascii values, and fails // (e.g. like ja.wikipedia.org). // Can consider a custom RedirectStrategy too if need be. httpClient.getParams().setParameter(CoreProtocolPNames.HTTP_ELEMENT_CHARSET, "UTF-8"); if (StringUtils.isNotBlank(proxyHost)) { httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, new HttpHost(proxyHost, proxyPort)); if (StringUtils.isNotBlank(proxyUsername)) { httpClient.getCredentialsProvider().setCredentials(new AuthScope(proxyHost, proxyPort), new UsernamePasswordCredentials(proxyUsername, proxyPassword)); } } if (!cookiesDisabled) { httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BROWSER_COMPATIBILITY); } if (AUTH_METHOD_FORM.equalsIgnoreCase(authMethod)) { authenticateUsingForm(httpClient); } else if (AUTH_METHOD_BASIC.equalsIgnoreCase(authMethod)) { setupBasicDigestAuth(httpClient); } else if (AUTH_METHOD_DIGEST.equalsIgnoreCase(authMethod)) { setupBasicDigestAuth(httpClient); } if (userAgent != null) { httpClient.getParams().setParameter(CoreProtocolPNames.USER_AGENT, userAgent); } } @Override public void loadFromXML(Reader in) { XMLConfiguration xml = ConfigurationLoader.loadXML(in); cookiesDisabled = xml.getBoolean("cookiesDisabled", cookiesDisabled); authMethod = xml.getString("authMethod", authMethod); authUsernameField = xml.getString("authUsernameField", authUsernameField); authUsername = xml.getString("authUsername", authUsername); authPasswordField = xml.getString("authPasswordField", authPasswordField); authPassword = xml.getString("authPassword", authPassword); authURL = xml.getString("authURL", authURL); authHostname = xml.getString("authHostname", authHostname); authPort = xml.getInt("authPort", authPort); authRealm = xml.getString("authRealm", authRealm); userAgent = xml.getString("userAgent", userAgent); proxyHost = xml.getString("proxyHost", proxyHost); proxyPort = xml.getInt("proxyPort", proxyPort); proxyUsername = xml.getString("proxyUsername", proxyUsername); proxyPassword = xml.getString("proxyPassword", proxyPassword); proxyRealm = xml.getString("proxyRealm", proxyRealm); } @Override public void saveToXML(Writer out) throws IOException { XMLOutputFactory factory = XMLOutputFactory.newInstance(); try { XMLStreamWriter writer = factory.createXMLStreamWriter(out); writer.writeStartElement("httpClientInitializer"); writer.writeAttribute("class", getClass().getCanonicalName()); writeSimpleElement(writer, "cookiesDisabled", Boolean.toString(cookiesDisabled)); writeSimpleElement(writer, "userAgent", userAgent); writeSimpleElement(writer, "authMethod", authMethod); writeSimpleElement(writer, "authUsername", authUsername); writeSimpleElement(writer, "authPassword", authPassword); writeSimpleElement(writer, "authUsernameField", authUsernameField); writeSimpleElement(writer, "authPasswordField", authPasswordField); writeSimpleElement(writer, "authURL", authURL); writeSimpleElement(writer, "authHostname", authHostname); writeSimpleElement(writer, "authPort", Integer.toString(authPort)); writeSimpleElement(writer, "authRealm", authRealm); writeSimpleElement(writer, "proxyHost", proxyHost); writeSimpleElement(writer, "proxyPort", Integer.toString(proxyPort)); writeSimpleElement(writer, "proxyUsername", proxyUsername); writeSimpleElement(writer, "proxyPassword", proxyPassword); writeSimpleElement(writer, "proxyRealm", proxyRealm); writer.writeEndElement(); writer.flush(); writer.close(); } catch (XMLStreamException e) { throw new IOException("Cannot save as XML.", e); } } private void writeSimpleElement(XMLStreamWriter writer, String name, String value) throws XMLStreamException { writer.writeStartElement(name); writer.writeCharacters(value); writer.writeEndElement(); } public String getAuthMethod() { return authMethod; } /** * Sets the authentication method. * * Valid values are "form", "basic" and "digest" (case insensitive). * * @param authMethod authentication method */ public void setAuthMethod(String authMethod) { this.authMethod = authMethod; } public String getAuthUsernameField() { return authUsernameField; } /** * Sets the name of the HTML field where the username is set. * * This is used only for "form" authentication. * * @param authUsernameField name of the HTML field */ public void setAuthUsernameField(String authUsernameField) { this.authUsernameField = authUsernameField; } public String getAuthUsername() { return authUsername; } /** * Sets the username. * * Used for all authentication methods. * * @param authUsername username */ public void setAuthUsername(String authUsername) { this.authUsername = authUsername; } public String getAuthPasswordField() { return authPasswordField; } /** * Sets the name of the HTML field where the password is set. * * This is used only for "form" authentication. * * @param authPasswordField name of the HTML field */ public void setAuthPasswordField(String authPasswordField) { this.authPasswordField = authPasswordField; } public String getAuthPassword() { return authPassword; } /** * Sets the password. * * Used for all authentication methods. * * @param authPassword password */ public void setAuthPassword(String authPassword) { this.authPassword = authPassword; } public boolean isCookiesDisabled() { return cookiesDisabled; } public void setCookiesDisabled(boolean cookiesDisabled) { this.cookiesDisabled = cookiesDisabled; } public String getAuthURL() { return authURL; } /** * Sets the URL for "form" authentication. * * The username and password will be POSTed to this URL. * * This is used only for "form" authentication. * * @param authURL "form" authentication URL */ public void setAuthURL(String authURL) { this.authURL = authURL; } public String getAuthHostname() { return authHostname; } /** * Sets the hostname for the current authentication scope. * * Setting this to null (default value) indicates "any hostname" for the * scope. * * Used for BASIC and DIGEST authentication. * * @param authHostname hostname for the scope */ public void setAuthHostname(String authHostname) { this.authHostname = authHostname; } public int getAuthPort() { return authPort; } /** * Sets the port for the current authentication scope. * * Setting this to a negative number (default value) indicates "any port" * for the scope. * * Used for BASIC and DIGEST authentication. * * @param authPort port for the scope */ public void setAuthPort(int authPort) { this.authPort = authPort; } public String getAuthRealm() { return authRealm; } /** * Sets the realm name for the current authentication scope. * * Setting this to null (the default value) indicates "any realm" * for the scope. * * Used for BASIC and DIGEST authentication. * * @param authRealm reaml name for the scope */ public void setAuthRealm(String authRealm) { this.authRealm = authRealm; } public String getUserAgent() { return userAgent; } public void setUserAgent(String userAgent) { this.userAgent = userAgent; } public String getProxyHost() { return proxyHost; } public void setProxyHost(String proxyHost) { this.proxyHost = proxyHost; } public int getProxyPort() { return proxyPort; } public void setProxyPort(int proxyPort) { this.proxyPort = proxyPort; } public String getProxyUsername() { return proxyUsername; } public void setProxyUsername(String proxyUsername) { this.proxyUsername = proxyUsername; } public String getProxyPassword() { return proxyPassword; } public void setProxyPassword(String proxyPassword) { this.proxyPassword = proxyPassword; } public String getProxyRealm() { return proxyRealm; } public void setProxyRealm(String proxyRealm) { this.proxyRealm = proxyRealm; } protected void authenticateUsingForm(DefaultHttpClient httpClient) { HttpPost post = new HttpPost(getAuthURL()); List<NameValuePair> formparams = new ArrayList<NameValuePair>(); formparams.add(new BasicNameValuePair(getAuthUsernameField(), getAuthUsername())); formparams.add(new BasicNameValuePair(getAuthPasswordField(), getAuthPassword())); try { UrlEncodedFormEntity entity = new UrlEncodedFormEntity(formparams, "UTF-8"); post.setEntity(entity); httpClient.execute(post); } catch (Exception e) { throw new HttpCollectorException(e); } post.releaseConnection(); } protected void setupBasicDigestAuth(DefaultHttpClient httpClient) { UsernamePasswordCredentials creds = new UsernamePasswordCredentials(getAuthUsername(), getAuthPassword()); AuthScope authscope = new AuthScope(getAuthHostname(), getAuthPort(), getAuthRealm(), getAuthMethod()); httpClient.getCredentialsProvider().setCredentials(authscope, creds); } }