Java tutorial
/** * License Agreement for OpenSearchServer * * Copyright (C) 2013-2014 Emmanuel Keller / Jaeksoft * * http://www.open-search-server.com * * This file is part of OpenSearchServer. * * OpenSearchServer is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * OpenSearchServer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with OpenSearchServer. * If not, see <http://www.gnu.org/licenses/>. **/ package com.jaeksoft.searchlib.crawler.web.spider; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.net.URISyntaxException; import java.nio.charset.Charset; import java.nio.charset.UnsupportedCharsetException; import java.text.ParseException; import java.util.List; import java.util.Locale; import org.apache.commons.collections.CollectionUtils; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.ProtocolException; import org.apache.http.StatusLine; import org.apache.http.auth.AuthSchemeProvider; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.CredentialsProvider; import org.apache.http.client.RedirectStrategy; import org.apache.http.client.config.AuthSchemes; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpRequestBase; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.cookie.Cookie; import org.apache.http.entity.ContentType; import org.apache.http.impl.auth.BasicSchemeFactory; import org.apache.http.impl.auth.DigestSchemeFactory; import org.apache.http.impl.auth.KerberosSchemeFactory; import org.apache.http.impl.auth.SPNegoSchemeFactory; import org.apache.http.impl.client.BasicCookieStore; import org.apache.http.impl.client.BasicCredentialsProvider; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.DefaultRedirectStrategy; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import com.jaeksoft.searchlib.Logging; import com.jaeksoft.searchlib.crawler.web.database.CookieItem; import com.jaeksoft.searchlib.crawler.web.database.CredentialItem; import com.jaeksoft.searchlib.util.FormatUtils.ThreadSafeDateFormat; import com.jaeksoft.searchlib.util.FormatUtils.ThreadSafeSimpleDateFormat; import com.jaeksoft.searchlib.util.IOUtils; import com.jaeksoft.searchlib.util.cifs.NTLMSchemeFactory; public abstract class HttpAbstract { private CloseableHttpClient httpClient = null; private RedirectStrategy redirectStrategy; private HttpResponse httpResponse = null; private HttpClientContext httpClientContext = null; private HttpRequestBase httpBaseRequest = null; private ProxyHandler proxyHandler; private HttpEntity httpEntity = null; private StatusLine statusLine = null; private BasicCookieStore cookieStore; private CredentialsProvider credentialsProvider; public HttpAbstract(String userAgent, boolean bFollowRedirect, ProxyHandler proxyHandler) { HttpClientBuilder builder = HttpClients.custom(); redirectStrategy = new DefaultRedirectStrategy(); if (userAgent != null) { userAgent = userAgent.trim(); if (userAgent.length() > 0) builder.setUserAgent(userAgent); else userAgent = null; } if (!bFollowRedirect) builder.disableRedirectHandling(); this.proxyHandler = proxyHandler; Registry<AuthSchemeProvider> authSchemeRegistry = RegistryBuilder.<AuthSchemeProvider>create() .register(AuthSchemes.NTLM, new NTLMSchemeFactory()) .register(AuthSchemes.BASIC, new BasicSchemeFactory()) .register(AuthSchemes.DIGEST, new DigestSchemeFactory()) .register(AuthSchemes.SPNEGO, new SPNegoSchemeFactory()) .register(AuthSchemes.KERBEROS, new KerberosSchemeFactory()).build(); credentialsProvider = new BasicCredentialsProvider(); builder.setDefaultCredentialsProvider(credentialsProvider); cookieStore = new BasicCookieStore(); builder.setDefaultCookieStore(cookieStore); builder.setDefaultCredentialsProvider(credentialsProvider); builder.setDefaultAuthSchemeRegistry(authSchemeRegistry); httpClient = builder.build(); } protected void reset() { httpResponse = null; httpBaseRequest = null; httpClientContext = null; synchronized (this) { if (httpEntity != null) { try { EntityUtils.consume(httpEntity); } catch (IOException e) { Logging.warn(e.getMessage(), e); } httpEntity = null; } statusLine = null; } } protected void execute(HttpRequestBase httpBaseRequest, CredentialItem credentialItem, List<CookieItem> cookies) throws ClientProtocolException, IOException, URISyntaxException { if (!CollectionUtils.isEmpty(cookies)) { List<Cookie> cookieList = cookieStore.getCookies(); for (CookieItem cookie : cookies) { Cookie newCookie = cookie.getCookie(); if (!cookieList.contains(newCookie)) cookieStore.addCookie(newCookie); } } this.httpBaseRequest = httpBaseRequest; // No more than one 1 minute to establish the connection // No more than 10 minutes to establish the socket // Enable stales connection checking // Cookies uses browser compatibility RequestConfig.Builder configBuilber = RequestConfig.custom().setSocketTimeout(1000 * 60 * 10) .setConnectTimeout(1000 * 60).setStaleConnectionCheckEnabled(true) .setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY); if (credentialItem == null) credentialsProvider.clear(); else credentialItem.setUpCredentials(credentialsProvider); URI uri = httpBaseRequest.getURI(); if (proxyHandler != null) proxyHandler.check(configBuilber, uri, credentialsProvider); httpBaseRequest.setConfig(configBuilber.build()); httpClientContext = new HttpClientContext(); httpResponse = httpClient.execute(httpBaseRequest, httpClientContext); if (httpResponse == null) return; statusLine = httpResponse.getStatusLine(); httpEntity = httpResponse.getEntity(); } public URI getRedirectLocation() { synchronized (this) { if (httpResponse == null) return null; if (httpClientContext == null) return null; try { if (!redirectStrategy.isRedirected(httpBaseRequest, httpResponse, httpClientContext)) return null; HttpUriRequest httpUri = redirectStrategy.getRedirect(httpBaseRequest, httpResponse, httpClientContext); if (httpUri == null) return null; return httpUri.getURI(); } catch (ProtocolException e) { Logging.error(e); return null; } } } final public Long getContentLength() { synchronized (this) { if (httpEntity != null) return httpEntity.getContentLength(); Header header = httpResponse.getFirstHeader("Content-Length"); if (header == null) return null; String value = header.getValue(); if (value == null) return null; return new Long(value); } } public String getContentDispositionFilename() { synchronized (this) { if (httpResponse == null) return null; Header header = httpResponse.getFirstHeader("Content-Disposition"); if (header == null) return null; String s = header.getValue(); int i1 = s.indexOf("filename="); if (i1 == -1) return null; i1 += 9; int i2 = s.indexOf(";", i1); String f = (i2 == -1) ? s.substring(i1) : s.substring(i1, i2); return f.replace("\"", ""); } } public String getContentBaseType() { synchronized (this) { Header header = null; if (httpEntity != null) header = httpEntity.getContentType(); if (header == null) header = httpResponse.getFirstHeader("Content-Type"); if (header == null) return null; String v = header.getValue(); int i = v.indexOf(';'); if (i == -1) return v; return v.substring(0, i); } } // Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123 // Sunday, 06-Nov-94 08:49:37 GMT ; RFC 850, obsoleted by RFC 1036 // Sun Nov 6 08:49:37 1994 private final static String[] LastModifiedDateFormats = { "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE, dd MMM yyyy HH:mm:ss z", "EEEE, dd-MMM-yy HH:mm:ss z", "EEE MMM d HH:mm:ss yyyy" }; private final static ThreadSafeDateFormat[] httpDatesFormats; static { int i = 0; httpDatesFormats = new ThreadSafeDateFormat[LastModifiedDateFormats.length * 2]; for (String format : LastModifiedDateFormats) { httpDatesFormats[i++] = new ThreadSafeSimpleDateFormat(format, Locale.ENGLISH); httpDatesFormats[i++] = new ThreadSafeSimpleDateFormat(format); } }; public Long getLastModified() { synchronized (this) { Header header = httpResponse.getFirstHeader("Last-Modified"); if (header == null) return null; String v = header.getValue(); if (v == null) return null; ParseException parseException = null; for (ThreadSafeDateFormat dateFormat : httpDatesFormats) { try { return dateFormat.parse(v).getTime(); } catch (ParseException e) { parseException = e; } } if (parseException != null) Logging.warn(parseException); return null; } } public final static void main(String[] argv) { for (ThreadSafeDateFormat dateFormat : httpDatesFormats) { try { System.out.println(dateFormat.parse("Thu, 21 Feb 2013 20:11:52 GMT").getTime()); } catch (ParseException e) { e.printStackTrace(); } } } public String getContentTypeCharset() { synchronized (this) { if (httpEntity == null) return null; try { ContentType ct = ContentType.getOrDefault(httpEntity); if (ct == null) return null; Charset charset = ct.getCharset(); if (charset == null) return null; return charset.name(); } catch (UnsupportedCharsetException e) { Logging.warn(e); return null; } } } public String getContentEncoding() { synchronized (this) { if (httpEntity == null) return null; Header header = httpEntity.getContentEncoding(); if (header == null) return null; return header.getValue(); } } public String getContentLocation() { synchronized (this) { if (httpResponse == null) return null; Header header = httpResponse.getFirstHeader("Content-Location"); if (header == null) return null; return header.getValue(); } } protected InputStream getContent() throws IllegalStateException, IOException { synchronized (this) { if (httpEntity == null) return null; return httpEntity.getContent(); } } public Integer getStatusCode() { synchronized (this) { if (statusLine == null) return null; return statusLine.getStatusCode(); } } public String getReasonPhrase() { synchronized (this) { if (statusLine == null) return null; return statusLine.getReasonPhrase(); } } public void release() { synchronized (this) { try { reset(); IOUtils.close(httpClient); } catch (Exception e) { Logging.warn(e.getMessage(), e); } } } public Header[] getHeaders() { synchronized (this) { if (httpResponse == null) return null; return httpResponse.getAllHeaders(); } } }