com.serphacker.serposcope.scraper.http.ScrapClient.java Source code

Java tutorial

Introduction

Here is the source code for com.serphacker.serposcope.scraper.http.ScrapClient.java

Source

/* 
 * Serposcope - SEO rank checker https://serposcope.serphacker.com/
 * 
 * Copyright (c) 2016 SERP Hacker
 * @author Pierre Nogues <support@serphacker.com>
 * @license https://opensource.org/licenses/MIT MIT License
 */
package com.serphacker.serposcope.scraper.http;

import com.serphacker.serposcope.scraper.http.extensions.CloseableBasicHttpClientConnectionManager;
import com.serphacker.serposcope.scraper.http.extensions.ScrapClientPlainConnectionFactory;
import com.serphacker.serposcope.scraper.http.extensions.ScrapClientSSLConnectionFactory;
import com.serphacker.serposcope.scraper.http.extensions.ScrapClientSocksAuthenticator;
import com.serphacker.serposcope.scraper.http.proxy.BindProxy;
import com.serphacker.serposcope.scraper.http.proxy.DirectNoProxy;
import com.serphacker.serposcope.scraper.http.proxy.HttpProxy;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpException;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.Credentials;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.routing.HttpRoutePlanner;
import org.apache.http.conn.routing.RouteInfo;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.protocol.HttpContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.serphacker.serposcope.scraper.http.proxy.ScrapProxy;
import com.serphacker.serposcope.scraper.http.proxy.SocksProxy;
import com.serphacker.serposcope.scraper.utils.EncodeUtils;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.config.SocketConfig;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.entity.mime.HttpMultipartMode;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.entity.mime.content.ContentBody;
import org.apache.http.impl.DefaultConnectionReuseStrategy;
import org.apache.http.impl.client.RedirectLocations;
import org.apache.http.message.BasicNameValuePair;

/**
 * *
 * not thread safe
 *
 * @author admin
 */
public class ScrapClient implements Closeable, CredentialsProvider {

    public enum PostType {
        URL_ENCODED, MULTIPART
    }

    private static final Logger LOG = LoggerFactory.getLogger(ScrapClient.class);

    public final static String DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0";
    public final static int DEFAULT_TIMEOUT_MS = 30000;
    public final static int DEFAULT_MAX_RESPONSE_LENGTH = (1024 * 1024 * 4) - 1;// 4MB

    CloseableHttpClient client;
    BasicCredentialsProvider credentialProvider = new BasicCredentialsProvider();
    BasicCookieStore basicCookieStore = new BasicCookieStore();
    final CloseableBasicHttpClientConnectionManager connManager;
    ScrapClientPlainConnectionFactory plainConnectionFactory = new ScrapClientPlainConnectionFactory();
    ScrapClientSSLConnectionFactory sslConnectionFactory = new ScrapClientSSLConnectionFactory(
            plainConnectionFactory);

    String useragent = DEFAULT_USER_AGENT;
    Integer timeoutMS = DEFAULT_TIMEOUT_MS;
    ScrapProxy proxy;
    int maxResponseLength;
    byte[] buffer;
    List<Header> requestHeaders = new ArrayList<>();
    Map<HttpHost, HttpHost> routes = new HashMap<>();
    boolean proxyChangedSinceLastRequest;
    int maxRedirect = 0;

    long executionTimeMS;
    CloseableHttpResponse response;
    byte[] content;
    int statusCode;
    Exception exception;
    String lastRedirect;

    class SCliConnectionReuseStrategy extends DefaultConnectionReuseStrategy {

        @Override
        public boolean keepAlive(HttpResponse response, HttpContext context) {
            if (!proxyChangedSinceLastRequest && (proxy == null || (proxy instanceof BindProxy))) {
                return super.keepAlive(response, context);
            } else {
                return false;
            }
        }
    }

    class SCliHttpRoutePlanner implements HttpRoutePlanner {

        @Override
        public HttpRoute determineRoute(HttpHost originaltarget, HttpRequest request, HttpContext context)
                throws HttpException {
            boolean ssl = "https".equalsIgnoreCase(originaltarget.getSchemeName());
            HttpHost target = routes.getOrDefault(originaltarget, originaltarget);

            if (proxy == null) {
                return new HttpRoute(target);
            }

            if (proxy instanceof SocksProxy) {
                SocksProxy socksProxy = (SocksProxy) proxy;
                context.setAttribute("proxy.socks",
                        new InetSocketAddress(socksProxy.getIp(), socksProxy.getPort()));
                return new HttpRoute(target);
            }

            if (proxy instanceof BindProxy) {
                BindProxy bindProxy = (BindProxy) proxy;
                try {
                    return new HttpRoute(target, InetAddress.getByName(bindProxy.ip), ssl);
                } catch (UnknownHostException cause) {
                    throw new HttpException("invalid bind ip", cause);
                }
            }

            if (proxy instanceof HttpProxy) {
                HttpProxy httpProxy = (HttpProxy) proxy;

                return new HttpRoute(target, null, new HttpHost(httpProxy.getIp(), httpProxy.getPort()), ssl,
                        ssl ? RouteInfo.TunnelType.TUNNELLED : RouteInfo.TunnelType.PLAIN,
                        ssl ? RouteInfo.LayerType.LAYERED : RouteInfo.LayerType.PLAIN);
            }

            throw new UnsupportedOperationException("unsupported proxy type : " + proxy);
        }

    }

    public ScrapClient() {
        setMaxResponseLength(DEFAULT_MAX_RESPONSE_LENGTH);

        sslConnectionFactory.setInsecure(false);

        connManager = new CloseableBasicHttpClientConnectionManager(
                RegistryBuilder.<ConnectionSocketFactory>create().register("http", plainConnectionFactory)
                        .register("https", sslConnectionFactory).build());

        client = HttpClients.custom().setRoutePlanner(this.new SCliHttpRoutePlanner())
                .setDefaultCredentialsProvider(this).setDefaultCookieStore(basicCookieStore)
                .setConnectionReuseStrategy(this.new SCliConnectionReuseStrategy())
                .setConnectionManager(connManager).build();

        setTimeout(timeoutMS);
    }

    public void addCookie(Cookie cookie) {
        basicCookieStore.addCookie(cookie);
    }

    public void addCookies(Cookie[] cookies) {
        basicCookieStore.addCookies(cookies);
    }

    public void addCookies(Collection<Cookie> cookies) {
        for (Cookie cooky : cookies) {
            basicCookieStore.addCookie(cooky);
        }
    }

    public List<Cookie> getCookies() {
        return basicCookieStore.getCookies();
    }

    public boolean clearExpiredCookies(Date date) {
        return basicCookieStore.clearExpired(date);
    }

    public void clearCookies() {
        basicCookieStore.clear();
    }

    public String getUseragent() {
        return useragent;
    }

    public void setUseragent(String useragent) {
        this.useragent = useragent;
    }

    public void setProxy(ScrapProxy proxy) {
        synchronized (connManager) {
            connManager.closeConnection();
        }
        proxyChangedSinceLastRequest = true;
        if (proxy != null && proxy instanceof DirectNoProxy) {
            this.proxy = null;
        } else {
            this.proxy = proxy;
        }

        if (proxy instanceof SocksProxy) {
            ScrapClientSocksAuthenticator.INSTANCE.addProxy((SocksProxy) proxy);
        }
    }

    public ScrapProxy getProxy() {
        return proxy;
    }

    public Integer getTimeout() {
        return timeoutMS;
    }

    public final void setTimeout(Integer timeoutMS) {
        this.timeoutMS = timeoutMS;
        SocketConfig.Builder newSocketConfig = SocketConfig.custom();
        if (timeoutMS != null) {
            newSocketConfig.setSoTimeout(timeoutMS);
        }
        connManager.setSocketConfig(newSocketConfig.build());
    }

    public int getMaxResponseLength() {
        return maxResponseLength;
    }

    public final void setMaxResponseLength(int maxResponseLength) {
        this.maxResponseLength = maxResponseLength + 1;
        buffer = new byte[this.maxResponseLength];
    }

    public CloseableHttpResponse getResponse() {
        return response;
    }

    public byte[] getContent() {
        return content;
    }

    public String getContentAsString() {
        if (response == null || content == null) {
            return null;
        }

        Charset charset = getDetectedCharset();

        if (charset == null) {
            charset = Charset.forName("UTF-8");
        }

        return new String(content, charset);
    }

    public Charset getDetectedCharset() {
        ContentType contentType = null;
        try {
            contentType = ContentType.get(response.getEntity());
        } catch (Exception ex) {
        }

        Charset charset = null;
        if (contentType != null) {
            try {
                charset = contentType.getCharset();
            } catch (final Exception ex) {
            }

            if (charset == null) {
                if (contentType.getMimeType().contains("text/html")) {
                    charset = detectCharsetFromHtmlMeta();
                }
            }

        }

        return charset;
    }

    final static Pattern pcharset = Pattern.compile("charset=['\"]?([^\"'\\s]+)");

    protected Charset detectCharsetFromHtmlMeta() {
        if (content == null) {
            return null;
        }

        int len = content.length > 4096 ? 4096 : content.length;
        Matcher matcher = pcharset.matcher(new ByteCharSequence(content, 0, len));
        if (matcher.find()) {
            try {
                return Charset.forName(matcher.group(1));
            } catch (Exception ex) {
            }
        }

        return null;
    }

    public String getResponseHeader(String key) {
        if (response == null) {
            return null;
        }
        Header header = response.getFirstHeader(key);
        if (header == null) {
            return null;
        }
        return header.getValue();
    }

    public int getStatusCode() {
        return statusCode;
    }

    public Exception getException() {
        return exception;
    }

    public int get(String url) {
        return get(url, null);
    }

    public int get(String url, String referrer) {
        HttpGet request = new HttpGet(url);
        if (referrer != null) {
            request.addHeader("Referer", referrer);
        }
        return request(request);
    }

    public int post(String url, Map<String, Object> data, PostType dataType) {
        return post(url, data, dataType, null);
    }

    public int post(String url, Map<String, Object> data, PostType dataType, String charset) {
        return post(url, data, dataType, charset, null);
    }

    public int post(String url, Map<String, Object> data, PostType dataType, String charset, String referrer) {
        clearPreviousRequest();

        HttpPost request = new HttpPost(url);
        HttpEntity entity = null;

        if (charset == null) {
            charset = "utf-8";
        }

        Charset detectedCharset = null;
        try {
            detectedCharset = Charset.forName(charset);
        } catch (Exception ex) {
            LOG.warn("invalid charset name {}, switching to utf-8");
            detectedCharset = Charset.forName("utf-8");
        }

        data = handleUnsupportedEncoding(data, detectedCharset);

        switch (dataType) {
        case URL_ENCODED:
            List<NameValuePair> formparams = new ArrayList<>();
            for (Map.Entry<String, Object> entry : data.entrySet()) {
                if (entry.getValue() instanceof String) {
                    formparams.add(new BasicNameValuePair(entry.getKey(), (String) entry.getValue()));
                } else {
                    LOG.warn("trying to url encode non string data");
                    formparams.add(new BasicNameValuePair(entry.getKey(), entry.getValue().toString()));
                }
            }

            try {
                entity = new UrlEncodedFormEntity(formparams, detectedCharset);
            } catch (Exception ex) {
                statusCode = -1;
                exception = ex;
                return statusCode;
            }
            break;

        case MULTIPART:
            MultipartEntityBuilder builder = MultipartEntityBuilder.create().setCharset(detectedCharset)
                    .setMode(HttpMultipartMode.BROWSER_COMPATIBLE);

            ContentType formDataCT = ContentType.create("form-data", detectedCharset);
            //                formDataCT = ContentType.DEFAULT_TEXT;

            for (Map.Entry<String, Object> entry : data.entrySet()) {
                String key = entry.getKey();

                if (entry.getValue() instanceof String) {
                    builder = builder.addTextBody(key, (String) entry.getValue(), formDataCT);
                } else if (entry.getValue() instanceof byte[]) {
                    builder = builder.addBinaryBody(key, (byte[]) entry.getValue());
                } else if (entry.getValue() instanceof ContentBody) {
                    builder = builder.addPart(key, (ContentBody) entry.getValue());
                } else {
                    exception = new UnsupportedOperationException(
                            "unssuported body type " + entry.getValue().getClass());
                    return statusCode = -1;
                }
            }

            entity = builder.build();
            break;

        default:
            exception = new UnsupportedOperationException("unspported PostType " + dataType);
            return statusCode = -1;
        }

        request.setEntity(entity);
        if (referrer != null) {
            request.addHeader("Referer", referrer);
        }
        return request(request);
    }

    protected Map<String, Object> handleUnsupportedEncoding(Map<String, Object> data, Charset detectedCharset) {

        Map<String, Object> cleanedData = new HashMap<>();

        boolean hasUnsupportedEncoding = false;
        for (Map.Entry<String, Object> entry : data.entrySet()) {
            String key = entry.getKey();
            Object value = entry.getValue();

            if (!EncodeUtils.canEncode(key, detectedCharset.name())) {
                hasUnsupportedEncoding = true;
                key = EncodeUtils.forceASCII(key);
            }

            if (value instanceof String) {
                if (!EncodeUtils.canEncode((String) value, detectedCharset.name())) {
                    hasUnsupportedEncoding = true;
                    value = EncodeUtils.forceASCII((String) value);
                }
            }

            cleanedData.put(key, value);
        }

        if (hasUnsupportedEncoding) {
            LOG.warn("failed to encode some post data to {} forced to ascii", detectedCharset.name());
        }

        return cleanedData;
    }

    protected void clearPreviousRequest() {
        content = null;
        exception = null;
        response = null;
        statusCode = 0;
        lastRedirect = null;
    }

    public int request(HttpRequestBase request) {
        synchronized (connManager) {
            try {
                clearPreviousRequest();
                executionTimeMS = System.currentTimeMillis();

                HttpClientContext context = HttpClientContext.create();
                initializeRequest(request, context);

                response = client.execute(request, context);
                statusCode = response.getStatusLine().getStatusCode();
                RedirectLocations redirects = context.getAttribute(HttpClientContext.REDIRECT_LOCATIONS,
                        RedirectLocations.class);
                if (redirects != null && !redirects.isEmpty()) {
                    lastRedirect = redirects.get(redirects.size() - 1).toString();
                }

                HttpEntity entity = response.getEntity();
                long contentLength = entity.getContentLength();

                if (contentLength > maxResponseLength) {
                    throw new ResponseTooBigException("content length (" + contentLength + ") "
                            + "is greater than max response leength (" + maxResponseLength + ")");
                }

                InputStream stream = entity.getContent();
                int totalRead = 0;
                int read = 0;

                while (totalRead < maxResponseLength
                        && (read = stream.read(buffer, totalRead, maxResponseLength - totalRead)) != -1) {
                    totalRead += read;
                }

                if (totalRead == maxResponseLength && read != 0) {
                    throw new ResponseTooBigException("already read " + totalRead + " bytes");
                }
                content = Arrays.copyOfRange(buffer, 0, totalRead);

            } catch (Exception ex) {
                content = null;
                statusCode = -1;
                exception = ex;
            } finally {
                proxyChangedSinceLastRequest = false;
                closeResponse();
                executionTimeMS = System.currentTimeMillis() - executionTimeMS;
            }

            return statusCode;
        }
    }

    protected void initializeRequest(HttpRequestBase request, HttpClientContext context) {
        if (request.getFirstHeader("user-agent") == null) {
            request.setHeader("User-Agent", useragent);
        }

        for (Header requestHeader : requestHeaders) {
            request.setHeader(requestHeader);
        }

        RequestConfig.Builder configBuilder = RequestConfig
                .copy(request.getConfig() == null ? RequestConfig.DEFAULT : request.getConfig());

        if (timeoutMS != null) {
            configBuilder.setConnectTimeout(timeoutMS);
            configBuilder.setConnectionRequestTimeout(timeoutMS);
            configBuilder.setSocketTimeout(timeoutMS);
        }

        if (maxRedirect == 0) {
            configBuilder.setRedirectsEnabled(false);
        } else {
            configBuilder.setMaxRedirects(maxRedirect);
        }

        RequestConfig config = configBuilder.build();

        context.setAttribute(HttpClientContext.REQUEST_CONFIG, config);
        request.setConfig(config);
    }

    public void closeResponse() {
        if (response != null) {
            try {
                response.close();
            } catch (Exception ex) {
                LOG.warn("Exception while closing response", ex);
            }
        }
    }

    @Override
    public void close() throws IOException {
        closeResponse();
        if (client != null) {
            client.close();
        }
    }

    public void setRoute(HttpHost to, HttpHost via) {
        routes.put(to, via);
    }

    public void removeRouteVia(HttpHost host) {
        routes.remove(host);
    }

    public void removeRoutesTo(String host) {
        routes.entrySet().removeIf((Map.Entry<HttpHost, HttpHost> t) -> host.equals(t.getValue().getHostName()));
    }

    public void removeRoutes() {
        routes.clear();
    }

    @Override
    public Credentials getCredentials(AuthScope authscope) {
        if (proxy != null && proxy instanceof HttpProxy) {
            HttpProxy httpProxy = (HttpProxy) proxy;
            if (httpProxy.getIp().equals(authscope.getHost()) && httpProxy.getPort() == authscope.getPort()
                    && httpProxy.getUsername() != null && httpProxy.getPassword() != null) {
                return new UsernamePasswordCredentials(httpProxy.getUsername(), httpProxy.getPassword());
            }
        }

        return credentialProvider.getCredentials(authscope);
    }

    @Override
    public void setCredentials(AuthScope scope, Credentials auth) {
        credentialProvider.setCredentials(scope, auth);
    }

    @Override
    public void clear() {
        credentialProvider.clear();
    }

    public void setRequestHeader(Header header) {
        removeRequestHeadersByName(header.getName());
        requestHeaders.add(header);
    }

    public void removeRequestHeadersByName(String name) {
        requestHeaders.removeIf((Header t) -> t.getName().toLowerCase().equals(name.toLowerCase()));
    }

    public long getExecutionTimeMS() {
        return executionTimeMS;
    }

    public boolean isInsecureSSL() {
        return sslConnectionFactory.isInsecure();
    }

    public void setInsecureSSL(boolean insecureSSL) {
        this.sslConnectionFactory.setInsecure(insecureSSL);
    }

    public int getMaxRedirect() {
        return maxRedirect;
    }

    public void setMaxRedirect(int maxRedirect) {
        this.maxRedirect = maxRedirect;
    }

    public void enableFollowRedirect() {
        maxRedirect = 10;
    }

    public void disableFollowRedirect() {
        maxRedirect = 0;
    }

    public String getLastRedirect() {
        return lastRedirect;
    }

}