org.tinymediamanager.scraper.util.Url.java Source code

Java tutorial

Introduction

Here is the source code for org.tinymediamanager.scraper.util.Url.java

Source

/*
 * Copyright 2012 - 2015 Manuel Laggner
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.tinymediamanager.scraper.util;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.UnknownHostException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHeaders;
import org.apache.http.StatusLine;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.message.BasicHeader;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * The Class Url. Used to make simple, blocking URL requests. The request is temporarily streamed into a ByteArrayInputStream, before the InputStream
 * is passed to the caller.
 * 
 * @author Manuel Laggner / Myron Boyle
 */
public class Url {
    private static final Logger LOGGER = LoggerFactory.getLogger(Url.class);
    protected static CloseableHttpClient client;

    protected StatusLine responseStatus = null;
    protected String url = null;
    protected Header[] headersResponse = null;
    protected List<Header> headersRequest = new ArrayList<Header>();
    protected HttpEntity entity = null;
    protected URI uri = null;

    /**
     * gets the specified header value from this connection<br>
     * You need to call this AFTER getInputstream().
     * 
     * @param header
     *          the header you want to know (like Content-Length)
     * @return the header value
     */
    public String getHeader(String header) {
        if (headersResponse == null) {
            return "";
        }
        for (Header h : headersResponse) {
            if (h.getName().equalsIgnoreCase(header)) {
                return h.getValue();
            }
        }
        return "";
    }

    /**
     * get all response headers
     * 
     * @return the response headers
     */
    public Header[] getHeadersResponse() {
        return headersResponse;
    }

    /**
     * Instantiates a new url / httpclient with default user-agent.
     * 
     * @param url
     *          the url
     */
    public Url(String url) throws MalformedURLException {
        if (client == null) {
            client = TmmHttpClient.getHttpClient();
        }
        this.url = url;

        // morph to URI to check syntax of the url
        try {
            this.uri = morphStringToUri(url);
        } catch (URISyntaxException e) {
            throw new MalformedURLException(url);
        }

        // default user agent
        addHeader(HttpHeaders.USER_AGENT, UrlUtil.generateUA());
    }

    /**
     * morph the url (string) to an URI to check the syntax and escape the path
     * 
     * @param urlToMorph
     *          the url to morph
     * @return the morphed URI
     * @throws MalformedURLException
     * @throws URISyntaxException
     */
    private URI morphStringToUri(String urlToMorph) throws MalformedURLException, URISyntaxException {
        URL url = new URL(urlToMorph);
        return new URI(url.getProtocol(), url.getUserInfo(), url.getHost(), url.getPort(), url.getPath(),
                url.getQuery(), url.getRef());
    }

    /**
     * set a specified User-Agent
     * 
     * @param userAgent
     */
    public void setUserAgent(String userAgent) {
        addHeader(HttpHeaders.USER_AGENT, userAgent);
    }

    /**
     * Gets the url.
     * 
     * @return the url
     * @throws IOException
     *           Signals that an I/O exception has occurred.
     */
    public URL getUrl() throws IOException, InterruptedException {
        return new URL(url);
    }

    /**
     * Adds the header.
     * 
     * @param key
     *          the key
     * @param value
     *          the value
     */
    public void addHeader(String key, String value) {
        if (StringUtils.isBlank(key)) {
            return;
        }

        // LOGGER.debug("add HTTP header: " + key + "=" + value);

        // check for duplicates
        for (int i = headersRequest.size() - 1; i >= 0; i--) {
            Header header = headersRequest.get(i);
            if (key.equals(header.getName())) {
                headersRequest.remove(i);
            }
        }

        // and add the new one
        headersRequest.add(new BasicHeader(key, value));
    }

    /**
     * Adds the header.
     * 
     * @param header
     *          the header
     */
    public void addHeader(Header header) {
        headersRequest.add(header);
    }

    /**
     * Adds the headers.
     * 
     * @param headers
     *          the headers
     */
    public void addHeaders(List<Header> headers) {
        headersRequest.addAll(headers);
    }

    /**
     * Gets the input stream.
     * 
     * @return the input stream
     * @throws IOException
     *           Signals that an I/O exception has occurred.
     */
    public InputStream getInputStream() throws IOException, InterruptedException {
        // workaround for local files
        if (url.startsWith("file:")) {
            String newUrl = url.replace("file:/", "");
            File file = new File(newUrl);
            return new FileInputStream(file);
        }

        BasicHttpContext localContext = new BasicHttpContext();
        ByteArrayInputStream is = null;

        // replace our API keys for logging...
        String logUrl = url.replaceAll("api_key=\\w+", "api_key=<API_KEY>").replaceAll("api/\\d+\\w+",
                "api/<API_KEY>");
        LOGGER.debug("getting " + logUrl);
        HttpGet httpget = new HttpGet(uri);
        RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(10000).setConnectTimeout(10000)
                .build();
        httpget.setConfig(requestConfig);

        // set custom headers
        for (Header header : headersRequest) {
            httpget.addHeader(header);
        }

        CloseableHttpResponse response = null;
        try {
            response = client.execute(httpget, localContext);
            headersResponse = response.getAllHeaders();
            entity = response.getEntity();
            responseStatus = response.getStatusLine();
            if (entity != null) {
                is = new ByteArrayInputStream(EntityUtils.toByteArray(entity));
            }
            EntityUtils.consume(entity);
        } catch (InterruptedIOException e) {
            LOGGER.info("aborted request (" + e.getMessage() + "): " + logUrl);
            throw e;
        } catch (UnknownHostException e) {
            LOGGER.error("proxy or host not found/reachable", e);
            throw e;
        } catch (Exception e) {
            LOGGER.error("Exception getting url " + logUrl, e);
        } finally {
            if (response != null) {
                response.close();
            }
        }
        return is;
    }

    /**
     * is the HTTP status code a 4xx/5xx?
     * 
     * @return true/false
     */
    public boolean isFault() {
        return (responseStatus != null && responseStatus.getStatusCode() >= 400) ? true : false;
    }

    /**
     * http status code
     */
    public int getStatusCode() {
        return responseStatus != null ? responseStatus.getStatusCode() : 0;
    }

    /**
     * http status string
     */
    public String getStatusLine() {
        return responseStatus != null ? responseStatus.toString() : "";
    }

    /**
     * Gets the bytes.
     * 
     * @return the bytes
     * @throws IOException
     *           Signals that an I/O exception has occurred.
     */
    public byte[] getBytes() throws IOException, InterruptedException {
        InputStream is = getInputStream();
        byte[] bytes = IOUtils.toByteArray(is);
        is.close();
        return bytes;
    }

    /**
     * Gets the charset.
     * 
     * @return the charset
     */
    public Charset getCharset() {
        Charset charset = null;
        if (entity == null || entity.getContentType() == null) {
            return Charset.defaultCharset();
        }

        String contentType = entity.getContentType().getValue();
        if (contentType != null) {
            // changed 'charset' to 'harset' in regexp because some sites send 'Charset'
            Matcher m = Pattern.compile("harset *=[ '\"]*([^ ;'\"]+)[ ;'\"]*").matcher(contentType);
            if (m.find()) {
                String encoding = m.group(1);
                try {
                    charset = Charset.forName(encoding);
                } catch (UnsupportedCharsetException e) {
                    // there will be used default charset
                }
            }
        }
        if (charset == null) {
            charset = Charset.defaultCharset();
        }

        return charset;
    }

    /**
     * Gets the content encoding.
     * 
     * @return the content encoding
     */
    public String getContentEncoding() {
        if (entity == null || entity.getContentEncoding() == null) {
            return null;
        }

        return entity.getContentEncoding().getValue();
    }

    /**
     * the number of bytes of the content, or a negative number if unknown. If the content length is known but exceeds Long.MAX_VALUE, a negative number
     * is returned.
     * 
     * @return the content length
     */
    public long getContentLength() {
        if (entity == null) {
            return -1;
        }

        return entity.getContentLength();
    }

    @Override
    public String toString() {
        return url;
    }
}