org.opensextant.xtext.collectors.web.WebClient.java Source code

Java tutorial

Introduction

Here is the source code for org.opensextant.xtext.collectors.web.WebClient.java

Source

/**
 *
 * Copyright 2013-2014 OpenSextant.org
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.opensextant.xtext.collectors.web;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import org.opensextant.ConfigException;
import org.opensextant.util.FileUtility;
import org.opensextant.xtext.XText;
import org.opensextant.xtext.collectors.Collector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

// TODO: Auto-generated Javadoc
/**
 * Simple client that pulls down HTML from a web site, acquire files and crawl sub-folders.
 * This is not a generalize web crawler. It specifically looks for meaningful content, such as HTML pages, document
 * downloads, etc.
 */
public class WebClient {

    private final Logger log = LoggerFactory.getLogger(getClass());

    /**
     * Prep url. This ensures that found URLs that may contain whitespace
     * are properly converted to proper URL format/escaping.
     *
     * @param u
     *            URL string
     * @return URL object
     * @throws MalformedURLException
     *             the malformed url exception
     */
    public static URL prepURL(String u) throws MalformedURLException {
        /**
         * TODO: require outside caller encode URL properly.
         * For now, whitespace is only main issue.
         */
        String encoded = u.replaceAll(" ", "%20");
        return new URL(encoded);
    }

    /**
     * Prep url path.
     *
     * @param u
     *            the u
     * @return the string
     * @throws MalformedURLException
     *             the malformed url exception
     */
    public static String prepURLPath(String u) throws MalformedURLException {
        /**
         * TODO: require outside caller encode URL properly.
         * For now, whitespace is only main issue.
         */
        return u.replaceAll(" ", "%20");
    }

    /**
     * Instantiates a new web client.
     *
     * @param siteUrl
     *            the url to collect.
     * @param archive
     *            the destination archive. Keep in mind, this is the location of downloaded originals.
     *            Use Xtext instance to manage where/how you convert those originals.
     * @throws MalformedURLException
     *             if URL given is bad
     * @throws ConfigException
     *             the config exception
     */
    public WebClient(String siteUrl, String archive) throws MalformedURLException, ConfigException {
        setSite(siteUrl);
        archiveRoot = archive;
    }

    /** The archive root. */
    protected String archiveRoot = null;
    private String proxy = null;

    /** The server. */
    protected String server = null;

    /** The site. */
    protected URL site = null;

    /** The proxy host. */
    protected HttpHost proxyHost = null;

    /** The interval. */
    protected int interval = 100; // milliseconds wait between web requests.

    /** The converter. */
    protected XText converter = null;

    /**
     * Configure.
     *
     * @throws ConfigException
     *             the config exception
     */
    public void configure() throws ConfigException {
        // Test if the site exists and is reachable
        testAvailability();

        // Test is your destination archive exists
        if (archiveRoot != null) {
            File test = new File(archiveRoot);
            if (!(test.isDirectory() && test.exists())) {
                throw new ConfigException(
                        "Destination archive does not exist. Caller must create prior to creation.");
            }
        }
    }

    /**
     * Caller should construct their own conversionManager and pass that in.
     * NOTE: since the web client can operate without an instance of XText, e.g., just run a crawl with no conversion
     * the WebClient constructor takes an archive path. As you pass in a conversion manager here, make sure that the
     * archive root there matches what is used her in the WebClient. If you are using Xtext in embedded mode, then do
     * not worry.
     * the archive is ignored.
     *
     * @param conversionManager
     *            converter, an XText instance
     */
    public void setConverter(XText conversionManager) {
        converter = conversionManager;
    }

    /**
     * Creates the archive file.
     *
     * @param relpath
     *            relative path for this object
     * @param isDir
     *            the is dir
     * @return full path
     * @throws IOException
     *             on I/O error
     */
    protected File createArchiveFile(String relpath, boolean isDir) throws IOException {
        String itemArchivedPath = archiveRoot + Collector.PATH_SEP + relpath;
        File itemSaved = new File(itemArchivedPath.replaceAll("//", "/"));
        if (isDir) {
            FileUtility.makeDirectory(itemSaved);
        } else {
            itemSaved.getParentFile().mkdirs();
        }
        return itemSaved;
    }

    /** */
    protected Map<String, HyperLink> found = new HashMap<String, HyperLink>();

    /** */
    protected Set<String> saved = new HashSet<String>();

    /**
     * current depth of the crawl at any time.
     */
    protected int depth = 0;

    /**
     * Maximum number of levels that will be crawled.
     */
    public final static int MAX_DEPTH = 5;

    /**
     * Allow a proxy host to be set given the URL.
     * Assumes port 80, no user/password.
     *
     * @param hosturl
     *            proxy URL
     */
    public void setProxy(String hosturl) {
        proxy = hosturl;
        int port = 80;
        String host = proxy;
        if (proxy.contains(":")) {
            String[] hp = proxy.split(":");
            host = hp[0];
            port = Integer.parseInt(hp[1]);
        }
        proxyHost = new HttpHost(host, port);
    }

    public void setProxy(String h, int port) {
        proxyHost = new HttpHost(h, port);
    }

    boolean useSystemProperties = false;

    /**
     * @param b flag to enable use of System Properties to get proxy settings, etc.
     */
    public void enableSystemProperties(boolean b) {
        this.useSystemProperties = b;
    }

    /**
     * Sets the site.
     *
     * @param url
     *            the new site
     * @throws MalformedURLException
     *             the malformed url exception
     */
    public void setSite(String url) throws MalformedURLException {
        site = new URL(url);
        server = new URL(url).getHost();
    }

    /**
     * Gets the site.
     *
     * @return the URL object
     */
    public URL getSite() {
        return site;
    }

    /**
     * Gets the server.
     *
     * @return server hostname
     */
    public String getServer() {
        return server;
    }

    /**
     * TODO: Update to use HTTP client "HttpClients....build()" method of creating and tailoring HttpClient
     * using the proxy and cookie settings, as well as any other tuning.
     *
     * Override if your context requires a different style of HTTP client.
     * 
     * @return HttpClient 4.x object
     */
    public HttpClient getClient() {
        HttpClientBuilder clientHelper = null;

        if (this.useSystemProperties) {
            clientHelper = HttpClientBuilder.create().useSystemProperties();
        } else {
            clientHelper = HttpClientBuilder.create();
            if (proxyHost != null) {
                clientHelper.setProxy(proxyHost);
            }
        }

        RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY)
                .build();

        HttpClient httpClient = clientHelper.setDefaultRequestConfig(globalConfig).build();

        return httpClient;
    }

    /**
     * Tests the availability of the currently configured source.
     *
     * @throws ConfigException
     *             error which means resource is unavailable.
     */
    public void testAvailability() throws ConfigException {

        if (site == null) {
            throw new ConfigException("Engineering Error: site was not set.");
        }

        try {
            getPage(site);
            return;
        } catch (Exception err) {
            throw new ConfigException(String.format("%s failed to collect URL %s", getName(), site), err);
        }
    }

    /**
     * clears state of crawl.
     */
    public void reset() {
        // Clear list of distinct items found
        this.found.clear();
        // Clear list of items tracked/saved in this session.
        this.saved.clear();
    }

    /**
     * Sets the interval.
     *
     * @param i
     *            interval
     */
    public void setInterval(int i) {
        interval = i;
    }

    /**
     * Pause.
     */
    protected void pause() {
        if (interval > 0) {
            try {
                Thread.sleep(interval);
            } catch (Exception err) {

            }
        }
    }

    /**
     * Get a web page that requires NTLM authentication.
     *
     * @param siteURL
     *            URL
     * @return response for the URL
     * @throws IOException
     *             on error
     */
    public HttpResponse getPage(URL siteURL) throws IOException {
        HttpClient httpClient = getClient();
        HttpGet httpget = new HttpGet();

        try {
            URI address = siteURL.toURI();
            httpget.setURI(address);
            HttpResponse response = httpClient.execute(httpget);

            if (response.getStatusLine().getStatusCode() == 404) {
                throw new IOException("HTTP Page " + siteURL + " not found");
            }

            return response;
        } catch (URISyntaxException ioerr) {
            throw new IOException(ioerr);
        }
    }

    private static final Pattern HREF_MATCH = Pattern.compile("href=[\"']([^\"']+)[\"']", Pattern.CASE_INSENSITIVE);

    /**
     * Recursively parse a site page, limiting the crawl to local items
     * contained within the current folder/page
     * This finds only obvious HREF anchors and filters out problematic ones:
     * 
     * <pre>
     *  "/"
     *  "../xxxxxxx/"
     *  "#"
     *  "javascript:xxxxxx"
     * </pre>
     * 
     * TODO: pass in or set an allow filter. sometimes caller knows which content is worth
     * following, e.g., ../abc_folder/morecontent.htm and such URLs should be resolved absolutely to avoid
     * recapture repeatedly.
     *
     * @param html
     *            HTML text buffer
     * @param pageUrl
     *            the page url
     * @param siteUrl
     *            the site url
     * @return a list of found links
     */
    public Collection<HyperLink> parseContentPage(String html, URL pageUrl, URL siteUrl) {
        Map<String, HyperLink> contentLinks = new HashMap<String, HyperLink>();
        Matcher matches = HREF_MATCH.matcher(html);
        while (matches.find()) {
            String link = matches.group(1).trim();
            String link_lc = link.toLowerCase();

            if ("/".equals(link) || "#".equals(link)) {
                continue;
            }
            if (link_lc.startsWith("#") || link_lc.startsWith("javascript")) {
                continue;
            }
            if (link_lc.startsWith("mailto:")) {
                log.info("Ignore Mailto {}", link_lc);
                continue;
            }

            if (link.endsWith("/")) {
                link = link.substring(0, link.length() - 1);
            }

            try {
                HyperLink l = new HyperLink(link, pageUrl, siteUrl);
                if (l.isResource()) {
                    continue;
                }
                if (!contentLinks.containsKey(l.toString())) {
                    log.debug("Found link {}", link);
                    contentLinks.put(l.toString(), l);
                }
            } catch (Exception err) {
                log.error("Failed to parse URL {}", link, err);
            }

        }

        return contentLinks.values();
    }

    /**
     * Reads a data stream as text as the default encoding.
     * TODO: test reading website content with different charset encodings to see if the resulting String
     * is properly decoded.
     *
     * @param io
     *            IO stream
     * @return content of the stream
     * @throws IOException
     *             I/O error
     */
    public static String readTextStream(InputStream io) throws IOException {
        Reader reader = new InputStreamReader(io);
        StringWriter buf = new StringWriter();

        int ch;
        while ((ch = reader.read()) >= 0) {
            buf.write(ch);
        }
        reader.close();
        io.close();

        return buf.toString();
    }

    /**
     * Reads an HttpEntity object, saving it to the path
     * 
     * REF: http://stackoverflow.com/questions/10960409/how-do-i-save-a-file-
     * downloaded-with-httpclient-into-a-specific-folder
     *
     * @param entity
     *            http entity obj
     * @param destPath
     *            output path
     * @throws IOException
     *             Signals that an I/O exception has occurred.
     */
    public static void downloadFile(HttpEntity entity, String destPath) throws IOException {
        org.apache.commons.io.IOUtils.copy(entity.getContent(), new FileOutputStream(destPath));
    }

    private String name = "Unamed Web crawler";

    /**
     * Set a name of this client for tracking puropses, e.g., in multiple threads
     *
     * @param n
     *            the new name
     */
    public void setName(String n) {
        name = n;
    }

    /**
     * Get name of client
     *
     * @return the name
     */
    public String getName() {
        return name;
    }
}