Java tutorial
/** * * Copyright 2013-2014 OpenSextant.org * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.opensextant.xtext.collectors.web; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringWriter; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.HttpClientBuilder; import org.opensextant.ConfigException; import org.opensextant.util.FileUtility; import org.opensextant.xtext.XText; import org.opensextant.xtext.collectors.Collector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; // TODO: Auto-generated Javadoc /** * Simple client that pulls down HTML from a web site, acquire files and crawl sub-folders. * This is not a generalize web crawler. It specifically looks for meaningful content, such as HTML pages, document * downloads, etc. */ public class WebClient { private final Logger log = LoggerFactory.getLogger(getClass()); /** * Prep url. This ensures that found URLs that may contain whitespace * are properly converted to proper URL format/escaping. * * @param u * URL string * @return URL object * @throws MalformedURLException * the malformed url exception */ public static URL prepURL(String u) throws MalformedURLException { /** * TODO: require outside caller encode URL properly. * For now, whitespace is only main issue. */ String encoded = u.replaceAll(" ", "%20"); return new URL(encoded); } /** * Prep url path. * * @param u * the u * @return the string * @throws MalformedURLException * the malformed url exception */ public static String prepURLPath(String u) throws MalformedURLException { /** * TODO: require outside caller encode URL properly. * For now, whitespace is only main issue. */ return u.replaceAll(" ", "%20"); } /** * Instantiates a new web client. * * @param siteUrl * the url to collect. * @param archive * the destination archive. Keep in mind, this is the location of downloaded originals. * Use Xtext instance to manage where/how you convert those originals. * @throws MalformedURLException * if URL given is bad * @throws ConfigException * the config exception */ public WebClient(String siteUrl, String archive) throws MalformedURLException, ConfigException { setSite(siteUrl); archiveRoot = archive; } /** The archive root. */ protected String archiveRoot = null; private String proxy = null; /** The server. */ protected String server = null; /** The site. */ protected URL site = null; /** The proxy host. */ protected HttpHost proxyHost = null; /** The interval. */ protected int interval = 100; // milliseconds wait between web requests. /** The converter. */ protected XText converter = null; /** * Configure. * * @throws ConfigException * the config exception */ public void configure() throws ConfigException { // Test if the site exists and is reachable testAvailability(); // Test is your destination archive exists if (archiveRoot != null) { File test = new File(archiveRoot); if (!(test.isDirectory() && test.exists())) { throw new ConfigException( "Destination archive does not exist. Caller must create prior to creation."); } } } /** * Caller should construct their own conversionManager and pass that in. * NOTE: since the web client can operate without an instance of XText, e.g., just run a crawl with no conversion * the WebClient constructor takes an archive path. As you pass in a conversion manager here, make sure that the * archive root there matches what is used her in the WebClient. If you are using Xtext in embedded mode, then do * not worry. * the archive is ignored. * * @param conversionManager * converter, an XText instance */ public void setConverter(XText conversionManager) { converter = conversionManager; } /** * Creates the archive file. * * @param relpath * relative path for this object * @param isDir * the is dir * @return full path * @throws IOException * on I/O error */ protected File createArchiveFile(String relpath, boolean isDir) throws IOException { String itemArchivedPath = archiveRoot + Collector.PATH_SEP + relpath; File itemSaved = new File(itemArchivedPath.replaceAll("//", "/")); if (isDir) { FileUtility.makeDirectory(itemSaved); } else { itemSaved.getParentFile().mkdirs(); } return itemSaved; } /** */ protected Map<String, HyperLink> found = new HashMap<String, HyperLink>(); /** */ protected Set<String> saved = new HashSet<String>(); /** * current depth of the crawl at any time. */ protected int depth = 0; /** * Maximum number of levels that will be crawled. */ public final static int MAX_DEPTH = 5; /** * Allow a proxy host to be set given the URL. * Assumes port 80, no user/password. * * @param hosturl * proxy URL */ public void setProxy(String hosturl) { proxy = hosturl; int port = 80; String host = proxy; if (proxy.contains(":")) { String[] hp = proxy.split(":"); host = hp[0]; port = Integer.parseInt(hp[1]); } proxyHost = new HttpHost(host, port); } public void setProxy(String h, int port) { proxyHost = new HttpHost(h, port); } boolean useSystemProperties = false; /** * @param b flag to enable use of System Properties to get proxy settings, etc. */ public void enableSystemProperties(boolean b) { this.useSystemProperties = b; } /** * Sets the site. * * @param url * the new site * @throws MalformedURLException * the malformed url exception */ public void setSite(String url) throws MalformedURLException { site = new URL(url); server = new URL(url).getHost(); } /** * Gets the site. * * @return the URL object */ public URL getSite() { return site; } /** * Gets the server. * * @return server hostname */ public String getServer() { return server; } /** * TODO: Update to use HTTP client "HttpClients....build()" method of creating and tailoring HttpClient * using the proxy and cookie settings, as well as any other tuning. * * Override if your context requires a different style of HTTP client. * * @return HttpClient 4.x object */ public HttpClient getClient() { HttpClientBuilder clientHelper = null; if (this.useSystemProperties) { clientHelper = HttpClientBuilder.create().useSystemProperties(); } else { clientHelper = HttpClientBuilder.create(); if (proxyHost != null) { clientHelper.setProxy(proxyHost); } } RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY) .build(); HttpClient httpClient = clientHelper.setDefaultRequestConfig(globalConfig).build(); return httpClient; } /** * Tests the availability of the currently configured source. * * @throws ConfigException * error which means resource is unavailable. */ public void testAvailability() throws ConfigException { if (site == null) { throw new ConfigException("Engineering Error: site was not set."); } try { getPage(site); return; } catch (Exception err) { throw new ConfigException(String.format("%s failed to collect URL %s", getName(), site), err); } } /** * clears state of crawl. */ public void reset() { // Clear list of distinct items found this.found.clear(); // Clear list of items tracked/saved in this session. this.saved.clear(); } /** * Sets the interval. * * @param i * interval */ public void setInterval(int i) { interval = i; } /** * Pause. */ protected void pause() { if (interval > 0) { try { Thread.sleep(interval); } catch (Exception err) { } } } /** * Get a web page that requires NTLM authentication. * * @param siteURL * URL * @return response for the URL * @throws IOException * on error */ public HttpResponse getPage(URL siteURL) throws IOException { HttpClient httpClient = getClient(); HttpGet httpget = new HttpGet(); try { URI address = siteURL.toURI(); httpget.setURI(address); HttpResponse response = httpClient.execute(httpget); if (response.getStatusLine().getStatusCode() == 404) { throw new IOException("HTTP Page " + siteURL + " not found"); } return response; } catch (URISyntaxException ioerr) { throw new IOException(ioerr); } } private static final Pattern HREF_MATCH = Pattern.compile("href=[\"']([^\"']+)[\"']", Pattern.CASE_INSENSITIVE); /** * Recursively parse a site page, limiting the crawl to local items * contained within the current folder/page * This finds only obvious HREF anchors and filters out problematic ones: * * <pre> * "/" * "../xxxxxxx/" * "#" * "javascript:xxxxxx" * </pre> * * TODO: pass in or set an allow filter. sometimes caller knows which content is worth * following, e.g., ../abc_folder/morecontent.htm and such URLs should be resolved absolutely to avoid * recapture repeatedly. * * @param html * HTML text buffer * @param pageUrl * the page url * @param siteUrl * the site url * @return a list of found links */ public Collection<HyperLink> parseContentPage(String html, URL pageUrl, URL siteUrl) { Map<String, HyperLink> contentLinks = new HashMap<String, HyperLink>(); Matcher matches = HREF_MATCH.matcher(html); while (matches.find()) { String link = matches.group(1).trim(); String link_lc = link.toLowerCase(); if ("/".equals(link) || "#".equals(link)) { continue; } if (link_lc.startsWith("#") || link_lc.startsWith("javascript")) { continue; } if (link_lc.startsWith("mailto:")) { log.info("Ignore Mailto {}", link_lc); continue; } if (link.endsWith("/")) { link = link.substring(0, link.length() - 1); } try { HyperLink l = new HyperLink(link, pageUrl, siteUrl); if (l.isResource()) { continue; } if (!contentLinks.containsKey(l.toString())) { log.debug("Found link {}", link); contentLinks.put(l.toString(), l); } } catch (Exception err) { log.error("Failed to parse URL {}", link, err); } } return contentLinks.values(); } /** * Reads a data stream as text as the default encoding. * TODO: test reading website content with different charset encodings to see if the resulting String * is properly decoded. * * @param io * IO stream * @return content of the stream * @throws IOException * I/O error */ public static String readTextStream(InputStream io) throws IOException { Reader reader = new InputStreamReader(io); StringWriter buf = new StringWriter(); int ch; while ((ch = reader.read()) >= 0) { buf.write(ch); } reader.close(); io.close(); return buf.toString(); } /** * Reads an HttpEntity object, saving it to the path * * REF: http://stackoverflow.com/questions/10960409/how-do-i-save-a-file- * downloaded-with-httpclient-into-a-specific-folder * * @param entity * http entity obj * @param destPath * output path * @throws IOException * Signals that an I/O exception has occurred. */ public static void downloadFile(HttpEntity entity, String destPath) throws IOException { org.apache.commons.io.IOUtils.copy(entity.getContent(), new FileOutputStream(destPath)); } private String name = "Unamed Web crawler"; /** * Set a name of this client for tracking puropses, e.g., in multiple threads * * @param n * the new name */ public void setName(String n) { name = n; } /** * Get name of client * * @return the name */ public String getName() { return name; } }