Java tutorial
/** * * Copyright 2013-2014 OpenSextant.org * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.opensextant.xtext.collectors.web; import java.io.File; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.security.NoSuchAlgorithmException; import java.util.HashMap; import java.util.HashSet; import java.util.Properties; import java.util.Set; import org.apache.commons.io.FilenameUtils; import static org.apache.commons.lang3.StringUtils.isBlank; import org.apache.tika.config.TikaConfig; import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; import org.opensextant.util.FileUtility; import org.opensextant.util.TextUtils; // TODO: Auto-generated Javadoc /** * A representation of a harvested hyperlink. Normalization of found URL attempts to derive: * <ul> * <li>is item a file or dynamic, generated HTML?</li> * <li>is item a folder or a page?</li> * <li>what is the relation between this page and its containing folder and hosting site? Is this link * resident hosted on the originally crawled site?</li> * <li>What is the proper file extension for a found link? A link itself does not always reflect the MIME Type and file * "save-as" filename... * </li> * * </ul> * * @author ubaldino * */ public class HyperLink { /** raw URL string */ protected String urlValue = null; /** * the link as found. */ protected String urlNominal = null; /** The referrer url. */ protected URL referrerURL = null; /** The absolute url. */ protected URL absoluteURL = null; /** The site url. */ protected URL siteURL = null; /** The is absolute. */ protected boolean isAbsolute = false; /** The params. */ protected Properties params = new Properties(); /** The is current page. */ protected boolean isCurrentPage = false; /** The is current site. */ protected boolean isCurrentSite = false; /** The is current host. */ protected boolean isCurrentHost = false; /** The site value. */ protected String siteValue = null; /** The archive file. */ protected File archiveFile = null; /** The path extension. */ protected String pathExtension = null; /** The archive file extension. */ protected String archiveFileExtension = null; /** The mime type. */ protected String mimeType = null; /** The is folder. */ protected boolean isFolder = false; /** The query. */ protected String query = null; /** The directory. */ protected String directory = null; /** The is dynamic. */ private boolean isDynamic = false; /** The link id. */ private String linkId = null; /** * a physical path that represents the URL uniquely. */ protected String normalizedPath = null; /** * URL wrangling, mainly to take a found URL and adapt it so it looks like a file path safe for a file system. * * @param link * found link * @param referringLink * - Normalized, absolute URL string * @param site * top level site * @throws MalformedURLException * on err * @throws NoSuchAlgorithmException * on err * @throws UnsupportedEncodingException * on err, when URL contains poorly encoded characters */ public HyperLink(String link, URL referringLink, URL site) throws MalformedURLException, NoSuchAlgorithmException, UnsupportedEncodingException { urlValue = link; urlNominal = link; siteURL = site; siteValue = site.toString(); referrerURL = referringLink; String url_lc = urlNominal.toLowerCase(); String site_lc = siteValue.toLowerCase(); // If referrer, e.g. page containing this link is a folder or file, detect that. // "/a/b/c" is a folder but ensure referrer is tracked as "/a/b/c/" with trailing slash here. // Otherwise, url is a page. String base_lc = referrerURL.toString().toLowerCase(); boolean isReferrerFolder = false; String urlPath = null; isAbsolute = (url_lc.startsWith("http:") || url_lc.startsWith("https:")); if (!isAbsolute) { absoluteURL = new URL(referrerURL, urlValue); urlValue = absoluteURL.toString(); } else { absoluteURL = new URL(urlValue); } // Use this to represent the object identity. linkId = TextUtils.text_id(getAbsoluteURL()); query = absoluteURL.getQuery(); urlPath = absoluteURL.getPath().toLowerCase(); pathExtension = FilenameUtils.getExtension(urlPath); String referrerExt = FilenameUtils.getExtension(base_lc); isFolder = isFolder(url_lc, pathExtension); isReferrerFolder = isFolder(referrerURL.getPath(), referrerExt); String abs_lc = absoluteURL.toString().toLowerCase(); String path = absoluteURL.getPath(); if (isBlank(path)) { normalizedPath = "./"; isFolder = true; } else { normalizedPath = path; if (normalizedPath.endsWith("/")) { normalizedPath = normalizedPath.substring(0, normalizedPath.length() - 1); } } // Optional boolean derivedPath = deriveFilepathFromQuery(); if (!derivedPath) { String p = FilenameUtils.normalize(normalizedPath); if (p == null) { throw new MalformedURLException("Unable to parse/normalize path for: " + normalizedPath); } normalizedPath = p; } if (isFolder) { directory = new File(normalizedPath).getPath(); } else { directory = new File(normalizedPath).getParent(); } if (directory == null) { directory = path; } if (!isFolder) { archiveFileExtension = FilenameUtils.getExtension(normalizedPath); } // If base/referring page is a directory see if it is in same folder // as current link // String dirB = base_lc; if (isReferrerFolder && !dirB.endsWith("/")) { dirB = dirB + "/"; } else if (!isReferrerFolder) { int b = base_lc.lastIndexOf('/'); dirB = base_lc.substring(0, b); } int s = site_lc.lastIndexOf('/'); String siteDir = site_lc.substring(0, s); isCurrentSite = abs_lc.startsWith(siteDir); if (isCurrentSite) { if (isFolder) { isCurrentPage = abs_lc.startsWith(dirB); } else { int a = abs_lc.lastIndexOf('/'); String dirA = abs_lc.substring(0, a) + "/"; isCurrentPage = dirA.startsWith(dirB); } } String linkHost = absoluteURL.getHost(); String siteHost = siteURL.getHost(); isCurrentHost = linkHost.equalsIgnoreCase(siteHost); } /** * get the generated link ID * * @return the id */ public String getId() { return linkId; } private boolean isFolder(String url, String ext) { if (url.endsWith(".") || url.endsWith("/")) { return true; } if (isBlank(ext)) { return true; } return false; } /** * Given a URL a.b/path?param=val¶m=val.... * Derive any meaningful filename from param values in the query. * * @return true, if successful */ private boolean deriveFilepathFromQuery() { if (isBlank(query)) { return false; } /* * Obscure means for identifying a better file name + extension * under which we save this content. */ isDynamic = true; parseURL(); for (Object p : params.keySet()) { String val = params.getProperty(p.toString()); if (val.length() > 8 && isCommonFile(val)) { normalizedPath = String.format("%s/%s", normalizedPath, val); isDynamic = false; isFolder = false; return true; } } /* We have a query, but other means of naming the file, so we'll use * current path + MD5 file name +'.html' * */ try { normalizedPath = String.format("%s/%s.html", normalizedPath, TextUtils.text_id(query)); isFolder = false; return true; } catch (Exception ignore) { // NOTE: this never happens. } // And this would also never happen. return false; } /** The default mime. */ private static MimeTypes defaultMIME = TikaConfig.getDefaultConfig().getMimeRepository(); /** * Set the MIME type of a found link, i.e., once you'ved downloaded the content you then know the ContentType * possibly. * Which may differ from your perception of the URL path * * - reset the file extension, * - reset the path * - folder vs. file * * Set the MIME Type, file type, path, etc... prior to saving content to disk. * * @param t * the new MIME type */ public void setMIMEType(String t) { mimeType = t; if (mimeType == null) { return; } try { MimeType mt; /* Isolate the MIME type without parameters. * */ mt = defaultMIME.forName(t.split(";", 2)[0]); if (mt != null) { fixPathExtension(mt.getExtension()); } } catch (MimeTypeException ignore) { // Hmm. } } /** The mime equivalences. */ private static HashMap<String, String> mimeEquivalences = new HashMap<>(); static { mimeEquivalences.put("htm", "html"); mimeEquivalences.put("html", "htm"); mimeEquivalences.put("jpg", "jpeg"); mimeEquivalences.put("jpeg", "jpg"); } /** * Not comparing any null values. * * Consider if b='x' and a='y', are a and b like MIME types. * example: .html ?= .htm * * @param a * a string * @param b * a string * @return true, if successful */ private static boolean equivalentFileType(String a, String b) { if (isBlank(a)) { return false; } if (a.equals(b)) { return true; } String a1 = mimeEquivalences.get(a); if (a1 != null) { return a1.equals(b); } String b1 = mimeEquivalences.get(b); if (b1 != null) { return b1.equals(a); } return false; } /** * set the path extension, IFF it is significantly different. * * @param mimeExt * the mime extension */ private void fixPathExtension(String mimeExt) { if (isBlank(mimeExt)) { return; } String ext = mimeExt.replace(".", ""); if (equivalentFileType(archiveFileExtension, ext)) { // Do nothing. new file extension is nothing new. return; } /* * Replace the new mime-based file extension */ if (archiveFileExtension == null) { archiveFileExtension = ext; normalizedPath = String.format("%s.%s", normalizedPath, ext); isFolder = false; } else { int x = normalizedPath.lastIndexOf(archiveFileExtension); String p = normalizedPath.substring(0, x); archiveFileExtension = ext; normalizedPath = String.format("%s%s", p, ext); isFolder = false; } } /** * Checks if is folder. * * @return true, if is folder */ public boolean isFolder() { return isFolder; } /** * Get the referrer link used at creation time. * * @return the referrer */ public String getReferrer() { return referrerURL.toString(); } /** * Sets the filepath. * * @param p * the new filepath */ public void setFilepath(File p) { archiveFile = p; } /** * Gets the name. * * @return the name */ public String getName() { File f = new File(absoluteURL.getPath()); return f.getName(); } /** * Get the relative path of the URL within the site hierarchy if possible. * * @return the normal path */ public String getNormalPath() { return normalizedPath; } /** * tests if URL API detected a path, e.g., non-zero string following * host:port/(path) * * @return true, if successful */ public boolean hasPath() { return absoluteURL.getPath().length() > 0; } /* (non-Javadoc) * @see java.lang.Object#toString() */ @Override public String toString() { return absoluteURL.toString(); } /** * trivial test for dynamic content. * * @return true, if is dynamic */ public boolean isDynamic() { // Page is NOT dynamic content as determined by other methods if (!isDynamic) { return false; } // Page is Dynamic - yes or no - by look up alone. return isDynamic(urlValue, pathExtension); } /** * Checks if is resource. * * @return true, if is resource */ public boolean isResource() { return isResource(urlValue, pathExtension); } /** * list of dynamic pages, e.g., items to avoid. */ private final static Set<String> dynamicPages = new HashSet<String>(); /** The Constant resourcePages. */ private final static Set<String> resourcePages = new HashSet<String>(); static { dynamicPages.add("asp"); dynamicPages.add("aspx"); dynamicPages.add("jsp"); dynamicPages.add("cgi"); dynamicPages.add("php"); dynamicPages.add("pl"); dynamicPages.add("dhtml"); dynamicPages.add("js"); } static { resourcePages.add("css"); resourcePages.add("ico"); } /** * Checks if is dynamic. * * @param url * the url * @return true, if is dynamic */ public static boolean isDynamic(String url) { if (isBlank(url)) { return false; } String norm = url.toLowerCase(); String ext = FilenameUtils.getExtension(norm); return isDynamic(url, ext); } /** * Checks if is resource. * * @param url * the url * @return true, if is resource */ public static boolean isResource(String url) { if (isBlank(url)) { return false; } String norm = url.toLowerCase(); String ext = FilenameUtils.getExtension(norm); return isResource(norm, ext); } /** * Checks if is resource. * * @param url * -- currently unused. * @param ext * lower case. * @return true, if is resource */ public static boolean isResource(String url, String ext) { return resourcePages.contains(ext); } /** * Checks if is dynamic. * * @param url * -- currently unused. * @param ext * lower case. * @return true, if is dynamic */ public static boolean isDynamic(String url, String ext) { return dynamicPages.contains(ext); } /** * Checks if is web page. * * @return true, if is web page */ public boolean isWebPage() { if (isDynamic()) { return true; } final String desc = FileUtility.getFileDescription(urlValue); if (desc == FileUtility.WEBPAGE_MIMETYPE) { return true; } // Test case: http://a.b.com/my/page // Not query, no file extension. // if (urlValue.contains("/") && !urlValue.contains("?") && desc == FileUtility.NOT_AVAILABLE) { return true; } return isDynamic(absoluteURL.getPath()); } /** * Checks if is file. * * @return true, if is file */ public boolean isFile() { return isCommonFile(urlValue); } /** * Checks if is common file. * * @param v * a path * @return if path is a common type of file. */ public static boolean isCommonFile(String v) { if (FileUtility.getFileDescription(v) == FileUtility.DOC_MIMETYPE || FileUtility.isArchiveFile(v) || FileUtility.getFileDescription(v) == FileUtility.SPREADSHEET_MIMETYPE || FileUtility.getFileDescription(v) == FileUtility.GIS_MIMETYPE) { return true; } // Other conditions? return false; } /** * Given this URL, a, found on page, p, determine if a is a local anchor to * p itself. * * <pre> * /x/y.html a page. * /x/y.html#tag anchor to y.html * abc.html link to other page * * http://z.z.z/z.html#tag Hmmm, this is a page anchor to the absolute page in the URL, z.html. * * </pre> * * TODO: possibly use the isLocalAnchor() vs. isAnchor() metaphor. * * @return true, if is page anchor */ public boolean isPageAnchor() { String p = absoluteURL.getPath(); if (isAbsolute()) { return p.contains("#"); } if (isBlank(p) || isBlank(referrerURL.getPath())) { return false; } String file = FileUtility.getBasename(p, ""); /* * Traditional anchors, but also scripting IDs. */ if (file.startsWith("#") || file.startsWith("%")) { return true; } if (!p.startsWith(referrerURL.getPath())) { // Not parent/child relationship here. return false; } return false; } /** * Parses the url. */ protected void parseURL() { /** * Fails to parse out param around View={xyz} in sharepoint URL * List<NameValuePair> params = URLEncodedUtils.parse(new * URI(getAbsoluteURL()), "UTF-8"); for (NameValuePair p : params) { if * ("RootFolder".equals(p.getName())) { return p.getValue(); } } */ String qry = absoluteURL.getQuery(); String[] kvlist = qry.split("&"); for (String param : kvlist) { if (isBlank(param)) { continue; } if (!param.contains("=")) { params.put(param, ""); // empty value. continue; } String[] kv = param.split("=", 2); params.put(kv[0], kv[1]); } } /** * Checks if is absolute. * * @return true, if is absolute */ public boolean isAbsolute() { return isAbsolute; } /** * If a URL is fully-qualified protocol + server, then it is not relative. * * @return true, if is relative */ public boolean isRelative() { return !isAbsolute; } /** * Trivial test to see if this link matches the HTML page/URL from which it * came. That is, we want to know if the page contains a relative link to * itself. * * @param test * a URL * @return true, if is current page */ public boolean isCurrentPage(String test) { if (test == null) { return false; } return test.equalsIgnoreCase(urlValue); } /** * Checks if is current host. * * @return true, if is current host */ public boolean isCurrentHost() { return isCurrentHost; } /** * Checks if is current page. * * @return true, if is current page */ public boolean isCurrentPage() { return isCurrentPage; } /** * Checks if is current site. * * @return true, if is current site */ public boolean isCurrentSite() { return isCurrentSite; } /** * Get absolute URL; limitations -- this is not intended for general use It * is a mere concatenation of parent + rel path. "../../....." paths are not * supported fully. * * @return the absolute url */ public String getAbsoluteURL() { return absoluteURL.toString(); } /** * Gets the url. * * @return URL object for this link. It is an absolute URL. */ public URL getURL() { return absoluteURL; } /** * Gets the directory. * * @return the directory */ public String getDirectory() { return directory; } }