Java tutorial
/** * Copyright 2015-2016 Abola Lee * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.github.abola.crawler; import org.apache.commons.httpclient.Cookie; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.impl.SimpleLog; import org.apache.commons.vfs2.CacheStrategy; import org.apache.commons.vfs2.FileContent; import org.apache.commons.vfs2.FileSystemException; import org.apache.commons.vfs2.FileSystemOptions; import org.apache.commons.vfs2.impl.StandardFileSystemManager; import org.apache.commons.vfs2.provider.http.HttpFileSystemConfigBuilder; import org.json.JSONArray; import org.json.JSONObject; import org.json.XML; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.parser.Parser; import org.mozilla.universalchardet.UniversalDetector; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Date; import java.util.List; /** * * * * @author Abola Lee <abola921@gmail.com> */ public class CrawlerPack { static SimpleLog log = new SimpleLog("simple.logger.com.github.abola.crawler.CrawlerPack"); static StandardFileSystemManager fileSystem; static { // create a Self-signed Server Certificates // for pass SSL XTrustProvider.install(); // Set default logging level "ERROR" log.setLevel(SimpleLog.LOG_LEVEL_WARN); try { fileSystem = new StandardFileSystemManager(); fileSystem.setCacheStrategy(CacheStrategy.ON_CALL); // change default logger to SimpleLog fileSystem.setLogger(log); fileSystem.init(); } catch (FileSystemException fse) { // ignore } } static CrawlerPack defaultCrawler; /** * Create a CrawlerPack instance * * @return CrawlerPack */ public static CrawlerPack start() { if (null == defaultCrawler) defaultCrawler = new CrawlerPack(); return defaultCrawler; } /** * Setting global level logging * * example: * CrawlerPack.setLoggerLevel( SimpleLog.LOG_LEVEL_INFO ); * * @param level */ public static void setLoggerLevel(int level) { log.setLevel(level); fileSystem.setLogger(log); } private String userAgent = "Mozilla/5.0 (CrawlerPack; )"; private List<Cookie> cookies = new ArrayList<>(); /** * Creates a cookie with the given name and value. * * @param name the cookie name * @param value the cookie value * @return CrawlerPack */ public CrawlerPack addCookie(String name, String value) { if (null == name) { log.warn("addCookie: Cookie name null."); return this; } cookies.add(new Cookie("", name, value)); return this; } /** * Creates a cookie with the given name, value, domain attribute, * path attribute, expiration attribute, and secure attribute * * @param name the cookie name * @param value the cookie value * @param domain the domain this cookie can be sent to * @param path the path prefix for which this cookie can be sent * @param expires the {@link Date} at which this cookie expires, * or <tt>null</tt> if the cookie expires at the end * of the session * @param secure if true this cookie can only be sent over secure * connections * */ public CrawlerPack addCookie(String domain, String name, String value, String path, Date expires, boolean secure) { if (null == name) { log.warn("addCookie: Cookie name null."); return this; } cookies.add(new Cookie(domain, name, value, path, expires, secure)); return this; } /** * Return a Cookie array * and auto importing domain and path when domain was empty. * * @param uri required Apache Common VFS supported file systems and response JSON format content. * @return Cookie[] */ Cookie[] getCookies(String uri) { if (null == cookies || 0 == cookies.size()) return null; for (Cookie cookie : cookies) { if ("".equals(cookie.getDomain())) { String domain = uri.replaceAll("^.*:\\/\\/([^\\/]+)[\\/]?.*$", "$1"); cookie.setDomain(domain); cookie.setPath("/"); cookie.setExpiryDate(null); cookie.setSecure(false); } } return cookies.toArray(new Cookie[cookies.size()]); } /** * Clear all cookies */ void clearCookies() { log.trace("clearCookies: clear all cookies."); cookies = new ArrayList<>(); } /** * ??? JSON * * @param uri required Apache Common VFS supported file systems and response JSON format content. * @return org.jsoup.nodes.Document */ public org.jsoup.nodes.Document getFromJson(String uri) { // ?XML? String json = getFromRemote(uri); // json xml String xml = jsonToXml(json); // Jsoup return xmlToJsoupDoc(xml); } /** * ??? HTML/Html5 * * @param uri required Apache Common VFS supported file systems and response HTML format content. * @return org.jsoup.nodes.Document */ public org.jsoup.nodes.Document getFromHtml(String uri) { // ? String html = getFromRemote(uri); // Jsoup return htmlToJsoupDoc(html); } /** * ??? XML * * @param uri required Apache Common VFS supported file systems and response XML format content. * @return org.jsoup.nodes.Document */ public org.jsoup.nodes.Document getFromXml(String uri) { // ?XML? String xml = getFromRemote(uri); // Jsoup return xmlToJsoupDoc(xml); } /** * json XML * * @param json a json format string. * @return XML format string */ public String jsonToXml(String json) { String xml = ""; // ?JSON row tag if ("[".equals(json.substring(0, 1))) { xml = XML.toString(new JSONArray(json), "row"); } else { xml = XML.toString(new JSONObject(json)); } return xml; } /** * ?? Apache Common VFS ??? * * ?? * @see <a href="https://commons.apache.org/proper/commons-vfs/filesystems.html">commons-vfs filesystems</a> */ public String getFromRemote(String uri) { // clear cache fileSystem.getFilesCache().close(); String remoteContent; String remoteEncoding = "utf-8"; log.debug("getFromRemote: Loading remote URI=" + uri); FileContent fileContent; try { FileSystemOptions fsOptions = new FileSystemOptions(); // set userAgent HttpFileSystemConfigBuilder.getInstance().setUserAgent(fsOptions, userAgent); // set cookie if cookies set if (0 < this.cookies.size()) { HttpFileSystemConfigBuilder.getInstance().setCookies(fsOptions, getCookies(uri)); } log.debug("getFromRemote: userAgent=" + userAgent); log.debug("getFromRemote: cookieSize=" + cookies.size()); log.debug("getFromRemote: cookies=" + cookies.toString()); fileContent = fileSystem.resolveFile(uri, fsOptions).getContent(); // 2016-03-22 only pure http/https auto detect encoding if ("http".equalsIgnoreCase(uri.substring(0, 4))) { fileContent.getSize(); // pass a bug {@link https://issues.apache.org/jira/browse/VFS-427} remoteEncoding = fileContent.getContentInfo().getContentEncoding(); } log.debug("getFromRemote: remoteEncoding=" + remoteEncoding + "(auto detect) "); // 2016-03-21 zip file getContentEncoding null if (null == remoteEncoding) remoteEncoding = "utf-8"; if (!"utf".equalsIgnoreCase(remoteEncoding.substring(0, 3))) { log.debug("getFromRemote: remote content encoding=" + remoteEncoding); // force charset encoding if setRemoteEncoding set if (!"utf".equalsIgnoreCase(encoding.substring(0, 3))) { remoteEncoding = encoding; } else { // auto detecting encoding remoteEncoding = detectCharset(IOUtils.toByteArray(fileContent.getInputStream())); log.debug("getFromRemote: real encoding=" + remoteEncoding); } } // ?? Apache VFS ?? // 2016-02-29 fixed remoteContent = IOUtils.toString(fileContent.getInputStream(), remoteEncoding); } catch (FileSystemException fse) { log.warn("getFromRemote: FileSystemException=" + fse.getMessage()); remoteContent = null; } catch (IOException ioe) { // return empty log.warn("getFromRemote: IOException=" + ioe.getMessage()); remoteContent = null; } catch (StringIndexOutOfBoundsException stre) { log.warn("getFromRemote: StringIndexOutOfBoundsException=" + stre.getMessage()); log.warn("getFromRemote: uri=" + uri); log.warn(stre.getMessage()); remoteContent = null; } clearCookies(); log.debug("getFromRemote: remoteContent=\n" + remoteContent); // any exception will return "null" return remoteContent; } /** * HTML Jsoup Document * * HTMLJsoup HTML Parser * * @param html Html document * @return org.jsoup.nodes.Document */ public org.jsoup.nodes.Document htmlToJsoupDoc(String html) { // html(html/html5) jsoup Document Document jsoupDoc = Jsoup.parse(html, "UTF-8", Parser.htmlParser()); jsoupDoc.charset(StandardCharsets.UTF_8); return jsoupDoc; } // ?? a-zA-Z ? final static String prefix = "all-lower-case-prefix"; /** * XML Jsoup Document * * Jsoup 1.9.1+ supported non-ascii tag * ----- * Tag ??? a-zA-Z jsoup ? * prefix * ?xmlParse prefix * * @param xml XML format string * @return org.jsoup.nodes.Document */ public org.jsoup.nodes.Document xmlToJsoupDoc(String xml) { // Tag ? a-zA-Z ? //xml = xml.replaceAll("<([^A-Za-z\\/! ][^\\/>]*)>", "<"+prefix.toLowerCase()+"$1>") // .replaceAll("<\\/([^A-Za-z\\/ ][^\\/>]*)>", "</"+prefix.toLowerCase()+"$1>"); // xml jsoup Document //Document jsoupDoc = Jsoup.parse(xml, "", new Parser( new PrefixXmlTreeBuilder(prefix.toLowerCase()) ) ); Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser()); jsoupDoc.charset(StandardCharsets.UTF_8); return jsoupDoc; } private String encoding = "utf-8"; /** * ?? * ? get ? * * @return CrawlerPack */ public CrawlerPack setRemoteEncoding(String encoding) { log.debug("setRemoteEncoding: encoding=" + encoding); this.encoding = encoding; return this; } private String detectCharset(byte[] content) { log.debug("detectCharset: "); return detectCharset(content, 0); } final Integer detectBuffer = 1000; /** * Detecting real content encoding * @param content * @param offset * @return real charset encoding */ private String detectCharset(byte[] content, Integer offset) { log.debug("detectCharset: offset=" + offset); // detect failed if (offset > content.length) return null; UniversalDetector detector = new UniversalDetector(null); detector.handleData(content, offset, content.length - offset > detectBuffer ? detectBuffer : content.length - offset); detector.dataEnd(); String detectEncoding = detector.getDetectedCharset(); return null == detectEncoding ? detectCharset(content, offset + detectBuffer) : detectEncoding; } /** * set header userAgent * * @param userAgent * @return CrawlerPack */ public CrawlerPack setUserAgent(String userAgent) { log.debug("setUserAgent: userAgent=\"" + userAgent + "\""); this.userAgent = userAgent; return this; } }