com.github.abola.crawler.CrawlerPack.java Source code

Java tutorial

Introduction

Here is the source code for com.github.abola.crawler.CrawlerPack.java

Source

/**
 * Copyright 2015-2016 Abola Lee
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.github.abola.crawler;

import org.apache.commons.httpclient.Cookie;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.impl.SimpleLog;
import org.apache.commons.vfs2.CacheStrategy;
import org.apache.commons.vfs2.FileContent;
import org.apache.commons.vfs2.FileSystemException;
import org.apache.commons.vfs2.FileSystemOptions;
import org.apache.commons.vfs2.impl.StandardFileSystemManager;
import org.apache.commons.vfs2.provider.http.HttpFileSystemConfigBuilder;
import org.json.JSONArray;
import org.json.JSONObject;
import org.json.XML;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.mozilla.universalchardet.UniversalDetector;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

/**
 *  
 *  
 * 
 * @author Abola Lee <abola921@gmail.com>
 */
public class CrawlerPack {

    static SimpleLog log = new SimpleLog("simple.logger.com.github.abola.crawler.CrawlerPack");

    static StandardFileSystemManager fileSystem;

    static {
        // create a Self-signed Server Certificates
        // for pass SSL
        XTrustProvider.install();

        // Set default logging level "ERROR"
        log.setLevel(SimpleLog.LOG_LEVEL_WARN);

        try {

            fileSystem = new StandardFileSystemManager();

            fileSystem.setCacheStrategy(CacheStrategy.ON_CALL);

            // change default logger to SimpleLog
            fileSystem.setLogger(log);
            fileSystem.init();
        } catch (FileSystemException fse) {
            // ignore
        }
    }

    static CrawlerPack defaultCrawler;

    /**
     * Create a CrawlerPack instance
     *
     * @return CrawlerPack
     */
    public static CrawlerPack start() {
        if (null == defaultCrawler)
            defaultCrawler = new CrawlerPack();
        return defaultCrawler;
    }

    /**
     * Setting global level logging
     *
     * example:
     *   CrawlerPack.setLoggerLevel( SimpleLog.LOG_LEVEL_INFO );
     *
     * @param level
     */
    public static void setLoggerLevel(int level) {
        log.setLevel(level);
        fileSystem.setLogger(log);
    }

    private String userAgent = "Mozilla/5.0 (CrawlerPack; )";

    private List<Cookie> cookies = new ArrayList<>();

    /**
     * Creates a cookie with the given name and value.
     *
     * @param name    the cookie name
     * @param value   the cookie value
     * @return CrawlerPack
     */
    public CrawlerPack addCookie(String name, String value) {
        if (null == name) {
            log.warn("addCookie: Cookie name null.");
            return this;
        }

        cookies.add(new Cookie("", name, value));

        return this;
    }

    /**
     * Creates a cookie with the given name, value, domain attribute,
     * path attribute, expiration attribute, and secure attribute
     *
     * @param name    the cookie name
     * @param value   the cookie value
     * @param domain  the domain this cookie can be sent to
     * @param path    the path prefix for which this cookie can be sent
     * @param expires the {@link Date} at which this cookie expires,
     *                or <tt>null</tt> if the cookie expires at the end
     *                of the session
     * @param secure if true this cookie can only be sent over secure
     * connections
     *
     */
    public CrawlerPack addCookie(String domain, String name, String value, String path, Date expires,
            boolean secure) {
        if (null == name) {
            log.warn("addCookie: Cookie name null.");
            return this;
        }

        cookies.add(new Cookie(domain, name, value, path, expires, secure));
        return this;
    }

    /**
     * Return a Cookie array
     * and auto importing domain and path when domain was empty.
     *
     * @param uri required Apache Common VFS supported file systems and response JSON format content.
     * @return Cookie[]
     */
    Cookie[] getCookies(String uri) {
        if (null == cookies || 0 == cookies.size())
            return null;

        for (Cookie cookie : cookies) {

            if ("".equals(cookie.getDomain())) {
                String domain = uri.replaceAll("^.*:\\/\\/([^\\/]+)[\\/]?.*$", "$1");
                cookie.setDomain(domain);
                cookie.setPath("/");
                cookie.setExpiryDate(null);
                cookie.setSecure(false);
            }
        }

        return cookies.toArray(new Cookie[cookies.size()]);
    }

    /**
     * Clear all cookies
     */
    void clearCookies() {
        log.trace("clearCookies: clear all cookies.");
        cookies = new ArrayList<>();
    }

    /**
     * ??? JSON 
     *
     * @param uri required Apache Common VFS supported file systems and response JSON format content.
     * @return org.jsoup.nodes.Document 
     */
    public org.jsoup.nodes.Document getFromJson(String uri) {
        // ?XML?
        String json = getFromRemote(uri);

        //  json  xml
        String xml = jsonToXml(json);

        //  Jsoup 
        return xmlToJsoupDoc(xml);
    }

    /**
     * ??? HTML/Html5 
     *
     * @param uri required Apache Common VFS supported file systems and response HTML format content.
     * @return org.jsoup.nodes.Document
     */
    public org.jsoup.nodes.Document getFromHtml(String uri) {
        // ?
        String html = getFromRemote(uri);

        //  Jsoup 
        return htmlToJsoupDoc(html);
    }

    /**
     * ??? XML 
     *
     * @param uri required Apache Common VFS supported file systems and response XML format content.
     * @return org.jsoup.nodes.Document 
     */
    public org.jsoup.nodes.Document getFromXml(String uri) {
        // ?XML?
        String xml = getFromRemote(uri);

        //  Jsoup 
        return xmlToJsoupDoc(xml);
    }

    /**
     *  json  XML
     *
     * @param json a json format string.
     * @return XML format string
     */
    public String jsonToXml(String json) {
        String xml = "";
        // ?JSON row  tag
        if ("[".equals(json.substring(0, 1))) {
            xml = XML.toString(new JSONArray(json), "row");
        } else {
            xml = XML.toString(new JSONObject(json));
        }

        return xml;
    }

    /**
     * ?? Apache Common VFS  ???
     *
     * ??
     * @see <a href="https://commons.apache.org/proper/commons-vfs/filesystems.html">commons-vfs filesystems</a>
     */
    public String getFromRemote(String uri) {

        // clear cache
        fileSystem.getFilesCache().close();

        String remoteContent;
        String remoteEncoding = "utf-8";

        log.debug("getFromRemote: Loading remote URI=" + uri);
        FileContent fileContent;

        try {

            FileSystemOptions fsOptions = new FileSystemOptions();
            // set userAgent
            HttpFileSystemConfigBuilder.getInstance().setUserAgent(fsOptions, userAgent);

            // set cookie if cookies set
            if (0 < this.cookies.size()) {
                HttpFileSystemConfigBuilder.getInstance().setCookies(fsOptions, getCookies(uri));
            }

            log.debug("getFromRemote: userAgent=" + userAgent);
            log.debug("getFromRemote: cookieSize=" + cookies.size());
            log.debug("getFromRemote: cookies=" + cookies.toString());

            fileContent = fileSystem.resolveFile(uri, fsOptions).getContent();

            // 2016-03-22 only pure http/https auto detect encoding
            if ("http".equalsIgnoreCase(uri.substring(0, 4))) {
                fileContent.getSize(); // pass a bug {@link https://issues.apache.org/jira/browse/VFS-427}
                remoteEncoding = fileContent.getContentInfo().getContentEncoding();
            }

            log.debug("getFromRemote: remoteEncoding=" + remoteEncoding + "(auto detect) ");

            // 2016-03-21 zip file getContentEncoding null
            if (null == remoteEncoding)
                remoteEncoding = "utf-8";

            if (!"utf".equalsIgnoreCase(remoteEncoding.substring(0, 3))) {
                log.debug("getFromRemote: remote content encoding=" + remoteEncoding);

                // force charset encoding if setRemoteEncoding set
                if (!"utf".equalsIgnoreCase(encoding.substring(0, 3))) {
                    remoteEncoding = encoding;
                } else {
                    // auto detecting encoding
                    remoteEncoding = detectCharset(IOUtils.toByteArray(fileContent.getInputStream()));
                    log.debug("getFromRemote: real encoding=" + remoteEncoding);
                }
            }

            // ??  Apache VFS ??
            // 2016-02-29 fixed
            remoteContent = IOUtils.toString(fileContent.getInputStream(), remoteEncoding);

        } catch (FileSystemException fse) {
            log.warn("getFromRemote: FileSystemException=" + fse.getMessage());
            remoteContent = null;
        } catch (IOException ioe) {
            // return empty
            log.warn("getFromRemote: IOException=" + ioe.getMessage());
            remoteContent = null;
        } catch (StringIndexOutOfBoundsException stre) {
            log.warn("getFromRemote: StringIndexOutOfBoundsException=" + stre.getMessage());
            log.warn("getFromRemote: uri=" + uri);
            log.warn(stre.getMessage());
            remoteContent = null;
        }

        clearCookies();

        log.debug("getFromRemote: remoteContent=\n" + remoteContent);
        // any exception will return "null"
        return remoteContent;
    }

    /**
     *  HTML  Jsoup Document 
     *
     * HTMLJsoup HTML Parser
     *
     * @param html Html document
     * @return org.jsoup.nodes.Document
     */
    public org.jsoup.nodes.Document htmlToJsoupDoc(String html) {

        //  html(html/html5)  jsoup Document 
        Document jsoupDoc = Jsoup.parse(html, "UTF-8", Parser.htmlParser());
        jsoupDoc.charset(StandardCharsets.UTF_8);

        return jsoupDoc;
    }

    // ?? a-zA-Z ?
    final static String prefix = "all-lower-case-prefix";

    /**
     *  XML  Jsoup Document 
     *
     * Jsoup 1.9.1+ supported non-ascii tag
     * -----
     * Tag ??? a-zA-Z jsoup ?
     *  prefix
     * ?xmlParse prefix
     *
     * @param xml XML format string
     * @return org.jsoup.nodes.Document
     */
    public org.jsoup.nodes.Document xmlToJsoupDoc(String xml) {

        // Tag ? a-zA-Z ?
        //xml = xml.replaceAll("<([^A-Za-z\\/! ][^\\/>]*)>", "<"+prefix.toLowerCase()+"$1>")
        //         .replaceAll("<\\/([^A-Za-z\\/ ][^\\/>]*)>", "</"+prefix.toLowerCase()+"$1>");

        //  xml  jsoup Document 
        //Document jsoupDoc = Jsoup.parse(xml, "", new Parser( new PrefixXmlTreeBuilder(prefix.toLowerCase()) ) );

        Document jsoupDoc = Jsoup.parse(xml, "", Parser.xmlParser());
        jsoupDoc.charset(StandardCharsets.UTF_8);

        return jsoupDoc;
    }

    private String encoding = "utf-8";

    /**
     * ??
     * ? get ?
     *
     * @return CrawlerPack
     */
    public CrawlerPack setRemoteEncoding(String encoding) {
        log.debug("setRemoteEncoding: encoding=" + encoding);
        this.encoding = encoding;
        return this;
    }

    private String detectCharset(byte[] content) {
        log.debug("detectCharset: ");
        return detectCharset(content, 0);
    }

    final Integer detectBuffer = 1000;

    /**
     * Detecting real content encoding
     * @param content
     * @param offset
     * @return real charset encoding
     */
    private String detectCharset(byte[] content, Integer offset) {
        log.debug("detectCharset: offset=" + offset);

        // detect failed
        if (offset > content.length)
            return null;

        UniversalDetector detector = new UniversalDetector(null);
        detector.handleData(content, offset,
                content.length - offset > detectBuffer ? detectBuffer : content.length - offset);
        detector.dataEnd();

        String detectEncoding = detector.getDetectedCharset();

        return null == detectEncoding ? detectCharset(content, offset + detectBuffer) : detectEncoding;
    }

    /**
     * set header userAgent
     *
     * @param userAgent
     * @return CrawlerPack
     */
    public CrawlerPack setUserAgent(String userAgent) {
        log.debug("setUserAgent: userAgent=\"" + userAgent + "\"");
        this.userAgent = userAgent;
        return this;
    }
}