com.jaeksoft.searchlib.crawler.web.spider.HtmlArchiver.java Source code

Java tutorial

Introduction

Here is the source code for com.jaeksoft.searchlib.crawler.web.spider.HtmlArchiver.java

Source

/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see <http://www.gnu.org/licenses/>.
 **/

package com.jaeksoft.searchlib.crawler.web.spider;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;

import javax.xml.parsers.ParserConfigurationException;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.conn.HttpHostConnectException;
import org.htmlcleaner.ContentNode;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.xml.sax.SAXException;

import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.SearchLibException.WrongStatusCodeException;
import com.jaeksoft.searchlib.crawler.web.browser.BrowserDriver;
import com.jaeksoft.searchlib.crawler.web.spider.NaiveCSSParser.CSSImportRule;
import com.jaeksoft.searchlib.crawler.web.spider.NaiveCSSParser.CSSProperty;
import com.jaeksoft.searchlib.crawler.web.spider.NaiveCSSParser.CSSRule;
import com.jaeksoft.searchlib.crawler.web.spider.NaiveCSSParser.CSSStyleRule;
import com.jaeksoft.searchlib.parser.htmlParser.HtmlCleanerParser;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.LinkUtils;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.ThreadUtils.RecursiveTracker;
import com.jaeksoft.searchlib.util.ThreadUtils.RecursiveTracker.RecursiveEntry;

public class HtmlArchiver {

    private final BrowserDriver<?> browserDriver;
    private final File filesDir;
    private final File indexFile;
    private final File sourceFile;
    private final Map<String, Integer> fileCountMap;
    private final Map<String, String> urlFileMap;
    private final URL pageUrl;
    private final HttpDownloader downloader;
    private final RecursiveTracker recursiveSecurity;
    private URL baseUrl;

    public HtmlArchiver(BrowserDriver<?> browserDriver, File parentDir, HttpDownloader httpDownloader, URL url) {
        this.browserDriver = browserDriver;
        filesDir = new File(parentDir, "files");
        indexFile = new File(parentDir, "index.html");
        sourceFile = new File(parentDir, "source.html");
        this.pageUrl = url;
        this.baseUrl = url;
        this.downloader = httpDownloader;
        fileCountMap = new TreeMap<String, Integer>();
        urlFileMap = new TreeMap<String, String>();
        filesDir.mkdir();
        recursiveSecurity = new RecursiveTracker(20);
    }

    final private static String buildFileName(String baseName, String extension, Integer fileCount) {
        if (baseName.length() > 160)
            baseName = baseName.substring(0, 160);
        if (extension.length() > 32)
            extension = extension.substring(0, 32);
        StringBuilder sb = new StringBuilder(baseName);
        if (fileCount != null && fileCount > 0) {
            sb.append('_');
            sb.append(fileCount);
        }
        if (extension != null && extension.length() > 0) {
            sb.append('.');
            sb.append(extension);
        }
        return sb.toString();
    }

    final private String getLocalPath(URL parentUrl, String fileName) {
        if (parentUrl == null || urlFileMap.get(parentUrl.toExternalForm()) != null)
            return fileName;
        StringBuilder sb = new StringBuilder("./");
        sb.append(filesDir.getName());
        sb.append('/');
        sb.append(fileName);
        return sb.toString();
    }

    final public File getLocalFile(String fileName) {
        return new File(filesDir, fileName);
    }

    final public String getUrlFileName(String src) {
        if (urlFileMap == null)
            return null;
        URL objectURL = LinkUtils.getLink(pageUrl, src, null, false);
        String url = objectURL == null ? src : objectURL.toExternalForm();
        return urlFileMap.get(url);
    }

    final public File getAndRegisterDestFile(String urlString, String baseName, String extension) {
        String fileName = buildFileName(baseName, extension, null);
        Integer fileCount = fileCountMap.get(fileName);
        fileCount = fileCount == null ? new Integer(0) : fileCount + 1;
        fileCountMap.put(fileName, fileCount);
        fileName = buildFileName(baseName, extension, fileCount);
        if (urlString != null)
            urlFileMap.put(urlString, fileName);
        return new File(filesDir, fileName);
    }

    final private String downloadObject(URL parentUrl, String src, String contentType)
            throws ClientProtocolException, IllegalStateException, IOException, SearchLibException,
            URISyntaxException {
        RecursiveEntry recursiveEntry = recursiveSecurity.enter();
        if (recursiveEntry == null) {
            Logging.warn("Max recursion reached - " + recursiveSecurity + " src: " + src + " url: " + parentUrl);
            return src;
        }
        try {
            src = StringEscapeUtils.unescapeXml(src);
            URL objectURL = LinkUtils.getLink(parentUrl, src, null, false);
            if (objectURL == null)
                return src;
            if (objectURL.equals(pageUrl)) {
                return "index.html";
            }
            String urlString = objectURL.toExternalForm();
            String fileName = urlFileMap.get(urlString);
            if (fileName != null)
                return getLocalPath(parentUrl, fileName);
            DownloadItem downloadItem = null;
            try {
                downloadItem = downloader.get(objectURL.toURI(), null);
            } catch (IOException e) {
                Logging.warn("IO Exception on " + objectURL.toURI(), e);
                return src;
            }
            fileName = downloadItem.getFileName();
            if (fileName == null || fileName.length() == 0)
                return src;
            downloadItem.checkNoErrorRange(200, 300);
            String baseName = FilenameUtils.getBaseName(fileName);
            String extension = FilenameUtils.getExtension(fileName);
            if (contentType == null)
                contentType = downloadItem.getContentBaseType();
            if ("text/html".equalsIgnoreCase(contentType))
                extension = "html";
            else if ("text/javascript".equalsIgnoreCase(contentType))
                extension = "js";
            else if ("text/css".equalsIgnoreCase(contentType))
                extension = "css";
            else if ("application/x-shockwave-flash".equalsIgnoreCase(contentType))
                extension = "swf";
            else if ("image/png".equalsIgnoreCase(contentType))
                extension = "png";
            else if ("image/gif".equalsIgnoreCase(contentType))
                extension = "gif";
            else if ("image/jpeg".equalsIgnoreCase(contentType))
                extension = "jpg";
            else if ("image/jpg".equalsIgnoreCase(contentType))
                extension = "jpg";
            File destFile = getAndRegisterDestFile(urlString, baseName, extension);
            if ("css".equals(extension)) {
                String cssContent = downloadItem.getContentAsString();
                StringBuffer sb = checkCSSContent(objectURL, cssContent);
                if (sb != null && sb.length() > 0)
                    cssContent = sb.toString();
                FileUtils.write(destFile, cssContent);
            } else
                downloadItem.writeToFile(destFile);

            return getLocalPath(parentUrl, destFile.getName());
        } catch (HttpHostConnectException e) {
            Logging.warn(e);
            return src;
        } catch (UnknownHostException e) {
            Logging.warn(e);
            return src;
        } catch (WrongStatusCodeException e) {
            Logging.warn(e);
            return src;
        } finally {
            recursiveEntry.release();
        }
    }

    final private boolean handleCssProperty(URL objectUrl, CSSProperty property) throws ClientProtocolException,
            IllegalStateException, IOException, SearchLibException, URISyntaxException {
        if (property == null)
            return false;
        String oldValue = property.getValue();
        if (oldValue == null)
            return false;
        Matcher matcher = NaiveCSSParser.findUrl(oldValue);
        if (!matcher.find())
            return false;
        String url = matcher.group(1);
        if (url == null || url.length() == 0)
            return false;
        String newSrc = downloadObject(objectUrl, url, null);
        if (newSrc == null)
            return false;
        property.setValue(NaiveCSSParser.replaceUrl(oldValue, matcher, newSrc));
        return true;
    }

    final private boolean handleCssStyle(URL objectUrl, CSSStyleRule rule) throws ClientProtocolException,
            IllegalStateException, IOException, SearchLibException, URISyntaxException {
        boolean change = false;
        for (CSSProperty property : rule.getProperties()) {
            if (handleCssProperty(objectUrl, property))
                change = true;
        }
        return change;
    }

    final private StringBuffer checkCSSContent(URL objectUrl, String css) throws ClientProtocolException,
            IllegalStateException, IOException, SearchLibException, URISyntaxException {
        StringWriter sw = null;
        PrintWriter pw = null;

        try {
            NaiveCSSParser cssParser = new NaiveCSSParser();
            Collection<CSSRule> rules = cssParser.parseStyleSheet(css);
            if (rules == null)
                return null;
            if (rules.size() == 0)
                return null;
            sw = new StringWriter();
            pw = new PrintWriter(sw);
            for (CSSRule rule : rules) {
                if (rule instanceof CSSStyleRule) {
                    handleCssStyle(objectUrl, (CSSStyleRule) rule);
                } else if (rule instanceof CSSImportRule) {
                    CSSImportRule importRule = (CSSImportRule) rule;
                    String newSrc = downloadObject(objectUrl, importRule.getHref(), "text/css");
                    importRule.setHref(newSrc);
                }
            }
            cssParser.write(pw);
            return sw.getBuffer();
        } catch (IOException e) {
            Logging.warn("CSS ISSUE", e);
            return null;
        } finally {
            IOUtils.close(pw, sw);
        }
    }

    final private void checkStyleCSS(TagNode node) throws ClientProtocolException, IllegalStateException,
            IOException, SearchLibException, URISyntaxException {
        if (!("style".equalsIgnoreCase(node.getName())))
            return;
        String attr = node.getAttributeByName("type");
        if (!StringUtils.isEmpty(attr) && !"text/css".equalsIgnoreCase(attr))
            return;
        attr = node.getAttributeByName("media");
        if (!StringUtils.isEmpty(attr) && !"screen".equalsIgnoreCase(attr) && !"all".equalsIgnoreCase(attr))
            return;
        StringBuilder builder = (StringBuilder) node.getText();
        if (builder == null)
            return;
        String content = builder.toString();
        String newContent = StringEscapeUtils.unescapeXml(content);
        StringBuffer sb = checkCSSContent(baseUrl, newContent);
        if (sb != null)
            newContent = sb.toString();
        if (newContent.equals(content))
            return;
        node.removeAllChildren();
        node.addChild(new ContentNode(newContent));
    }

    final private void checkStyleAttribute(TagNode node) throws ClientProtocolException, IllegalStateException,
            IOException, SearchLibException, URISyntaxException {
        String style = node.getAttributeByName("style");
        if (style == null)
            return;
        if (style.length() == 0)
            return;

        NaiveCSSParser cssParser = new NaiveCSSParser();
        CSSStyleRule cssStyle = cssParser.parseStyleAttribute(style);
        if (!handleCssStyle(baseUrl, cssStyle))
            return;
        node.addAttribute("style", cssStyle.getPropertyString());
    }

    final boolean hasAncestorId(String[] ids, TagNode node) {
        if (node == null)
            return false;
        String nodeId = node.getAttributeByName("id");
        if (nodeId != null)
            for (String id : ids)
                if (id.equalsIgnoreCase(nodeId))
                    return true;
        return hasAncestorId(ids, node.getParent());
    }

    final boolean hasAncestorXPath(Set<TagNode> xpathSelectorSet, TagNode node) {
        if (node == null)
            return false;
        if (xpathSelectorSet.contains(node))
            return true;
        return hasAncestorXPath(xpathSelectorSet, node.getParent());
    }

    final private void checkScriptContent(TagNode node, Set<TagNode> disableScriptNodeSet) {
        if (!("script".equalsIgnoreCase(node.getName())))
            return;
        if (disableScriptNodeSet != null && hasAncestorXPath(disableScriptNodeSet, node)) {
            node.removeFromTree();
            return;
        }
        StringBuilder builder = (StringBuilder) node.getText();
        if (builder == null)
            return;
        String content = builder.toString();
        if (content == null)
            return;
        String newContent = StringEscapeUtils.unescapeXml(content);
        if (newContent.equals(content))
            return;
        node.removeAllChildren();
        node.addChild(new ContentNode(newContent));
    }

    final private String downloadIframe(URL parentUrl, TagNode node, Map<TagNode, WebElement> iframeNodeMap)
            throws IOException, ParserConfigurationException, SAXException, IllegalStateException,
            SearchLibException, URISyntaxException {
        if (iframeNodeMap == null) {
            Logging.warn("Unable to download IFRAME (no iframeNodeNap) " + node);
            return null;
        }
        WebElement webElement = iframeNodeMap.get(node);
        if (webElement == null) {
            Logging.warn("Issue when finding IFRAME for " + node);
            return null;
        }
        URL oldBaseUrl = baseUrl;
        String src = node.getAttributeByName("src");
        baseUrl = LinkUtils.getLink(parentUrl, src, null, false);
        String urlFileMapKey = null;
        if (baseUrl != null && !urlFileMap.containsKey(baseUrl.toExternalForm()))
            urlFileMapKey = baseUrl.toExternalForm();
        else
            urlFileMapKey = Integer.toString(node.hashCode());
        File destFile = getAndRegisterDestFile(urlFileMapKey, "iframe", "html");
        browserDriver.switchToFrame(webElement);
        String frameSource = browserDriver.getSourceCode();
        HtmlCleanerParser htmlCleanerParser = new HtmlCleanerParser();
        htmlCleanerParser.init(frameSource);
        recursiveArchive(htmlCleanerParser.getTagNode(), null, iframeNodeMap);
        htmlCleanerParser.writeHtmlToFile(destFile);
        baseUrl = oldBaseUrl;
        browserDriver.switchToMain();
        return getLocalPath(parentUrl, destFile.getName());
    }

    final private boolean downloadObjectIframe(TagNode node, Map<TagNode, WebElement> iframeNodeMap)
            throws IllegalStateException, IOException, ParserConfigurationException, SAXException,
            SearchLibException, URISyntaxException {
        if (!"iframe".equalsIgnoreCase(node.getName()))
            return false;
        String src = downloadIframe(baseUrl, node, iframeNodeMap);
        if (src != null)
            node.addAttribute("src", src);
        return true;
    }

    final private boolean downloadObjectSrc(TagNode node) throws ClientProtocolException, IllegalStateException,
            IOException, SearchLibException, URISyntaxException {
        String src = node.getAttributeByName("src");
        if (src == null)
            return false;
        src = downloadObject(baseUrl, src, null);
        if (src != null)
            node.addAttribute("src", src);
        return true;
    }

    final private boolean downloadObjectLink(TagNode node) throws ClientProtocolException, IllegalStateException,
            IOException, SearchLibException, URISyntaxException, ParserConfigurationException, SAXException {
        String src = node.getAttributeByName("href");
        if (src == null)
            return false;
        String type = node.getAttributeByName("type");
        if (type == null && node.getName().equalsIgnoreCase("script"))
            type = "text/javascript";
        if (type == null && node.getName().equalsIgnoreCase("link")
                && "stylesheet".equalsIgnoreCase(node.getAttributeByName("rel")))
            type = "text/css";
        if (type == null)
            return false;
        src = downloadObject(baseUrl, src, type);
        if (src != null)
            node.addAttribute("href", src);
        return true;
    }

    final private void checkBaseHref(TagNode node) {
        if (node == null)
            return;
        if (!"base".equalsIgnoreCase(node.getName()))
            return;
        String href = node.getAttributeByName("href");
        if (href != null) {
            try {
                baseUrl = new URL(href);
            } catch (MalformedURLException e) {
                Logging.warn(e);
                return;
            }
        }
        node.removeFromTree();
    }

    final private void recursiveArchive(TagNode node, Set<TagNode> disableScriptNodeSet,
            Map<TagNode, WebElement> iframeNodeMap) throws ClientProtocolException, IllegalStateException,
            IOException, SearchLibException, URISyntaxException, ParserConfigurationException, SAXException {
        if (node == null)
            return;
        checkBaseHref(node);
        if (!downloadObjectIframe(node, iframeNodeMap))
            if (!downloadObjectSrc(node))
                downloadObjectLink(node);
        checkStyleCSS(node);
        checkScriptContent(node, disableScriptNodeSet);
        checkStyleAttribute(node);
        TagNode[] nodes = node.getChildTags();
        if (nodes == null)
            return;
        for (TagNode n : nodes)
            recursiveArchive(n, disableScriptNodeSet, iframeNodeMap);
    }

    final public void archive(BrowserDriver<?> browserDriver, Set<String> xPathDisableScriptSet)
            throws IOException, ParserConfigurationException, SAXException, IllegalStateException,
            SearchLibException, URISyntaxException, XPatherException {
        String pageSource = browserDriver.getSourceCode();
        HtmlCleanerParser htmlCleanerParser = new HtmlCleanerParser();
        htmlCleanerParser.init(pageSource);
        // Find iframe
        Set<WebElement> iframeWebElementSet = new HashSet<WebElement>();
        browserDriver.locateBy(By.tagName("iframe"), iframeWebElementSet, true);
        Map<TagNode, WebElement> iframeNodeMap = null;
        if (iframeWebElementSet != null && iframeWebElementSet.size() > 0) {
            iframeNodeMap = new HashMap<TagNode, WebElement>();
            Set<TagNode> tagNodeSet = new HashSet<TagNode>();
            for (WebElement webElement : iframeWebElementSet) {
                String xPath = browserDriver.getXPath(webElement, true);
                if (xPath == null)
                    continue;
                if (htmlCleanerParser.xpath(xPath, tagNodeSet) == 0) {
                    Logging.warn("DisableScript not found using XPath: " + xPath);
                    continue;
                }
                for (TagNode tagNode : tagNodeSet)
                    iframeNodeMap.put(tagNode, webElement);
                tagNodeSet.clear();
            }
        }
        // Find node that need to be disabled
        Set<TagNode> disableScriptNodeSet = null;
        if (xPathDisableScriptSet != null && xPathDisableScriptSet.size() > 0) {
            disableScriptNodeSet = new HashSet<TagNode>();
            for (String xPath : xPathDisableScriptSet)
                if (htmlCleanerParser.xpath(xPath, disableScriptNodeSet) == 0)
                    Logging.warn("DisableScript not found using XPath: " + xPath);
        }
        recursiveArchive(htmlCleanerParser.getTagNode(), disableScriptNodeSet, iframeNodeMap);
        htmlCleanerParser.writeHtmlToFile(indexFile);
        String charset = htmlCleanerParser.findCharset();
        if (charset == null)
            FileUtils.write(sourceFile, pageSource);
        else
            FileUtils.write(sourceFile, pageSource, charset);

    }
}