psef.handler.HTMLFilter.java Source code

Java tutorial

Introduction

Here is the source code for psef.handler.HTMLFilter.java

Source

/*
 * This file is part of Psef.
 *
 *  Psef is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  Psef is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with Psef.  If not, see <http://www.gnu.org/licenses/>.
 *  (c) copyright Desmond Schmidt 2015
 */
package psef.handler;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.URL;
import java.net.URLConnection;
import java.io.InputStream;
import java.io.FileOutputStream;
import calliope.core.Utils;
import java.io.File;
import java.io.StringWriter;
import java.io.PrintWriter;
import psef.exception.PsefException;
import org.jsoup.nodes.DataNode;
import java.util.List;
import java.net.MalformedURLException;
import java.security.MessageDigest;
import org.apache.commons.codec.binary.Base64;

/**
 * Create a Web archive from a single web page
 */
public class HTMLFilter {
    /** the source HTML text */
    String src;
    /** base path within web-dir of site */
    String base;
    /** host dns name */
    String host;
    /**  relative path from root */
    String relPath;
    /** the root destination directory */
    File root;

    HTMLFilter(File root, String src, String base, String relPath, String host) {
        this.src = src;
        this.base = base;
        this.relPath = relPath;
        this.host = host;
        this.root = root;
    }

    /**
     * Download a remote file and save it locally
     * @param url the url to fetch it from
     * @param path the local path to store it at
     * @throws PsefException 
     */
    void downloadResource(URL url, String path) throws PsefException {
        try {
            File dst = new File(root, path);
            if (!dst.exists()) {
                URLConnection connection = url.openConnection();
                InputStream is = connection.getInputStream();
                byte[] data = Utils.readStream(is);
                File parent = dst.getParentFile();
                if (!parent.exists())
                    parent.mkdirs();
                dst.createNewFile();
                FileOutputStream out = new FileOutputStream(dst);
                out.write(data);
                out.close();
            }
        } catch (Exception e) {
            throw new PsefException(e);
        }
    }

    /**
     * Convert and download all scripts
     * @param doc the DOM document
     * @throws PsefException 
     */
    private void filterScripts(Document doc) throws PsefException {
        Elements scripts = doc.getElementsByTag("script");
        for (Element script : scripts) {
            String scriptSrc = script.attr("src");
            try {
                if (scriptSrc != null && scriptSrc.length() > 0) {
                    if (!scriptSrc.startsWith("http"))
                        scriptSrc = "http://" + host + base + relPath + "/" + scriptSrc;
                    URL url = new URL(scriptSrc);
                    if (url.getPath().startsWith(base)) {
                        String newPath = url.getPath().substring(base.length());
                        String newUrl = "scripts" + newPath;
                        script.attr("src", newUrl);
                        downloadResource(url, newUrl);
                    }
                    // else what?
                }
            } catch (Exception e) {
                throw new PsefException(e);
            }
        }
    }

    /**
     * See if the string starts with "@import"
     * @param text the text to scan
     * @return the offset of the first character after @import or 0
     */
    int readAtImport(String text) {
        if (text.trim().startsWith("@import"))
            return text.indexOf("@import") + 7;
        else
            return 0;
    }

    /**
     * Revise all "style" statements with @import directives
     * @param doc the DOM document
     * @throws PsefException 
     */
    private void filterStyles(Document doc) throws PsefException {
        try {
            Elements styles = doc.getElementsByTag("style");
            for (Element style : styles) {
                List<DataNode> data = style.dataNodes();
                String styleText = "";
                StringBuilder sb = new StringBuilder();
                for (DataNode node : data) {
                    styleText = node.getWholeData();
                    int pos = readAtImport(styleText);
                    while (pos > 0) {
                        sb.append("@import ");
                        styleText = styleText.substring(pos);
                        CSSUrl cssu = new CSSUrl(styleText);
                        styleText = styleText.substring(cssu.getPos());
                        cssu.revise(host, base + relPath, "styles");
                        URL u = new URL(cssu.getUrl(host, base + relPath));
                        downloadResource(u, cssu.getLocalPath());
                        sb.append(cssu.toString());
                        sb.append("\n");
                        pos = readAtImport(styleText);
                    }
                    node.setWholeData(sb.toString());
                }
                sb.append(styleText);
                //style.( sb.toString() );
            }
        } catch (Exception e) {
            throw new PsefException(e);
        }
    }

    /**
     * Convert a relative href to an absolute one
     * @param href a possibly relative href
     * @return the full http:// href
     */
    private String cleanHref(String href) {
        if (href.startsWith("http://"))
            return href;
        else
            return "http://" + host + relPath + href;
    }

    /**
     * Remove just the base element from the full path
     * @param path a full path 
     * @return the path relative to base
     */
    private String cleanPath(String path) {
        if (path.startsWith(base))
            path = path.substring(base.length());
        else
            path = path;
        if (path.equals("/"))
            path = "index.html";
        return path;
    }

    /**
     * Convert the link hrefs to their local equivalents
     * @param doc the document
     */
    private void filterLinks(Document doc) throws PsefException {
        try {
            Elements links = doc.getElementsByTag("link");
            for (Element link : links) {
                String type = link.attr("type");
                String href = link.attr("href");
                href = cleanHref(href);
                String folder = "other";
                if (type.equals("text/css"))
                    folder = "styles";
                else if (type.equals("text/javascript"))
                    folder = "scripts";
                else if (type.startsWith("image/"))
                    folder = "corpix";
                URL u = new URL(href);
                String localPath = folder + "/" + cleanPath(u.getPath());
                downloadResource(u, localPath);
                link.attr("href", localPath);
            }
        } catch (MalformedURLException e) {
            throw new PsefException(e);
        }
    }

    /**
     * Update all the anchors
     * @param doc the DOM document
     */
    private void filterAnchors(Document doc) throws PsefException {
        try {
            Elements anchors = doc.getElementsByTag("a");
            for (Element anchor : anchors) {
                String href = anchor.attr("href");
                String localPath = "";
                URL u = null;
                if (href.startsWith("#"))
                    continue;
                else if (href.startsWith("http:")) {
                    u = new URL(href);
                    if (u.getHost().equals(host) && u.getPath().startsWith(base))
                        localPath = cleanPath(u.getPath());
                    u = null;
                } else
                    localPath = cleanPath(href);
                if (localPath.length() > 0 && u == null)
                    u = new URL(cleanHref(href));
                if (u != null && localPath.length() > 0) {
                    if (!localPath.equals("index.html")) {
                        localPath = "anchors" + localPath;
                        String[] parts = localPath.split("\\?");
                        if (parts.length == 2) {
                            MessageDigest md = MessageDigest.getInstance("MD5");
                            md.update(parts[1].getBytes());
                            byte[] digest = md.digest();
                            byte[] bytesEncoded = Base64.encodeBase64(digest);
                            localPath = parts[0] + "_" + new String(bytesEncoded);
                        }
                    }
                    downloadResource(u, localPath);
                    anchor.attr("href", localPath);
                }
            }
        } catch (Exception e) {
            throw new PsefException(e);
        }
    }

    /**
     * Filter the entire document
     * @return the filtered document
     * @throws PsefException 
     */
    public String filter() throws PsefException {
        Document doc = Jsoup.parse(src);
        System.out.println("Filtering scripts");
        filterScripts(doc);
        System.out.println("Filtering styles");
        filterStyles(doc);
        System.out.println("Filtering links");
        filterLinks(doc);
        System.out.println("Filtering anchors");
        filterAnchors(doc);
        // write converted dom back to a string
        StringWriter sw = new StringWriter(src.length());
        PrintWriter writer = new PrintWriter(sw);
        writer.write(doc.html());
        writer.flush();
        writer.close();
        return sw.toString();
    }
}