openthinks.others.htmlunit.PageKeeper.java Source code

Java tutorial

Introduction

Here is the source code for openthinks.others.htmlunit.PageKeeper.java

Source

/**   
 *  Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
* @Title: PageKeeper.java 
* @Package openthinks.others.safari 
* @Description: TODO
* @author dailey.yet@outlook.com  
* @date Aug 6, 2015
* @version V1.0   
*/
package openthinks.others.htmlunit;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import javax.imageio.stream.ImageInputStream;

import org.apache.commons.codec.binary.Base64;

import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.WebResponse;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.DomNodeList;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlImage;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

/**
 * @author dailey.yet@outlook.com
 *
 */
public class PageKeeper {
    private HtmlPage htmlPage;
    private File keepDir;

    public static final String RESOURCE_STYLE_DIR = "style";
    public static final String RESOURCE_SCRIPT_DIR = "js";
    public static final String RESOURCE_IMAGE_DIR = "images";
    public static final String RESOURCE_STYLE_REFERENCE_DIR = "style\\styleref";
    public static final String RESOURCE_STYLE_REFERENCE_URL = "styleref";

    public static final String RESOURCE_STYLE_REFERENCE_REGEX = "url\\(['\"]?([^\\(\\)'\"]+)['\"]?\\)";
    public static final Pattern RESOURCE_STYLE_REFERENCE_PATTERN = Pattern.compile(RESOURCE_STYLE_REFERENCE_REGEX);

    private PageKeeper(HtmlPage htmlPage, File keepDir) {
        super();
        this.htmlPage = htmlPage;
        this.keepDir = keepDir;
    }

    private PageKeeper(HtmlPage htmlPage) {
        this(htmlPage, new File("."));
    }

    public void setHtmlPage(HtmlPage htmlPage) {
        this.htmlPage = htmlPage;
    }

    public void setKeepDir(File keepDir) {
        this.keepDir = keepDir;
    }

    private void keepPage() throws FileNotFoundException, IOException {
        String fileName = this.htmlPage.getUrl().toString();
        int end = fileName.indexOf("?");
        int start = fileName.lastIndexOf("/");
        if (end != -1 && start < end)
            fileName = fileName.substring(start + 1, end);
        else
            fileName = fileName.substring(start + 1);
        String htmlContent = this.htmlPage.asXml();
        File keepFile = new File(keepDir, fileName);
        store(htmlContent, keepFile);
    }

    private void keepResources() {
        keepStylesheets(htmlPage.getElementsByTagName("link"));
        keepScripts(htmlPage.getElementsByTagName("script"));
        keepImages(htmlPage.getElementsByTagName("img"));
    }

    private void keepStylesheets(DomNodeList<DomElement> domElements) {
        File styleDir = new File(keepDir, RESOURCE_STYLE_DIR);
        if (!styleDir.exists())
            styleDir.mkdirs();
        for (DomElement element : domElements) {
            if ("stylesheet".equalsIgnoreCase(element.getAttribute("rel"))
                    && !element.getAttribute("href").isEmpty()) {
                String styleUrl = getFullyQualifiedUrl(element.getAttribute("href"));
                if (styleUrl == null) {
                    continue;
                }
                String styleName = getResourceName(styleUrl);
                String styleCtx = "";

                try {
                    File keepFile = new File(styleDir, styleName);
                    checkIfAlreadExist(keepFile);
                    WebResponse wrp = getResourceResponse(styleUrl);
                    styleCtx = wrp.getContentAsString("UTF-8");
                    styleCtx = keepStyleReference(styleCtx).toString();
                    store(styleCtx, keepFile);
                    element.setAttribute("href", RESOURCE_STYLE_DIR + "/" + styleName);
                } catch (IOException e) {
                    e.printStackTrace();
                } catch (IllegalArgumentException e) {
                    element.setAttribute("href", RESOURCE_STYLE_DIR + "/" + styleName);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private StringBuffer keepStyleReference(String styleCtx) {
        Matcher matcher = RESOURCE_STYLE_REFERENCE_PATTERN.matcher(styleCtx);
        File refDir = new File(keepDir, RESOURCE_STYLE_REFERENCE_DIR);
        if (!refDir.exists())
            refDir.mkdirs();
        StringBuffer sb = new StringBuffer();
        while (matcher.find()) {
            String relativeURL = matcher.group(1);
            String styleRefUrl = getFullyQualifiedUrl(relativeURL);
            String styleRefName = getResourceName(styleRefUrl);
            String styleRefCtx = "";
            File keepFile = new File(refDir, styleRefName);
            WebResponse wrp = null;
            try {
                checkIfAlreadExist(keepFile);
                wrp = getResourceResponse(styleRefUrl);
                if (wrp.getContentType().startsWith("image") || wrp.getContentType().startsWith("IMAGE")) {
                    ImageInputStream iis = ImageIO.createImageInputStream(wrp.getContentAsStream());
                    Iterator<ImageReader> iter = ImageIO.getImageReaders(iis);
                    ImageReader imageReader = iter.next();
                    imageReader.setInput(iis);
                    ImageIO.write(imageReader.read(0), imageReader.getFormatName(), keepFile);
                } else {
                    styleRefCtx = wrp.getContentAsString("UTF-8");
                    store(styleRefCtx, keepFile);
                }
                matcher.appendReplacement(sb, "url(" + RESOURCE_STYLE_REFERENCE_URL + "/" + styleRefName + ")");
            } catch (MalformedURLException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            } catch (IllegalArgumentException e) {
                matcher.appendReplacement(sb, "url(" + RESOURCE_STYLE_REFERENCE_URL + "/" + styleRefName + ")");
            } catch (NoSuchElementException e) {
                if (wrp != null) {
                    styleRefCtx = wrp.getContentAsString("UTF-8");
                    store(styleRefCtx, keepFile);
                    matcher.appendReplacement(sb, "url(" + RESOURCE_STYLE_REFERENCE_URL + "/" + styleRefName + ")");
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        matcher.appendTail(sb);
        return sb;
    }

    private void keepScripts(DomNodeList<DomElement> elementsByTag) {
        File tagDir = new File(keepDir, RESOURCE_SCRIPT_DIR);
        if (!tagDir.exists())
            tagDir.mkdirs();
        for (DomElement element : elementsByTag) {
            if (!element.getAttribute("src").isEmpty()) {
                String tagUrl = getFullyQualifiedUrl(element.getAttribute("src"));
                if (tagUrl == null) {
                    continue;
                }
                String tagName = getResourceName(tagUrl);
                String tagCtx = "";
                try {
                    File keepFile = new File(tagDir, tagName);
                    checkIfAlreadExist(keepFile);
                    WebResponse wrp = getResourceResponse(tagUrl);
                    tagCtx = wrp.getContentAsString("UTF-8");
                    store(tagCtx, new File(tagDir, tagName));
                    element.setAttribute("src", RESOURCE_SCRIPT_DIR + "/" + tagName);
                } catch (IOException e) {
                    e.printStackTrace();
                } catch (IllegalArgumentException e) {
                    element.setAttribute("src", RESOURCE_SCRIPT_DIR + "/" + tagName);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private void keepImages(DomNodeList<DomElement> elementsByTag) {
        File imgeDir = new File(keepDir, RESOURCE_IMAGE_DIR);
        if (!imgeDir.exists())
            imgeDir.mkdirs();
        for (DomElement element : elementsByTag) {
            HtmlImage htmlImage = (HtmlImage) element;
            String srcUrl = getFullyQualifiedUrl(htmlImage.getAttribute("src"));
            String imgName = srcUrl;
            int start = imgName.lastIndexOf("/");
            imgName = imgName.substring(start + 1);
            imgName = Base64.encodeBase64String(imgName.getBytes());
            if (imgName.indexOf(".") == -1) {
                imgName = imgName + ".jpg";
            }
            try {
                File keepFile = new File(imgeDir, imgName);
                checkIfAlreadExist(keepFile);
                htmlImage.saveAs(new File(imgeDir, imgName));
                htmlImage.setAttribute("src", RESOURCE_IMAGE_DIR + "/" + imgName);
            } catch (IOException e) {
                e.printStackTrace();
            } catch (IllegalArgumentException e) {
                htmlImage.setAttribute("src", RESOURCE_IMAGE_DIR + "/" + imgName);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * @param tagUrl
     * @return
     * @throws MalformedURLException
     * @throws IOException
     */
    protected WebResponse getResourceResponse(String tagUrl) throws MalformedURLException, IOException {
        final WebRequest referringRequest = this.htmlPage.getWebResponse().getWebRequest();
        final WebRequest wrq = new WebRequest(new URL(tagUrl),
                this.htmlPage.getWebClient().getBrowserVersion().getScriptAcceptHeader());
        wrq.setAdditionalHeaders(new HashMap<>(referringRequest.getAdditionalHeaders()));
        wrq.setAdditionalHeader("Referer", referringRequest.getUrl().toString());
        wrq.setAdditionalHeader("Accept", this.htmlPage.getWebClient().getBrowserVersion().getScriptAcceptHeader());
        WebResponse wrp = this.htmlPage.getWebClient().loadWebResponse(wrq);
        return wrp;
    }

    private String getFullyQualifiedUrl(String relative) {
        try {
            return this.htmlPage.getFullyQualifiedUrl(relative).toString();
        } catch (Exception e) {
            return null;
        }
    }

    private String getResourceName(String url) {
        String name = url;
        int start = name.lastIndexOf("/");
        int end = name.indexOf("?");
        if (end != -1 && start < end)
            name = name.substring(start + 1, end);
        else
            name = name.substring(start + 1);
        return name;
    }

    private void checkIfAlreadExist(File file) {
        if (file != null && file.exists()) {
            throw new IllegalArgumentException("Already download!");
        }
    }

    private void keepTo(File keepDirectory) {
        this.keepDir = keepDirectory;
        if (!keepDir.exists()) {
            keepDir.mkdirs();
        } else if (!keepDir.isDirectory()) {
            throw new IllegalArgumentException("Cann't save to the directory:[" + keepDirectory + "]");
        }
    }

    protected void store(String htmlContent, File keepFile) {
        try (PrintWriter writer = new PrintWriter(new FileOutputStream(keepFile))) {
            writer.write(htmlContent);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    static void print(HtmlPage htmlPage) {
        String htmlContent = htmlPage.getWebResponse().getContentAsString("UTF-8");
        System.out.println(htmlContent);
    }

    static PageKeeper keeper(File saveDir) {
        PageKeeper keeper = new PageKeeper(null, saveDir);
        return keeper;
    }

    static void save(HtmlPage htmPage, File saveDir) {
        Objects.requireNonNull(htmPage);
        Objects.requireNonNull(saveDir);
        PageKeeper keeper = new PageKeeper(htmPage, saveDir);
        keeper.keepTo(saveDir);
        try {
            keeper.keepResources();
            keeper.localAnchorLink();
            keeper.keepPage();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private void localAnchorLink() {
        this.htmlPage.getAnchors().stream().filter((HtmlAnchor anchor) -> {
            return anchor.hasAttribute("href") && !anchor.getAttribute("href").isEmpty();
        }).forEach((HtmlAnchor anchor) -> {
            String hrefURL = anchor.getHrefAttribute();
            int start = hrefURL.lastIndexOf("/");
            if (start != -1) {
                hrefURL = hrefURL.substring(start + 1);
                anchor.setAttribute("href", hrefURL);
            }
        });
    }

}