net.yacy.cora.util.Html2Image.java Source code

Java tutorial

Introduction

Here is the source code for net.yacy.cora.util.Html2Image.java

Source

/**
 *  Html2Image
 *  Copyright 2014 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
 *  First published 26.11.2014 on http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.cora.util;

import java.awt.Container;
import java.awt.Dimension;
import java.awt.Graphics;
import java.awt.Image;
import java.awt.MediaTracker;
import java.awt.image.BufferedImage;
import java.beans.PropertyChangeEvent;
import java.beans.PropertyChangeListener;
import java.io.File;
import java.io.IOException;
import java.util.List;

import javax.imageio.ImageIO;
import javax.swing.JEditorPane;
import javax.swing.text.Document;
import javax.swing.text.Element;
import javax.swing.text.View;
import javax.swing.text.ViewFactory;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.ImageView;

import net.yacy.document.ImageParser;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.OS;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;

/**
 * Convert html to an copy on disk-image in a other file format
 * currently (pdf and/or jpg)
 */
public class Html2Image {

    // Mac
    // to install wkhtmltopdf, download wkhtmltox-0.12.1_osx-cocoa-x86-64.pkg from http://wkhtmltopdf.org/downloads.html
    // to install imagemagick, download from http://cactuslab.com/imagemagick/assets/ImageMagick-6.8.9-9.pkg.zip
    // the convert command from imagemagick needs ghostscript, if not present on older macs, download a version of gs from http://pages.uoregon.edu/koch/
    private final static File wkhtmltopdfMac = new File("/usr/local/bin/wkhtmltopdf"); // sometimes this is also the path on debian
    private final static File convertMac1 = new File("/opt/local/bin/convert");
    private final static File convertMac2 = new File("/opt/ImageMagick/bin/convert");

    // debian
    // to install: apt-get install wkhtmltopdf imagemagick xvfb
    private final static File wkhtmltopdfDebian = new File("/usr/bin/wkhtmltopdf"); // there is no wkhtmltoimage, use convert to create images
    private final static File convertDebian = new File("/usr/bin/convert");

    private static boolean usexvfb = false;

    public static boolean wkhtmltopdfAvailable() {
        return wkhtmltopdfMac.exists() || wkhtmltopdfDebian.exists();
    }

    public static boolean convertAvailable() {
        return convertMac1.exists() || convertMac2.exists() || convertDebian.exists();
    }

    /**
     * write a pdf of a web page
     * @param url
     * @param proxy must be of the form http://host:port; use YaCy here as proxy which is mostly http://localhost:8090
     * @param destination
     * @return
     */
    public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage,
            File destination) {
        boolean success = false;
        for (boolean ignoreErrors : new boolean[] { false, true }) {
            success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors);
            if (success)
                break;
            if (!success && proxy != null) {
                ConcurrentLog.warn("Html2Image", "trying to load without proxy: " + url);
                success = writeWkhtmltopdfInternal(url, null, destination, userAgent, acceptLanguage, ignoreErrors);
                if (success)
                    break;
            }
        }
        if (success) {
            ConcurrentLog.info("Html2Image", "wrote " + destination.toString() + " for " + url);
        } else {
            ConcurrentLog.warn("Html2Image", "could not generate snapshot for " + url);
        }
        return success;
    }

    private static boolean writeWkhtmltopdfInternal(final String url, final String proxy, final File destination,
            final String userAgent, final String acceptLanguage, final boolean ignoreErrors) {
        final File wkhtmltopdf = wkhtmltopdfMac.exists() ? wkhtmltopdfMac : wkhtmltopdfDebian;
        String commandline = wkhtmltopdf.getAbsolutePath() + " -q --title '" + url + "' " +
        //acceptLanguage == null ? "" : "--custom-header 'Accept-Language' '" + acceptLanguage + "' " + 
        //(userAgent == null ? "" : "--custom-header \"User-Agent\" \"" + userAgent + "\" --custom-header-propagation ") + 
                (proxy == null ? "" : "--proxy " + proxy + " ")
                + (ignoreErrors ? (OS.isMacArchitecture ? "--load-error-handling ignore " : "--ignore-load-errors ")
                        : "")
                + // some versions do not have that flag and fail if attempting to use it...
                //"--footer-font-name 'Courier' --footer-font-size 9 --footer-left [webpage] --footer-right [date]/[time]([page]/[topage]) " +
                "--footer-left [webpage] --footer-right '[date]/[time]([page]/[topage])' --footer-font-size 7 "
                + url + " " + destination.getAbsolutePath();
        try {
            ConcurrentLog.info("Html2Pdf", "creating pdf from url " + url + " with command: " + commandline);
            List<String> message;
            if (!usexvfb) {
                message = OS.execSynchronous(commandline);
                if (destination.exists())
                    return true;
                ConcurrentLog.warn("Html2Image", "failed to create pdf "
                        + (proxy == null ? "" : "using proxy " + proxy) + " with command: " + commandline);
                for (String m : message)
                    ConcurrentLog.warn("Html2Image", ">> " + m);
            }
            // if this fails, we should try to wrap the X server with a virtual screen using xvfb, this works on headless servers
            commandline = "xvfb-run -a " + commandline;
            message = OS.execSynchronous(commandline);
            if (destination.exists()) {
                usexvfb = true;
                return true;
            }
            ConcurrentLog.warn("Html2Pdf", "failed to create pdf " + (proxy == null ? "" : "using proxy " + proxy)
                    + " and xvfb with command: " + commandline);
            for (String m : message)
                ConcurrentLog.warn("Html2Image", ">> " + m);
            return false;
        } catch (IOException e) {
            e.printStackTrace();
            ConcurrentLog.warn("Html2Pdf", "exception while creation of pdf with command: " + commandline);
            return false;
        }
    }

    /**
     * convert a pdf (first page) to an image. proper values are i.e. width = 1024, height = 1024, density = 300, quality = 75
     * using internal pdf library or external command line tool on linux or mac
     * @param pdf input pdf file
     * @param image output jpg file
     * @param width
     * @param height
     * @param density (dpi)
     * @param quality
     * @return
     */
    public static boolean pdf2image(File pdf, File image, int width, int height, int density, int quality) {
        final File convert = convertMac1.exists() ? convertMac1
                : convertMac2.exists() ? convertMac2 : convertDebian;

        // convert pdf to jpg using internal pdfbox capability
        if (OS.isWindows || !convert.exists()) {
            try {
                PDDocument pdoc = PDDocument.load(pdf);
                BufferedImage bi = new PDFRenderer(pdoc).renderImageWithDPI(0, density, ImageType.RGB);

                return ImageIO.write(bi, "jpg", image);

            } catch (IOException ex) {
            }
        }

        // convert on mac or linux using external command line utility
        try {
            // i.e. convert -density 300 -trim yacy.pdf[0] -trim -resize 1024x -crop x1024+0+0 -quality 75% yacy-convert-300.jpg
            // note: both -trim are necessary, otherwise it is trimmed only on one side. The [0] selects the first page of the pdf
            String command = convert.getAbsolutePath() + " -density " + density + " -trim " + pdf.getAbsolutePath()
                    + "[0] -trim -resize " + width + "x -crop x" + height + "+0+0 -quality " + quality + "% "
                    + image.getAbsolutePath();
            List<String> message = OS.execSynchronous(command);
            if (image.exists())
                return true;
            ConcurrentLog.warn("Html2Image", "failed to create image with command: " + command);
            for (String m : message)
                ConcurrentLog.warn("Html2Image", ">> " + m);

            // another try for mac: use Image Events using AppleScript in osacript commands...
            // the following command overwrites a pdf with an png, so we must make a copy first
            if (!OS.isMacArchitecture)
                return false;
            File pngFile = new File(pdf.getAbsolutePath() + ".tmp.pdf");
            org.apache.commons.io.FileUtils.copyFile(pdf, pngFile);
            String[] commandx = { "osascript", "-e", "set ImgFile to \"" + pngFile.getAbsolutePath() + "\"", "-e",
                    "tell application \"Image Events\"", "-e", "set Img to open file ImgFile", "-e",
                    "save Img as PNG", "-e", "end tell" };
            //ConcurrentLog.warn("Html2Image", "failed to create image with command: " + commandx);
            message = OS.execSynchronous(commandx);
            for (String m : message)
                ConcurrentLog.warn("Html2Image", ">> " + m);
            // now we must read and convert this file to a jpg with the target size 1024x1024
            try {
                File newPngFile = new File(pngFile.getAbsolutePath() + ".png");
                pngFile.renameTo(newPngFile);
                Image img = ImageParser.parse(pngFile.getAbsolutePath(), FileUtils.read(newPngFile));
                final Image scaled = img.getScaledInstance(width, height, Image.SCALE_AREA_AVERAGING);
                final MediaTracker mediaTracker = new MediaTracker(new Container());
                mediaTracker.addImage(scaled, 0);
                try {
                    mediaTracker.waitForID(0);
                } catch (final InterruptedException e) {
                }
                // finally write the image
                final BufferedImage bi = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
                bi.createGraphics().drawImage(scaled, 0, 0, width, height, null);
                ImageIO.write(bi, "jpg", image);
                newPngFile.delete();
                return image.exists();
            } catch (IOException e) {
                ConcurrentLog.logException(e);
                return false;
            }
        } catch (IOException e) {
            e.printStackTrace();
            return false;
        }
    }

    /**
     * render a html page with a JEditorPane, which can do html up to html v 3.2. No CSS supported!
     * @param url
     * @param size
     * @throws IOException 
     */
    public static void writeSwingImage(String url, Dimension size, File destination) throws IOException {

        // set up a pane for rendering
        final JEditorPane htmlPane = new JEditorPane();
        htmlPane.setSize(size);
        htmlPane.setEditable(false);
        final HTMLEditorKit kit = new HTMLEditorKit() {

            private static final long serialVersionUID = 1L;

            @Override
            public Document createDefaultDocument() {
                HTMLDocument doc = (HTMLDocument) super.createDefaultDocument();
                doc.setAsynchronousLoadPriority(-1);
                return doc;
            }

            @Override
            public ViewFactory getViewFactory() {
                return new HTMLFactory() {
                    @Override
                    public View create(Element elem) {
                        View view = super.create(elem);
                        if (view instanceof ImageView) {
                            ((ImageView) view).setLoadsSynchronously(true);
                        }
                        return view;
                    }
                };
            }
        };
        htmlPane.setEditorKitForContentType("text/html", kit);
        htmlPane.setContentType("text/html");
        htmlPane.addPropertyChangeListener(new PropertyChangeListener() {
            @Override
            public void propertyChange(PropertyChangeEvent evt) {
            }
        });

        // load the page
        try {
            htmlPane.setPage(url);
        } catch (IOException e) {
            e.printStackTrace();
        }

        // render the page
        Dimension prefSize = htmlPane.getPreferredSize();
        BufferedImage img = new BufferedImage(prefSize.width, htmlPane.getPreferredSize().height,
                BufferedImage.TYPE_INT_ARGB);
        Graphics graphics = img.getGraphics();
        htmlPane.setSize(prefSize);
        htmlPane.paint(graphics);
        ImageIO.write(img, destination.getName().endsWith("jpg") ? "jpg" : "png", destination);
    }

    public static void main(String[] args) {
        try {
            Html2Image.writeSwingImage(args[0], new Dimension(1200, 2000), new File(args[1]));
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

}