org.b3log.symphony.util.Markdowns.java Source code

Java tutorial

Introduction

Here is the source code for org.b3log.symphony.util.Markdowns.java

Source

/*
 * Symphony - A modern community (forum/SNS/blog) platform written in Java.
 * Copyright (C) 2012-2018,  b3log.org & hacpai.com
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.b3log.symphony.util;

import com.vladsch.flexmark.html.HtmlRenderer;
import com.vladsch.flexmark.profiles.pegdown.Extensions;
import com.vladsch.flexmark.profiles.pegdown.PegdownOptionsAdapter;
import com.vladsch.flexmark.util.options.DataHolder;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.b3log.latke.Latkes;
import org.b3log.latke.cache.Cache;
import org.b3log.latke.cache.CacheFactory;
import org.b3log.latke.ioc.LatkeBeanManager;
import org.b3log.latke.ioc.LatkeBeanManagerImpl;
import org.b3log.latke.ioc.Lifecycle;
import org.b3log.latke.logging.Level;
import org.b3log.latke.logging.Logger;
import org.b3log.latke.repository.jdbc.JdbcRepository;
import org.b3log.latke.service.LangPropsService;
import org.b3log.latke.service.LangPropsServiceImpl;
import org.b3log.latke.util.Callstacks;
import org.b3log.latke.util.Stopwatchs;
import org.b3log.latke.util.Strings;
import org.b3log.symphony.model.Common;
import org.b3log.symphony.service.UserQueryService;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Parser;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeVisitor;

import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.*;

/**
 * <a href="http://en.wikipedia.org/wiki/Markdown">Markdown</a> utilities.
 * <p>
 * Uses the <a href="https://github.com/chjj/marked">marked</a> as the processor, if not found this command, try
 * built-in <a href="https://github.com/vsch/flexmark-java">flexmark</a> instead.
 * </p>
 *
 * @author <a href="http://88250.b3log.org">Liang Ding</a>
 * @author <a href="http://zephyr.b3log.org">Zephyr</a>
 * @author <a href="http://vanessa.b3log.org">Vanessa</a>
 * @version 1.11.20.0, Dec 21, 2017
 * @since 0.2.0
 */
public final class Markdowns {

    /**
     * Logger.
     */
    private static final Logger LOGGER = Logger.getLogger(Markdowns.class);

    /**
     * Language service.
     */
    private static final LangPropsService LANG_PROPS_SERVICE = LatkeBeanManagerImpl.getInstance()
            .getReference(LangPropsServiceImpl.class);

    /**
     * Bean manager.
     */
    private static final LatkeBeanManager beanManager = Lifecycle.getBeanManager();

    /**
     * User query service.
     */
    private static final UserQueryService userQueryService;

    /**
     * Markdown cache.
     */
    private static final Cache MD_CACHE = CacheFactory.getCache("markdown");

    /**
     * Markdown to HTML timeout.
     */
    private static final int MD_TIMEOUT = 2000;

    /**
     * Marked engine serve path.
     */
    private static final String MARKED_ENGINE_URL = "http://localhost:8250";

    /**
     * Built-in MD engine options.
     */
    private static final DataHolder OPTIONS = PegdownOptionsAdapter.flexmarkOptions(Extensions.ALL_WITH_OPTIONALS);

    /**
     * Built-in MD engine parser.
     */
    private static final com.vladsch.flexmark.parser.Parser PARSER = com.vladsch.flexmark.parser.Parser
            .builder(OPTIONS).build();

    /**
     * Built-in MD engine HTML renderer.
     */
    private static final HtmlRenderer RENDERER = HtmlRenderer.builder(OPTIONS).build();

    /**
     * Whether marked is available.
     */
    public static boolean MARKED_AVAILABLE;

    static {
        MD_CACHE.setMaxCount(1024 * 10 * 4);

        if (null != beanManager) {
            userQueryService = beanManager.getReference(UserQueryService.class);
        } else {
            userQueryService = null;
        }
    }

    static {
        try {
            final URL url = new URL(MARKED_ENGINE_URL);
            final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            conn.setDoOutput(true);

            final OutputStream outputStream = conn.getOutputStream();
            IOUtils.write("Symphony ", outputStream, "UTF-8");
            IOUtils.closeQuietly(outputStream);

            final InputStream inputStream = conn.getInputStream();
            final String html = IOUtils.toString(inputStream, "UTF-8");
            IOUtils.closeQuietly(inputStream);

            conn.disconnect();

            MARKED_AVAILABLE = StringUtils.contains(html, "<p>Symphony </p>");

            if (MARKED_AVAILABLE) {
                LOGGER.log(Level.INFO, "[marked] is available, uses it for markdown processing");
            } else {
                LOGGER.log(Level.INFO,
                        "[marked] is not available, uses built-in [flexmark] for markdown processing");
            }
        } catch (final Exception e) {
            LOGGER.log(Level.INFO, "[marked] is not available caused by [" + e.getMessage()
                    + "], uses built-in [flexmark] for markdown processing");
        }
    }

    /**
     * Private constructor.
     */
    private Markdowns() {
    }

    /**
     * Gets the safe HTML content of the specified content.
     *
     * @param content the specified content
     * @param baseURI the specified base URI, the relative path value of href will starts with this URL
     * @return safe HTML content
     */
    public static String clean(final String content, final String baseURI) {
        final Document.OutputSettings outputSettings = new Document.OutputSettings();
        outputSettings.prettyPrint(false);

        final String tmp = Jsoup.clean(content, baseURI,
                Whitelist.relaxed().addAttributes(":all", "id", "target", "class")
                        .addTags("span", "hr", "kbd", "samp", "tt", "del", "s", "strike", "u")
                        .addAttributes("iframe", "src", "width", "height", "border", "marginwidth", "marginheight")
                        .addAttributes("audio", "controls", "src")
                        .addAttributes("video", "controls", "src", "width", "height")
                        .addAttributes("source", "src", "media", "type")
                        .addAttributes("object", "width", "height", "data", "type")
                        .addAttributes("param", "name", "value")
                        .addAttributes("input", "type", "disabled", "checked").addAttributes("embed", "src", "type",
                                "width", "height", "wmode", "allowNetworking"),
                outputSettings);
        final Document doc = Jsoup.parse(tmp, baseURI, Parser.htmlParser());

        final Elements ps = doc.getElementsByTag("p");
        for (final Element p : ps) {
            p.removeAttr("style");
        }

        final Elements iframes = doc.getElementsByTag("iframe");
        for (final Element iframe : iframes) {
            final String src = StringUtils.deleteWhitespace(iframe.attr("src"));
            if (StringUtils.startsWithIgnoreCase(src, "javascript")
                    || StringUtils.startsWithIgnoreCase(src, "data:")) {
                iframe.remove();
            }
        }

        final Elements objs = doc.getElementsByTag("object");
        for (final Element obj : objs) {
            final String data = StringUtils.deleteWhitespace(obj.attr("data"));
            if (StringUtils.startsWithIgnoreCase(data, "data:")
                    || StringUtils.startsWithIgnoreCase(data, "javascript")) {
                obj.remove();

                continue;
            }

            final String type = StringUtils.deleteWhitespace(obj.attr("type"));
            if (StringUtils.containsIgnoreCase(type, "script")) {
                obj.remove();
            }
        }

        final Elements embeds = doc.getElementsByTag("embed");
        for (final Element embed : embeds) {
            final String data = StringUtils.deleteWhitespace(embed.attr("src"));
            if (StringUtils.startsWithIgnoreCase(data, "data:")
                    || StringUtils.startsWithIgnoreCase(data, "javascript")) {
                embed.remove();

                continue;
            }
        }

        final Elements as = doc.getElementsByTag("a");
        for (final Element a : as) {
            a.attr("rel", "nofollow");

            final String href = a.attr("href");
            if (href.startsWith(Latkes.getServePath())) {
                continue;
            }

            a.attr("target", "_blank");
        }

        final Elements audios = doc.getElementsByTag("audio");
        for (final Element audio : audios) {
            audio.attr("preload", "none");
        }

        final Elements videos = doc.getElementsByTag("video");
        for (final Element video : videos) {
            video.attr("preload", "none");
        }

        String ret = doc.body().html();
        ret = ret.replaceAll("(</?br\\s*/?>\\s*)+", "<br>"); // patch for Jsoup issue

        return ret;
    }

    /**
     * Converts the specified markdown text to HTML.
     *
     * @param markdownText the specified markdown text
     * @return converted HTML, returns an empty string "" if the specified markdown text is "" or {@code null}, returns
     * 'markdownErrorLabel' if exception
     */
    public static String toHTML(final String markdownText) {
        if (Strings.isEmptyOrNull(markdownText)) {
            return "";
        }

        final String cachedHTML = getHTML(markdownText);
        if (null != cachedHTML) {
            return cachedHTML;
        }

        final ExecutorService pool = Executors.newSingleThreadExecutor();
        final long[] threadId = new long[1];

        final Callable<String> call = () -> {
            threadId[0] = Thread.currentThread().getId();

            String html = LANG_PROPS_SERVICE.get("contentRenderFailedLabel");

            if (MARKED_AVAILABLE) {
                html = toHtmlByMarked(markdownText);
                if (!StringUtils.startsWith(html, "<p>")) {
                    html = "<p>" + html + "</p>";
                }
            } else {
                com.vladsch.flexmark.ast.Node document = PARSER.parse(markdownText);
                html = RENDERER.render(document);
                if (!StringUtils.startsWith(html, "<p>")) {
                    html = "<p>" + html + "</p>";
                }
            }

            final Document doc = Jsoup.parse(html);
            final List<org.jsoup.nodes.Node> toRemove = new ArrayList<>();
            doc.traverse(new NodeVisitor() {
                @Override
                public void head(final org.jsoup.nodes.Node node, int depth) {
                    if (node instanceof org.jsoup.nodes.TextNode) {
                        final org.jsoup.nodes.TextNode textNode = (org.jsoup.nodes.TextNode) node;
                        final org.jsoup.nodes.Node parent = textNode.parent();

                        if (parent instanceof Element) {
                            final Element parentElem = (Element) parent;

                            if (!parentElem.tagName().equals("code")) {
                                String text = textNode.getWholeText();
                                boolean nextIsBr = false;
                                final org.jsoup.nodes.Node nextSibling = textNode.nextSibling();
                                if (nextSibling instanceof Element) {
                                    nextIsBr = "br".equalsIgnoreCase(((Element) nextSibling).tagName());
                                }

                                if (null != userQueryService) {
                                    try {
                                        final Set<String> userNames = userQueryService.getUserNames(text);
                                        for (final String userName : userNames) {
                                            text = text.replace('@' + userName + (nextIsBr ? "" : " "),
                                                    "@<a href='" + Latkes.getServePath() + "/member/" + userName
                                                            + "'>" + userName + "</a> ");
                                        }
                                        text = text.replace("@participants ",
                                                "@<a href='https://hacpai.com/article/1458053458339' class='ft-red'>participants</a> ");
                                    } finally {
                                        JdbcRepository.dispose();
                                    }
                                }

                                if (text.contains("@<a href=")) {
                                    final List<org.jsoup.nodes.Node> nodes = Parser.parseFragment(text, parentElem,
                                            "");
                                    final int index = textNode.siblingIndex();

                                    parentElem.insertChildren(index, nodes);
                                    toRemove.add(node);
                                } else {
                                    textNode.text(Pangu.spacingText(text));
                                }
                            }
                        }
                    }
                }

                @Override
                public void tail(org.jsoup.nodes.Node node, int depth) {
                }
            });

            toRemove.forEach(node -> node.remove());

            doc.select("pre>code").addClass("hljs");
            doc.select("a").forEach(a -> {
                String src = a.attr("href");
                if (!StringUtils.startsWithIgnoreCase(src, Latkes.getServePath())) {
                    try {
                        src = URLEncoder.encode(src, "UTF-8");
                    } catch (final Exception e) {
                    }
                    a.attr("href", Latkes.getServePath() + "/forward?goto=" + src);
                    a.attr("target", "_blank");
                }
            });
            doc.outputSettings().prettyPrint(false);

            String ret = doc.select("body").html();
            ret = StringUtils.trim(ret);

            // cache it
            putHTML(markdownText, ret);

            return ret;
        };

        Stopwatchs.start("Md to HTML");
        try {
            final Future<String> future = pool.submit(call);

            return future.get(MD_TIMEOUT, TimeUnit.MILLISECONDS);
        } catch (final TimeoutException e) {
            LOGGER.log(Level.ERROR, "Markdown timeout [md=" + markdownText + "]");
            Callstacks.printCallstack(Level.ERROR, new String[] { "org.b3log" }, null);

            final Set<Thread> threads = Thread.getAllStackTraces().keySet();
            for (final Thread thread : threads) {
                if (thread.getId() == threadId[0]) {
                    thread.stop();

                    break;
                }
            }
        } catch (final Exception e) {
            LOGGER.log(Level.ERROR, "Markdown failed [md=" + markdownText + "]", e);
        } finally {
            pool.shutdownNow();

            Stopwatchs.end();
        }

        return LANG_PROPS_SERVICE.get("contentRenderFailedLabel");
    }

    private static String toHtmlByMarked(final String markdownText) throws Exception {
        final URL url = new URL(MARKED_ENGINE_URL);
        final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
        conn.setDoOutput(true);

        final OutputStream outputStream = conn.getOutputStream();
        IOUtils.write(markdownText, outputStream, "UTF-8");
        IOUtils.closeQuietly(outputStream);

        final InputStream inputStream = conn.getInputStream();
        final String html = IOUtils.toString(inputStream, "UTF-8");
        IOUtils.closeQuietly(inputStream);

        //conn.disconnect();

        return html;
    }

    /**
     * Gets HTML for the specified markdown text.
     *
     * @param markdownText the specified markdown text
     * @return HTML
     */
    private static String getHTML(final String markdownText) {
        final String hash = DigestUtils.md5Hex(markdownText);
        final JSONObject value = MD_CACHE.get(hash);
        if (null == value) {
            return null;
        }

        return value.optString(Common.DATA);
    }

    /**
     * Puts the specified HTML into cache.
     *
     * @param markdownText the specified markdown text
     * @param html         the specified HTML
     */
    private static void putHTML(final String markdownText, final String html) {
        final String hash = DigestUtils.md5Hex(markdownText);
        final JSONObject value = new JSONObject();
        value.put(Common.DATA, html);
        MD_CACHE.put(hash, value);
    }
}