no.dusken.momus.service.drive.GoogleDocsTextConverter.java Source code

Java tutorial

Introduction

Here is the source code for no.dusken.momus.service.drive.GoogleDocsTextConverter.java

Source

/*
 * Copyright 2014 Studentmediene i Trondheim AS
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package no.dusken.momus.service.drive;

import org.apache.commons.lang3.StringEscapeUtils;
import org.springframework.stereotype.Service;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Converts content from a Google Drive Document
 * to our representation
 */
@Service
public class GoogleDocsTextConverter {

    Pattern body = Pattern.compile("<body.*?>(.*)</body>");
    Pattern css = Pattern.compile("<style type=\"text/css\">(.*)</style>");
    Pattern italicStyleName = Pattern.compile("\\.([^{]*?)\\{font-style:italic\\}");
    Pattern boldStyleName = Pattern.compile("\\.([^{]*?)\\{font-weight:bold\\}");

    Pattern aTags = Pattern.compile("<a[^>]*?></a>");
    Pattern classes = Pattern.compile(" class=\".*?\"");
    Pattern spans = Pattern.compile("</?span.*?>");
    Pattern emptyP = Pattern.compile("<p>\\s?</p>");

    Pattern inlineComments = Pattern.compile("<sup>.*?</sup>");
    Pattern spaces = Pattern.compile("&nbsp;");
    Pattern comments = Pattern.compile("<div><p>.*?</p></div>");

    Pattern lists = Pattern.compile(" start=\".*?\"");

    Pattern table = Pattern.compile("<table[^>]*?>.*?</table>");
    Pattern img = Pattern.compile("<img.*?>");

    String ltUnicode = Character.toString((char) 44000);
    String gtUnicode = Character.toString((char) 44001);
    Pattern ltToUnicode = Pattern.compile("&lt;");
    Pattern gtToUnicode = Pattern.compile("&gt;");
    Pattern unicodeToLt = Pattern.compile(ltUnicode);
    Pattern unicodeToGt = Pattern.compile(gtUnicode);

    public String convert(String input) {
        String body = extractBody(input);
        String css = extractCss(input);

        String out;

        out = findItalicsAndBold(body, css);

        out = removeEmptyATags(out);
        out = removeClasses(out);
        out = removeSpans(out);
        out = removeComments(out);
        out = removeInvalidContent(out);
        out = removeListAttributes(out);
        out = removeEmptyPTags(out);
        out = unescapeHtml(out);

        return out;
    }

    /**
     * Only interested in the stuff inside <body></body>
     */
    private String extractBody(String in) {
        Matcher m = body.matcher(in);

        if (m.find()) {
            return m.group(1);
        }
        return in;
    }

    private String extractCss(String in) {
        Matcher m = css.matcher(in);

        if (m.find()) {
            return m.group(1);
        }
        return in;
    }

    /**
     * Bold and italics are not marked with tags in GDocs, instead it is applied with CSS.
     * For instance:
     * .c1{font-weight:bold}
     * lalala <span class="c1">bold</span>
     *
     * The classnames change each time, so need to dynamicall find it and change the span to <i> or <b>
     */
    private String findItalicsAndBold(String body, String css) {
        Matcher italicsMatcher = italicStyleName.matcher(css);
        Matcher boldMatcher = boldStyleName.matcher(css);

        if (italicsMatcher.find()) {
            String italicSelectorName = italicsMatcher.group(1);

            Pattern italicClasses = Pattern.compile("<span class=\"" + italicSelectorName + "\">(.*?)</span>");
            Matcher spanMatcherItalics = italicClasses.matcher(body);

            body = spanMatcherItalics.replaceAll("<i>$1</i>"); // $1 means what is matched inside the parentheses in the pattern
        }

        if (boldMatcher.find()) {
            String boldSelectorName = boldMatcher.group(1);

            Pattern boldClasses = Pattern.compile("<span class=\"" + boldSelectorName + "\">(.*?)</span>");
            Matcher spanMatcherBold = boldClasses.matcher(body);

            body = spanMatcherBold.replaceAll("<b>$1</b>");
        }

        return body;
    }

    /**
     * Remove <a name=*></a> stuff google inserts everywhere
     */
    private String removeEmptyATags(String in) {
        Matcher m = aTags.matcher(in);
        return m.replaceAll("");
    }

    private String removeClasses(String in) {
        Matcher m = classes.matcher(in);
        return m.replaceAll("");
    }

    private String removeSpans(String in) {
        Matcher m = spans.matcher(in);
        return m.replaceAll("");
    }

    /**
     * In case someone likes to have much space between their paragraphs..
     */
    private String removeEmptyPTags(String in) {
        Matcher m = emptyP.matcher(in);
        return m.replaceAll("");
    }

    /**
     * Comments inserted should be removed as they don't belong to the text
     * A comment adds a <sup>-reference to the text, and then the comment
     * itself at the bottom
     */
    private String removeComments(String in) {
        Matcher m = inlineComments.matcher(in);
        String out = m.replaceAll("");

        // Spaces inside a marked text are written as &nbsp;
        m = spaces.matcher(out);
        out = m.replaceAll(" ");

        m = comments.matcher(out);
        out = m.replaceAll("");

        return out;
    }

    /**
     * Removes some stuff from the lists
     */
    private String removeListAttributes(String in) {
        Matcher m = lists.matcher(in);
        return m.replaceAll("");
    }

    /**
     * Removes images and tables, should possibly remove more stuff
     * but try to keep the contents, not just the formatting.
     */
    private String removeInvalidContent(String in) {
        Matcher m = table.matcher(in);
        String out = m.replaceAll("");

        m = img.matcher(out);
        out = m.replaceAll(" ");

        return out;
    }

    /**
     * Converts HTML entities to "normal characters", for instance
     * it converts &aring; to 
     *
     * But &lt; (<) and &gt; (>) are ignored, to avoid < and > in the written
     * text to affect our HTML.
     */
    private String unescapeHtml(String in) {
        // replace all &gt; and &lt;
        Matcher m = ltToUnicode.matcher(in);
        String out = m.replaceAll(ltUnicode);

        m = gtToUnicode.matcher(out);
        out = m.replaceAll(gtUnicode);

        //Convert quotes to "guillemets"
        out = out.replaceAll("&ldquo;", "");
        out = out.replaceAll("&rdquo;", "");

        // convert stuff
        out = StringEscapeUtils.unescapeHtml4(out);

        // add the &gt; and &lt;s back
        m = unicodeToLt.matcher(out);
        out = m.replaceAll("&lt;");

        m = unicodeToGt.matcher(out);
        out = m.replaceAll("&gt;");

        return out;
    }
}