Java tutorial
/* * Copyright 2014 Studentmediene i Trondheim AS * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package no.dusken.momus.service.drive; import org.apache.commons.lang3.StringEscapeUtils; import org.springframework.stereotype.Service; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Converts content from a Google Drive Document * to our representation */ @Service public class GoogleDocsTextConverter { Pattern body = Pattern.compile("<body.*?>(.*)</body>"); Pattern css = Pattern.compile("<style type=\"text/css\">(.*)</style>"); Pattern italicStyleName = Pattern.compile("\\.([^{]*?)\\{font-style:italic\\}"); Pattern boldStyleName = Pattern.compile("\\.([^{]*?)\\{font-weight:bold\\}"); Pattern aTags = Pattern.compile("<a[^>]*?></a>"); Pattern classes = Pattern.compile(" class=\".*?\""); Pattern spans = Pattern.compile("</?span.*?>"); Pattern emptyP = Pattern.compile("<p>\\s?</p>"); Pattern inlineComments = Pattern.compile("<sup>.*?</sup>"); Pattern spaces = Pattern.compile(" "); Pattern comments = Pattern.compile("<div><p>.*?</p></div>"); Pattern lists = Pattern.compile(" start=\".*?\""); Pattern table = Pattern.compile("<table[^>]*?>.*?</table>"); Pattern img = Pattern.compile("<img.*?>"); String ltUnicode = Character.toString((char) 44000); String gtUnicode = Character.toString((char) 44001); Pattern ltToUnicode = Pattern.compile("<"); Pattern gtToUnicode = Pattern.compile(">"); Pattern unicodeToLt = Pattern.compile(ltUnicode); Pattern unicodeToGt = Pattern.compile(gtUnicode); public String convert(String input) { String body = extractBody(input); String css = extractCss(input); String out; out = findItalicsAndBold(body, css); out = removeEmptyATags(out); out = removeClasses(out); out = removeSpans(out); out = removeComments(out); out = removeInvalidContent(out); out = removeListAttributes(out); out = removeEmptyPTags(out); out = unescapeHtml(out); return out; } /** * Only interested in the stuff inside <body></body> */ private String extractBody(String in) { Matcher m = body.matcher(in); if (m.find()) { return m.group(1); } return in; } private String extractCss(String in) { Matcher m = css.matcher(in); if (m.find()) { return m.group(1); } return in; } /** * Bold and italics are not marked with tags in GDocs, instead it is applied with CSS. * For instance: * .c1{font-weight:bold} * lalala <span class="c1">bold</span> * * The classnames change each time, so need to dynamicall find it and change the span to <i> or <b> */ private String findItalicsAndBold(String body, String css) { Matcher italicsMatcher = italicStyleName.matcher(css); Matcher boldMatcher = boldStyleName.matcher(css); if (italicsMatcher.find()) { String italicSelectorName = italicsMatcher.group(1); Pattern italicClasses = Pattern.compile("<span class=\"" + italicSelectorName + "\">(.*?)</span>"); Matcher spanMatcherItalics = italicClasses.matcher(body); body = spanMatcherItalics.replaceAll("<i>$1</i>"); // $1 means what is matched inside the parentheses in the pattern } if (boldMatcher.find()) { String boldSelectorName = boldMatcher.group(1); Pattern boldClasses = Pattern.compile("<span class=\"" + boldSelectorName + "\">(.*?)</span>"); Matcher spanMatcherBold = boldClasses.matcher(body); body = spanMatcherBold.replaceAll("<b>$1</b>"); } return body; } /** * Remove <a name=*></a> stuff google inserts everywhere */ private String removeEmptyATags(String in) { Matcher m = aTags.matcher(in); return m.replaceAll(""); } private String removeClasses(String in) { Matcher m = classes.matcher(in); return m.replaceAll(""); } private String removeSpans(String in) { Matcher m = spans.matcher(in); return m.replaceAll(""); } /** * In case someone likes to have much space between their paragraphs.. */ private String removeEmptyPTags(String in) { Matcher m = emptyP.matcher(in); return m.replaceAll(""); } /** * Comments inserted should be removed as they don't belong to the text * A comment adds a <sup>-reference to the text, and then the comment * itself at the bottom */ private String removeComments(String in) { Matcher m = inlineComments.matcher(in); String out = m.replaceAll(""); // Spaces inside a marked text are written as m = spaces.matcher(out); out = m.replaceAll(" "); m = comments.matcher(out); out = m.replaceAll(""); return out; } /** * Removes some stuff from the lists */ private String removeListAttributes(String in) { Matcher m = lists.matcher(in); return m.replaceAll(""); } /** * Removes images and tables, should possibly remove more stuff * but try to keep the contents, not just the formatting. */ private String removeInvalidContent(String in) { Matcher m = table.matcher(in); String out = m.replaceAll(""); m = img.matcher(out); out = m.replaceAll(" "); return out; } /** * Converts HTML entities to "normal characters", for instance * it converts å to * * But < (<) and > (>) are ignored, to avoid < and > in the written * text to affect our HTML. */ private String unescapeHtml(String in) { // replace all > and < Matcher m = ltToUnicode.matcher(in); String out = m.replaceAll(ltUnicode); m = gtToUnicode.matcher(out); out = m.replaceAll(gtUnicode); //Convert quotes to "guillemets" out = out.replaceAll("“", ""); out = out.replaceAll("”", ""); // convert stuff out = StringEscapeUtils.unescapeHtml4(out); // add the > and <s back m = unicodeToLt.matcher(out); out = m.replaceAll("<"); m = unicodeToGt.matcher(out); out = m.replaceAll(">"); return out; } }