Java tutorial
/* * Copyright 2011 OverZealous Creations, LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package by.heap.remark.convert; import by.heap.remark.Options; import by.heap.remark.util.StringUtils; import org.apache.commons.lang3.StringEscapeUtils; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import java.util.*; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * This class is used to clean up plain text fields based on the selected set of options. * It optionally escapes certain special characters, as well as replacing various * HTML and Unicode entities with their plaintext equivalents. * * @author Phil DeJarnett */ public class TextCleaner { /** * Internal class simply used to hold the various escape regexes. */ private class Escape { final Pattern pattern; final String replacement; public Escape(String pattern, String replacement) { this.pattern = Pattern.compile(pattern); this.replacement = replacement; } } /** Used to track the replacements based on matched groups. */ private Map<String, String> replacements; /** Compiled entity replacement pattern. */ private Pattern entityReplacementsPattern; /** Compiled unicode replacement pattern. */ private Pattern unicodeReplacementsPattern = null; /** List of possible escapes */ private List<Escape> escapes; private Pattern unescapeLeadingChars; private static final Pattern EMPTY_MATCHER = Pattern.compile("\\s+", Pattern.DOTALL); private static final Pattern LINEBREAK_REMOVER = Pattern.compile("(\\s*\\r?+\\n)+"); private static final Pattern URL_CLEANER = Pattern.compile("([\\(\\) ])"); /** * Create a new TextCleaner based on the configured options. * @param options Options that will affect what is cleaned. */ public TextCleaner(Options options) { setupReplacements(options); setupEscapes(options); } /** * Configures the basic replacements based on the configured options. * @param options Options that will affect what is replaced. */ @SuppressWarnings({ "OverlyLongMethod" }) private void setupReplacements(Options options) { this.replacements = new HashMap<String, String>(); // build replacement regex StringBuilder entities = new StringBuilder(replacements.size() * 5); // this is a special case for double-encoded HTML entities. entities.append("&(?>amp;([#a-z0-9]++;)|(?>"); addRepl(entities, "&", "&"); addRepl(entities, "<", "<"); addRepl(entities, ">", ">"); addRepl(entities, """, "\""); if (options.reverseHtmlSmartQuotes) { addRepl(entities, "“", "\""); addRepl(entities, "”", "\""); addRepl(entities, "‘", "\'"); addRepl(entities, "’", "\'"); addRepl(entities, "'", "\'"); addRepl(entities, "«", "<<"); addRepl(entities, "»", ">>"); } if (options.reverseHtmlSmartPunctuation) { addRepl(entities, "–", "--"); addRepl(entities, "—", "---"); addRepl(entities, "…", "..."); } entities.replace(entities.length() - 1, entities.length(), ");)"); entityReplacementsPattern = Pattern.compile(entities.toString(), Pattern.CASE_INSENSITIVE); if (options.reverseUnicodeSmartPunctuation || options.reverseUnicodeSmartQuotes) { StringBuilder unicode = new StringBuilder("[\\Q"); if (options.reverseUnicodeSmartQuotes) { addRepl(unicode, "\u201c", "\""); // left double quote: addRepl(unicode, "\u201d", "\""); // right double quote: ? addRepl(unicode, "\u2018", "\'"); // left single quote: addRepl(unicode, "\u2019", "\'"); // right single quote: addRepl(unicode, "\u00ab", "<<"); // left angle quote: addRepl(unicode, "\u00bb", ">>"); // right angle quote: } if (options.reverseUnicodeSmartPunctuation) { addRepl(unicode, "\u2013", "--"); // en-dash: addRepl(unicode, "\u2014", "---"); // em-dash: addRepl(unicode, "\u2026", "..."); // ellipsis: } unicode.append("\\E]"); unicodeReplacementsPattern = Pattern.compile(unicode.toString()); } } /** * Utility method to make the code above easier to read. * @param regex A character buffer to append the replacement to * @param original Original character or string. * @param replacement Replacement character or string. */ private void addRepl(StringBuilder regex, String original, String replacement) { replacements.put(original, replacement); if (original.charAt(0) == '&') { // add entity regex.append(original.substring(1, original.length() - 1)); regex.append('|'); } else { // add single character regex.append(original); } } /** * Configures the basic escapes based on the configured options. * @param options Options that will affect what is escaped. */ private void setupEscapes(Options options) { escapes = new ArrayList<Escape>(); // confusingly, this replaces single backslashes with double backslashes. // Man, I miss Groovy's slashy strings in these moments... escapes.add(new Escape("\\\\", "\\\\\\\\")); // creates an set of characters that are universally escaped. // these characters are wrapped in \Q...\E to ensure they aren't treated as special characters. StringBuilder chars = new StringBuilder("([\\Q`*_{}[]#"); if (options.tables.isConvertedToText() && !options.tables.isRenderedAsCode()) { chars.append('|'); } chars.append("\\E])"); escapes.add(new Escape(chars.toString(), "\\\\$1")); // finally, escape certain characters only if they are leading characters StringBuilder leadingChars = new StringBuilder("^( ?+)([\\Q-+"); if (options.definitionLists) { leadingChars.append(':'); } leadingChars.append("\\E])"); escapes.add(new Escape(leadingChars.toString(), "$1\\\\$2")); // setup the leading character reverser // this is a bit of a hack to undo leading character escapes. unescapeLeadingChars = Pattern.compile(leadingChars.insert(6, "\\\\").toString()); } /** * Clean the given input text based on the original configuration Options. * Newlines are also replaced with a single space. * * @param input The text to be cleaned. Can be any object. JSoup nodes are handled specially. * @return The cleaned text. */ public String clean(Object input) { return clean(input, true); } /** * Clean the given input text based on the original configuration Options. * The text is treat as code, so it is not escaped, and newlines are preserved. * * @param input The text to be cleaned. Can be any object. JSoup nodes are handled specially. * @return The cleaned text. */ public String cleanCode(Object input) { return clean(input, false); } /** * Clean the given input text based on the original configuration Options. * Optionally, don't escape special characters. * * @param oinput The text to be cleaned. Can be any object. JSoup nodes are handled specially. * @param normalText If false, don't escape special characters. This is usually only used for * inline code or code blocks, because they don't need to be escaped. * @return The cleaned text. */ private String clean(Object oinput, boolean normalText) { String input; if (oinput instanceof TextNode) { input = getTextNodeText((TextNode) oinput, normalText); } else if (oinput instanceof Element) { if (normalText) { input = ((Element) oinput).text(); } else { input = getPreformattedText((Element) oinput); } } else { input = oinput.toString(); } String result; if (input.length() == 0) { // not seen, so just return an empty string. result = ""; } else if (normalText) { // For non-code text, newlines are _never_ allowed. // Replace one or more set of whitespace chars followed by a newline with a single space. input = LINEBREAK_REMOVER.matcher(input).replaceAll(" "); // now escape special characters. for (final Escape rep : escapes) { input = rep.pattern.matcher(input).replaceAll(rep.replacement); } StringBuffer output = doReplacements(input, entityReplacementsPattern); if (unicodeReplacementsPattern != null) { output = doReplacements(output, unicodeReplacementsPattern); } result = output.toString(); } else { // we have to revert ALL HTML entities for code, because they will end up // double-encoded by markdown // we also don't need to worry about escaping anything // note: we have to manually replace ' because it is ignored by StringEscapeUtils for some reason. result = StringEscapeUtils.unescapeHtml4(input.replace("'", "'")); } return result; } /** * Replaces all {@code <br/>} tags with a newline in a copy of the input node, and * returns the resulting innter text. * This is necessary to ensure that manual linebreaks are supported in preformatted code. * * @param oinput Preformatted node to process * @return inner text of the node. */ private String getPreformattedText(Element oinput) { Element el = oinput.clone(); fixLineBreaks(el); return el.text(); } // recursively processes the element to replace <br>'s with \n private void fixLineBreaks(Element el) { for (final Element e : el.children()) { if (e.tagName().equals("br")) { e.before("\n"); e.remove(); } else { fixLineBreaks(e); } } } /** * Handles running the regex-based replacements in the input * @param input String to process * @param regex Pattern to use * @return cleaned up input string */ private StringBuffer doReplacements(CharSequence input, Pattern regex) { StringBuffer output = new StringBuffer(); Matcher m = regex.matcher(input); while (m.find()) { String repString; // if we have a hard match, do a simple replacement. String replacementKey = m.group().toLowerCase(Locale.ENGLISH); if (replacements.containsKey(replacementKey)) { repString = replacements.get(replacementKey); } else { // special case for escaped HTML entities. repString = "\\\\&$1"; } m.appendReplacement(output, repString); } m.appendTail(output); return output; } /** * Method to clean inline code, and, if necessary, add spaces to make sure that internal, leading, or * trailing {@code '`'} characters don't break the inline code. * Newlines are also replaced with spaces. * * This method also adds the leading and trailing {@code '`'} or {@code '```'} as necessary. * * @param input String to clean. Can be any object. JSoup nodes are handled specially. * @return The cleaned text. */ public String cleanInlineCode(Object input) { String output = clean(input, false).replace('\n', ' '); if (output.indexOf('`') != -1) { String prepend = ""; if (output.charAt(0) == '`') { prepend = " "; } String append = ""; if (output.charAt(output.length() - 1) == '`') { append = " "; } String delim = getDelimiter(output); output = String.format("%s%s%s%s%s", delim, prepend, output, append, delim); } else { output = String.format("`%s`", output); } return output; } /** * Removes the escaping on leading characters, for example, when they are going to be rendered inside * another node, such as a table. * @param input String to process * @return Cleaned string. */ public String unescapeLeadingCharacters(String input) { // removes any leading escapes... return unescapeLeadingChars.matcher(input).replaceAll("$1$2"); } /** * Handles escaping special characters in URLs to avoid issues when they are rendered out * (ie: spaces, parentheses) * @param input URL to process * @return Cleaned URL */ public String cleanUrl(String input) { StringBuffer output = new StringBuffer(); Matcher m = URL_CLEANER.matcher(input); while (m.find()) { char c = m.group().charAt(0); m.appendReplacement(output, String.format("%%%02x", (int) c)); } m.appendTail(output); return output.toString(); } String getDelimiter(String input) { int max = 0; int counter = 0; for (int i = 0; i < input.length(); i++) { if (input.charAt(i) == '`') { counter++; } else { max = Math.max(max, counter); counter = 0; } } // check in case the last tick was at the end. max = Math.max(max, counter); return StringUtils.multiply('`', max + 1); } private String getTextNodeText(TextNode tn, boolean normalText) { String input = normalText ? tn.text() : tn.getWholeText(); Node prev = tn.previousSibling(); Node next = tn.nextSibling(); boolean parentIsBlock = isBlock(tn.parent()); if (isBlock(prev)) { input = ltrim(input); } else if (prev == null && parentIsBlock) { input = ltrim(input); } else if (normalText && prev instanceof TextNode) { TextNode tprev = (TextNode) prev; if (EMPTY_MATCHER.matcher(tprev.text()).matches()) { input = ltrim(input); } } if (input.length() > 0) { if (isBlock(next)) { input = rtrim(input); } else if (next == null && parentIsBlock) { input = rtrim(input); } else if (normalText && next instanceof TextNode) { TextNode tnext = (TextNode) next; if (EMPTY_MATCHER.matcher(tnext.text()).matches()) { input = rtrim(input); } } } return input; } private boolean isBlock(Node n) { boolean block = false; if (n != null && n instanceof Element) { Element el = (Element) n; block = el.isBlock() || el.tagName().equals("br"); } return block; } private String ltrim(String s) { int start = 0; while ((start + 1 <= s.length()) && EMPTY_MATCHER.matcher(s.substring(start, start + 1)).matches()) { start++; } String ret = ""; if (start != s.length()) { ret = s.substring(start); } return ret; } private String rtrim(String s) { int end = s.length(); while ((end - 1 >= 0) && EMPTY_MATCHER.matcher(s.substring(end - 1, end)).matches()) { end--; } String ret = ""; if (end != 0) { ret = s.substring(0, end); } return ret; } }