Java tutorial
/* * Copyright (c) 2006-2013 by Public Library of Science * * http://plos.org * http://ambraproject.org * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.ambraproject.util; import java.io.StringReader; import java.io.StringWriter; import java.net.MalformedURLException; import java.net.URI; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.regex.Pattern; import java.util.List; import java.util.ArrayList; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Node; import com.opensymphony.util.UrlUtils; import org.xml.sax.InputSource; import sun.misc.BASE64Encoder; /** * Provides some useful text manipulation functions. */ public class TextUtils { public static final String HTTP_PREFIX = "http://"; private static final Pattern maliciousContentPattern = Pattern.compile("[<>\"\'%;()&+]"); private static final Pattern lineBreakPattern = Pattern.compile("\\p{Zl}|\r\n|\n|\u0085|\\p{Zp}"); private static final Pattern strongPattern = Pattern.compile("'''"); private static final Pattern emphasizePattern = Pattern.compile("''"); private static final Pattern strongEmphasizePattern = Pattern.compile("'''''"); private static final Pattern superscriptPattern = Pattern.compile("\\^\\^"); private static final Pattern subscriptPattern = Pattern.compile("~~"); private static Logger log = LoggerFactory.getLogger(TextUtils.class); /** * Create a hash of a string * * @param string the string to make the hash * * @return the hash of the string */ public static String createHash(String string) { return createHash(string.getBytes()); } /** * Create a hash of a byte array * * @param bytes * * @return the hash of the byte array */ public static String createHash(byte[] bytes) { try { MessageDigest messageDigest = MessageDigest.getInstance("SHA-1"); messageDigest.update(bytes); return encodeText(messageDigest.digest()); } catch (NoSuchAlgorithmException ex) { throw new RuntimeException(ex); } } /** * Produces a String value suitable for rendering in HTML for the given binary data. */ private static String encodeText(byte[] data) { BASE64Encoder encoder = new BASE64Encoder(); String base64 = encoder.encodeBuffer(data); // Make the returned value a little prettier by replacing slashes with underscores, and removing the trailing // "=". base64 = base64.replace('/', '_').trim(); return base64.substring(0, base64.length() - 1); } /** * Convert a List of URIs to a List of Strings * @param list a List of URIs * @return a list of strings */ public static List<String> toStringList(List<URI> list) { List<String> simpleCollection = new ArrayList<String>(); for (URI uri : list) { simpleCollection.add(uri.toString()); } return simpleCollection; } /** * Takes in a String and returns it with all line separators replaced by <br/> tags suitable * for display as HTML. * * @param input HTML * @return String with line separators replaced with <br/> */ public static String makeHtmlLineBreaks(final String input) { if (StringUtils.isBlank(input)) { return input; } return lineBreakPattern.matcher(input).replaceAll("<br/>"); } /** * Takes in a String and returns it with all pairs of <code>'''</code> * replaced by \<strong\>\</strong\> tags suitable for display as HTML. * For example: <code>foo '''bar''' baz</code> is transformed into * <code>foo \<strong\>bar\</strong\> baz</code> * <p/> * The <code>strong</code> tag is used instead of the <code>b</code> tag * because <code>strong</code> is preferred for CSS styling. * <p/> * There is no check for consistency of markup pairs (e.g., <code>foo ''bar''' baz</code> will become * <code>foo ''bar\<strong\> baz</code>) which will, rightfully, infuriate some users. * * @param input HTML * @return String with all pairs of <code>'''</code> replaced by \<strong\>\</strong\> tags */ public static String makeHtmlStrong(final String input) { // If no Pattern in "input" parameter, then do nothing. if (StringUtils.isBlank(input) || (!strongPattern.matcher(input).find())) { return input; } String transformedInput = input; // This will be the String that gets returned. boolean isInsideATag = false; // Whether an open tag was the most recent substitution. // While there is Pattern in "input" parameter, replace each instance of Pattern with // either an open or close tag. Alternate the tag substituted to give tag pairs. while (strongPattern.matcher(transformedInput).find()) { if (!isInsideATag) { transformedInput = strongPattern.matcher(transformedInput).replaceFirst("<strong>"); isInsideATag = true; } else { transformedInput = strongPattern.matcher(transformedInput).replaceFirst("</strong>"); isInsideATag = false; } } return transformedInput; } /** * Takes in a String and returns it with all pairs of <code>''</code> replaced by \<em\>\</em\> * tags suitable for display as HTML. * <p/> * For example: <code>foo ''bar'' baz</code> is transformed into <code>foo \<em\>bar\</em\> baz</code> * <p/> * The <code>em</code> tag is used instead of the <code>i</code> tag * because <code>em</code> is preferred for CSS styling. * <p/> * There is no check for consistency of markup pairs (e.g., <code>foo 'bar'' baz</code> will become * <code>foo 'bar\<em\> baz</code>) which will, rightfully, infuriate some users. * * @param input HTML * @return String with all pairs of <code>''</code> replaced by \<em\>\</em\> tags */ public static String makeHtmlEmphasized(final String input) { // If no Pattern in "input" parameter, then do nothing. if (StringUtils.isBlank(input) || (!emphasizePattern.matcher(input).find())) { return input; } String transformedInput = input; // This will be the String that gets returned. boolean isInsideATag = false; // Whether an open tag was the most recent substitution. // While there is Pattern in "input" parameter, replace each instance of Pattern with // either an open or close tag. Alternate the tag substituted to give tag pairs. while (emphasizePattern.matcher(transformedInput).find()) { if (!isInsideATag) { transformedInput = emphasizePattern.matcher(transformedInput).replaceFirst("<em>"); isInsideATag = true; } else { transformedInput = emphasizePattern.matcher(transformedInput).replaceFirst("</em>"); isInsideATag = false; } } return transformedInput; } /** * Takes in a String and returns it with all pairs of <code>'''''</code> replaced by * \<strong\>\<em\>\</em\>\</strong\> tags suitable for display as HTML. * <p/> * For example: <code>foo '''''bar''''' baz</code> is * transformed into <code>foo \<strong\>\<em\>bar\</em\>\</strong\> baz</code> * <p/> * The <code>em</code> tag is used instead of the <code>i</code> tag * because <code>em</code> is preferred for CSS styling. * The <code>strong</code> tag is used instead of the <code>b</code> tag * because <code>strong</code> is preferred for CSS styling. * <p/> * There is no check for consistency of markup pairs (e.g., <code>foo 'bar''''' baz</code> will become * <code>foo 'bar\<strong\>\<em\> baz</code>) which will, rightfully, infuriate some users. * * @param input HTML * @return String with all pairs of <code>'''''</code> replaced by \<strong\>\<em\>\</em\>\</strong\> tags */ public static String makeHtmlStrongEmphasized(final String input) { // If no Pattern in "input" parameter, then do nothing. if (StringUtils.isBlank(input) || (!strongEmphasizePattern.matcher(input).find())) { return input; } String transformedInput = input; // This will be the String that gets returned. boolean isInsideATag = false; // Whether an open tag was the most recent substitution. // While there is Pattern in "input" parameter, replace each instance of Pattern with // either an open or close tag. Alternate the tag substituted to give tag pairs. while (strongEmphasizePattern.matcher(transformedInput).find()) { if (!isInsideATag) { transformedInput = strongEmphasizePattern.matcher(transformedInput).replaceFirst("<strong><em>"); isInsideATag = true; } else { transformedInput = strongEmphasizePattern.matcher(transformedInput).replaceFirst("</em></strong>"); isInsideATag = false; } } return transformedInput; } /** * Takes in a String and returns it with all pairs of <code>^^</code> replaced by \<sup\>\</sup\> * tags suitable for display as HTML. * <p/> * For example: <code>foo ^^bar^^ baz</code> is transformed into * <code>foo \<sup\>bar\</sup\> baz</code> * <p/> * There is no check for consistency of markup pairs (e.g., <code>foo ^bar^^ baz</code> will become * <code>foo ^bar\<sup\> baz</code>) which will, rightfully, infuriate some users. * * @param input HTML * @return String with all pairs of <code>^^</code> replaced by \<sup\>\</sup\> tags */ public static String makeHtmlSuperscript(final String input) { // If no Pattern in "input" parameter, then do nothing. if (StringUtils.isBlank(input) || (!superscriptPattern.matcher(input).find())) { return input; } String transformedInput = input; // This will be the String that gets returned. boolean isInsideATag = false; // Whether an open tag was the most recent substitution. // While there is Pattern in "input" parameter, replace each instance of Pattern with // either an open or close tag. Alternate the tag substituted to give tag pairs. while (superscriptPattern.matcher(transformedInput).find()) { if (!isInsideATag) { transformedInput = superscriptPattern.matcher(transformedInput).replaceFirst("<sup>"); isInsideATag = true; } else { transformedInput = superscriptPattern.matcher(transformedInput).replaceFirst("</sup>"); isInsideATag = false; } } return transformedInput; } /** * Takes in a String and returns it with all pairs of <code>~~</code> replaced by \<sub\>\</sub\> * tags suitable for display as HTML. * <p/> * For example: <code>foo ~~bar~~ baz</code> is transformed into * <code>foo \<sub\>bar\</sub\> baz</code> * <p/> * There is no check for consistency of markup pairs (e.g., <code>foo ~bar~~ baz</code> will become * <code>foo ~bar\<sub\> baz</code>) which will, rightfully, infuriate some users. * * @param input HTML * @return String with all pairs of <code>~~</code> replaced by \<sub\>\</sub\> tags */ public static String makeHtmlSubscript(final String input) { // If no Pattern in "input" parameter, then do nothing. if (StringUtils.isBlank(input) || (!subscriptPattern.matcher(input).find())) { return input; } String transformedInput = input; // This will be the String that gets returned. boolean isInsideATag = false; // Whether an open tag was the most recent substitution. // While there is Pattern in "input" parameter, replace each instance of Pattern with // either an open or close tag. Alternate the tag substituted to give tag pairs. while (subscriptPattern.matcher(transformedInput).find()) { if (!isInsideATag) { transformedInput = subscriptPattern.matcher(transformedInput).replaceFirst("<sub>"); isInsideATag = true; } else { transformedInput = subscriptPattern.matcher(transformedInput).replaceFirst("</sub>"); isInsideATag = false; } } return transformedInput; } /** * Linkify any possible web links excepting email addresses and enclosed with <p> tags * @param text text * @param maxLength The max length (in displayed characters) of the text to be displayed inside the <a>tag</a> * @return hyperlinked text */ public static String hyperlinkEnclosedWithPTags(final String text, int maxLength) { final StringBuilder retStr = new StringBuilder("<p>"); retStr.append(hyperlink(text, maxLength)); retStr.append("</p>"); return (retStr.toString()); } /** * Linkify any possible web links excepting email addresses and enclosed with <p> tags * @param text text * @return hyperlinked text */ public static String hyperlinkEnclosedWithPTags(final String text) { return hyperlinkEnclosedWithPTags(text, 0); } /** * Linkify any possible web links excepting email addresses * * @param text text * @param maxLength The max length (in displayed characters) of the text to be displayed * inside the <a>tag</a> * @return hyperlinked text */ public static String hyperlink(final String text, int maxLength) { if (StringUtils.isBlank(text)) { return text; } /* * HACK: [issue - if the text ends with ')' this is included in the hyperlink] * so to avoid this we explicitly guard against it here * NOTE: com.opensymphony.util.TextUtils.linkURL guards against an atomically wrapped url: * "(http://www.domain.com)" but NOT "(see http://www.domain.com)" */ if (text.indexOf('}') >= 0 || text.indexOf('{') >= 0) { return linkURL(text, null, maxLength); } String s = text.replace('(', '{'); s = s.replace(')', '}'); s = linkURL(s, null, maxLength); s = StringUtils.replace(s, "{", "("); s = StringUtils.replace(s, "}", ")"); return s; // END HACK } /** * Linkify any possible web links excepting email addresses * * @param text text * @return hyperlinked text */ public static String hyperlink(final String text) { return hyperlink(text, 0); } /** * Return the escaped html. Useful when you want to make any dangerous scripts safe to render. * <p/> * Also transforms wiki-type markup into HTML tags and replaces line breaks with HTML "break" tags. * * @param bodyContent bodyContent * @return escaped html text */ public static String escapeHtml(final String bodyContent) { String transformedBodyContent = makeHtmlLineBreaks(StringEscapeUtils.escapeHtml(bodyContent)); // The order of these three methods is important; we have to transform all instances of // ''''' before trying to match instances of ''' or '' transformedBodyContent = makeHtmlStrongEmphasized(transformedBodyContent); // matches ''''' transformedBodyContent = makeHtmlStrong(transformedBodyContent); // matches ''' transformedBodyContent = makeHtmlEmphasized(transformedBodyContent); // matches '' transformedBodyContent = makeHtmlSuperscript(transformedBodyContent); // matches ^^ transformedBodyContent = makeHtmlSubscript(transformedBodyContent); // matches ~~ return transformedBodyContent; } /** * @param bodyContent bodyContent * @return Return escaped and hyperlinked text */ public static String escapeAndHyperlink(final String bodyContent) { return hyperlinkEnclosedWithPTags(escapeHtml(bodyContent), 0); } /** * Transforms an org.w3c.dom.Document into a String * * @param node Document to transform * @return String representation of node * @throws TransformerException TransformerException */ public static String getAsXMLString(final Node node) throws TransformerException { final Transformer tf = TransformerFactory.newInstance().newTransformer(); final StringWriter stringWriter = new StringWriter(); tf.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); tf.transform(new DOMSource(node), new StreamResult(stringWriter)); return stringWriter.toString(); } /** * @param url A URL * @return whether the url is a valid address */ public static boolean verifyUrl(final String url) { try { URI u = new URI(url); // To see if we can get a valid url or if we get an exception u.toURL(); return true; } catch (Exception e) { return false; } } /** * Make a valid url from the given input url or url fragment * @param url url * @return valid url * @throws MalformedURLException MalformedURLException */ public static String makeValidUrl(final String url) throws MalformedURLException { String finalUrl = url; if (!verifyUrl(finalUrl)) { finalUrl = HTTP_PREFIX + finalUrl; if (!verifyUrl(finalUrl)) { throw new MalformedURLException("Invalid url:" + url); } } return finalUrl; } /** * Check if the input text is potentially malicious. For more details read; * http://www.dwheeler.com/secure-programs/Secure-Programs-HOWTO/cross-site-malicious-content.html * @param text text * @return boolean */ public static boolean isPotentiallyMalicious(final String text) { return maliciousContentPattern.matcher(text).find(); } /** * Escape html entity characters and high characters (eg "curvy" Word quotes). * Note this method can also be used to encode XML. * * @param s the String to escape. * @param encodeSpecialChars if true high characters will be encode other wise not. * @return the escaped string */ private static String htmlEncode(String s, boolean encodeSpecialChars) { s = noNull(s, ""); StringBuilder str = new StringBuilder(); for (int j = 0; j < s.length(); j++) { char c = s.charAt(j); // encode standard ASCII characters into HTML entities where needed if (c < '\200') { switch (c) { case '"': str.append("""); break; case '&': str.append("&"); break; case '<': str.append("<"); break; case '>': str.append(">"); break; default: str.append(c); } } // encode 'ugly' characters (ie Word "curvy" quotes etc) else if (encodeSpecialChars && (c < '\377')) { String hexChars = "0123456789ABCDEF"; int a = c % 16; int b = (c - a) / 16; str.append("&#x").append(hexChars.charAt(b)).append(hexChars.charAt(a)).append(';'); } //add other characters back in - to handle charactersets //other than ascii else { str.append(c); } } return str.toString(); } /** * Wrap all urls ('abc://' and 'www.abc') in specified string with href tags. * Any text after the length defined by the maxDisplayLength parameter will be dropped and three periods will be added "..." * * @param str The block of text to check. * @param target The target to use for the href (optional). * @param maxDisplayLength The max length (in displayed characters) of the text to be displayed inside the <a>tag</a> * @return String The block of text with all url's placed in href tags. */ //TODO: If openSymphony's implemntation of this method one day mactches this, we can remove this class private static String linkURL(String str, String target, int maxDisplayLength) { StringBuilder sb = new StringBuilder((int) (str.length() * 1.05)); sb.append(str); linkURL(sb, target, maxDisplayLength); return sb.toString(); } /** * Return <code>string</code>, or <code>defaultString</code> if * <code>string</code> is <code>null</code> or <code>""</code>. * Never returns <code>null</code>. * * <p>Examples:</p> * <pre> * // prints "hello" * String s=null; * System.out.println(TextUtils.noNull(s,"hello"); * * // prints "hello" * s=""; * System.out.println(TextUtils.noNull(s,"hello"); * * // prints "world" * s="world"; * System.out.println(TextUtils.noNull(s, "hello"); * </pre> * * @param string the String to check. * @param defaultString The default string to return if <code>string</code> is <code>null</code> or <code>""</code> * @return <code>string</code> if <code>string</code> is non-empty, and <code>defaultString</code> otherwise * @see #stringSet(String) */ private static String noNull(String string, String defaultString) { return (stringSet(string)) ? string : defaultString; } /** * Check whether <code>string</code> has been set to * something other than <code>""</code> or <code>null</code>. * @param string the <code>String</code> to check * @return a boolean indicating whether the string was non-empty (and non-null) */ private static boolean stringSet(String string) { return (string != null) && !"".equals(string); } /** * Get the starting index of a URL (either 'abc://' or 'www.') * @param str String builder * @param startIndex index * @return new index */ private static int getStartUrl(StringBuilder str, int startIndex) { int schemeIndex = getSchemeIndex(str, startIndex); final int wwwIndex = str.indexOf("www.", startIndex + 1); if ((schemeIndex == -1) && (wwwIndex == -1)) { return -1; } else if (schemeIndex == -1) { return wwwIndex; } else if (wwwIndex == -1) { return schemeIndex; } return Math.min(schemeIndex, wwwIndex); } private static void linkURL(StringBuilder str, String target, int maxDisplayLength) { String urlToDisplay; int lastEndIndex = -1; //Stores the index position, within the whole string, of the ending char of the last URL found. String targetString = ((target == null) || (target.trim().length() == 0)) ? "" : (" target=\"" + target.trim() + '\"'); while (true) { int linkStartIndex = getStartUrl(str, lastEndIndex); //if no more links found - then end the loop if (linkStartIndex == -1) { break; } else { //Get the whole URL... //We move forward and add each character to the URL string until we encounter //an invalid URL character (we assume that the URL ends there). int linkEndIndex = linkStartIndex; String urlStr = ""; while (true) { // if char at linkEndIndex is '&' then we look at the next 4 chars // to see if they make up "&" altogether. This is the html coded // '&' and will pretty much stuff up an otherwise valid link becos of the ';'. // We therefore have to remove it before proceeding... if (str.charAt(linkEndIndex) == '&') { if (((linkEndIndex + 6) <= str.length()) && """.equals(str.substring(linkEndIndex, linkEndIndex + 6))) { break; } else if (((linkEndIndex + 5) <= str.length()) && "&".equals(str.substring(linkEndIndex, linkEndIndex + 5))) { str.replace(linkEndIndex, linkEndIndex + 5, "&"); } } if (UrlUtils.isValidURLChar(str.charAt(linkEndIndex))) { urlStr += str.charAt(linkEndIndex); linkEndIndex++; if (linkEndIndex == str.length()) { //Reached end of str... break; } } else { break; } } //if the characters before the linkStart equal 'href="' then don't link the url - CORE-44 if (linkStartIndex >= 6) { //6 = "href\"".length() String prefix = str.substring(linkStartIndex - 6, linkStartIndex); if ("href=\"".equals(prefix)) { lastEndIndex = linkEndIndex; continue; } } //if the characters after the linkEnd are '</a>' then this url is probably already linked - CORE-44 if (str.length() >= (linkEndIndex + 4)) { //4 = "</a>".length() String suffix = str.substring(linkEndIndex, linkEndIndex + 4); if ("</a>".equals(suffix)) { lastEndIndex = linkEndIndex + 4; continue; } } //Decrement linkEndIndex back by 1 to reflect the real ending index position of the URL... linkEndIndex--; // If the last char of urlStr is a '.' we exclude it. It is most likely a full stop and // we don't want that to be part of an url. while (true) { char lastChar = urlStr.charAt(urlStr.length() - 1); if (lastChar == '.') { urlStr = urlStr.substring(0, urlStr.length() - 1); linkEndIndex--; } else { break; } } //if the URL had a '(' before it, and has a ')' at the end, trim the last ')' from the url //ie '(www.opensymphony.com)' => '(<a href="http://www.openymphony.com/">www.opensymphony.com</a>)' char lastChar = urlStr.charAt(urlStr.length() - 1); if (lastChar == ')') { if ((linkStartIndex > 0) && ('(' == (str.charAt(linkStartIndex - 1)))) { urlStr = urlStr.substring(0, urlStr.length() - 1); linkEndIndex--; } } else if (lastChar == '\'') { if ((linkStartIndex > 0) && ('\'' == (str.charAt(linkStartIndex - 1)))) { urlStr = urlStr.substring(0, urlStr.length() - 1); linkEndIndex--; } } //perhaps we ended with '>', '<' or '"' //We need to strip these //ie '"www.opensymphony.com"' => '"<a href="http://www.openymphony.com/">www.opensymphony.com</a>"' //ie '<www.opensymphony.com>' => '<<a href="http://www.openymphony.com/">www.opensymphony.com</a>>' else if (lastChar == ';') { // 6 = """.length() if ((urlStr.length() > 6) && """.equalsIgnoreCase(urlStr.substring(urlStr.length() - 6))) { urlStr = urlStr.substring(0, urlStr.length() - 6); linkEndIndex -= 6; } // 4 = "<".length() || ">".length() else if (urlStr.length() > 4) { final String endingStr = urlStr.substring(urlStr.length() - 4); if ("<".equalsIgnoreCase(endingStr) || ">".equalsIgnoreCase(endingStr)) { urlStr = urlStr.substring(0, urlStr.length() - 4); linkEndIndex -= 4; } } } // we got the URL string, now we validate it and convert it into a hyperlink... if (maxDisplayLength > 0 && urlStr.length() > maxDisplayLength) { urlToDisplay = htmlEncode(urlStr.substring(0, maxDisplayLength), true) + "..."; } else { urlToDisplay = htmlEncode(urlStr, true); } if (urlStr.toLowerCase().startsWith("www.")) { urlStr = "http://" + urlStr; } if (UrlUtils.verifyHierachicalURI(urlStr)) { //Construct the hyperlink for the url... String urlLink; if (maxDisplayLength > 0 && urlStr.length() > maxDisplayLength) { //urlLink = "<a href=\"" + urlStr + "\"" + targetString + ">" + urlToDisplay + "</a>"; urlLink = "<a href=\"" + urlStr + "\"" + targetString + " title=\"" + htmlEncode(urlStr, true) + "\">" + urlToDisplay + "</a>"; } else { urlLink = "<a href=\"" + urlStr + "\"" + targetString + ">" + urlToDisplay + "</a>"; } //urlLink = "<a href=\"" + urlStr + '\"' + targetString + '>' + urlToDisplay + "</a>"; //Remove the original urlStr from str and put urlLink there instead... str.replace(linkStartIndex, linkEndIndex + 1, urlLink); //Set lastEndIndex to reflect the position of the end of urlLink //within the whole string... lastEndIndex = (linkStartIndex - 1) + urlLink.length(); } else { //lastEndIndex is different from the one above cos' there's no //<a href...> tags added... lastEndIndex = (linkStartIndex - 1) + urlStr.length(); } } } } /** * Given a string, and the index to start looking at, find the index of the start of the scheme. Eg. * <pre> * getSchemeIndex("notes://abc", 0) -> 0 * getSchemeIndex("abc notes://abc", 0) -> 4 * </pre> * @param str The string to search for * @param startIndex Where to start looking at * @return The location the string was found, ot -1 if the string was not found. */ private static int getSchemeIndex(StringBuilder str, int startIndex) { int schemeIndex = str.indexOf(UrlUtils.SCHEME_URL, startIndex + 1); //if it was not found, or found at the start of the string, then return 'not found' if (schemeIndex <= 0) { return -1; } //walk backwards through the scheme until we find the first non valid character int schemeStart; for (schemeStart = schemeIndex - 1; schemeStart >= 0; schemeStart--) { char currentChar = str.charAt(schemeStart); if (!UrlUtils.isValidSchemeChar(currentChar)) { break; } } //reset the scheme to the starting character schemeStart++; /* we don't want to do this, otherwise an invalid scheme would ruin the linking for later schemes if (UrlUtils.isValidScheme(str.substring(schemeStart, schemeIndex))) return schemeStart; else return -1; */ return schemeStart; } /** * Remove all of the XML and HTML tags from the <code>s</code> parameter. * The RegEx in this method removes everything between two "innermost" brackets * (e.g., <code><...></code>) so * it may accidentally remove sections of text that are not tags, just because both the * "greater than" and "less than" symbols exist and there is no tag bewteen them. * <p/> * For instance, the title: "Yak mass < whale mass, but yak mass > weasel mass" would * be reduced to: "Yak mass weasel mass" which is very much not the desired result. * That is why this method is prefaced with the lable "simple". * <p/> * Note that the above example only fails because there is no tag between the * < and > for this method to remove. * If the title was, instead, "Yak mass < whale mass, <p>but yak mass > weasel mass", * then the <p> tag would be removed and the rest of the title would be left alone. * * TODO: Augment the RegEx to fix the above corner case. This can be accomplished by ensuring * todo: all openning tags have matching closing tags, then handling valid singleton tags (e.g., * todo: <p/>) as special cases. * * @param s The String which will have all of its tags removed * @return The <code>s</code> parameter with all tags removed */ public static String simpleStripAllTags(String s) { return s.replaceAll("<[^<>]*?>", ""); } /** * Transform a xml string to html text * @param xmlContent xml * @return html html text */ public static String transformXMLtoHtmlText(String xmlContent) { if (xmlContent != null) { String htmlContent = ""; try { DocumentBuilder db = DocumentBuilderFactory.newInstance().newDocumentBuilder(); // surround the xml content with temporary root element to make sure that it can be parsed. InputSource source = new InputSource( new StringReader("<temprootelement>" + xmlContent + "</temprootelement>")); Document doc = db.parse(source); // remove all the elements from the xml content StringWriter stw = new StringWriter(); Transformer transformer = TransformerFactory.newInstance().newTransformer(); transformer.setOutputProperty(OutputKeys.METHOD, "text"); transformer.transform(new DOMSource(doc), new StreamResult(stw)); htmlContent = stw.toString(); // make sure all the characters are escaped using html entities htmlContent = StringEscapeUtils.escapeHtml(htmlContent); } catch (Exception e) { log.info("Failed to transform " + xmlContent + " to html text", e); } return htmlContent; } else { return ""; } } /** * truncate text * @param text text to truncate * @param truncatedLength truncate length * @return truncated text */ public static String truncateText(String text, int truncatedLength) { if (StringUtils.isBlank(text)) { return text; } if (text.length() > truncatedLength) { final String abrsfx = "..."; final int abrsfxlen = 3; // attempt to truncate on a word boundary int index = truncatedLength - 1; while (!Character.isWhitespace(text.charAt(index)) || index > (truncatedLength - abrsfxlen - 1)) { if (--index == 0) { break; } } if (index == 0) { index = truncatedLength - abrsfxlen - 1; } text = text.substring(0, index) + abrsfx; assert text.length() <= truncatedLength; } return text; } /** * truncate text and close open tags * @param text text to truncate * @param truncatedLength truncate length * @return truncated text */ public static String truncateTextCloseOpenTag(String text, final int truncatedLength) { String shortenedText = truncateText(text, truncatedLength); int openIndex = shortenedText.lastIndexOf("<i>"); if (openIndex != -1) { int closeIndex = shortenedText.indexOf("</i>", openIndex); if (closeIndex == -1) { shortenedText = shortenedText + "</i>"; } } return shortenedText; } /** * Create a list of first, second and last authors * * @param authors the list of authors * * @return a combined string of first, second and last authors */ public static String makeAuthorString(String[] authors) { if (authors.length <= 3) { return StringUtils.join(authors, ", "); } else { //use first two and last. return authors[0].trim() + ", " + authors[1].trim() + ", [...], " + authors[authors.length - 1].trim(); } } }