Java tutorial
// BlogBridge -- RSS feed reader, manager, and web based service // Copyright (C) 2002-2006 by R. Pito Salas // // This program is free software; you can redistribute it and/or modify it under // the terms of the GNU General Public License as published by the Free Software Foundation; // either version 2 of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; // without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // See the GNU General Public License for more details. // // You should have received a copy of the GNU General Public License along with this program; // if not, write to the Free Software Foundation, Inc., 59 Temple Place, // Suite 330, Boston, MA 02111-1307 USA // // Contact: R. Pito Salas // mailto:pitosalas@users.sourceforge.net // More information: about BlogBridge // http://www.blogbridge.com // http://sourceforge.net/projects/blogbridge // // $Id: StringUtils.java,v 1.42 2007/11/07 17:16:48 spyromus Exp $ // package com.salas.bb.utils; import com.salas.bb.utils.i18n.Strings; import sun.io.Converters; import sun.misc.BASE64Encoder; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLEncoder; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Collection of string utilities. */ public final class StringUtils extends org.apache.commons.lang.StringUtils { private static final Logger LOG = Logger.getLogger(StringUtils.class.getName()); private static final String[] SIZE_UNIT = { "Bytes", "Kb", "Mb", "Gb" }; private static final DecimalFormat FORMAT = new DecimalFormat(); private static final Pattern PATTERN_KEYWORDS = Pattern.compile("\\s*((\\\"([^\\\"]*)\\\"|([^\\s\\\"]+))\\s*)"); private static final Pattern PATTERN_URL_WITH_PROTOCOL = Pattern.compile("^[a-zA-Z]+:/"); private static final Pattern PATTERN_PUNCTUATION = Pattern .compile("([\\-\\+#\\$%^&\\_*\\s,.\\(\\)\\[\\]<>!\\?\"':;/\\\\])+"); private static final Map<String, Character> ENTITIES = new HashMap<String, Character>(); static { FORMAT.setMaximumFractionDigits(1); FORMAT.setMinimumFractionDigits(1); String[] fromA0 = { "nbsp", "iexcl", "cent", "pound", "curren", "yen", "brvbar", "sect", "uml", "copy", "ordf", "laquo", "not", "shy", "reg", "macr", "deg", "plusmn", "sup2", "sup3", "acute", "micro", "para", "middot", "cedil", "sup1", "ordm", "raquo", "frac14", "frac12", "frac34", "iquest", "Agrave", "Aacute", "Acirc", "Atilde", "Auml", "Aring", "AElig", "Ccedil", "Egrave", "Eacute", "Ecirc", "Euml", "Igrave", "Iacute", "Icirc", "Iuml", "ETH", "Ntilde", "Ograve", "Oacute", "Ocirc", "Otilde", "Ouml", "times", "Oslash", "Ugrave", "Uacute", "Ucirc", "Uuml", "Yacute", "THORN", "szlig", "agrave", "aacute", "acirc", "atilde", "auml", "aring", "aelig", "ccedil", "egrave", "eacute", "ecirc", "euml", "igrave", "iacute", "icirc", "iuml", "eth", "ntilde", "ograve", "oacute", "ocirc", "otilde", "ouml", "divide", "oslash", "ugrave", "uacute", "ucirc", "uuml", "yacute", "thorn", "yuml" }; for (int i = 0; i < fromA0.length; i++) ENTITIES.put(fromA0[i], (char) (0xa0 + i)); ENTITIES.put("trade", (char) 8482); ENTITIES.put("OElig", (char) 338); ENTITIES.put("oelig", (char) 339); ENTITIES.put("Scaron", (char) 352); ENTITIES.put("scaron", (char) 353); ENTITIES.put("Yuml", (char) 376); ENTITIES.put("circ", (char) 710); ENTITIES.put("tilde", (char) 732); ENTITIES.put("ensp", (char) 8194); ENTITIES.put("emsp", (char) 8195); ENTITIES.put("thinsp", (char) 8201); ENTITIES.put("zwnj", (char) 8204); ENTITIES.put("zwj", (char) 8205); ENTITIES.put("lrm", (char) 8206); ENTITIES.put("rrm", (char) 8207); ENTITIES.put("ndash", (char) 8211); ENTITIES.put("mdash", (char) 8212); ENTITIES.put("lsquo", (char) 8216); ENTITIES.put("rsquo", (char) 8217); ENTITIES.put("sbquo", (char) 8218); ENTITIES.put("ldquo", (char) 8220); ENTITIES.put("rdquo", (char) 8221); ENTITIES.put("bdquo", (char) 8222); ENTITIES.put("dagger", (char) 8224); ENTITIES.put("Dagger", (char) 8225); ENTITIES.put("hellip", (char) 8230); ENTITIES.put("permil", (char) 8240); ENTITIES.put("lsaquo", (char) 8249); ENTITIES.put("rsaquo", (char) 8250); ENTITIES.put("euro", (char) 8364); ENTITIES.put("amp", '&'); ENTITIES.put("lt", '<'); ENTITIES.put("gt", '>'); ENTITIES.put("apos", '\''); ENTITIES.put("quot", '"'); } /** * Hidden utility class constructor. */ private StringUtils() { } /** * Converts array of bytes in UTF-8 encoding to appropriate string. If encoding * isn't supported then the array will be converted into string using default * encoding and record will be put in log with severe priority. * * @param string bytes forming string. * * @return resulting string. */ public static String fromUTF8(byte[] string) { if (string == null) return null; String str; try { str = new String(string, "UTF-8"); } catch (UnsupportedEncodingException e) { LOG.severe(Strings.error("utf8.not.supported")); str = new String(string); } return str; } /** * Converts array of byte arrays in UTF-8 encoding to array of strings. The notes are * the same as for <code>fromUTF8(String)</code> method. * * @param strings array of byte arrays to decode. * * @return resulting array of strings. */ public static String[] fromUTF8(byte[][] strings) { if (strings == null) return null; String[] strs = new String[strings.length]; for (int i = 0; i < strings.length; i++) { byte[] string = strings[i]; strs[i] = fromUTF8(string); } return strs; } /** * Converts string into array of bytes in UTF-8 encoding. If UTF-8 encoding isn't supported * then the tring is converted into bytes in default encoding and record is put in log * with severe priority. * * @param string string to convert. * * @return resulting array of bytes. */ public static byte[] toUTF8(String string) { if (string == null) return null; byte[] result; try { result = string.getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { LOG.severe(Strings.error("utf8.not.supported")); result = string.getBytes(); } return result; } /** * Converts multi-line text into the array of strings. * * @param text text. * * @return array of strings. */ public static String[] multilineToArray(String text) { return text == null ? null : split(text, "\n"); } /** * Converts any value to multi-line text. Arrays of strings are converted that each string * appears on its own line. * * @param value arbitrary value. * * @return multi-line text. */ public static String anyToMultiline(Object value) { String result = Constants.EMPTY_STRING; if (value instanceof String[]) { result = arrayToMultiline((String[]) value); } else if (value != null) { result = value.toString(); } return result; } /** * Converts array of strings to multi-line text where each string appears on it's own line. * * @param aStrings array of strings. * * @return multi-line. */ public static String arrayToMultiline(String[] aStrings) { return aStrings == null ? null : join(aStrings, "\n"); } /** * Converts text from source encoding into Unicode. If encoding isn't supported you will get * the original text. * * @param text text. * @param sourceEncoding source encoding. * * @return converted text. */ public static String decodeForced(String text, String sourceEncoding) { if (text == null) return null; if (sourceEncoding == null || sourceEncoding.equals(Converters.getDefaultEncodingName())) return text; try { text = new String(text.getBytes("ISO8859-1"), sourceEncoding); } catch (UnsupportedEncodingException e) { // We don't cate about it. } return text; } /** * Returns the first line of article. * * @param aText text to scan. * * @return first text line. */ public static String getFirstSentense(String aText) { if (aText == null) return null; int size = aText.length(); int start; int length; for (start = 0; start < size && Character.isWhitespace(aText.charAt(start)); start++) ; for (length = 0; start + length < size && (!isSentenseTerminator(aText.charAt(start + length))); length++) ; return length > 0 ? aText.substring(start, start + length).trim() : Constants.EMPTY_STRING; } /** * Returns TRUE if char is a sentense terminator. * * @param ch char to test. * * @return TRUE if char is a sentense terminator. */ public static boolean isSentenseTerminator(char ch) { return ch == '.' || ch == '?' || ch == '!'; } /** * Returns the stringified size. * * @param size size in bytes. * * @return string representation. */ public static String sizeToString(double size) { return sizeToString(size, 0); } /** * Returns the stringified size. * * @param size size in units. * @param unitIndex unit index. * * @return string represenation. */ private static String sizeToString(double size, int unitIndex) { String value; if (size < 512 || unitIndex == SIZE_UNIT.length - 1) { value = FORMAT.format(size) + " " + SIZE_UNIT[unitIndex]; } else { value = sizeToString(size / 1024, unitIndex + 1); } return value; } /** * Encodes string to be put in URL. * * <p>Example:</p> * <pre> * input string: 'a &?b' * output string: 'a+%26%3Fb' * </pre> * * @param str string to encode. * * @return encoded string or NULL if source was NULL. */ public static String encodeForURL(String str) { if (str == null) return null; try { str = URLEncoder.encode(str, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(Strings.error("utf8.not.supported"), e); } return str; } /** * Converts list of keywords separated by whitespace. Each keyword can actually * contain several words if enclosed in double quotes. * * @param keywords keywords string. * * @return list of keywords or <code>NULL</code> if <code>keywords</code> * were <code>NULL</code>. */ public static String[] keywordsToArray(String keywords) { List<String> matches = keywordsToList(keywords); return matches == null ? null : matches.toArray(new String[matches.size()]); } /** * Converts list of keywords separated by whitespace. Each keyword can actually * contain several words if enclosed in double quotes. * * @param keywords keywords string. * * @return list of keywords or <code>NULL</code> if <code>keywords</code> * were <code>NULL</code>. */ public static List<String> keywordsToList(String keywords) { if (keywords == null) return null; Matcher mat = PATTERN_KEYWORDS.matcher(keywords); List<String> matches = new ArrayList<String>(); while (mat.find()) { String keyword = mat.group(3); if (keyword == null) keyword = mat.group(2); if (keyword != null && !"*".equals(keyword) && !"+".equals(keyword) && !matches.contains(keyword)) matches.add(keyword); } return matches; } /** * Places multi-word keyword in quotes only if it isn't in quotes already. * * @param keyword param to quote if necessary. * * @return updated keyword. */ public static String quoteKeywordIfNecessary(String keyword) { keyword = keyword.trim(); if (keyword.indexOf(' ') != -1 && keyword.charAt(0) != '"' && keyword.charAt(keyword.length() - 1) != '"') { keyword = "\"" + keyword + "\""; } return keyword; } /** * Converts keywords from <code>a|b|c d|e</code> or <code>a, b, c d, e</code> * looks to <code>a b "c d" e</code>. * * @param aKeywords keywords to convert. * * @return new-look keywords. */ public static String convertKeywordsToNewFormat(String aKeywords) { String result; if (aKeywords.indexOf('|') != -1) { result = breakAndRejoinKeywords(aKeywords, "|"); } else if (aKeywords.indexOf(',') != -1) { result = breakAndRejoinKeywords(aKeywords, ","); } else result = aKeywords; return result; } /** * Breaks current keywords list appart and rejoins it using curren keywords * rules. * * @param aKeywords list of keywords. * @param currentSeparator the separator to be used for breaking. * * @return newly formed keywords list. */ private static String breakAndRejoinKeywords(String aKeywords, String currentSeparator) { String[] keywordsList = split(aKeywords, currentSeparator); return arrayToQuotedKeywords(keywordsList); } /** * Converts the array of keywords into the space-delimited list with quoted multi-word * items. * * @param aKeywordsList list. * * @return space-delimete and quoted list of keywords. */ public static String arrayToQuotedKeywords(String[] aKeywordsList) { String result = null; if (aKeywordsList != null) { if (aKeywordsList.length > 0) { StringBuffer buf = new StringBuffer(); buf.append(quoteKeywordIfNecessary(aKeywordsList[0])); for (int i = 1; i < aKeywordsList.length; i++) { buf.append(" ").append(quoteKeywordIfNecessary(aKeywordsList[i])); } result = buf.toString(); } else result = ""; } return result; } /** * Digests the buffer with key using MD5 algorithm. * * @param buffer buffer. * @param key key is secret key (password or something else which isn't * going to be passed over network). * * @return digested buffer. * * @throws NoSuchAlgorithmException if there's no MD5 algorithm implemetation. */ public static byte[] digestMD5(String buffer, String key) throws NoSuchAlgorithmException { MessageDigest md5 = MessageDigest.getInstance("MD5"); md5.update(buffer.getBytes()); return md5.digest(key.getBytes()); } /** * Creates basic authentication token given user name and password. * * @param user user name. * @param password password. * * @return token. */ public static String createBasicAuthToken(String user, String password) { String token = user + ":" + password; String base64Token = new BASE64Encoder().encode(token.getBytes()); return "Basic " + base64Token; } /** * Creates pattern from the keywords list. * * @param keywords keywords list. * * @return keywords regex pattern. */ public static String keywordsToPattern(String keywords) { return keywordsToPattern(keywordsToArray(keywords)); } /** * Creates pattern from the keywords list. * * @param aKeywords keywords list. * * @return keywords regex pattern. */ public static String keywordsToPattern(String[] aKeywords) { String pattern; if (aKeywords != null && aKeywords.length > 0) { pattern = join(aKeywords, "|"); pattern = pattern.replaceAll("\\\\", "\\\\\\\\"); pattern = pattern.replaceAll("\\.", "\\\\."); pattern = pattern.replaceAll("\\n+", "|"); pattern = pattern.replaceAll("\\?", "\\\\?"); pattern = pattern.replaceAll("\\(", "\\\\("); pattern = pattern.replaceAll("\\)", "\\\\)"); pattern = pattern.replaceAll("\\[", "\\\\["); pattern = pattern.replaceAll("\\]", "\\\\]"); pattern = pattern.replaceAll("\\++", "\\\\w+"); pattern = pattern.replaceAll("\\s+", "\\\\s+"); pattern = pattern.replaceAll("\\*+", "\\\\w*"); pattern = pattern.replaceAll("\\\\s\\+\\\\w\\*\\\\s\\+", "\\\\s+(\\\\w*\\\\s+)?"); pattern = pattern.replaceAll("\\|\\|+", "\\|"); pattern = pattern.replaceAll("(^\\||\\|$)", ""); String start = "\\W"; String end = start; if (pattern.startsWith("\\s+")) { start = "\\s"; pattern = pattern.substring(3); } if (pattern.endsWith("\\s+")) { end = "\\s"; pattern = pattern.substring(0, pattern.length() - 3); } pattern = "(^|" + start + ")(" + pattern.trim() + ")($|" + end + ")"; } else pattern = null; return pattern; } /** * Performs different cleanups of URL. Removes extra spaces, converts "feed://" into "http://", * takes only first line of draggeed URL which is actual link under FireFox 1.5 (Win). * * @param link link being dragged into application. * * @return final link. */ public static String cleanDraggedURL(String link) { if (link == null) return null; link = link.trim(); // If URL starts with feed:// we change it to http:// if (link.startsWith("feed:")) link = "http:" + link.substring(5); // FireFox 1.5 under Win has two lines: URL and description taken from page // We leave only the first line -- the URL int index = link.indexOf(0x0a); if (index != -1) link = link.substring(0, index).trim(); return link; } /** * Scans for tags definitions in micro-format. The text should contain A-links to * tag categories with "rel" attribute equal to "tag". The last section of URL is * taken as tag. * * @param aText text to parse. * * @return list of tags detected. */ public static String[] collectTags(String aText) { List<String> tagsList = null; if (aText != null) { Pattern pat = Pattern.compile("<a\\s+[^>]*rel\\s*=\\s*['\"]tag['\"][^>]*>", Pattern.CASE_INSENSITIVE); Matcher matcher = pat.matcher(aText); Pattern patTag = null; while (matcher.find()) { if (tagsList == null) { tagsList = new ArrayList<String>(); patTag = Pattern.compile("href\\s*=\\s*['\"]([^'\"/]+/+)+([\\+a-zA-Z0-9]+)['\"]"); } Matcher m2 = patTag.matcher(matcher.group()); if (m2.find()) tagsList.add(m2.group(2).replaceAll("\\+", " ")); } } return tagsList == null ? Constants.EMPTY_STRING_LIST : tagsList.toArray(new String[tagsList.size()]); } /** * Adds protocol part to URL (http://) if none is specified and removes spaces around text. * * @param url source URL. * * @return modified URL. */ public static String fixURL(String url) { if (url != null) { url = url.trim(); if (url.length() == 0) { url = null; } else if (url.startsWith("feed:")) { url = url.substring(5).replaceAll("^/+", ""); url = fixURL(url); } else if (!PATTERN_URL_WITH_PROTOCOL.matcher(url).find()) { url = "http://" + url; } } return url; } /** * Unescapes the string. * * @param str string. * * @return unescaped version. */ public static String quickUnescape(String str) { if (str == null) return null; str = str.replaceAll("&", "&"); str = str.replaceAll("<", "<"); str = str.replaceAll(">", ">"); str = str.replaceAll("'", "'"); str = str.replaceAll(""", "\""); return str; } /** * Complete recoding of all HTML entities into Unicode symbols. * * @param str string. * * @return result. */ public static String unescape(String str) { if (isEmpty(str)) return str; Pattern p = Pattern.compile("&(([^#;\\s]{3,6})|#([0-9]{1,4})|#x([0-9a-fA-F]{1,4}));"); Matcher m = p.matcher(str); StringBuffer sb = new StringBuffer(); while (m.find()) { Character c; String strEntity = m.group(2); String decEntity = m.group(3); String hexEntity = m.group(4); if (strEntity != null) { // String entity c = ENTITIES.get(strEntity); } else { c = decEntity != null ? (char) Integer.parseInt(decEntity) : (char) Integer.parseInt(hexEntity, 16); } m.appendReplacement(sb, c == null ? m.group() : c.toString()); } m.appendTail(sb); return sb.toString(); } /** * Checks whether the given strings is a valid e-mail address. * * @param email address. * * @return <code>TRUE</code> if valid. */ public static boolean isValidEmail(String email) { return !isEmpty(email) && email.trim().matches("^[^@]+@[^\\.]+(\\.[^\\.]+)+$"); } /** * Converts the list of URLs in string form into the array of URL objects. * * @param str URLs string. * * @return array of URLs. */ public static URL[] strToURLs(String str) { URL[] newURLs = null; if (isNotEmpty(str)) { String[] urls = split(str, Constants.URL_SEPARATOR); List<URL> urlsList = new ArrayList<URL>(urls.length); for (String url : urls) { try { urlsList.add(new URL(url)); } catch (MalformedURLException e) { // Wrong url specified -- skipping } } newURLs = urlsList.toArray(new URL[urlsList.size()]); } return newURLs; } /** * Returns space-separated list of first <code>N</code> words. * * @param str string. * @param n number of words. * * @return list or <code>NULL</code> if string is <code>NULL</code> or <code>N</code> is less than <code>1</code>. */ public static String getUpToNWords(String str, int n) { if (str == null || n < 1) return null; String[] split = split(str, " ,.()[]<>!?\"':;/\\", n + 1); if (split.length > 1) split[split.length - 1] = ""; return join(split, " ").trim(); } private static Pattern lastPattern; private static int lastSentences = -1; /** * Returns the excerpt consisting of given number of sentences unless they are shorter or longer than given * limits. In this case minimum or maximum allowed number of characters plus "..." are returned. * * @param str string to process. * @param sentences the number of sentences. * @param min minimum characters. * @param max maximum characters. * * @return the result. */ public static String excerpt(String str, int sentences, int min, int max) { if (str == null) return str; String res; // Create pattern or reuse Pattern pat; if (lastSentences != sentences) { String patS = "^([^\\.!?]+(\\.+|!+|\\?+)+){" + sentences + "}"; pat = Pattern.compile(patS); lastSentences = sentences; lastPattern = pat; } else pat = lastPattern; // Match the string Matcher m = pat.matcher(str); if (m.find()) { res = m.group().trim(); int len = res.length(); if (len < min) res = excerpt(str, min); else if (len > max) res = excerpt(str, max); } else res = excerpt(str, max); return res; } /** * Returns the excerpt with given number of characters plus "..." unless * the string is shorter than limit. * * @param str string to process. * @param len number of characters. * * @return the result. */ public static String excerpt(String str, int len) { return str == null || str.length() <= len ? str : str.substring(0, len) + "..."; } /** * Takes the string, removes punctuation, lowercases it and returns the words in a given range glued * together with spaces. If there's not enough words, the maximum available number of them is returned. * * @param str string. * @param from the first word to return. * @param to the last word to return. * * @return the result. */ public static String getWordsInRange(String str, int from, int to) { if (isEmpty(str)) return str; if (from > to) throw new IllegalArgumentException("From can't be bigger than To."); // Remove all punctuation and collapse spaces plus lowercase str = PATTERN_PUNCTUATION.matcher(str).replaceAll(" ").toLowerCase().trim(); String[] strs = str.split(" "); // Figure out what are our limits from = Math.min(from, strs.length); to = Math.min(to + 1, strs.length); // Glue words back together String[] arr = new String[to - from]; System.arraycopy(strs, from, arr, 0, to - from); return join(arr, " "); } /** * Safely intern's a string. * * @param s string. * * @return intern'ed version. */ public static String intern(String s) { return s == null ? null : s.intern(); } }