Java tutorial
/** * * Copyright 2012-2013 The MITRE Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. * * ************************************************************************** * NOTICE This software was produced for the U. S. Government under Contract No. * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer * Software and Noncommercial Computer Software Documentation Clause * 252.227-7014 (JUN 1995) * * (c) 2012 The MITRE Corporation. All Rights Reserved. * ************************************************************************** */ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| // // _____ ____ __ __ ///\ __`\ /\ _`\ /\ \__ /\ \__ //\ \ \/\ \ _____ __ ___ \ \,\L\_\ __ __ _\ \ ,_\ __ ___ \ \ ,_\ // \ \ \ \ \ /\ '__`\ /'__`\ /' _ `\ \/_\__ \ /'__`\/\ \/'\\ \ \/ /'__`\ /' _ `\\ \ \/ // \ \ \_\ \\ \ \L\ \/\ __/ /\ \/\ \ /\ \L\ \ /\ __/\/> </ \ \ \_ /\ \L\.\_ /\ \/\ \\ \ \_ // \ \_____\\ \ ,__/\ \____\\ \_\ \_\ \ `\____\\ \____\/\_/\_\ \ \__\\ \__/.\_\\ \_\ \_\\ \__\ // \/_____/ \ \ \/ \/____/ \/_/\/_/ \/_____/ \/____/\//\/_/ \/__/ \/__/\/_/ \/_/\/_/ \/__/ // \ \_\ // \/_/ // // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| // package org.opensextant.util; import static org.apache.commons.lang3.StringUtils.isBlank; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.text.Normalizer; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import org.opensextant.data.Language; import org.supercsv.cellprocessor.Optional; import org.supercsv.cellprocessor.constraint.NotNull; import org.supercsv.cellprocessor.ift.CellProcessor; import org.supercsv.io.CsvListReader; import org.supercsv.prefs.CsvPreference; /** * * @author ubaldino */ public class TextUtils { final static Pattern delws = Pattern.compile("\\s+"); // Match ALL empty lines: // \n followed by other ootional whitespace // Up to 2 empty lines or more. This matches 3 line endings // The first EOL could be on a non-empty line, but then followed by 2 empty // lines. // The intent is to reduce 3 or more EOL to 2. Preserving paragraph breaks. // final static Pattern multi_eol = Pattern.compile("(\n[ \t\r]*){3,}"); final static Pattern multi_eol2 = Pattern.compile("(\n\r?){2,}"); /** * Checks if non-ASCII and non-LATIN characters are present. * * @param data * any textual data * @return true if content is strictly ASCII or Latin1 extended. */ public final static boolean isLatin(String data) { char[] ch = data.toCharArray(); boolean isLatin = true; for (char c : ch) { if (isASCII(c)) { continue; } if (!Character.isLetter(c)) { continue; } Character.UnicodeBlock blk = Character.UnicodeBlock.of(c); if (blk == Character.UnicodeBlock.LATIN_1_SUPPLEMENT || blk == Character.UnicodeBlock.LATIN_EXTENDED_A || blk == Character.UnicodeBlock.LATIN_EXTENDED_B || blk == Character.UnicodeBlock.LATIN_EXTENDED_C || blk == Character.UnicodeBlock.LATIN_EXTENDED_D || blk == Character.UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) { continue; } isLatin = false; break; } // return isLatin; } /** * Helpful hints on parsing Unicode phrases. Reference: * http://www.rgagnon.com/javadetails/java-0456.html */ private static final String ALPHAMAP_PLAIN_ASCII = "AaEeIiOoUu" // grave + "AaEeIiOoUuYy" // acute + "AaEeIiOoUuYy" // circumflex + "AaOoNn" // tilde + "AaEeIiOoUuYy" // umlaut + "Aa" // ring + "Cc" // cedilla + "OoUu" // double acute + "Oo" // Scandanavian o + "AaEe" // A/E wiht micron ; private static final String ALPHAMAP_UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" // grave + "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" // acute + "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" // circumflex + "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" // tilde + "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" // umlaut + "\u00C5\u00E5" // ring + "\u00C7\u00E7" // cedilla + "\u0150\u0151\u0170\u0171" // double acute + "\u00D8\u00F8" // Scandanavian o + "\u0100\u0101\u0112\u0113" // E-bar, A-bar ; /** * remove accents from a string and replace with ASCII equivalent Reference: * http://www.rgagnon.com/javadetails/java-0456.html Caveat: This * implementation is not exhaustive. * * @param s * the string * @return converted string */ public final static String replaceDiacritics(final String s) { if (s == null) { return null; } if ("".equals(s)) { return s; } StringBuilder sb = new StringBuilder(); int n = s.length(); for (int i = 0; i < n; i++) { char c = s.charAt(i); int pos = ALPHAMAP_UNICODE.indexOf(c); if (pos > -1) { sb.append(ALPHAMAP_PLAIN_ASCII.charAt(pos)); } else { sb.append(c); } } return sb.toString(); } /** * * @param c * a character * @return true if c is ASCII */ public final static boolean isASCII(char c) { return c > 0 && c <= ASCII_END; } private final static int ASCII_END = 0x7F; /** * @param data * bytes to test * @return boolean if data is ASCII or not */ public static boolean isASCII(byte[] data) { for (byte b : data) { if (b < 0 || b > ASCII_END) { return false; } } return true; } /** * Early exit test -- return false on first non-ASCII character found. * * @param t * buffer of text * @return true only if every char is in ASCII table. */ public static boolean isASCII(String t) { char c; for (int x = 0; x < t.length(); ++x) { c = t.charAt(x); if (c > ASCII_END) { return false; } } return true; } /** * count the number of ASCII bytes * * @param data * bytes to count * @return count of ASCII bytes */ public static int countASCIIChars(byte[] data) { int ascii = 0; for (byte b : data) { if (b > 0 || b <= ASCII_END) { ++ascii; } } return ascii; } /** * Replaces all 3 or more blank lines with a single paragraph break (\n\n) * * @param t * text * @return A string with fewer line breaks; * */ public static String reduce_line_breaks(String t) { Matcher m = multi_eol.matcher(t); if (m != null) { return m.replaceAll("\n\n"); } return t; } /** * Delete whitespace of any sort. * * @param t * text * @return String, without whitespace. */ public static String delete_whitespace(String t) { Matcher m = delws.matcher(t); if (m != null) { return m.replaceAll(""); } return t; } /** * Minimize whitespace. * * @param t * text * @return scrubbed string */ public static String squeeze_whitespace(String t) { Matcher m = delws.matcher(t); if (m != null) { return m.replaceAll(" "); } return t; } /** * Replace line endings with SPACE * * @param t * text * @return scrubbed string */ public static String delete_eol(String t) { return t.replace('\n', ' ').replace('\r', ' '); } public final static char NL = '\n'; public final static char CR = '\r'; public final static char SP = ' '; public final static char TAB = '\t'; public final static char DEL = 0x7F; /** * Delete control chars from text data; leaving text and whitespace only. * Delete char (^?) is also removed. Length may differ if ctl chars are * removed. * * @param t * text * @return scrubbed buffer */ public static String delete_controls(String t) { if (t == null) { return null; } StringBuilder tmpCleanBuf = new StringBuilder(); for (char ch : t.toCharArray()) { if ((ch < ' ' && !(ch == TAB || ch == NL)) || (ch == DEL)) { continue; } tmpCleanBuf.append(ch); } return tmpCleanBuf.toString(); } public static boolean hasDigits(String txt) { return (countDigits(txt) > 0); } public static int countDigits(String txt) { return count_digits(txt); } /** * Counts all digits in text. * * @param txt * text to count * @return count of digits */ public static int count_digits(String txt) { if (txt == null) { return 0; } int digits = 0; for (char c : txt.toCharArray()) { if (Character.isDigit(c)) { ++digits; } } return digits; } /** * StringUtils in commons isNumeric("1.234") is NOT numeric. Here "1.234" is * numeric. * * @param v * val to parse * @return true if val is a number */ public final static boolean isNumeric(final String v) { if (v == null) { return false; } for (char ch : v.toCharArray()) { /* * Is the character in .-+Ee ? */ if (ch == '.' || ch == '-' || ch == '+' || ch == 'e' || ch == 'E') { continue; } if (!Character.isDigit(ch)) { return false; } } return true; } /** * Counts all whitespace in text. * * @param txt * text * @return whitespace count */ public static int count_ws(String txt) { if (txt == null) { return 0; } int ws = 0; for (char c : txt.toCharArray()) { // isWhitespaceChar(c)? if (Character.isWhitespace(c)) { ++ws; } } return ws; } /** * Count formatting whitespace. This is helpful in determining if text spans * are phrases with multiple TAB or EOL characters. For that matter, any * control character contributes to formatting in some way. DEL, VT, HT, * etc. So all control characters ( c < ' ') are counted. * * @param txt * input string * @return count of format chars */ public static int countFormattingSpace(String txt) { if (txt == null) { return 0; } int ws = 0; for (char c : txt.toCharArray()) { // if (c == '\n' || c == '\r' || c == '\t') { if (c < 0x20) { ++ws; } } return ws; } /** * For measuring the upper-case-ness of short texts. Returns true if ALL * letters in text are UPPERCASE. Allows for non-letters in text. * * @param dat * text or data * @return true if text is Upper */ public static boolean isUpper(String dat) { return checkCase(dat, 2); } public static boolean isLower(String dat) { return checkCase(dat, 1); } /** * detects if string alpha chars are purely lower case. * * @param text * text * @param textcase * 1 lower, 2 upper * @return if case matches given textcase param */ public static boolean checkCase(String text, int textcase) { if (text == null) { return false; } int caseCount = 0; for (char c : text.toCharArray()) { if (!Character.isLetter(c)) { continue; } if (textcase == 1) { if (Character.isUpperCase(c)) { // Checking for lower case; Fail if upper case is found. return false; } else if (Character.isLowerCase(c)) { ++caseCount; } } else if (textcase == 2) { if (Character.isLowerCase(c)) { // Checking for upper case; Fail if lower case is found. return false; } else if (Character.isUpperCase(c)) { ++caseCount; } } } // IF at least one letter found in the case, return true. // It is possible that mixed-language text that has no case-sense // is mixed up with ASCII or Romance language text. // test LOWER UPPER // A b ==> no no // A ==> no yes // a ==> yes no return caseCount > 0; } /** * Measure character count, upper, lower, non-Character, whitespace * * @param text * text * @return int array with counts. */ public static int[] measureCase(String text) { if (text == null) { return null; } int u = 0, l = 0, ch = 0, nonCh = 0, ws = 0; int[] counts = new int[5]; for (char c : text.toCharArray()) { if (Character.isLetter(c)) { ++ch; if (Character.isUpperCase(c)) { ++u; } else if (Character.isLowerCase(c)) { ++l; } } else if (Character.isWhitespace(c)) { ++ws; } else { ++nonCh; // Other content? } } counts[0] = ch; counts[1] = u; counts[2] = l; counts[3] = nonCh; counts[4] = ws; return counts; } /** * a threshold for determining if character content in a document is upper case enough that the entire document can * be considered upper case. These are constants you can override, since these thresholds are just heuristics. We * don't expect you would pass in such things as arguments * as they don't change from doc to doc much; they do change from domain to domain. * * IFF you are hitting these thresholds too closely, then you have to adapt these to your own data. These are * meant to be very, very general. They would best apply to documents on the order of 200 to 10,000 bytes. Beyond * that * we don't find many texts that are that size and all lower or all upper where these heuristics are helpful. * E.g., tweets in English -- these thresholds are easily influenced by a difference of one or two characters. */ public static double UPPER_CASE_THRESHOLD = 0.75; /** * Since we live in a world that has made use of first-letter capital for a number of languages, this threshold is * very high. * "IS THIS UPPER CASE?, I WILL USE eBAY TODAY" * "by the same convention, this is largely lower case; I will use eBay today." */ public static double LOWER_CASE_THRESHOLD = 0.95; /** * First measureCase(Text) to acquire counts, then call this routine for a heuristic * that suggests the text is mainly upper case. These routines may not work well on languages that are not * Latin-alphabet. * * @param counts * word stats from measureCase() * @return true if counts represent text that exceeds the "UPPER CASE" threshold */ public static boolean isUpperCaseDocument(final int[] counts) { // Method 1: Content = chars + non-chars (not whitespace) // measure upper case against ALL content. // Method 2: measure upper case against just char content. // // Method 2 seems best. int content = counts[0] /* + counts[3]*/ ; return ((float) counts[1] / content) > UPPER_CASE_THRESHOLD; } /** * This measures the amount of upper case * See Upper Case. Two methods to measure -- lower case count compared to all content (char+non-char) * or compared to just char content. * * @param counts * word stats from measureCase() * @return true if counts represent text that exceeds the "lower case" threshold */ public static boolean isLowerCaseDocument(final int[] counts) { int content = counts[0] /*+ counts[3]*/; return ((float) counts[2] / content) > LOWER_CASE_THRESHOLD; } /** * Find the text window(s) around a match. Given the size of a buffer, the * match and desired width return * * <pre> * prepreprepre MATCH postpostpost * ^ ^ ^ ^ * l-width l l+len l+len+width * left1 left2 right1 right2 * </pre> * * @param offset * offset of match * @param width * width of window left and right of match * @param textsize * size of buffer containing match; used for boundary conditions * @param matchlen * length of match * @return window offsets left of match, right of match: [ l1, l2, r1, r2 ] */ public static int[] get_text_window(int offset, int matchlen, int textsize, int width) { /* */ int left_x = offset - width; int left_y = offset - 1; int right_x = offset + matchlen; int right_y = right_x + width; if (left_x < 0) { left_x = 0; } // Fix left side of bounds if (left_y < left_x) { left_y = left_x; } // Fix right side of bounds if (right_y >= textsize) { right_y = textsize; } if (right_x > right_y) { right_x = right_y; } int[] slice = { left_x, left_y, right_x, right_y }; return slice; } /** * Get a single text window around the offset. * * @param offset * offset of match * @param width * width of window left and right of match * @param textsize * size of buffer containing match; used for boundary conditions * @return window offsets of a text span contianing match [ left, right ] */ public static int[] get_text_window(int offset, int textsize, int width) { /* * left .... match .... right */ int half = (width / 2); int left = offset - half; int right = offset + half; if (left < 0) { left = 0; } // Fix right side of bounds if (right >= textsize) { right = textsize; } int[] slice = { left, right }; return slice; } /** * Static method -- use only if you are sure of thread-safety. * * @param text * text or data * @return identifier for the text, an MD5 hash * @throws NoSuchAlgorithmException * on err * @throws UnsupportedEncodingException * on err */ public static String text_id(String text) throws NoSuchAlgorithmException, UnsupportedEncodingException { if (text == null) { return null; } MessageDigest md5 = MessageDigest.getInstance("MD5"); /* * For this to be reproducible on all machines, we cannot rely on a * default encoding for getBytes. So use getBytes(enc) to be explicit. * */ md5.update(text.getBytes("UTF-8")); return md5_id(md5.digest()); } /** * * @param md5digest * byte array * @return MD5 hash for the data */ public static String md5_id(byte[] md5digest) { // Thanks to javacream: // create hex string from the 16-byte hash StringBuilder hashbuf = new StringBuilder(md5digest.length * 2); for (byte b : md5digest) { int intVal = b & 0xff; if (intVal < 0x10) { hashbuf.append("0"); } hashbuf.append(Integer.toHexString(intVal)); } return hashbuf.toString().toLowerCase(); } /** * Get a list of values into a nice, scrubbed array of values, no * whitespace. * * a, b, c d e, f => [ "a", "b", "c d e", "f" ] * * @param s * string to split * @param delim * delimiter, no default. * @return list of split strings, which are also whitespace trimmed */ public static List<String> string2list(String s, String delim) { if (s == null) { return null; } List<String> values = new ArrayList<String>(); String[] _vals = s.split(delim); for (String v : _vals) { String val = v.trim(); if (!val.isEmpty()) { values.add(val); } } return values; } /** * Given a string S and a list of characters to replace with a substitute, * * return the new string, S'. * * "-name-with.invalid characters;" // replace "-. ;" with "_" * "_name_with_invalid_characters_" // * * @param buf * buffer * @param replace * string of characters to replace with the one substitute char * @param substitution * string to insert in place of chars * @return scrubbed text */ public static String fast_replace(String buf, String replace, String substitution) { StringBuilder _new = new StringBuilder(); for (char ch : buf.toCharArray()) { if (replace.indexOf(ch) >= 0) { _new.append(substitution); } else { _new.append(ch); } } return _new.toString(); } /** * Remove instances of any char in the remove string from buf * * @param buf * text * @param remove * string to remove * @return scrubbed text */ public static String removeAny(String buf, String remove) { StringBuilder _new = new StringBuilder(); for (char ch : buf.toCharArray()) { if (remove.indexOf(ch) < 0) { _new.append(ch); } } return _new.toString(); } /** * Replace any of the removal chars with the sub. A many to one replacement. * alt: use regex String.replace(//, '') * * @param buf * text * @param remove * string to replace * @param sub * the replacement string * @return scrubbed text */ public static String replaceAny(String buf, String remove, String sub) { StringBuilder _new = new StringBuilder(); for (char ch : buf.toCharArray()) { if (remove.indexOf(ch) < 0) { _new.append(ch); } else { _new.append(sub); } } return _new.toString(); } /** * compare to trim( string, chars ), but you can trim any chars * * Example: - a b c remove "-" from string above. * * @param buf * text * @param remove * string to remove * @return scrubbed text */ public static String removeAnyLeft(String buf, String remove) { boolean eval = true; // Start from left. int x = 0; for (char ch : buf.toCharArray()) { if (eval && remove.indexOf(ch) >= 0) { ++x; continue; } else { eval = false; // shunt the evaluation of the chars. } } return buf.substring(x); } /** * Normalization: Clean the ends, Remove Line-endings from middle of entity. * * <pre> * Example: * TEXT: **The Daily Newsletter of \n\rBarbara, So.** * CLEAN: __The Daily Newsletter of __Barbara, So___ * * Where "__" represents omitted characters. * </pre> * * @param str * text * @return scrubbed text */ public static String normalizeTextEntity(String str) { if (isBlank(str)) { return ""; } char[] chars = str.toCharArray(); int s1 = 0, s2 = chars.length - 1; int end = s2; while (s1 < s2 && !(Character.isLetter(chars[s1]) || Character.isDigit(chars[s1]))) { ++s1; } // No text found if (s1 == s2) { return null; } while (s2 > s1 && !(Character.isLetter(chars[s2]) || Character.isDigit(chars[s2]))) { --s2; } if (s1 == 0 && s2 == end) { // No cleanup to do. return squeeze_whitespace(str); } // NOT possible, I hope... if (s2 <= s1) { return ""; } // Some cleanup was done on ends of String. Now clear up whitespace. // return squeeze_whitespace(str.substring(s1, s2 + 1)); } private final static Pattern tokenizer = Pattern.compile("\\s+"); /** * Return just white-space delmited tokens. * * @param str * text * @return tokens */ public static String[] tokens(String str) { return tokenizer.split(str.trim()); } /** * Return tokens on the right most part of a buffer. If a para break occurs, * \n\n or \r\n\r\n, then return the part on the right of the break. * * @param str * text * @return whitespace delimited tokens */ public static final String[] tokensRight(String str) { if (str.length() == 0) { return null; } String[] toks = multi_eol2.split(str); if (toks.length == 0) { return null; } return tokens(toks[toks.length - 1]); // Rightmost } /** * See tokensRight() * * @param str * text * @return whitespace delimited tokens */ public static final String[] tokensLeft(String str) { if (str.length() == 0) { return null; } String[] toks = multi_eol2.split(str); if (toks.length == 0) { return null; } return tokens(toks[0]); // Leftmost } /** * Intended only as a filter for punctuation within a word. Text of the form * A.T.T. or U.S. becomes ATT and US. A text such as Mr.Pibbs incorrectly * becomes MrPibbs but for the purposes of normalizing tokens this should be * fine. Use appropriate tokenization prior to using this as a filter. * * @param word * phrase with periods denoting some abbreviation. * @return scrubbed text */ public static String normalizeAbbreviation(String word) { return word.replace(".", ""); } /** * Supports Phoneticizer utility from OpenSextant v1.x Remove diacritics * from a phrase * * @param word * text * @return scrubbed text */ public static String removeDiacritics(String word) { // first, fully decomposed all chars String tmpWord = Normalizer.normalize(word, Normalizer.Form.NFD); StringBuilder newWord = new StringBuilder(); char[] chars = tmpWord.toCharArray(); // now, discard any characters from one of the "Mark" categories. for (char c : chars) { if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c) != Character.COMBINING_SPACING_MARK && Character.getType(c) != Character.ENCLOSING_MARK) { newWord.append(c); } } return newWord.toString(); } /** * Normalize to "Normalization Form Canonical Decomposition" (NFD) REF: * http: * //stackoverflow.com/questions/3610013/file-listfiles-mangles-unicode- * names-with-jdk-6-unicode-normalization-issues This supports proper file * name retrieval from file system, among other things. In many situations * we see unicode file names -- Java can list them, but in using the * Java-provided version of the filename the OS/FS may not be able to find * the file by the name given in a particular normalized form. * * @param str * text * @return normalized string, encoded with NFD bytes */ public static String normalizeUnicode(String str) { Normalizer.Form form = Normalizer.Form.NFD; if (!Normalizer.isNormalized(str, form)) { return Normalizer.normalize(str, form); } return str; } /** * Matches non-text after a word. */ final static Pattern CLEAN_WORD_RIGHT = Pattern.compile("[^\\p{L}\\p{Nd}]+$"); /** * Matches non-text preceeding text */ final static Pattern CLEAN_WORD_LEFT = Pattern.compile("^[^\\p{L}\\p{Nd}]+"); /** * Obscure punctuation pattern that also deals with Unicode single and * double quotes */ final static Pattern CLEAN_WORD_PUNCT = Pattern.compile("[\"'.`\\u00B4\\u2018\\u2019]"); /** * Remove any leading and trailing punctuation and some internal * punctuation. Internal punctuation which indicates conjunction of two * tokens, e.g. a hyphen, should have caused a split into separate tokens at * the tokenization stage. * * Phoneticizer utility from OpenSextant v1.x Remove punctuation from a * phrase * * @param word * text * @return scrubbed text */ public static String removePunctuation(String word) { String tmp = CLEAN_WORD_LEFT.matcher(word).replaceAll(" "); tmp = CLEAN_WORD_RIGHT.matcher(tmp).replaceAll(" "); // remove some internal punctuation. To be removed: char hex // unicode_name // " 22 QUOTATION MARK // ' 27 APOSTROPHE // . 2e FULL STOP // ` 60 GRAVE ACCENT // b4 ACUTE ACCENT // 2018 LEFT SINGLE QUOTATION MARK // 2019 RIGHT SINGLE QUOTATION MARK return CLEAN_WORD_PUNCT.matcher(tmp).replaceAll("").trim(); } // Alphabetic list of top-N languages -- ISO-639_1 "ISO2" language codes // public final static String arabicLang = "ar"; public final static String bahasaLang = "id"; public final static String chineseLang = "zh"; public final static String chineseTradLang = "zt"; public final static String englishLang = "en"; public final static String farsiLang = "fa"; public final static String frenchLang = "fr"; public final static String germanLang = "de"; public final static String italianLang = "it"; public final static String japaneseLang = "ja"; public final static String koreanLang = "ko"; public final static String portugueseLang = "pt"; public final static String russianLang = "ru"; public final static String spanishLang = "es"; public final static String turkishLang = "tr"; public final static String thaiLang = "th"; public final static String vietnameseLang = "vi"; public final static String romanianLang = "ro"; private final static Map<String, Language> languageMapISO639 = new HashMap<String, Language>(); /* * Initialize some langauge metadata. */ static { try { // initLanguageData(); // Barely useful -- this pulls out lang // Locales. initLOCLanguageData(); // LOC language data is a list of all known // languages w/ISO codes. // initICULanguageData(); ICU did not seem to be the right solution. } catch (Exception err) { err.printStackTrace(); } } /** * If caller wants to add language they can. * * @return map of lang ID to language obj */ public static Map<String, Language> getLanguageMap() { return languageMapISO639; } /** * Initialize language codes and metadata. This establishes a map for the * most common language codes/names that exist in at least ISO-639-1 and * have a non-zero 2-char ID. * * <pre> * Based on: * http://stackoverflow.com/questions/674041/is-there-an-elegant-way * -to-convert-iso-639-2-3-letter-language-codes-to-java-lo * * Actual code mappings: en => eng eng => en * * cel => '' // Celtic; Avoid this. * * tr => tur tur => tr * * Names: tr => turkish tur => turkish turkish => tr // ISO2 only * * </pre> */ public static void initLanguageData() { Locale[] locales = Locale.getAvailableLocales(); for (Locale locale : locales) { Language l = new Language(locale.getISO3Language(), locale.getLanguage(), locale.getDisplayLanguage()); addLanguage(l); } } /** * This is Libray of Congress data for language IDs. This is offered as a * tool to help downstream language ID and enrich metadata when tagging data * from particular countries. * * Reference: http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt * * @throws java.io.IOException * if resource file is not found */ public static void initLOCLanguageData() throws java.io.IOException { // // DATA FILE: http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt java.io.InputStream io = TextUtils.class.getResourceAsStream("/ISO-639-2_utf-8.txt"); java.io.Reader featIO = new InputStreamReader(io, "UTF-8"); CsvListReader langReader = new CsvListReader(featIO, new CsvPreference.Builder('"', '|', "\n").build()); CellProcessor[] cells = { new Optional(), new Optional(), new Optional(), new Optional(), new NotNull() }; List<Object> lang = null; /* * ISO3,XX,ISO2,NAME,NAME_FR */ while ((lang = langReader.read(cells)) != null) { // String names = (String) lang.get(3); if (isBlank(names)) { continue; } if ("NAME".equals(names)) { continue; } List<String> namelist = TextUtils.string2list(names, ";"); String iso3 = (String) lang.get(0); if (iso3.startsWith("#")) { continue; } String iso2 = (String) lang.get(2); Language l = new Language(iso3, iso2, namelist.get(0)); addLanguage(l); } langReader.close(); // Popular languages that go by other codes. // ISO languages as listed by LOC are listed with Bibliographic vs. // Terminological codes. // FRE vs. FRA are subtle difference for French, but important if you // cannot find French by lang ID. // // Fully override French and Trad Chinese: Language fr = new Language("fra", "fr", "French"); addLanguage(fr, true); Language zhtw = new Language("zh-tw", "zt", "Chinese/Taiwain"); addLanguage(zhtw, true); // Delicately insert more common names and codes as well as locales // here. Language zh = new Language("zho", "zh", "Chinese"); languageMapISO639.put("zho", zh); Language zhcn = new Language("chi", "zh", "Chinese"); languageMapISO639.put("zh-cn", zhcn); Language fas = new Language("per", "fa", "Farsi"); languageMapISO639.put("farsi", fas); // Locales of English -- are still "English" Language en1 = new Language("eng", "en", "English"); languageMapISO639.put("en-gb", en1); languageMapISO639.put("en-us", en1); languageMapISO639.put("en-au", en1); } public static void addLanguage(Language lg) { addLanguage(lg, false); } /** * Extend the basic language dictionary. Note -- First language is listed in * language map by Name, and is not overwritten. Language objects may be * overwritten in map using lang codes. * * For example, fre = French(fre), fra = French(fra), and french = * French(fra) * * the last one, 'french' = could have been the French(fre) or (fra). * * Example, 'ger' and 'deu' are both valid ISO 3-alpha codes for German. * What to do? * * TODO: Create a language object that lists both language * biblio/terminology codes. * * @param lg * language object * @param override * if this value should overwrite an existing one. */ public static void addLanguage(Language lg, boolean override) { if (lg == null) { return; } if (lg.getCode() != null) { if (override || !languageMapISO639.containsKey(lg.getCode())) { languageMapISO639.put(lg.getCode(), lg); } } if (lg.getISO639_1_Code() != null) { if (override || !languageMapISO639.containsKey(lg.getISO639_1_Code())) { languageMapISO639.put(lg.getISO639_1_Code(), lg); } } if (lg.getNameCode() != null) { if (!languageMapISO639.containsKey(lg.getNameCode())) { languageMapISO639.put(lg.getNameCode(), lg); } } } /** * Given an ISO2 char code (least common denominator) retrieve Language * Name. * * This is best effort, so if your code finds nothing, this returns code * normalized to lowercase. * * @param code * lang ID * @return name of language */ public static String getLanguageName(String code) { if (code == null) { return null; } Language L = getLanguage(code); return (L != null ? L.getName() : null); } /** * ISO2 and ISO3 char codes for languages are unique. * * @param code * iso2 or iso3 code * @return the other code. */ public static Language getLanguage(String code) { if (code == null) { return null; } String lookup = code.toLowerCase(); Language l = languageMapISO639.get(lookup); if (l != null) { return l; } // Keep looking. if (lookup.contains("_")) { lookup = lookup.split("_")[0]; l = languageMapISO639.get(lookup); if (l != null) { return l; } } return null; } /** * ISO2 and ISO3 char codes for languages are unique. * * @param code * iso2 or iso3 code * @return the other code. */ public static String getLanguageCode(String code) { if (code == null) { return null; } Language l = getLanguage(code); if (l != null) { return l.getCode(); } return null; } private static boolean _isRomanceLanguage(String l) { return (l.equals(spanishLang) || l.equals(portugueseLang) || l.equals(italianLang) || l.equals(frenchLang) || l.equals(romanianLang)); } /** * European languages = Romance + GER + ENG Extend definition as needed. * * @param l * language ID * @return true if language is European in nature */ public static boolean isEuroLanguage(String l) { Language lang = getLanguage(l); if (lang == null) { return false; } String id = lang.getISO639_1_Code(); return (_isRomanceLanguage(id) || id.equals(germanLang) || id.equals(englishLang)); } /** * Romance languages = SPA + POR + ITA + FRA + ROM * * Extend definition as needed. * * @param l * lang ID * @return true if language is a Romance language */ public static boolean isRomanceLanguage(String l) { Language lang = getLanguage(l); if (lang == null) { return false; } String id = lang.getISO639_1_Code(); return _isRomanceLanguage(id); } /** * Utility method to check if lang ID is English... * * @param x * a langcode * @return whether langcode is english */ public static boolean isEnglish(String x) { Language lang = getLanguage(x); if (lang == null) { return false; } String id = lang.getISO639_1_Code(); return (id.equals(englishLang)); } /** * Utility method to check if lang ID is Chinese(Traditional or * Simplified)... * * @param x * a langcode * @return whether langcode is chinese */ public static boolean isChinese(String x) { Language lang = getLanguage(x); if (lang == null) { return false; } String id = lang.getISO639_1_Code(); return (id.equals(chineseLang) || id.equals(chineseTradLang)); } /** * Utility method to check if lang ID is Chinese, Korean, or Japanese * * @param x * a langcode * @return whether langcode is a CJK language */ public static boolean isCJK(String x) { Language lang = getLanguage(x); if (lang == null) { return false; } String id = lang.getISO639_1_Code(); if (isBlank(id)) { return false; } return (id.equals(koreanLang) || id.equals(japaneseLang) || id.equals(chineseLang) || id.equals(chineseTradLang)); } /** * Returns a ratio of Chinese/Japanese/Korean characters: CJK chars / ALL * * TODO: needs testing; not sure if this is sustainable if block; or if it * is comprehensive. TODO: for performance reasons the internal chain of * comparisons is embedded in the method; Otherwise for each char, an * external method invocation is required. * * @param buf * the character to be tested * @return true if CJK, false otherwise */ public static double measureCJKText(String buf) { if (buf == null) { return -1.0; } int cjkCount = countCJKChars(buf.toCharArray()); return ((double) cjkCount) / buf.length(); } private final static int LATIN1_END = 0xFE; /** * * Counts the CJK characters in buffer, buf chars Inspiration: * http://stackoverflow * .com/questions/1499804/how-can-i-detect-japanese-text-in-a-java-string * Assumption is that the char array is Unicode characters. * * @param chars * char array for the text in question. * @return count of CJK characters */ public static int countCJKChars(char[] chars) { int cjkCount = 0; for (char ch : chars) { // Ignore ASCII outright. // Ignore Latin-1 outright. if (ch < LATIN1_END) { continue; } Character.UnicodeBlock blk = Character.UnicodeBlock.of(ch); if (isCJK(blk)) { // increment counter: ++cjkCount; } } return cjkCount; } /** * A simple test to see if text has any CJK characters at all. It returns * after the first such character. * * @param buf * text * @return if buf has at least one CJK char. */ public static boolean hasCJKText(String buf) { if (buf == null) { return false; } char ch; for (int x = 0; x < buf.length(); ++x) { ch = buf.charAt(x); // Ignore ASCII outright. // Ignore Latin-1 outright. if (ch < LATIN1_END) { continue; } Character.UnicodeBlock blk = Character.UnicodeBlock.of(ch); if (isCJK(blk)) { return true; } } return false; } public static boolean isCJK(Character.UnicodeBlock blk) { // Chinese/CJK group: return isChinese(blk) || isJapanese(blk) || isKorean(blk); } public static boolean isChinese(Character.UnicodeBlock blk) { return (blk == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) || (blk == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) || (blk == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B) || (blk == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C) || (blk == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D) || (blk == Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS) || (blk == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS) || (blk == Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT) || (blk == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION) || (blk == Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS) || (blk == Character.UnicodeBlock.KANGXI_RADICALS) || (blk == Character.UnicodeBlock.YI_SYLLABLES) || (blk == Character.UnicodeBlock.YI_RADICALS) || (blk == Character.UnicodeBlock.BOPOMOFO) || (blk == Character.UnicodeBlock.BOPOMOFO_EXTENDED) || (blk == Character.UnicodeBlock.KANBUN); } /** * Likely to be uniquely Korean if the character block is in Hangul. But * also, it may be Korean if block is part of the CJK ideographs at large. * User must check if text in its entirety is part of CJK & Hangul, * independently. This method only detects if character block is uniquely * Hangul or not. * * @param blk * a Java Unicode block * @return true if char block is Hangul */ public static boolean isKorean(Character.UnicodeBlock blk) { return (blk == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO) || (blk == Character.UnicodeBlock.HANGUL_JAMO) || (blk == Character.UnicodeBlock.HANGUL_SYLLABLES) || (blk == Character.UnicodeBlock.HANGUL_JAMO_EXTENDED_A) || (blk == Character.UnicodeBlock.HANGUL_JAMO_EXTENDED_B); } /** * Checks if char block is uniquely Japanese. Check other chars isChinese * * @param blk * a Java Unicode block * @return true if char block is Hiragana or Katakana */ public static boolean isJapanese(Character.UnicodeBlock blk) { return (blk == Character.UnicodeBlock.HIRAGANA) || (blk == Character.UnicodeBlock.KATAKANA) || (blk == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS); } /** * Compress bytes from a Unicode string. Conversion to bytes first to avoid * unicode or platform-dependent IO issues. * * @param buf * UTF-8 encoded text * @return byte array * @throws IOException * on error with compression or text encoding */ public static byte[] compress(String buf) throws IOException { return compress(buf, "UTF-8"); } /** * * @param buf * text * @param charset * character set encoding for text * @return byte array for the compressed result * @throws IOException * on error with compression or text encoding */ public static byte[] compress(String buf, String charset) throws IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); GZIPOutputStream gz = new GZIPOutputStream(out); gz.write(buf.getBytes(charset)); gz.close(); return out.toByteArray(); } /** * * @param gzData * byte array containing gzipped buffer * @return buffer UTF-8 decoded string * * @throws IOException * on error with decompression or text encoding */ public static String uncompress(byte[] gzData) throws IOException { return uncompress(gzData, "UTF-8"); } private final static int ONEKB = 1024; /** * * @param gzData * byte array containing gzipped buffer * @param charset * character set decoding for text * @return buffer of uncompressed, decoded string * @throws IOException * on error with decompression or text encoding */ public static String uncompress(byte[] gzData, String charset) throws IOException { GZIPInputStream gzipInputStream = new GZIPInputStream(new ByteArrayInputStream(gzData)); ByteArrayOutputStream out = new ByteArrayOutputStream(); byte[] buf = new byte[ONEKB]; int len; while ((len = gzipInputStream.read(buf)) > 0) { out.write(buf, 0, len); } gzipInputStream.close(); out.close(); return new String(out.toByteArray(), charset); } /** * Unicode and social media -- We encounter all sorts of hangups when * processing modern unicode text. XML issues, JNI issues, escape utilities, * etc. All sorts of problems arise with emoticons aka emoji, and other * symbols used in online media. So these utilities are offered to help * remove such things prior to data processing. */ // UnicodeBlock.MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS; private static final Pattern SCRUB_SYM = Pattern.compile("\\p{block=Miscellaneous Symbols And Pictographs}+"); private static final Pattern SCRUB_SYM2 = Pattern.compile("\\p{block=Transport and Map Symbols}+"); private static final Pattern SCRUB_EMOTICONS = Pattern.compile("\\p{block=Emoticons}+"); private static final Pattern SCRUB_ALPHASUP = Pattern.compile("\\p{block=Enclosed Alphanumeric Supplement}+"); private static final Pattern SCRUB_TILES1 = Pattern.compile("\\p{block=Mahjong Tiles}+"); private static final Pattern SCRUB_TILES2 = Pattern.compile("\\p{block=Domino Tiles}+"); private static final Pattern SCRUB_SYM_MISC = Pattern.compile("\\p{block=Miscellaneous Symbols}+"); private static final Pattern SCRUB_PLAYCARDS = Pattern.compile("\\p{block=Playing Cards}+"); /** * replace Emoticons with something less nefarious -- UTF-16 characters do * not play well with some I/O routines. * * @param t * text * @return scrubbed text */ public static String removeEmoticons(String t) { return SCRUB_EMOTICONS.matcher(t).replaceAll("{icon}"); } /** * Replace symbology * * @param t * text * @return scrubbed text */ public static String removeSymbols(String t) { String _new = SCRUB_SYM.matcher(t).replaceAll("{sym}"); _new = SCRUB_SYM2.matcher(_new).replaceAll("{sym2}"); _new = SCRUB_ALPHASUP.matcher(_new).replaceAll("{asup}"); _new = SCRUB_TILES1.matcher(_new).replaceAll("{tile1}"); _new = SCRUB_TILES2.matcher(_new).replaceAll("{tile2}"); _new = SCRUB_SYM_MISC.matcher(_new).replaceAll("{sym}"); _new = SCRUB_PLAYCARDS.matcher(_new).replaceAll("{card}"); return _new; } /** * Count number of non-alphanumeric chars are present. * * @param t * @return */ public static int countNonText(final String t) { int nonText = 0; for (char c : t.toCharArray()) { if (!Character.isLetter(c) && Character.isDigit(c) && Character.isWhitespace(c)) { ++nonText; } } return nonText; } }