com.pfarrell.utils.misc.TextTools.java Source code

Java tutorial

Introduction

Here is the source code for com.pfarrell.utils.misc.TextTools.java

Source

/*
 * TextTools.java
 *
 * Created on February 14, 2007, 6:11 PM
 *
 * Copyright (c) 2007, Pat Farrell. All rights reserved.
 * Based on code from OneBigCD and perhaps even CyberCash
 *  
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.pfarrell.utils.misc;

import com.google.common.base.Preconditions;
import com.pfarrell.exceptions.PibException;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.codec.language.DoubleMetaphone;

/**
 * The  <code>TextTools</code> class implements handy utilities on Strings
 *
 * @author pfarrell
 */
public abstract class TextTools {

    /** encoder for Metaphone replacement to soundex */
    private static DoubleMetaphone meta = new DoubleMetaphone();

    /**
     * get DoubleMetaphone algorithm encoding of argument
     * @param arg input string
     * @return DoubleMetaphone encoding
     */
    public static String getMetaphone(String arg) {
        meta.setMaxCodeLen(5);
        String rval = meta.encode(arg);
        return rval;
    }

    /**
     * Check if the Double Metaphone values of two String values are equal.
     * @param value1 first param
     * @param value2 another param
     * @return true if the encoded Strings are equal; false otherwise.
     */
    public static boolean isMetaphoneEqual(String value1, String value2) {
        return meta.isDoubleMetaphoneEqual(value1, value2);
    }

    /**
     * make a string with CRLF between items
     * @param arg list of strings
     * @return string with CRLF between items
     */
    public static String textFromList(List<String> arg) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < arg.size(); i++) {
            sb.append(arg.get(i));
            if (i < (arg.size() - 1)) {
                sb.append(Constants.CRLF);
            }
        }
        return sb.toString();
    }

    /**
     * make a string with no whitespace at all
     * @param arg  string
     * @return string with no whitespace
     */
    public static String noWhiteSpace(String arg) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < arg.length(); i++) {
            int c = arg.codePointAt(i);
            if (!Character.isWhitespace(c)) {
                sb.appendCodePoint(c);
            }
        }
        return sb.toString();
    }

    /**
     * make a string with no whitespace at all
     * @param arg  string
     * @return string with no whitespace
     */
    public static String justLettersOrDigits(String arg) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < arg.length(); i++) {
            int c = arg.codePointAt(i);
            if (Character.isLetterOrDigit(c)) {
                sb.appendCodePoint(c);
            }
        }
        return sb.toString();
    }

    /**
     * make a string with only letters
     * @param arg  string
     * @return string with only letters, no punct, space, digits, etc.
     */
    public static String justLetters(String arg) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < arg.length(); i++) {
            int c = arg.codePointAt(i);
            if (Character.isLetter(c)) {
                sb.appendCodePoint(c);
            }
        }
        return sb.toString();
    }

    /**
     * strip out any thing other than letters or space in the middle, 
     * @param arg any string
     * @return letters or spaces, after trim()
     */
    public static String justLetterOrSpace(String arg) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < arg.length(); i++) {
            char c = arg.charAt(i);
            if (Character.isLetter(c) || c == ' ') {
                sb.append(c);
            }
        }
        return sb.toString().trim();
    }

    /**
     * make a string with only digits
     * @param arg  string
     * @return string with only digits
     */
    public static String justDigits(String arg) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < arg.length(); i++) {
            int c = arg.codePointAt(i);
            if (Character.isDigit(c)) {
                sb.appendCodePoint(c);
            }
        }
        return sb.toString();
    }

    /**
     * ensure that we have word capitalized as American English handling of a name
     * @param arg input word
     * @return word capitalized. foo => Foo
     */
    public static String capitalizeAsName(String arg) {
        Preconditions.checkNotNull(arg);
        if (arg.length() == 0)
            return "";
        return (arg.length() == 1) ? arg.substring(0, 1).toUpperCase()
                : arg.substring(0, 1).toUpperCase() + arg.substring(1);
    }

    /**
     * ensure that we have word with Capital first letter, rest lowercase
     * @param arg input word
     * @return word capitalized. foo => Foo, FOO -> Foo, fOO -> Foo
     */
    public static String capitalFirstRestLower(String arg) {
        Preconditions.checkNotNull(arg);
        if (arg.length() == 0)
            return "";
        return (arg.length() == 1) ? arg.substring(0, 1).toUpperCase()
                : arg.substring(0, 1).toUpperCase() + arg.substring(1).toLowerCase();
    }

    /**
     * unquote a string surrounded by single or double quote
     * @param arg input string
     * @return unquoted string
     */
    public static String unquote(String arg) {
        String rval = arg;
        if (arg != null && arg.length() > 2) {
            char c = arg.charAt(0);
            char e = arg.charAt(arg.length() - 1);
            if (c == e) {
                if (c == '"' || c == '\'') {
                    rval = arg.substring(1, arg.length() - 1);
                }
            }
        }
        return rval;
    }

    /**
     * unquote a string surrounded by single or double quote possible with trailing punctuation
     * @param arg input string
     * @return unquoted string
     */
    public static String unquoteWithTrailing(String arg) {
        String rval = arg;
        if (arg != null && arg.length() > 2) {
            char c = arg.charAt(0);

            Pattern trailingPattern = Pattern.compile("(\\" + c + ")[,.]*$");
            Matcher m = trailingPattern.matcher(arg);
            if (m.find()) {
                rval = arg.substring(1, m.start()).trim();
            }
        }
        return rval;
    }

    /**
     * validate a credit card number with the Luhn 10 algorithm.
     * @param cardNumber a credit card number, no dashes or spaces
     * @return true if valid
     */
    public static boolean validateLuhn10(String cardNumber) {
        Preconditions.checkNotNull(cardNumber);
        String working = justDigits(cardNumber);
        return calculateLuhnCheckDigit(working) == 0;
    }

    private static int calculateLuhnCheckDigit(String working) {
        int sum = 0;
        int digit;
        int addend = 0;
        boolean doubled = false;
        for (int i = working.length() - 1; i >= 0; i--) {
            digit = Integer.parseInt(working.substring(i, i + 1));
            if (doubled) {
                addend = digit * 2;
                if (addend > 9) {
                    addend -= 9;
                }
            } else {
                addend = digit;
            }
            sum += addend;
            doubled = !doubled;
        }
        return (sum % 10);
    }

    private static final String[] digits = { "0", "1", "2", "3", "4", "5", "6", "7", "8", "9" };

    /**
     * append a correct Luhn10 check digit to the argument and return new longer string
     * @param cardNumber account number, ok to have spaces or dash between groups
     * @return cardnumber with check digit
     */
    public static String addLuhn10Checkdigit(String cardNumber) {
        StringBuilder sb = new StringBuilder();
        Preconditions.checkNotNull(cardNumber);
        String working = justDigits(cardNumber);
        sb.append(working);
        String appended = working + digits[0];
        int c = calculateLuhnCheckDigit(appended);
        if (c == 0) {
            sb.append(digits[0]); // perfect, we are done
        } else {
            c = 10 - c;
            sb.append(digits[c]);
        }
        return sb.toString();
    }

    /**
     * calculate Levenshtein distance between two strings, only works for inputs that
     * are directly suitable for 'char' representation, will not work properly with extended Unicode
     * languages.
     * @see <a href=http://en.wikipedia.org/wiki/Levenshtein_distance>http://en.wikipedia.org/wiki/Levenshtein_distance</a>
     * @see <a href=http://en.literateprograms.org/Levenshtein_distance>http://en.literateprograms.org/Levenshtein_distance</a>
     * @param s1 first argument
     * @param s2 other argument
     * @return Levenshtein distance
     */
    public static int levenshtein(String s1, String s2) {
        Preconditions.checkNotNull(s1);
        Preconditions.checkNotNull(s2);
        int[][] dp = new int[s1.length() + 1][s2.length() + 1];
        for (int i = 0; i < dp.length; i++) {
            for (int j = 0; j < dp[i].length; j++) {
                dp[i][j] = i == 0 ? j : j == 0 ? i : 0;
                if (i > 0 && j > 0) {
                    if (s1.charAt(i - 1) == s2.charAt(j - 1))
                        dp[i][j] = dp[i - 1][j - 1];
                    else
                        dp[i][j] = Math.min(dp[i][j - 1] + 1, Math.min(dp[i - 1][j - 1] + 1, dp[i - 1][j] + 1));
                }
            }
        }
        return dp[s1.length()][s2.length()];
    }

    /**
     * gets Levenshtein distance "probability" between two strings, map value to 0.0 to 1.0 range
     * @param s1 first argument
     * @param s2 other argument
     * @return rough probability of strings being the same.
     */
    public static float levenshteinProb(String s1, String s2) {
        Preconditions.checkNotNull(s1);
        Preconditions.checkNotNull(s2);
        float rval = 0.0f;
        int raw = levenshtein(s1, s2);
        if (raw > s2.length()) {
            rval = 0.0f;
        } else if (raw != s2.length()) {
            rval = 1.0f - (1.0f * raw / s2.length());
        }
        return rval;
    }

    static final Pattern doubleDotPattern = Pattern.compile("digital\\.\\.com");

    /**
     * manually fix the dumb double dot that Tomcat seems to create.
     * @param url input string that may have double dot
     * @return cleaned up string.
     */
    public static String fixEvilDoubleDot(String url) {
        Preconditions.checkNotNull(url);
        String rval = url;
        Matcher m = doubleDotPattern.matcher(url);
        if (m.find()) {
            rval = m.replaceAll("digital\\.com");
        }
        return rval;
    }

    /**
     * quickly convert a byte array into a String for the argument Charset.
     * See http://stackoverflow.com/questions/1684040/java-why-charset-names-are-not-constants
     * @param array byte array
     * @param charset Charset to use (usually UTF-8)
     * @return  converted string
     */
    public static String stringFromByteArray(final byte[] array, final Charset charset) {
        String rval = null;
        try {
            rval = new String(array, charset.name());
        } catch (UnsupportedEncodingException ex) {
            throw new PibException("can not happen, no charset");
        }
        return rval;
    }
}