Strings.java Source code

Introduction

Here is the source code for Strings.java
Source

/*
 * LingPipe v. 3.9
 * Copyright (C) 2003-2010 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

//package com.aliasi.util;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import java.text.DecimalFormat;

/**
 * Static utility methods for processing strings, characters and
 * string buffers.
 *
 * @author  Bob Carpenter
 * @version 4.0.1
 * @since   LingPipe1.0
 * @see     java.lang.Character
 * @see     java.lang.String
 * @see     java.lang.StringBuilder
 */
public class Strings {
    /**
     * Returns the string constructed from the specified character
     * sequence by deaccenting each of its characters.  See {@link
     * #deAccentLatin1(char)} for details of the de-accenting.
     *
     * @param cSeq Character sequence to de accent.
     * @return De-accented version of input.
     */
    public static String deAccentLatin1(CharSequence cSeq) {
        char[] cs = new char[cSeq.length()];
        for (int i = 0; i < cs.length; ++i)
            cs[i] = deAccentLatin1(cSeq.charAt(i));
        return new String(cs);
    }

    /**
     * Returns the equivalent de-accented character for characters in
     * the Latin-1 (ISO-8859-1) range (0000-00FF).  Characters not in
     * the Latin-1 range are returned as-is.
     *
     * Note that Latin-1 is a superset of ASCII, and the unsigned byte
     * encoding of Latin-1 characters (ISO-8859-1) provides the same
     * code points as Unicode for characters.
     *
     * <p>The <code>unicode.org</code> site supplies a complete <a
     * href="http://unicode.org/charts/PDF/U0080.pdf">Latin-1
     * Supplement</code>, listing the code points for each character.
     *
     * @param c Character to de-accent.
     * @return Equivalent character without accent.
     */
    public static char deAccentLatin1(char c) {
        switch (c) {
        case '\u00C0':
            return 'A';
        case '\u00C1':
            return 'A';
        case '\u00C2':
            return 'A';
        case '\u00C3':
            return 'A';
        case '\u00C4':
            return 'A';
        case '\u00C5':
            return 'A';
        case '\u00C6':
            return 'A'; // capital AE ligature
        case '\u00C7':
            return 'C';
        case '\u00C8':
            return 'E';
        case '\u00C9':
            return 'E';
        case '\u00CA':
            return 'E';
        case '\u00CB':
            return 'E';
        case '\u00CC':
            return 'I';
        case '\u00CD':
            return 'I';
        case '\u00CE':
            return 'I';
        case '\u00CF':
            return 'I';

        case '\u00D0':
            return 'D';
        case '\u00D1':
            return 'N';
        case '\u00D2':
            return 'O';
        case '\u00D3':
            return 'O';
        case '\u00D4':
            return 'O';
        case '\u00D5':
            return 'O';
        case '\u00D6':
            return 'O';
        case '\u00D8':
            return 'O';
        case '\u00D9':
            return 'U';
        case '\u00DA':
            return 'U';
        case '\u00DB':
            return 'U';
        case '\u00DC':
            return 'U';
        case '\u00DD':
            return 'Y';
        case '\u00DE':
            return 'P'; // runic letter thorn
        case '\u00DF':
            return 's'; // upper case is SS

        case '\u00E0':
            return 'a';
        case '\u00E1':
            return 'a';
        case '\u00E2':
            return 'a';
        case '\u00E3':
            return 'a';
        case '\u00E4':
            return 'a';
        case '\u00E5':
            return 'a';
        case '\u00E6':
            return 'a'; // ae ligature
        case '\u00E7':
            return 'c';
        case '\u00E8':
            return 'e';
        case '\u00E9':
            return 'e';
        case '\u00EA':
            return 'e';
        case '\u00EB':
            return 'e';
        case '\u00EC':
            return 'i';
        case '\u00ED':
            return 'i';
        case '\u00EE':
            return 'i';
        case '\u00EF':
            return 'i';

        case '\u00F0':
            return 'd';
        case '\u00F1':
            return 'n';
        case '\u00F2':
            return 'o';
        case '\u00F3':
            return 'o';
        case '\u00F4':
            return 'o';
        case '\u00F5':
            return 'o';
        case '\u00F6':
            return 'o';
        case '\u00F8':
            return 'o';
        case '\u00F9':
            return 'u';
        case '\u00FA':
            return 'u';
        case '\u00FB':
            return 'u';
        case '\u00FC':
            return 'u';
        case '\u00FD':
            return 'y';
        case '\u00FE':
            return 'p'; // runic letter thorn
        case '\u00FF':
            return 'y';

        default:
            return c;
        }
    }

}