Parse text of string token to unicode characters. - Java java.lang

Java examples for java.lang:String Unicode

Description

Parse text of string token to unicode characters.

Demo Code

/*/*from  ww w.ja v  a  2s .c  o  m*/
 * Reference ETL Parser for Java
 * Copyright (c) 2000-2009 Constantine A Plotnikov
 *
 * Permission is hereby granted, free of charge, to any person 
 * obtaining a copy of this software and associated documentation 
 * files (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
 * SOFTWARE. 
 */
import java.math.BigInteger;

public class Main{
    /**
     * Parse text of string token to unicode characters. The string prefix is
     * ignored. Note it is assumed that the token has been already parsed by the
     * lexer, so minimal additional validation is performed.
     * 
     * @param stringToken
     *            a string token to parse or null
     * @return parsed string or null if null has been passed as argument
     */
    public static String parseString(String stringToken) {
        if (stringToken == null) {
            return null;
        }
        final StringBuilder rc = new StringBuilder();
        int n = stringToken.length();
        if (n < 2) {
            throw new IllegalArgumentException(
                    "Unexpected end of the token " + n);
        }
        int i = 0;
        while (Character.isUnicodeIdentifierPart(stringToken.charAt(i))) {
            i++;
        }
        final char quote = stringToken.charAt(i);
        switch (quote) {
        case '\'':
        case '"':
            break;
        default:
            throw new IllegalArgumentException("Invalid quote character "
                    + stringToken.charAt(0));
        }
        boolean multiline = stringToken.length() > 6 + i
                && stringToken.charAt(i + 1) == quote
                && stringToken.charAt(i + 2) == quote;
        // ignore last and first characters
        n -= multiline ? 3 : 1;
        i += multiline ? 3 : 1;
        if (i > n
                || stringToken.charAt(n) != quote
                || !(multiline ? stringToken.charAt(n + 1) == quote
                        && stringToken.charAt(n + 2) == quote : true)) {
            throw new IllegalArgumentException(
                    "The string is in invalid format: " + stringToken);
        }
        while (i < n) {
            char ch = stringToken.charAt(i++);
            if ((ch >= '\uD800' && ch <= '\uDBFF')
                    || (ch >= '\uDC00' && ch <= '\uDFFF')) {
                // NOTE POST 0.2: fix it
                throw new IllegalArgumentException(
                        "Large codepoints are not yet handled: "
                                + ((int) ch));
            }
            switch (ch) {
            case '\\':
                if (i >= n) {
                    throw new IllegalArgumentException(
                            "Unexpected end of the token " + i);
                }
                ch = stringToken.charAt(i++);
                switch (ch) {
                case 'U':
                    final int start = i;
                    while (i < n && (ch = stringToken.charAt(i++)) != ';') {
                        if (('0' > ch || ch > '9')
                                && ('a' > ch || ch > 'f')
                                && ('A' > ch || ch > 'F')) {
                            throw new IllegalArgumentException(
                                    "Invalid symbol in escape sequence "
                                            + ch);
                        }
                    }
                    if (i == start || stringToken.charAt(i - 1) != ';') {
                        throw new IllegalArgumentException(
                                "Unexpected end of the token " + i);
                    }
                    final int codepoint = Integer.parseInt(
                            stringToken.substring(start, i - 1), 16);
                    rc.appendCodePoint(codepoint);
                    break;
                case 'u':
                    final int ch16 = Integer.parseInt(
                            stringToken.substring(i, i + 4), 16);
                    rc.append((char) ch16);
                    i += 4;
                    break;
                case 'x':
                    final int ch8 = Integer.parseInt(
                            stringToken.substring(i, i + 2), 16) & 0xFF;
                    rc.append((char) ch8);
                    i += 2;
                    break;
                case 'n':
                    rc.append('\n');
                    break;
                case 'r':
                    rc.append('\r');
                    break;
                case 't':
                    rc.append('\t');
                    break;
                case 'f':
                    rc.append('\f');
                    break;
                case 'b':
                    rc.append('\b');
                    break;
                default:
                    rc.append(ch);
                }
                break;
            default:
                rc.append(ch);
            }
        }
        return rc.toString();
    }
    /**
     * Parse text of integer token to integer value.
     * 
     * @param intToken
     *            a integer token to parse
     * @return parsed value
     */
    public static int parseInt(String intToken) {
        final NumberInfo n = parseNumber(intToken);
        if (n.kind != Tokens.INTEGER
                && n.kind != Tokens.INTEGER_WITH_SUFFIX) {
            throw new NumberFormatException("wrong token kind: " + n.kind);
        }
        String textToParse = n.text;
        if (n.sign == -1) {
            textToParse = "-" + textToParse;
        }
        return Integer.parseInt(textToParse, n.base);
    }
    /**
     * Parse number
     * 
     * @param input
     *            an input token
     * @return information about number.
     */
    public static NumberInfo parseNumber(String input) {
        return new NumberParser(input).parse();
    }
}

Related Tutorials