Example usage for java.lang Character getType

Introduction

In this page you can find the example usage for java.lang Character getType.

Prototype

public static int getType(int codePoint)

Source Link

Document

Returns a value indicating a character's general category.

Usage

From source file:com.vuze.android.remote.adapter.TorrentListAdapter.java

private static boolean isAlphabetic(int c) {
    // Seems to return symbolic languages
    //      if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.KITKAT) {
    //         return Character.isAlphabetic(c);
    //      }//from www .  j  a  va 2 s  . com
    if (!Character.isLetter(c)) {
        return false;
    }
    int type = Character.getType(c);
    return type == Character.UPPERCASE_LETTER || type == Character.LOWERCASE_LETTER;
    // Simple, but doesn't include letters with hats on them ;)
    //return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z');
}

From source file:com.vuze.android.remote.adapter.TorrentListAdapter.java

private static boolean isStandardPuncuation(int c) {
    int type = Character.getType(c);
    return type == Character.START_PUNCTUATION || type == Character.END_PUNCTUATION
            || type == Character.OTHER_PUNCTUATION;
}

From source file:org.opendatakit.services.preferences.fragments.ServerSettingsFragment.java

/**
 * Disallows carriage returns from user entry
 *
 * @return// www . j  a  v  a  2 s  .co m
 */
private InputFilter getReturnFilter() {
    InputFilter returnFilter = new InputFilter() {
        public CharSequence filter(CharSequence source, int start, int end, Spanned dest, int dstart,
                int dend) {
            for (int i = start; i < end; i++) {
                if (Character.getType((source.charAt(i))) == Character.CONTROL) {
                    return "";
                }
            }
            return null;
        }
    };
    return returnFilter;
}

From source file:org.apache.pdfbox.text.TextPosition.java

/**
 * @return True if the current character is a diacritic char.
 *///from www.  j  av  a 2 s .c  om
public boolean isDiacritic() {
    String text = this.getUnicode();
    if (text.length() != 1) {
        return false;
    }
    int type = Character.getType(text.charAt(0));
    return type == Character.NON_SPACING_MARK || type == Character.MODIFIER_SYMBOL
            || type == Character.MODIFIER_LETTER;

}

From source file:tufts.vue.ds.Field.java

private static boolean isCurrencySymbol(int c) {
    // checking '$' should be redundant
    return c == '$' || Character.getType(c) == Character.CURRENCY_SYMBOL;
}

From source file:gate.creole.tokeniser.SimpleTokeniser.java

/**
 * The method that does the actual tokenisation.
 *//*from   w ww  .ja va2s  .  c  o  m*/
@Override
public void execute() throws ExecutionException {
    interrupted = false;
    AnnotationSet annotationSet;
    //check the input
    if (document == null) {
        throw new ExecutionException("No document to tokenise!");
    }

    if (annotationSetName == null || annotationSetName.equals(""))
        annotationSet = document.getAnnotations();
    else
        annotationSet = document.getAnnotations(annotationSetName);

    fireStatusChanged("Tokenising " + document.getName() + "...");

    String content = document.getContent().toString();
    int length = content.length();
    int currentChar;
    int charsInCurrentCP = 1;

    DFSMState graphPosition = dInitialState;

    //the index of the first character of the token trying to be recognised
    int tokenStart = 0;

    DFSMState lastMatchingState = null;
    DFSMState nextState;
    String tokenString;
    int charIdx = 0;
    int oldCharIdx = 0;
    FeatureMap newTokenFm;

    while (charIdx < length) {
        currentChar = content.codePointAt(charIdx);
        // number of chars we have to advance after processing this code point.
        // 1 in the vast majority of cases, but 2 where the code point is a
        // supplementary character represented as a surrogate pair.
        charsInCurrentCP = Character.isSupplementaryCodePoint(currentChar) ? 2 : 1;

        //      Out.println(
        //      currentChar + typesMnemonics[Character.getType(currentChar)+128]);
        nextState = graphPosition.next(typeIds.get(new Integer(Character.getType(currentChar))).intValue());

        if (null != nextState) {
            graphPosition = nextState;
            if (graphPosition.isFinal()) {
                lastMatchingState = graphPosition;
            }
            charIdx += charsInCurrentCP;
        } else {//we have a match!
            newTokenFm = Factory.newFeatureMap();

            if (null == lastMatchingState) {
                // no rule matches this character, so create a single-char
                // DEFAULT_TOKEN annotation covering it and start again after it
                charIdx = tokenStart + charsInCurrentCP;
                tokenString = content.substring(tokenStart, charIdx);
                newTokenFm.put("type", "UNKNOWN");
                newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
                newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length()));

                try {
                    annotationSet.add(new Long(tokenStart), new Long(charIdx), "DEFAULT_TOKEN", newTokenFm);
                } catch (InvalidOffsetException ioe) {
                    //This REALLY shouldn't happen!
                    ioe.printStackTrace(Err.getPrintWriter());
                }
                // Out.println("Default token: " + tokenStart +
                //             "->" + tokenStart + " :" + tokenString + ";");
            } else {
                // we've reached the end of a string that the FSM recognised
                tokenString = content.substring(tokenStart, charIdx);
                newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
                newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length()));

                for (int i = 1; i < lastMatchingState.getTokenDesc().length; i++) {
                    newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
                            lastMatchingState.getTokenDesc()[i][1]);
                    //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" +
                    //                       lastMatchingState.getTokenDesc()[i][1]);
                }

                try {
                    annotationSet.add(new Long(tokenStart), new Long(charIdx),
                            lastMatchingState.getTokenDesc()[0][0], newTokenFm);
                } catch (InvalidOffsetException ioe) {
                    //This REALLY shouldn't happen!
                    throw new GateRuntimeException(ioe.toString());
                }

                // Out.println(lastMatchingState.getTokenDesc()[0][0] +
                //              ": " + tokenStart + "->" + lastMatch +
                //              " :" + tokenString + ";");
                //charIdx = lastMatch + 1;
            }

            // reset to initial state and start looking again from here
            lastMatchingState = null;
            graphPosition = dInitialState;
            tokenStart = charIdx;
        }

        if ((charIdx - oldCharIdx > 256)) {
            fireProgressChanged((100 * charIdx) / length);
            oldCharIdx = charIdx;
            if (isInterrupted())
                throw new ExecutionInterruptedException();
        }

    } // while(charIdx < length)

    if (null != lastMatchingState) {
        // we dropped off the end having found a match, annotate it
        tokenString = content.substring(tokenStart, charIdx);
        newTokenFm = Factory.newFeatureMap();
        newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
        newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length()));

        for (int i = 1; i < lastMatchingState.getTokenDesc().length; i++) {
            newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], lastMatchingState.getTokenDesc()[i][1]);
        }

        try {
            annotationSet.add(new Long(tokenStart), new Long(charIdx), lastMatchingState.getTokenDesc()[0][0],
                    newTokenFm);
        } catch (InvalidOffsetException ioe) {
            //This REALLY shouldn't happen!
            throw new GateRuntimeException(ioe.toString());
        }

    }

    reset();
    fireProcessFinished();
    fireStatusChanged("Tokenisation complete!");
}

From source file:marytts.util.string.StringUtils.java

/**
 * Determine whether the given codepoint is either a letter or
 * a modifier according to the Unicode standard. More precisely,
 * this returns true if codepoint belongs to one of the following categories
 * as defined at http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values:
 * <ul>//from ww w . j  av a2  s.  c  o m
 * <li>Lu   Letter, Uppercase</li>
 * <li>Ll  Letter, Lowercase</li>
 * <li>Lt  Letter, Titlecase</li>
 * <li>Lm  Letter, Modifier</li>
 * <li>Lo  Letter, Other</li>
 * <li>Mn  Mark, Nonspacing</li>
 * <li>Mc  Mark, Spacing Combining</li>
 * <li>Me  Mark, Enclosing</li>
 * </ul>
 * Whether a given character is associated with this category can be looked up
 * at http://unicode.org/Public/UNIDATA/UnicodeData.txt
 * @param codePoint the unicode codepoint as determined e.g. by String.codePointAt().
 * @return true if the above condition is met, false otherwise
 */
public static boolean isLetterOrModifier(int codePoint) {
    int type = Character.getType(codePoint);
    return type == Character.UPPERCASE_LETTER || type == Character.LOWERCASE_LETTER
            || type == Character.TITLECASE_LETTER || type == Character.MODIFIER_LETTER
            || type == Character.OTHER_LETTER || type == Character.NON_SPACING_MARK
            || type == Character.COMBINING_SPACING_MARK || type == Character.ENCLOSING_MARK;
}

From source file:org.opensextant.util.TextUtils.java

/**
 * Supports Phoneticizer utility from OpenSextant v1.x Remove diacritics
 * from a phrase/*from   w  ww.  j  av a  2 s.com*/
 * 
 * @param word
 *            text
 * @return scrubbed text
 */
public static String removeDiacritics(String word) {

    // first, fully decomposed all chars
    String tmpWord = Normalizer.normalize(word, Normalizer.Form.NFD);
    StringBuilder newWord = new StringBuilder();
    char[] chars = tmpWord.toCharArray();
    // now, discard any characters from one of the "Mark" categories.
    for (char c : chars) {
        if (Character.getType(c) != Character.NON_SPACING_MARK
                && Character.getType(c) != Character.COMBINING_SPACING_MARK
                && Character.getType(c) != Character.ENCLOSING_MARK) {
            newWord.append(c);
        }
    }
    return newWord.toString();
}

From source file:org.apache.orc.impl.mask.RedactMaskFactory.java

/**
 * Given a UTF code point, find the replacement codepoint
 * @param codepoint a UTF character/*from   w w w .j  a v a  2  s  . c  o  m*/
 * @return the replacement codepoint
 */
int getReplacement(int codepoint) {
    switch (Character.getType(codepoint)) {
    case Character.UPPERCASE_LETTER:
        return UPPPER_REPLACEMENT;
    case Character.LOWERCASE_LETTER:
        return LOWER_REPLACEMENT;
    case Character.TITLECASE_LETTER:
    case Character.MODIFIER_LETTER:
    case Character.OTHER_LETTER:
        return OTHER_LETTER_REPLACEMENT;
    case Character.NON_SPACING_MARK:
    case Character.ENCLOSING_MARK:
    case Character.COMBINING_SPACING_MARK:
        return MARK_REPLACEMENT;
    case Character.DECIMAL_DIGIT_NUMBER:
        return DIGIT_CP_REPLACEMENT;
    case Character.LETTER_NUMBER:
    case Character.OTHER_NUMBER:
        return OTHER_NUMBER_REPLACEMENT;
    case Character.SPACE_SEPARATOR:
    case Character.LINE_SEPARATOR:
    case Character.PARAGRAPH_SEPARATOR:
        return SEPARATOR_REPLACEMENT;
    case Character.MATH_SYMBOL:
    case Character.CURRENCY_SYMBOL:
    case Character.MODIFIER_SYMBOL:
    case Character.OTHER_SYMBOL:
        return SYMBOL_REPLACEMENT;
    case Character.DASH_PUNCTUATION:
    case Character.START_PUNCTUATION:
    case Character.END_PUNCTUATION:
    case Character.CONNECTOR_PUNCTUATION:
    case Character.OTHER_PUNCTUATION:
        return PUNCTUATION_REPLACEMENT;
    default:
        return OTHER_REPLACEMENT;
    }
}

From source file:org.apache.orc.impl.mask.RedactMaskFactory.java

/**
 * Mask the given stringified numeric value excluding the unmask range.
 * Non-digit characters are passed through on the assumption they are
 * markers (eg. one of ",.ef")./*from   w w  w.ja va 2 s. com*/
 * @param value the original value.
 */
String maskNumericString(final String value) {
    StringBuilder result = new StringBuilder();
    final int length = value.codePointCount(0, value.length());
    for (int c = 0; c < length; ++c) {
        int cp = value.codePointAt(c);
        if (isIndexInUnmaskRange(c, length) || Character.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {
            result.appendCodePoint(cp);
        } else {
            result.appendCodePoint(DIGIT_CP_REPLACEMENT);
        }
    }
    return result.toString();
}