List of usage examples for java.lang Character getType
public static int getType(int codePoint)
From source file:com.vuze.android.remote.adapter.TorrentListAdapter.java
private static boolean isAlphabetic(int c) { // Seems to return symbolic languages // if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.KITKAT) { // return Character.isAlphabetic(c); // }//from www . j a va 2 s . com if (!Character.isLetter(c)) { return false; } int type = Character.getType(c); return type == Character.UPPERCASE_LETTER || type == Character.LOWERCASE_LETTER; // Simple, but doesn't include letters with hats on them ;) //return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'); }
From source file:com.vuze.android.remote.adapter.TorrentListAdapter.java
private static boolean isStandardPuncuation(int c) { int type = Character.getType(c); return type == Character.START_PUNCTUATION || type == Character.END_PUNCTUATION || type == Character.OTHER_PUNCTUATION; }
From source file:org.opendatakit.services.preferences.fragments.ServerSettingsFragment.java
/** * Disallows carriage returns from user entry * * @return// www . j a v a 2 s .co m */ private InputFilter getReturnFilter() { InputFilter returnFilter = new InputFilter() { public CharSequence filter(CharSequence source, int start, int end, Spanned dest, int dstart, int dend) { for (int i = start; i < end; i++) { if (Character.getType((source.charAt(i))) == Character.CONTROL) { return ""; } } return null; } }; return returnFilter; }
From source file:org.apache.pdfbox.text.TextPosition.java
/** * @return True if the current character is a diacritic char. *///from www. j av a 2 s .c om public boolean isDiacritic() { String text = this.getUnicode(); if (text.length() != 1) { return false; } int type = Character.getType(text.charAt(0)); return type == Character.NON_SPACING_MARK || type == Character.MODIFIER_SYMBOL || type == Character.MODIFIER_LETTER; }
From source file:tufts.vue.ds.Field.java
private static boolean isCurrencySymbol(int c) { // checking '$' should be redundant return c == '$' || Character.getType(c) == Character.CURRENCY_SYMBOL; }
From source file:gate.creole.tokeniser.SimpleTokeniser.java
/** * The method that does the actual tokenisation. *//*from w ww .ja va2s . c o m*/ @Override public void execute() throws ExecutionException { interrupted = false; AnnotationSet annotationSet; //check the input if (document == null) { throw new ExecutionException("No document to tokenise!"); } if (annotationSetName == null || annotationSetName.equals("")) annotationSet = document.getAnnotations(); else annotationSet = document.getAnnotations(annotationSetName); fireStatusChanged("Tokenising " + document.getName() + "..."); String content = document.getContent().toString(); int length = content.length(); int currentChar; int charsInCurrentCP = 1; DFSMState graphPosition = dInitialState; //the index of the first character of the token trying to be recognised int tokenStart = 0; DFSMState lastMatchingState = null; DFSMState nextState; String tokenString; int charIdx = 0; int oldCharIdx = 0; FeatureMap newTokenFm; while (charIdx < length) { currentChar = content.codePointAt(charIdx); // number of chars we have to advance after processing this code point. // 1 in the vast majority of cases, but 2 where the code point is a // supplementary character represented as a surrogate pair. charsInCurrentCP = Character.isSupplementaryCodePoint(currentChar) ? 2 : 1; // Out.println( // currentChar + typesMnemonics[Character.getType(currentChar)+128]); nextState = graphPosition.next(typeIds.get(new Integer(Character.getType(currentChar))).intValue()); if (null != nextState) { graphPosition = nextState; if (graphPosition.isFinal()) { lastMatchingState = graphPosition; } charIdx += charsInCurrentCP; } else {//we have a match! newTokenFm = Factory.newFeatureMap(); if (null == lastMatchingState) { // no rule matches this character, so create a single-char // DEFAULT_TOKEN annotation covering it and start again after it charIdx = tokenStart + charsInCurrentCP; tokenString = content.substring(tokenStart, charIdx); newTokenFm.put("type", "UNKNOWN"); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length())); try { annotationSet.add(new Long(tokenStart), new Long(charIdx), "DEFAULT_TOKEN", newTokenFm); } catch (InvalidOffsetException ioe) { //This REALLY shouldn't happen! ioe.printStackTrace(Err.getPrintWriter()); } // Out.println("Default token: " + tokenStart + // "->" + tokenStart + " :" + tokenString + ";"); } else { // we've reached the end of a string that the FSM recognised tokenString = content.substring(tokenStart, charIdx); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length())); for (int i = 1; i < lastMatchingState.getTokenDesc().length; i++) { newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], lastMatchingState.getTokenDesc()[i][1]); //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" + // lastMatchingState.getTokenDesc()[i][1]); } try { annotationSet.add(new Long(tokenStart), new Long(charIdx), lastMatchingState.getTokenDesc()[0][0], newTokenFm); } catch (InvalidOffsetException ioe) { //This REALLY shouldn't happen! throw new GateRuntimeException(ioe.toString()); } // Out.println(lastMatchingState.getTokenDesc()[0][0] + // ": " + tokenStart + "->" + lastMatch + // " :" + tokenString + ";"); //charIdx = lastMatch + 1; } // reset to initial state and start looking again from here lastMatchingState = null; graphPosition = dInitialState; tokenStart = charIdx; } if ((charIdx - oldCharIdx > 256)) { fireProgressChanged((100 * charIdx) / length); oldCharIdx = charIdx; if (isInterrupted()) throw new ExecutionInterruptedException(); } } // while(charIdx < length) if (null != lastMatchingState) { // we dropped off the end having found a match, annotate it tokenString = content.substring(tokenStart, charIdx); newTokenFm = Factory.newFeatureMap(); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length())); for (int i = 1; i < lastMatchingState.getTokenDesc().length; i++) { newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], lastMatchingState.getTokenDesc()[i][1]); } try { annotationSet.add(new Long(tokenStart), new Long(charIdx), lastMatchingState.getTokenDesc()[0][0], newTokenFm); } catch (InvalidOffsetException ioe) { //This REALLY shouldn't happen! throw new GateRuntimeException(ioe.toString()); } } reset(); fireProcessFinished(); fireStatusChanged("Tokenisation complete!"); }
From source file:marytts.util.string.StringUtils.java
/** * Determine whether the given codepoint is either a letter or * a modifier according to the Unicode standard. More precisely, * this returns true if codepoint belongs to one of the following categories * as defined at http://unicode.org/Public/UNIDATA/UCD.html#General_Category_Values: * <ul>//from ww w . j av a2 s. c o m * <li>Lu Letter, Uppercase</li> * <li>Ll Letter, Lowercase</li> * <li>Lt Letter, Titlecase</li> * <li>Lm Letter, Modifier</li> * <li>Lo Letter, Other</li> * <li>Mn Mark, Nonspacing</li> * <li>Mc Mark, Spacing Combining</li> * <li>Me Mark, Enclosing</li> * </ul> * Whether a given character is associated with this category can be looked up * at http://unicode.org/Public/UNIDATA/UnicodeData.txt * @param codePoint the unicode codepoint as determined e.g. by String.codePointAt(). * @return true if the above condition is met, false otherwise */ public static boolean isLetterOrModifier(int codePoint) { int type = Character.getType(codePoint); return type == Character.UPPERCASE_LETTER || type == Character.LOWERCASE_LETTER || type == Character.TITLECASE_LETTER || type == Character.MODIFIER_LETTER || type == Character.OTHER_LETTER || type == Character.NON_SPACING_MARK || type == Character.COMBINING_SPACING_MARK || type == Character.ENCLOSING_MARK; }
From source file:org.opensextant.util.TextUtils.java
/** * Supports Phoneticizer utility from OpenSextant v1.x Remove diacritics * from a phrase/*from w ww. j av a 2 s.com*/ * * @param word * text * @return scrubbed text */ public static String removeDiacritics(String word) { // first, fully decomposed all chars String tmpWord = Normalizer.normalize(word, Normalizer.Form.NFD); StringBuilder newWord = new StringBuilder(); char[] chars = tmpWord.toCharArray(); // now, discard any characters from one of the "Mark" categories. for (char c : chars) { if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c) != Character.COMBINING_SPACING_MARK && Character.getType(c) != Character.ENCLOSING_MARK) { newWord.append(c); } } return newWord.toString(); }
From source file:org.apache.orc.impl.mask.RedactMaskFactory.java
/** * Given a UTF code point, find the replacement codepoint * @param codepoint a UTF character/*from w w w .j a v a 2 s . c o m*/ * @return the replacement codepoint */ int getReplacement(int codepoint) { switch (Character.getType(codepoint)) { case Character.UPPERCASE_LETTER: return UPPPER_REPLACEMENT; case Character.LOWERCASE_LETTER: return LOWER_REPLACEMENT; case Character.TITLECASE_LETTER: case Character.MODIFIER_LETTER: case Character.OTHER_LETTER: return OTHER_LETTER_REPLACEMENT; case Character.NON_SPACING_MARK: case Character.ENCLOSING_MARK: case Character.COMBINING_SPACING_MARK: return MARK_REPLACEMENT; case Character.DECIMAL_DIGIT_NUMBER: return DIGIT_CP_REPLACEMENT; case Character.LETTER_NUMBER: case Character.OTHER_NUMBER: return OTHER_NUMBER_REPLACEMENT; case Character.SPACE_SEPARATOR: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: return SEPARATOR_REPLACEMENT; case Character.MATH_SYMBOL: case Character.CURRENCY_SYMBOL: case Character.MODIFIER_SYMBOL: case Character.OTHER_SYMBOL: return SYMBOL_REPLACEMENT; case Character.DASH_PUNCTUATION: case Character.START_PUNCTUATION: case Character.END_PUNCTUATION: case Character.CONNECTOR_PUNCTUATION: case Character.OTHER_PUNCTUATION: return PUNCTUATION_REPLACEMENT; default: return OTHER_REPLACEMENT; } }
From source file:org.apache.orc.impl.mask.RedactMaskFactory.java
/** * Mask the given stringified numeric value excluding the unmask range. * Non-digit characters are passed through on the assumption they are * markers (eg. one of ",.ef")./*from w w w.ja va 2 s. com*/ * @param value the original value. */ String maskNumericString(final String value) { StringBuilder result = new StringBuilder(); final int length = value.codePointCount(0, value.length()); for (int c = 0; c < length; ++c) { int cp = value.codePointAt(c); if (isIndexInUnmaskRange(c, length) || Character.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) { result.appendCodePoint(cp); } else { result.appendCodePoint(DIGIT_CP_REPLACEMENT); } } return result.toString(); }