List of usage examples for java.lang Character isSupplementaryCodePoint
public static boolean isSupplementaryCodePoint(int codePoint)
From source file:Main.java
public static void main(String[] args) { int cp1 = 0x0065, cp2 = 0x0abcd; boolean b1 = Character.isSupplementaryCodePoint(cp1); boolean b2 = Character.isSupplementaryCodePoint(cp2); System.out.println(b1);/*from w w w . ja va2 s.co m*/ System.out.println(b2); }
From source file:Main.java
public static final String filterUCS4(String str) { if (TextUtils.isEmpty(str)) { return str; }/*from ww w . j av a 2 s. c o m*/ if (str.codePointCount(0, str.length()) == str.length()) { return str; } StringBuilder sb = new StringBuilder(); int index = 0; while (index < str.length()) { int codePoint = str.codePointAt(index); index += Character.charCount(codePoint); if (Character.isSupplementaryCodePoint(codePoint)) { continue; } sb.appendCodePoint(codePoint); } return sb.toString(); }
From source file:SpinnerTest.java
private static int[] toCodePointArray(String str) { int[] codePoints = new int[str.codePointCount(0, str.length())]; for (int i = 0, j = 0; i < str.length(); i++, j++) { int cp = str.codePointAt(i); if (Character.isSupplementaryCodePoint(cp)) i++;/*from w ww. ja v a 2 s . com*/ codePoints[j] = cp; } return codePoints; }
From source file:FormatTest.java
public void insertString(FilterBypass fb, int offset, String string, AttributeSet attr) throws BadLocationException { StringBuilder builder = new StringBuilder(string); for (int i = builder.length() - 1; i >= 0; i--) { int cp = builder.codePointAt(i); if (!Character.isDigit(cp) && cp != '-') { builder.deleteCharAt(i);/* w w w. j av a 2 s.c o m*/ if (Character.isSupplementaryCodePoint(cp)) { i--; builder.deleteCharAt(i); } } } super.insertString(fb, offset, builder.toString(), attr); }
From source file:FormatTest.java
public void replace(FilterBypass fb, int offset, int length, String string, AttributeSet attr) throws BadLocationException { if (string != null) { StringBuilder builder = new StringBuilder(string); for (int i = builder.length() - 1; i >= 0; i--) { int cp = builder.codePointAt(i); if (!Character.isDigit(cp) && cp != '-') { builder.deleteCharAt(i); if (Character.isSupplementaryCodePoint(cp)) { i--;// w ww. jav a 2 s.co m builder.deleteCharAt(i); } } } string = builder.toString(); } super.replace(fb, offset, length, string, attr); }
From source file:gate.creole.tokeniser.SimpleTokeniser.java
/** * The method that does the actual tokenisation. *///from w ww .j a v a2s .c o m @Override public void execute() throws ExecutionException { interrupted = false; AnnotationSet annotationSet; //check the input if (document == null) { throw new ExecutionException("No document to tokenise!"); } if (annotationSetName == null || annotationSetName.equals("")) annotationSet = document.getAnnotations(); else annotationSet = document.getAnnotations(annotationSetName); fireStatusChanged("Tokenising " + document.getName() + "..."); String content = document.getContent().toString(); int length = content.length(); int currentChar; int charsInCurrentCP = 1; DFSMState graphPosition = dInitialState; //the index of the first character of the token trying to be recognised int tokenStart = 0; DFSMState lastMatchingState = null; DFSMState nextState; String tokenString; int charIdx = 0; int oldCharIdx = 0; FeatureMap newTokenFm; while (charIdx < length) { currentChar = content.codePointAt(charIdx); // number of chars we have to advance after processing this code point. // 1 in the vast majority of cases, but 2 where the code point is a // supplementary character represented as a surrogate pair. charsInCurrentCP = Character.isSupplementaryCodePoint(currentChar) ? 2 : 1; // Out.println( // currentChar + typesMnemonics[Character.getType(currentChar)+128]); nextState = graphPosition.next(typeIds.get(new Integer(Character.getType(currentChar))).intValue()); if (null != nextState) { graphPosition = nextState; if (graphPosition.isFinal()) { lastMatchingState = graphPosition; } charIdx += charsInCurrentCP; } else {//we have a match! newTokenFm = Factory.newFeatureMap(); if (null == lastMatchingState) { // no rule matches this character, so create a single-char // DEFAULT_TOKEN annotation covering it and start again after it charIdx = tokenStart + charsInCurrentCP; tokenString = content.substring(tokenStart, charIdx); newTokenFm.put("type", "UNKNOWN"); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length())); try { annotationSet.add(new Long(tokenStart), new Long(charIdx), "DEFAULT_TOKEN", newTokenFm); } catch (InvalidOffsetException ioe) { //This REALLY shouldn't happen! ioe.printStackTrace(Err.getPrintWriter()); } // Out.println("Default token: " + tokenStart + // "->" + tokenStart + " :" + tokenString + ";"); } else { // we've reached the end of a string that the FSM recognised tokenString = content.substring(tokenStart, charIdx); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length())); for (int i = 1; i < lastMatchingState.getTokenDesc().length; i++) { newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], lastMatchingState.getTokenDesc()[i][1]); //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" + // lastMatchingState.getTokenDesc()[i][1]); } try { annotationSet.add(new Long(tokenStart), new Long(charIdx), lastMatchingState.getTokenDesc()[0][0], newTokenFm); } catch (InvalidOffsetException ioe) { //This REALLY shouldn't happen! throw new GateRuntimeException(ioe.toString()); } // Out.println(lastMatchingState.getTokenDesc()[0][0] + // ": " + tokenStart + "->" + lastMatch + // " :" + tokenString + ";"); //charIdx = lastMatch + 1; } // reset to initial state and start looking again from here lastMatchingState = null; graphPosition = dInitialState; tokenStart = charIdx; } if ((charIdx - oldCharIdx > 256)) { fireProgressChanged((100 * charIdx) / length); oldCharIdx = charIdx; if (isInterrupted()) throw new ExecutionInterruptedException(); } } // while(charIdx < length) if (null != lastMatchingState) { // we dropped off the end having found a match, annotate it tokenString = content.substring(tokenStart, charIdx); newTokenFm = Factory.newFeatureMap(); newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length())); for (int i = 1; i < lastMatchingState.getTokenDesc().length; i++) { newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], lastMatchingState.getTokenDesc()[i][1]); } try { annotationSet.add(new Long(tokenStart), new Long(charIdx), lastMatchingState.getTokenDesc()[0][0], newTokenFm); } catch (InvalidOffsetException ioe) { //This REALLY shouldn't happen! throw new GateRuntimeException(ioe.toString()); } } reset(); fireProcessFinished(); fireStatusChanged("Tokenisation complete!"); }
From source file:nl.strohalm.cyclos.utils.StringHelper.java
/** * Replaces supplementary characters with a ? character * @param text/* w w w .j a v a 2 s.c o m*/ * @return */ public static String replaceSupplementaryCharacters(final String text) { if (text == null) { return null; } final int len = text.length(); boolean isSupplementary = false; final StringBuilder result = new StringBuilder(); for (int i = 0; i < len; i++) { final int cp = Character.codePointAt(text, i); isSupplementary = Character.isSupplementaryCodePoint(cp); if (isSupplementary) { result.append("?"); i++; } else { result.append(text.charAt(i)); } } return result.toString(); }
From source file:org.eclipse.rdf4j.rio.turtle.TurtleParser.java
/** * Pushes back a single code point by copying it to the front of the buffer. * After this method returns, a call to {@link #readCodePoint()} will return * the same code point c again.// w w w . ja va2 s.co m * * @param codePoint * a single Unicode code point. * @throws IOException */ protected void unread(int codePoint) throws IOException { if (codePoint != -1) { if (Character.isSupplementaryCodePoint(codePoint)) { final char[] surrogatePair = Character.toChars(codePoint); reader.unread(surrogatePair); } else { reader.unread(codePoint); } } }
From source file:org.eclipse.rdf4j.rio.turtle.TurtleParser.java
/** * Pushes back the supplied string by copying it to the front of the buffer. * After this method returns, successive calls to {@link #readCodePoint()} * will return the code points in the supplied string again, starting at the * first in the String../*from w ww . j a va 2s . co m*/ * * @param string * the string to un-read. * @throws IOException */ protected void unread(String string) throws IOException { for (int i = string.codePointCount(0, string.length()); i >= 1; i--) { final int codePoint = string.codePointBefore(i); if (Character.isSupplementaryCodePoint(codePoint)) { final char[] surrogatePair = Character.toChars(codePoint); reader.unread(surrogatePair); } else { reader.unread(codePoint); } } }
From source file:org.nuclos.common2.StringUtils.java
private static void makeSQLIdentifierFrom(StringBuilder result, String s, int maxLen) { if (maxLen < 1) throw new IllegalArgumentException(); final int len = s.length(); final int max = result.length() + maxLen; for (int i = 0; i < len; ++i) { final boolean accept; int c = s.codePointAt(i); if (Character.isSupplementaryCodePoint(c)) { ++i;//from w ww. j av a 2s . c om } if (c >= 'A' && c <= 'Z') { accept = true; } else if (c >= 'a' && c <= 'z') { accept = true; } else if (c >= '0' && c <= '9') { accept = true; } else { switch (c) { case '_': accept = true; break; case ' ': c = '_'; accept = true; break; // german umlaut support case '\u00e4': c = 'a'; accept = true; break; case '\u00f6': c = 'o'; accept = true; break; case '\u00fc': c = 'u'; accept = true; break; case '\u00df': c = 's'; accept = true; break; case '\u00c4': c = 'A'; accept = true; break; case '\u00d6': c = 'O'; accept = true; break; case '\u00dc': c = 'U'; accept = true; break; default: accept = false; } } if (accept) { result.append((char) c); } } if (result.length() > max) result.setLength(max); }