Example usage for java.lang String codePointAt

List of usage examples for java.lang String codePointAt


In this page you can find the example usage for java.lang String codePointAt.


public int codePointAt(int index) 

Source Link


Returns the character (Unicode code point) at the specified index.


From source file:com.gargoylesoftware.htmlunit.HttpWebConnection.java

private Charset getCharset(final String charset, final List<NameValuePair> pairs) {
    for (final NameValuePair pair : pairs) {
        if (pair instanceof KeyDataPair) {
            final KeyDataPair pairWithFile = (KeyDataPair) pair;
            if (pairWithFile.getData() == null && pairWithFile.getFile() != null) {
                final String fileName = pairWithFile.getFile().getName();
                for (int i = 0; i < fileName.length(); i++) {
                    if (fileName.codePointAt(i) > 127) {
                        return Charset.forName(charset);
                    }//from  w  w  w.  ja  v a2  s .  c o m
    return null;

From source file:org.apache.orc.impl.mask.RedactMaskFactory.java

 * Mask the given stringified numeric value excluding the unmask range.
 * Non-digit characters are passed through on the assumption they are
 * markers (eg. one of ",.ef")./*from  w ww.  j a  v  a 2 s .  c  o m*/
 * @param value the original value.
String maskNumericString(final String value) {
    StringBuilder result = new StringBuilder();
    final int length = value.codePointCount(0, value.length());
    for (int c = 0; c < length; ++c) {
        int cp = value.codePointAt(c);
        if (isIndexInUnmaskRange(c, length) || Character.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {
        } else {
    return result.toString();

From source file:com.gargoylesoftware.htmlunit.javascript.host.xml.XMLHttpRequestTest.java

 * @throws Exception if the test fails/* w  w w  . j a  va  2s . co m*/
public void java_encoding() throws Exception {
    // Chrome and FF return the last apostrophe, see overrideMimeType_charset_all()
    // but Java and other tools (e.g. Notpad++) return only 3 characters, not 4
    // this method is not a test case, but rather to show the behavior of java

    final String string = "'\u9EC4'";
    final ByteArrayInputStream bais = new ByteArrayInputStream(string.getBytes("UTF-8"));
    try (final BufferedReader reader = new BufferedReader(new InputStreamReader(bais, "GBK"))) {
        final String output = reader.readLine();
        assertEquals(39, output.codePointAt(0));
        assertEquals(27035, output.codePointAt(1));
        assertEquals(65533, output.codePointAt(2));
        assertEquals(39, output.codePointAt(3));

From source file:org.sleuthkit.autopsy.casemodule.Case.java

 * Sanitize the case name for PostgreSQL database, Solr cores, and ActiveMQ
 * topics. Makes it plain-vanilla enough that each item should be able to
 * use it.//from  ww  w. jav a 2  s .c om
 * Sanitize the PostgreSQL/Solr core, and ActiveMQ name by excluding:
 * Control characters Non-ASCII characters Various others shown below
 * Solr:
 * http://stackoverflow.com/questions/29977519/what-makes-an-invalid-core-name
 * may not be / \ :
 * ActiveMQ:
 * http://activemq.2283324.n4.nabble.com/What-are-limitations-restrictions-on-destination-name-td4664141.html
 * may not be ?
 * PostgreSQL:
 * http://www.postgresql.org/docs/9.4/static/sql-syntax-lexical.html 63
 * chars max, must start with a-z or _ following chars can be letters _ or
 * digits
 * SQLite: Uses autopsy.db for the database name follows Windows naming
 * convention
 * @param caseName The name of the case as typed in by the user
 * @return the sanitized case name to use for Database, Solr, and ActiveMQ
static String sanitizeCaseName(String caseName) {

    String result;

    // Remove all non-ASCII characters
    result = caseName.replaceAll("[^\\p{ASCII}]", "_");

    // Remove all control characters
    result = result.replaceAll("[\\p{Cntrl}]", "_");

    // Remove / \ : ? space ' "
    result = result.replaceAll("[ /?:'\"\\\\]", "_");

    // Make it all lowercase
    result = result.toLowerCase();

    // Must start with letter or underscore for PostgreSQL. If not, prepend an underscore.
    if (result.length() > 0 && !(Character.isLetter(result.codePointAt(0)))
            && !(result.codePointAt(0) == '_')) {
        result = "_" + result;

    // Chop to 63-16=47 left (63 max for PostgreSQL, taking 16 for the date _20151225_123456)
    if (result.length() > MAX_SANITIZED_NAME_LENGTH) {
        result = result.substring(0, MAX_SANITIZED_NAME_LENGTH);

    if (result.isEmpty()) {
        result = "case";

    return result;

From source file:com.microsoft.windowsazure.mobileservices.MobileServiceTableBase.java

 * Validates if a given string contains any of the following special characters: "(U+0022),  +(U+002B), /(U+002F), ?(U+003F), \(U+005C), `(U+0060)
 * @param s/*from ww w.  ja  v a  2s.  c o  m*/
 * @return
protected boolean containsSpecialCharacter(String s) {
    boolean result = false;

    final int length = s.length();

    final int cpQuotationMark = 0x0022;
    final int cpPlusSign = 0x002B;
    final int cpSolidus = 0x002F;
    final int cpQuestionMark = 0x003F;
    final int cpReverseSolidus = 0x005C;
    final int cpGraveAccent = 0x0060;

    for (int offset = 0; offset < length;) {
        final int codepoint = s.codePointAt(offset);

        if (codepoint == cpQuotationMark || codepoint == cpPlusSign || codepoint == cpSolidus
                || codepoint == cpQuestionMark || codepoint == cpReverseSolidus || codepoint == cpGraveAccent) {
            result = true;

        offset += Character.charCount(codepoint);

    return result;

From source file:net.sf.jabref.importer.HTMLConverter.java

public String formatUnicode(String text) {
    if (text == null) {
        return null;
    }//from  ww  w . jav a  2  s.c  o  m
    Set<Character> chars = unicodeSymbols.keySet();
    for (Character character : chars) {
        // System.err.println(new Integer((int) character).toString() + ": " + character.toString() + ": " + unicodeSymbols.get(character));
        text = text.replaceAll(character.toString(), unicodeSymbols.get(character));

    Integer cp;
    for (int i = 0; i <= (text.length() - 1); i++) {
        cp = text.codePointAt(i);
        if (cp >= 129) {
            LOGGER.warn("Unicode character not converted: " + cp);
    return text;

From source file:org.languagetool.rules.spelling.hunspell.HunspellRule.java

protected String getSentenceTextWithoutUrlsAndImmunizedTokens(AnalyzedSentence sentence) {
    StringBuilder sb = new StringBuilder();
    AnalyzedTokenReadings[] sentenceTokens = getSentenceWithImmunization(sentence).getTokens();
    for (int i = 1; i < sentenceTokens.length; i++) {
        String token = sentenceTokens[i].getToken();
        if (sentenceTokens[i].isImmunized() || sentenceTokens[i].isIgnoredBySpeller() || isUrl(token)
                || isEMail(token) || isQuotedCompound(sentence, i, token)) {
            if (isQuotedCompound(sentence, i, token)) {
                sb.append(" ").append(token.substring(1));
            }/*from w ww  .ja v a2 s.c  o m*/
            // replace URLs and immunized tokens with whitespace to ignore them for spell checking:
            else if (token.length() < 20) {
            } else {
                for (int j = 0; j < token.length(); j++) {
                    sb.append(' ');
        } else if (token.length() > 1 && token.codePointCount(0, token.length()) != token.length()) {
            // some symbols such as emojis () have a string length that equals 2 
            for (int charIndex = 0; charIndex < token.length();) {
                int unicodeCodePoint = token.codePointAt(charIndex);
                int increment = Character.charCount(unicodeCodePoint);
                if (increment == 1) {
                } else {
                    sb.append("  ");
                charIndex += increment;
        } else {
    return sb.toString();

From source file:gate.creole.tokeniser.SimpleTokeniser.java

 * The method that does the actual tokenisation.
 *//*from  w ww  . j ava2s  . c  o m*/
public void execute() throws ExecutionException {
    interrupted = false;
    AnnotationSet annotationSet;
    //check the input
    if (document == null) {
        throw new ExecutionException("No document to tokenise!");

    if (annotationSetName == null || annotationSetName.equals(""))
        annotationSet = document.getAnnotations();
        annotationSet = document.getAnnotations(annotationSetName);

    fireStatusChanged("Tokenising " + document.getName() + "...");

    String content = document.getContent().toString();
    int length = content.length();
    int currentChar;
    int charsInCurrentCP = 1;

    DFSMState graphPosition = dInitialState;

    //the index of the first character of the token trying to be recognised
    int tokenStart = 0;

    DFSMState lastMatchingState = null;
    DFSMState nextState;
    String tokenString;
    int charIdx = 0;
    int oldCharIdx = 0;
    FeatureMap newTokenFm;

    while (charIdx < length) {
        currentChar = content.codePointAt(charIdx);
        // number of chars we have to advance after processing this code point.
        // 1 in the vast majority of cases, but 2 where the code point is a
        // supplementary character represented as a surrogate pair.
        charsInCurrentCP = Character.isSupplementaryCodePoint(currentChar) ? 2 : 1;

        //      Out.println(
        //      currentChar + typesMnemonics[Character.getType(currentChar)+128]);
        nextState = graphPosition.next(typeIds.get(new Integer(Character.getType(currentChar))).intValue());

        if (null != nextState) {
            graphPosition = nextState;
            if (graphPosition.isFinal()) {
                lastMatchingState = graphPosition;
            charIdx += charsInCurrentCP;
        } else {//we have a match!
            newTokenFm = Factory.newFeatureMap();

            if (null == lastMatchingState) {
                // no rule matches this character, so create a single-char
                // DEFAULT_TOKEN annotation covering it and start again after it
                charIdx = tokenStart + charsInCurrentCP;
                tokenString = content.substring(tokenStart, charIdx);
                newTokenFm.put("type", "UNKNOWN");
                newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
                newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length()));

                try {
                    annotationSet.add(new Long(tokenStart), new Long(charIdx), "DEFAULT_TOKEN", newTokenFm);
                } catch (InvalidOffsetException ioe) {
                    //This REALLY shouldn't happen!
                // Out.println("Default token: " + tokenStart +
                //             "->" + tokenStart + " :" + tokenString + ";");
            } else {
                // we've reached the end of a string that the FSM recognised
                tokenString = content.substring(tokenStart, charIdx);
                newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
                newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length()));

                for (int i = 1; i < lastMatchingState.getTokenDesc().length; i++) {
                    //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" +
                    //                       lastMatchingState.getTokenDesc()[i][1]);

                try {
                    annotationSet.add(new Long(tokenStart), new Long(charIdx),
                            lastMatchingState.getTokenDesc()[0][0], newTokenFm);
                } catch (InvalidOffsetException ioe) {
                    //This REALLY shouldn't happen!
                    throw new GateRuntimeException(ioe.toString());

                // Out.println(lastMatchingState.getTokenDesc()[0][0] +
                //              ": " + tokenStart + "->" + lastMatch +
                //              " :" + tokenString + ";");
                //charIdx = lastMatch + 1;

            // reset to initial state and start looking again from here
            lastMatchingState = null;
            graphPosition = dInitialState;
            tokenStart = charIdx;

        if ((charIdx - oldCharIdx > 256)) {
            fireProgressChanged((100 * charIdx) / length);
            oldCharIdx = charIdx;
            if (isInterrupted())
                throw new ExecutionInterruptedException();

    } // while(charIdx < length)

    if (null != lastMatchingState) {
        // we dropped off the end having found a match, annotate it
        tokenString = content.substring(tokenStart, charIdx);
        newTokenFm = Factory.newFeatureMap();
        newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
        newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, Integer.toString(tokenString.length()));

        for (int i = 1; i < lastMatchingState.getTokenDesc().length; i++) {
            newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], lastMatchingState.getTokenDesc()[i][1]);

        try {
            annotationSet.add(new Long(tokenStart), new Long(charIdx), lastMatchingState.getTokenDesc()[0][0],
        } catch (InvalidOffsetException ioe) {
            //This REALLY shouldn't happen!
            throw new GateRuntimeException(ioe.toString());


    fireStatusChanged("Tokenisation complete!");

From source file:tufts.vue.ds.Field.java

/** @return double value if one found, Double.NaN otherwise */
private double getNumericValue(final String text, final boolean tryCurrency) {

    try {//w w w . jav  a2  s  .  co  m
        // Double.parseDouble handles most stuff, including "0x2F" style
        // hex values was well as scientific notation.
        return Double.parseDouble(text);
    } catch (Throwable t) {

    Number value = null;

    try {
        // This handles values of the form "1,234,567". It will also extract any
        // number that can be found at the head of a string: e.g. "7foo" will return
        // 7, or "70%" will return 70 (*not* 0.70).  The instance of LocalNumberFormat will
        // generally be a DecimalFormat
        value = LocalNumberFormat.parse(text);
    } catch (Throwable t) {

    // Note that if we use a NumberFormat.getCurrencyInstance() here to handle
    // currency, it will only allow the local currency symbol.

    if (value == null && tryCurrency && text.length() > 1 && isCurrencySymbol(text.codePointAt(0))) {
        value = getNumericValue(text.substring(1), false); // NOTE RECURSION
        //Log.debug("HANDLED CURRENCY " + Util.tags(text) + " = " + Util.tags(value));

    // could allow for percent parsers that return value/100

        Log.debug(Util.tags(text) + " = " + Util.tags(value));

    return value == null ? Double.NaN : value.doubleValue();

From source file:org.xwoot.wikiContentManager.XWikiSwizzleClient.XwikiSwizzleClient.java

 * DOCUMENT ME!//w w w .j  av  a 2s  .  co  m
 * @param pageId DOCUMENT ME!
 * @param value DOCUMENT ME!
 * @param algo DOCUMENT ME!
 * @param rmd DOCUMENT ME!
 * @return DOCUMENT ME!
 * @throws NoSuchAlgorithmException
 * @throws XWikiSwizzleClientException
synchronized public String setPageContent(String pageId, String value, String algo, byte[] rmd)
        throws NoSuchAlgorithmException, XWikiSwizzleClientException {
    String result = null;
    Page page = null;
    String pageContent = "";

    // if user have not connected client, method do it for him
    // else it's to the user to do the connection gestion...
    boolean b = this.relogin();

    page = this.getWikiPage(pageId);

    if (page != null) {
        pageContent = page.getContent();

    byte[] messageDigest = this.getDigest(pageContent, algo);

    if (MessageDigest.isEqual(messageDigest, rmd)) {
        if (page == null) {
            Map p = this.createPage(pageId, value);
            if (p == null) {
                throw new XWikiSwizzleClientException("Problem with setPageContent : can't create the page");
        } else {
    } else {
        if ((pageContent == null)
                || ((pageContent.length() == 1) && (pageContent.codePointAt(0) == VOID_CHARACTER))
                || (pageContent.length() < 1)) {
            result = "";
        } else {
            result = pageContent;


    return result;