Example usage for java.lang Character charCount

List of usage examples for java.lang Character charCount

Introduction

In this page you can find the example usage for java.lang Character charCount.

Prototype

public static int charCount(int codePoint) 

Source Link

Document

Determines the number of char values needed to represent the specified character (Unicode code point).

Usage

From source file:org.omegat.tokenizer.BaseTokenizer.java

protected Token[] tokenizeByCodePoint(String strOrig) {
    // See http://www.ibm.com/developerworks/library/j-unicode/#1-5
    // Example 1-5 appears to be faster than 1-6 for us (because our strings are short?)
    Token[] tokens = new Token[strOrig.codePointCount(0, strOrig.length())];
    for (int cp, i = 0, j = 0; i < strOrig.length(); i += Character.charCount(cp)) {
        cp = strOrig.codePointAt(i);/*w ww .  ja  v  a 2 s  .  com*/
        tokens[j++] = new Token(String.valueOf(Character.toChars(cp)), i);
    }
    return tokens;
}

From source file:com.norconex.collector.http.redirect.impl.GenericRedirectURLProvider.java

private String resolveRedirectURL(final String redirectURL, final String nonAsciiCharset) {

    String url = redirectURL;/*ww w .j av a  2 s .co  m*/

    // Is string containing only ASCII as it should?
    boolean isAscii = true;
    final int length = url.length();
    for (int offset = 0; offset < length;) {
        final int codepoint = url.codePointAt(offset);
        if (codepoint > ASCII_MAX_CODEPOINT) {
            isAscii = false;
            break;
        }
        offset += Character.charCount(codepoint);
    }
    if (isAscii) {
        return url;
    } else {
        LOG.warn("Redirect URI made of 7-bit clean ASCII. " + "It probably is not encoded properly. "
                + "Will try to fix. Redirect URL: " + redirectURL);
    }

    // try to fix if non ascii charset is non UTF8.
    if (StringUtils.isNotBlank(nonAsciiCharset)) {
        String charset = CharsetUtils.clean(nonAsciiCharset);
        if (!CharEncoding.UTF_8.equals(charset)) {
            try {
                url = new String(url.getBytes(charset));
                return url;
            } catch (UnsupportedEncodingException e) {
                LOG.warn("Could not fix badly encoded URL with charset \"" + charset + "\". Redirect URL: "
                        + redirectURL, e);
            }
        }
    }

    // If all fails, fall back to UTF8
    try {
        url = new String(url.getBytes(CharEncoding.UTF_8));
        return url;
    } catch (UnsupportedEncodingException e) {
        LOG.warn("Could not fix badly encoded URL with charset " + "\"UTF-8\". Redirect URL: " + redirectURL,
                e);
    }
    return url;
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

protected String[] tokenizeByCodePointToStrings(String strOrig) {
    // See http://www.ibm.com/developerworks/library/j-unicode/#1-5
    // Example 1-5 appears to be faster than 1-6 for us (because our strings are short?)
    String[] tokens = new String[strOrig.codePointCount(0, strOrig.length())];
    for (int cp, i = 0, j = 0; i < strOrig.length(); i += Character.charCount(cp)) {
        cp = strOrig.codePointAt(i);/* ww  w . j a v a  2 s.  c  o m*/
        tokens[j++] = String.valueOf(Character.toChars(cp));
    }
    return tokens;
}

From source file:org.kitodo.dataaccess.storage.memory.GraphPath.java

/**
 * Creates a node representing the graph path string.
 *
 * @param string/*from ww w  .  jav  a2  s  .  c  o m*/
 *            string to parse
 * @param prefixes
 *            a mapping of prefixes to namespaces which was used to shorten
 *            the string
 */
public GraphPath(String string, Map<String, String> prefixes) {
    super(GRAPH_PATH);
    int index = 0;
    Node graphPosition = this;
    int length = string.length();
    while (index < length) {
        while ((index < length) && (string.codePointAt(index) <= ' ')) {
            index++;
        }
        if ((index < length) && (string.codePointAt(index) == '[')) {
            index++;
            Pair<Integer, Node> parseObjectRecursive = parseObject(string.substring(index), prefixes);
            index += parseObjectRecursive.getKey();
            index++;
            graphPosition.put(RDF.OBJECT, parseObjectRecursive.getValue());
        } else {
            Node nextLocationStep = new MemoryNode(LOCATION_STEP);
            NodeReference direction = RDF.NIL;
            switch (index < length ? string.codePointAt(index) : -1) {
            case '<':
                throw new IllegalArgumentException("Directive '<' not supported.");
            case '>':
                index++;
                switch (index < length ? string.codePointAt(index) : -1) {
                case '>':
                    index++;
                    if ((index < length) && (string.codePointAt(index) == '>')) {
                        index++;
                        throw new IllegalArgumentException("Directive '>|' not supported.");
                    } else {
                        throw new IllegalArgumentException("Directive '>>' not supported.");
                    }
                case '|':
                    throw new IllegalArgumentException("Directive '>|' not supported.");
                default:
                    direction = TO;
                    break;
                }
                break;
            case '|':
                if (((index + 1) < length) && (string.codePointAt(index + 1) == '<')) {
                    throw new IllegalArgumentException("Directive '|<' not supported.");
                }
                break;
            default:
                direction = TO;
                break;
            }
            while ((index < length) && (string.codePointAt(index) <= ' ')) {
                index++;
            }
            graphPosition.put(direction, nextLocationStep);
            graphPosition = nextLocationStep;
            int predicatesStart = index;
            int codePoint;
            while ((index < length) && ((codePoint = string.codePointAt(index)) > ' ')) {
                index += Character.charCount(codePoint);
            }
            String predicates = string.substring(predicatesStart, index);
            if (!predicates.equals(ANY_PREDICATE_CHAR)) {
                for (String predicate : predicates.split("\\|")) {
                    graphPosition.put(RDF.PREDICATE, applyPrefixes(prefixes, predicate));
                }
            }
        }
    }
}

From source file:android.support.text.emoji.EmojiProcessor.java

/**
 * Checks a given CharSequence for emojis, and adds EmojiSpans if any emojis are found.
 * <p>//from   w  ww.  jav a2 s .co  m
 * <ul>
 * <li>If no emojis are found, {@code charSequence} given as the input is returned without
 * any changes. i.e. charSequence is a String, and no emojis are found, the same String is
 * returned.</li>
 * <li>If the given input is not a Spannable (such as String), and at least one emoji is found
 * a new {@link android.text.Spannable} instance is returned. </li>
 * <li>If the given input is a Spannable, the same instance is returned. </li>
 * </ul>
 *
 * @param charSequence CharSequence to add the EmojiSpans, cannot be {@code null}
 * @param start start index in the charSequence to look for emojis, should be greater than or
 *              equal to {@code 0}, also less than {@code charSequence.length()}
 * @param end end index in the charSequence to look for emojis, should be greater than or
 *            equal to {@code start} parameter, also less than {@code charSequence.length()}
 * @param maxEmojiCount maximum number of emojis in the {@code charSequence}, should be greater
 *                      than or equal to {@code 0}
 * @param replaceAll whether to replace all emoji with {@link EmojiSpan}s
 */
CharSequence process(@NonNull final CharSequence charSequence, @IntRange(from = 0) int start,
        @IntRange(from = 0) int end, @IntRange(from = 0) int maxEmojiCount, final boolean replaceAll) {
    final boolean isSpannableBuilder = charSequence instanceof SpannableBuilder;
    if (isSpannableBuilder) {
        ((SpannableBuilder) charSequence).beginBatchEdit();
    }

    try {
        Spannable spannable = null;
        // if it is a spannable already, use the same instance to add/remove EmojiSpans.
        // otherwise wait until the the first EmojiSpan found in order to change the result
        // into a Spannable.
        if (isSpannableBuilder || charSequence instanceof Spannable) {
            spannable = (Spannable) charSequence;
        }

        if (spannable != null) {
            final EmojiSpan[] spans = spannable.getSpans(start, end, EmojiSpan.class);
            if (spans != null && spans.length > 0) {
                // remove existing spans, and realign the start, end according to spans
                // if start or end is in the middle of an emoji they should be aligned
                final int length = spans.length;
                for (int index = 0; index < length; index++) {
                    final EmojiSpan span = spans[index];
                    final int spanStart = spannable.getSpanStart(span);
                    final int spanEnd = spannable.getSpanEnd(span);
                    // Remove span only when its spanStart is NOT equal to current end.
                    // During add operation an emoji at index 0 is added with 0-1 as start and
                    // end indices. Therefore if there are emoji spans at [0-1] and [1-2]
                    // and end is 1, the span between 0-1 should be deleted, not 1-2.
                    if (spanStart != end) {
                        spannable.removeSpan(span);
                    }
                    start = Math.min(spanStart, start);
                    end = Math.max(spanEnd, end);
                }
            }
        }

        if (start == end || start >= charSequence.length()) {
            return charSequence;
        }

        // calculate max number of emojis that can be added. since getSpans call is a relatively
        // expensive operation, do it only when maxEmojiCount is not unlimited.
        if (maxEmojiCount != EmojiCompat.EMOJI_COUNT_UNLIMITED && spannable != null) {
            maxEmojiCount -= spannable.getSpans(0, spannable.length(), EmojiSpan.class).length;
        }
        // add new ones
        int addedCount = 0;
        final ProcessorSm sm = new ProcessorSm(mMetadataRepo.getRootNode());

        int currentOffset = start;
        int codePoint = Character.codePointAt(charSequence, currentOffset);

        while (currentOffset < end && addedCount < maxEmojiCount) {
            final int action = sm.check(codePoint);

            switch (action) {
            case ACTION_ADVANCE_BOTH:
                start += Character.charCount(Character.codePointAt(charSequence, start));
                currentOffset = start;
                if (currentOffset < end) {
                    codePoint = Character.codePointAt(charSequence, currentOffset);
                }
                break;
            case ACTION_ADVANCE_END:
                currentOffset += Character.charCount(codePoint);
                if (currentOffset < end) {
                    codePoint = Character.codePointAt(charSequence, currentOffset);
                }
                break;
            case ACTION_FLUSH:
                if (replaceAll || !hasGlyph(charSequence, start, currentOffset, sm.getFlushMetadata())) {
                    if (spannable == null) {
                        spannable = new SpannableString(charSequence);
                    }
                    addEmoji(spannable, sm.getFlushMetadata(), start, currentOffset);
                    addedCount++;
                }
                start = currentOffset;
                break;
            }
        }

        // After the last codepoint is consumed the state machine might be in a state where it
        // identified an emoji before. i.e. abc[women-emoji] when the last codepoint is consumed
        // state machine is waiting to see if there is an emoji sequence (i.e. ZWJ).
        // Need to check if it is in such a state.
        if (sm.isInFlushableState() && addedCount < maxEmojiCount) {
            if (replaceAll || !hasGlyph(charSequence, start, currentOffset, sm.getCurrentMetadata())) {
                if (spannable == null) {
                    spannable = new SpannableString(charSequence);
                }
                addEmoji(spannable, sm.getCurrentMetadata(), start, currentOffset);
                addedCount++;
            }
        }
        return spannable == null ? charSequence : spannable;
    } finally {
        if (isSpannableBuilder) {
            ((SpannableBuilder) charSequence).endBatchEdit();
        }
    }
}

From source file:org.apache.pdfbox.pdmodel.font.PDFont.java

/**
 * Encodes the given string for use in a PDF content stream.
 *
 * @param text Any Unicode text./*from  ww  w . j  av  a 2s .  co m*/
 * @return Array of PDF content stream bytes.
 * @throws IOException If the text could not be encoded.
 */
public final byte[] encode(String text) throws IOException {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    for (int offset = 0; offset < text.length();) {
        int codePoint = text.codePointAt(offset);

        // multi-byte encoding with 1 to 4 bytes
        byte[] bytes = encode(codePoint);
        out.write(bytes);

        offset += Character.charCount(codePoint);
    }
    return out.toByteArray();
}

From source file:com.keylesspalace.tusky.activity.ComposeActivity.java

private static int findEndOfHashtag(String string, int fromIndex) {
    final int length = string.length();
    for (int i = fromIndex + 1; i < length;) {
        int codepoint = string.codePointAt(i);
        if (Character.isWhitespace(codepoint)) {
            return i;
        } else if (codepoint == '#') {
            return -1;
        }/*www. j  a  va 2 s . co m*/
        i += Character.charCount(codepoint);
    }
    return length;
}

From source file:com.keylesspalace.tusky.activity.ComposeActivity.java

private static int findEndOfMention(String string, int fromIndex) {
    int atCount = 0;
    final int length = string.length();
    for (int i = fromIndex + 1; i < length;) {
        int codepoint = string.codePointAt(i);
        if (Character.isWhitespace(codepoint)) {
            return i;
        } else if (codepoint == '@') {
            atCount += 1;/*ww  w.  j  av a  2s.c  om*/
            if (atCount >= 2) {
                return -1;
            }
        }
        i += Character.charCount(codepoint);
    }
    return length;
}

From source file:org.kitodo.dataaccess.storage.memory.GraphPath.java

/**
 * Parses an object from a graph path string.
 *
 * @param string//from  w  w  w  .j av a2 s  .  co m
 *            string to parse
 * @return the number of code points consumed and the object parsed
 */
private final Pair<Integer, Node> parseObject(String string, Map<String, String> prefixes) {
    int length = string.length();
    Node result = new MemoryNode();
    int index = 0;
    NodeReference currentPredicate = null;
    do {
        while ((index < length) && (string.codePointAt(index) <= ' ')) {
            index++;
        }
        if ((index >= length) || (string.codePointAt(index) == ']')) {
            return Pair.of(index, result);
        } else if (string.codePointAt(index) == ',') {
            index++;
            currentPredicate = null;
        } else if (string.codePointAt(index) == '[') {
            index++;
            Pair<Integer, Node> recursion = parseObject(string.substring(index), prefixes);
            index += recursion.getKey();
            index++;
            result.put(currentPredicate != null ? currentPredicate : ANY_PREDICATE, recursion.getValue());
        } else {
            if (currentPredicate == null) {
                int predicatesStart = index;
                int codePoint;
                while ((index < length) && ((codePoint = string.codePointAt(index)) > ' ')) {
                    index += Character.charCount(codePoint);
                }
                String predicate = string.substring(predicatesStart, index);
                currentPredicate = predicate.equals(ANY_PREDICATE_CHAR) ? ANY_PREDICATE
                        : MemoryStorage.INSTANCE.createNodeReference(applyPrefixes(prefixes, predicate));
            } else {
                int literalStart = index;
                int cp;
                while ((index < length) && ((cp = string.codePointAt(index)) > ' ') && (cp != ',')
                        && (cp != ']')) {
                    index += Character.charCount(cp);
                }
                String value = applyPrefixes(prefixes, string.substring(literalStart, index));
                result.put(currentPredicate, MemoryLiteral.createLeaf(value, null));
            }
        }
    } while (index < length);
    return Pair.of(length, result);
}

From source file:org.archive.modules.fetcher.FetchHTTPRequest.java

/**
 * Returns a copy of the string with non-ascii characters replaced by their
 * html numeric character reference in decimal (e.g. &amp;#12345;).
 * //from   w  w  w  . ja va 2s.com
 * <p>
 * The purpose of this is to produce a multipart/formdata submission that
 * any server should be able to handle, based on experiments using a modern
 * browser (chromium 47.0.2526.106 for mac). What chromium posts depends on
 * what it considers the character encoding of the page containing the form,
 * and maybe other factors. It would be too complicated to try to simulate
 * that behavior in heritrix.
 * 
 * <p>
 * Instead what we do is approximately what the browser does when the form
 * page is plain ascii. It html-escapes characters outside of the
 * latin1/cp1252 range. Characters in the U+0080-U+00FF range are encoded in
 * latin1/cp1252. That is the one way that we differ from chromium. We
 * html-escape those characters (U+0080-U+00FF) as well. That way the http
 * post is plain ascii, and should work regardless of which encoding the
 * server expects.
 * 
 * <p>
 * N.b. chromium doesn't indicate the encoding of the request in any way (no
 * charset in the content-type or anything like that). Also of note is that
 * when it considers the form page to be utf-8, it submits in utf-8. That's
 * part of the complicated behavior we don't want to try to simulate.
 */
public static String escapeForMultipart(String str) {
    StringBuilder buf = new StringBuilder();
    for (int i = 0; i < str.length();) {
        int codepoint = str.codePointAt(i);
        if (codepoint <= 0x7f) {
            buf.appendCodePoint(codepoint);
        } else {
            buf.append("&#" + codepoint + ";");
        }
        i += Character.charCount(codepoint);
    }
    return buf.toString();
}