List of usage examples for java.lang Character charCount
public static int charCount(int codePoint)
From source file:org.omegat.tokenizer.BaseTokenizer.java
protected Token[] tokenizeByCodePoint(String strOrig) { // See http://www.ibm.com/developerworks/library/j-unicode/#1-5 // Example 1-5 appears to be faster than 1-6 for us (because our strings are short?) Token[] tokens = new Token[strOrig.codePointCount(0, strOrig.length())]; for (int cp, i = 0, j = 0; i < strOrig.length(); i += Character.charCount(cp)) { cp = strOrig.codePointAt(i);/*w ww . ja v a 2 s . com*/ tokens[j++] = new Token(String.valueOf(Character.toChars(cp)), i); } return tokens; }
From source file:com.norconex.collector.http.redirect.impl.GenericRedirectURLProvider.java
private String resolveRedirectURL(final String redirectURL, final String nonAsciiCharset) { String url = redirectURL;/*ww w .j av a 2 s .co m*/ // Is string containing only ASCII as it should? boolean isAscii = true; final int length = url.length(); for (int offset = 0; offset < length;) { final int codepoint = url.codePointAt(offset); if (codepoint > ASCII_MAX_CODEPOINT) { isAscii = false; break; } offset += Character.charCount(codepoint); } if (isAscii) { return url; } else { LOG.warn("Redirect URI made of 7-bit clean ASCII. " + "It probably is not encoded properly. " + "Will try to fix. Redirect URL: " + redirectURL); } // try to fix if non ascii charset is non UTF8. if (StringUtils.isNotBlank(nonAsciiCharset)) { String charset = CharsetUtils.clean(nonAsciiCharset); if (!CharEncoding.UTF_8.equals(charset)) { try { url = new String(url.getBytes(charset)); return url; } catch (UnsupportedEncodingException e) { LOG.warn("Could not fix badly encoded URL with charset \"" + charset + "\". Redirect URL: " + redirectURL, e); } } } // If all fails, fall back to UTF8 try { url = new String(url.getBytes(CharEncoding.UTF_8)); return url; } catch (UnsupportedEncodingException e) { LOG.warn("Could not fix badly encoded URL with charset " + "\"UTF-8\". Redirect URL: " + redirectURL, e); } return url; }
From source file:org.omegat.tokenizer.BaseTokenizer.java
protected String[] tokenizeByCodePointToStrings(String strOrig) { // See http://www.ibm.com/developerworks/library/j-unicode/#1-5 // Example 1-5 appears to be faster than 1-6 for us (because our strings are short?) String[] tokens = new String[strOrig.codePointCount(0, strOrig.length())]; for (int cp, i = 0, j = 0; i < strOrig.length(); i += Character.charCount(cp)) { cp = strOrig.codePointAt(i);/* ww w . j a v a 2 s. c o m*/ tokens[j++] = String.valueOf(Character.toChars(cp)); } return tokens; }
From source file:org.kitodo.dataaccess.storage.memory.GraphPath.java
/** * Creates a node representing the graph path string. * * @param string/*from ww w . jav a2 s . c o m*/ * string to parse * @param prefixes * a mapping of prefixes to namespaces which was used to shorten * the string */ public GraphPath(String string, Map<String, String> prefixes) { super(GRAPH_PATH); int index = 0; Node graphPosition = this; int length = string.length(); while (index < length) { while ((index < length) && (string.codePointAt(index) <= ' ')) { index++; } if ((index < length) && (string.codePointAt(index) == '[')) { index++; Pair<Integer, Node> parseObjectRecursive = parseObject(string.substring(index), prefixes); index += parseObjectRecursive.getKey(); index++; graphPosition.put(RDF.OBJECT, parseObjectRecursive.getValue()); } else { Node nextLocationStep = new MemoryNode(LOCATION_STEP); NodeReference direction = RDF.NIL; switch (index < length ? string.codePointAt(index) : -1) { case '<': throw new IllegalArgumentException("Directive '<' not supported."); case '>': index++; switch (index < length ? string.codePointAt(index) : -1) { case '>': index++; if ((index < length) && (string.codePointAt(index) == '>')) { index++; throw new IllegalArgumentException("Directive '>|' not supported."); } else { throw new IllegalArgumentException("Directive '>>' not supported."); } case '|': throw new IllegalArgumentException("Directive '>|' not supported."); default: direction = TO; break; } break; case '|': if (((index + 1) < length) && (string.codePointAt(index + 1) == '<')) { throw new IllegalArgumentException("Directive '|<' not supported."); } break; default: direction = TO; break; } while ((index < length) && (string.codePointAt(index) <= ' ')) { index++; } graphPosition.put(direction, nextLocationStep); graphPosition = nextLocationStep; int predicatesStart = index; int codePoint; while ((index < length) && ((codePoint = string.codePointAt(index)) > ' ')) { index += Character.charCount(codePoint); } String predicates = string.substring(predicatesStart, index); if (!predicates.equals(ANY_PREDICATE_CHAR)) { for (String predicate : predicates.split("\\|")) { graphPosition.put(RDF.PREDICATE, applyPrefixes(prefixes, predicate)); } } } } }
From source file:android.support.text.emoji.EmojiProcessor.java
/** * Checks a given CharSequence for emojis, and adds EmojiSpans if any emojis are found. * <p>//from w ww. jav a2 s .co m * <ul> * <li>If no emojis are found, {@code charSequence} given as the input is returned without * any changes. i.e. charSequence is a String, and no emojis are found, the same String is * returned.</li> * <li>If the given input is not a Spannable (such as String), and at least one emoji is found * a new {@link android.text.Spannable} instance is returned. </li> * <li>If the given input is a Spannable, the same instance is returned. </li> * </ul> * * @param charSequence CharSequence to add the EmojiSpans, cannot be {@code null} * @param start start index in the charSequence to look for emojis, should be greater than or * equal to {@code 0}, also less than {@code charSequence.length()} * @param end end index in the charSequence to look for emojis, should be greater than or * equal to {@code start} parameter, also less than {@code charSequence.length()} * @param maxEmojiCount maximum number of emojis in the {@code charSequence}, should be greater * than or equal to {@code 0} * @param replaceAll whether to replace all emoji with {@link EmojiSpan}s */ CharSequence process(@NonNull final CharSequence charSequence, @IntRange(from = 0) int start, @IntRange(from = 0) int end, @IntRange(from = 0) int maxEmojiCount, final boolean replaceAll) { final boolean isSpannableBuilder = charSequence instanceof SpannableBuilder; if (isSpannableBuilder) { ((SpannableBuilder) charSequence).beginBatchEdit(); } try { Spannable spannable = null; // if it is a spannable already, use the same instance to add/remove EmojiSpans. // otherwise wait until the the first EmojiSpan found in order to change the result // into a Spannable. if (isSpannableBuilder || charSequence instanceof Spannable) { spannable = (Spannable) charSequence; } if (spannable != null) { final EmojiSpan[] spans = spannable.getSpans(start, end, EmojiSpan.class); if (spans != null && spans.length > 0) { // remove existing spans, and realign the start, end according to spans // if start or end is in the middle of an emoji they should be aligned final int length = spans.length; for (int index = 0; index < length; index++) { final EmojiSpan span = spans[index]; final int spanStart = spannable.getSpanStart(span); final int spanEnd = spannable.getSpanEnd(span); // Remove span only when its spanStart is NOT equal to current end. // During add operation an emoji at index 0 is added with 0-1 as start and // end indices. Therefore if there are emoji spans at [0-1] and [1-2] // and end is 1, the span between 0-1 should be deleted, not 1-2. if (spanStart != end) { spannable.removeSpan(span); } start = Math.min(spanStart, start); end = Math.max(spanEnd, end); } } } if (start == end || start >= charSequence.length()) { return charSequence; } // calculate max number of emojis that can be added. since getSpans call is a relatively // expensive operation, do it only when maxEmojiCount is not unlimited. if (maxEmojiCount != EmojiCompat.EMOJI_COUNT_UNLIMITED && spannable != null) { maxEmojiCount -= spannable.getSpans(0, spannable.length(), EmojiSpan.class).length; } // add new ones int addedCount = 0; final ProcessorSm sm = new ProcessorSm(mMetadataRepo.getRootNode()); int currentOffset = start; int codePoint = Character.codePointAt(charSequence, currentOffset); while (currentOffset < end && addedCount < maxEmojiCount) { final int action = sm.check(codePoint); switch (action) { case ACTION_ADVANCE_BOTH: start += Character.charCount(Character.codePointAt(charSequence, start)); currentOffset = start; if (currentOffset < end) { codePoint = Character.codePointAt(charSequence, currentOffset); } break; case ACTION_ADVANCE_END: currentOffset += Character.charCount(codePoint); if (currentOffset < end) { codePoint = Character.codePointAt(charSequence, currentOffset); } break; case ACTION_FLUSH: if (replaceAll || !hasGlyph(charSequence, start, currentOffset, sm.getFlushMetadata())) { if (spannable == null) { spannable = new SpannableString(charSequence); } addEmoji(spannable, sm.getFlushMetadata(), start, currentOffset); addedCount++; } start = currentOffset; break; } } // After the last codepoint is consumed the state machine might be in a state where it // identified an emoji before. i.e. abc[women-emoji] when the last codepoint is consumed // state machine is waiting to see if there is an emoji sequence (i.e. ZWJ). // Need to check if it is in such a state. if (sm.isInFlushableState() && addedCount < maxEmojiCount) { if (replaceAll || !hasGlyph(charSequence, start, currentOffset, sm.getCurrentMetadata())) { if (spannable == null) { spannable = new SpannableString(charSequence); } addEmoji(spannable, sm.getCurrentMetadata(), start, currentOffset); addedCount++; } } return spannable == null ? charSequence : spannable; } finally { if (isSpannableBuilder) { ((SpannableBuilder) charSequence).endBatchEdit(); } } }
From source file:org.apache.pdfbox.pdmodel.font.PDFont.java
/** * Encodes the given string for use in a PDF content stream. * * @param text Any Unicode text./*from ww w . j av a 2s . co m*/ * @return Array of PDF content stream bytes. * @throws IOException If the text could not be encoded. */ public final byte[] encode(String text) throws IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); for (int offset = 0; offset < text.length();) { int codePoint = text.codePointAt(offset); // multi-byte encoding with 1 to 4 bytes byte[] bytes = encode(codePoint); out.write(bytes); offset += Character.charCount(codePoint); } return out.toByteArray(); }
From source file:com.keylesspalace.tusky.activity.ComposeActivity.java
private static int findEndOfHashtag(String string, int fromIndex) { final int length = string.length(); for (int i = fromIndex + 1; i < length;) { int codepoint = string.codePointAt(i); if (Character.isWhitespace(codepoint)) { return i; } else if (codepoint == '#') { return -1; }/*www. j a va 2 s . co m*/ i += Character.charCount(codepoint); } return length; }
From source file:com.keylesspalace.tusky.activity.ComposeActivity.java
private static int findEndOfMention(String string, int fromIndex) { int atCount = 0; final int length = string.length(); for (int i = fromIndex + 1; i < length;) { int codepoint = string.codePointAt(i); if (Character.isWhitespace(codepoint)) { return i; } else if (codepoint == '@') { atCount += 1;/*ww w. j av a 2s.c om*/ if (atCount >= 2) { return -1; } } i += Character.charCount(codepoint); } return length; }
From source file:org.kitodo.dataaccess.storage.memory.GraphPath.java
/** * Parses an object from a graph path string. * * @param string//from w w w .j av a2 s . co m * string to parse * @return the number of code points consumed and the object parsed */ private final Pair<Integer, Node> parseObject(String string, Map<String, String> prefixes) { int length = string.length(); Node result = new MemoryNode(); int index = 0; NodeReference currentPredicate = null; do { while ((index < length) && (string.codePointAt(index) <= ' ')) { index++; } if ((index >= length) || (string.codePointAt(index) == ']')) { return Pair.of(index, result); } else if (string.codePointAt(index) == ',') { index++; currentPredicate = null; } else if (string.codePointAt(index) == '[') { index++; Pair<Integer, Node> recursion = parseObject(string.substring(index), prefixes); index += recursion.getKey(); index++; result.put(currentPredicate != null ? currentPredicate : ANY_PREDICATE, recursion.getValue()); } else { if (currentPredicate == null) { int predicatesStart = index; int codePoint; while ((index < length) && ((codePoint = string.codePointAt(index)) > ' ')) { index += Character.charCount(codePoint); } String predicate = string.substring(predicatesStart, index); currentPredicate = predicate.equals(ANY_PREDICATE_CHAR) ? ANY_PREDICATE : MemoryStorage.INSTANCE.createNodeReference(applyPrefixes(prefixes, predicate)); } else { int literalStart = index; int cp; while ((index < length) && ((cp = string.codePointAt(index)) > ' ') && (cp != ',') && (cp != ']')) { index += Character.charCount(cp); } String value = applyPrefixes(prefixes, string.substring(literalStart, index)); result.put(currentPredicate, MemoryLiteral.createLeaf(value, null)); } } } while (index < length); return Pair.of(length, result); }
From source file:org.archive.modules.fetcher.FetchHTTPRequest.java
/** * Returns a copy of the string with non-ascii characters replaced by their * html numeric character reference in decimal (e.g. &#12345;). * //from w w w . ja va 2s.com * <p> * The purpose of this is to produce a multipart/formdata submission that * any server should be able to handle, based on experiments using a modern * browser (chromium 47.0.2526.106 for mac). What chromium posts depends on * what it considers the character encoding of the page containing the form, * and maybe other factors. It would be too complicated to try to simulate * that behavior in heritrix. * * <p> * Instead what we do is approximately what the browser does when the form * page is plain ascii. It html-escapes characters outside of the * latin1/cp1252 range. Characters in the U+0080-U+00FF range are encoded in * latin1/cp1252. That is the one way that we differ from chromium. We * html-escape those characters (U+0080-U+00FF) as well. That way the http * post is plain ascii, and should work regardless of which encoding the * server expects. * * <p> * N.b. chromium doesn't indicate the encoding of the request in any way (no * charset in the content-type or anything like that). Also of note is that * when it considers the form page to be utf-8, it submits in utf-8. That's * part of the complicated behavior we don't want to try to simulate. */ public static String escapeForMultipart(String str) { StringBuilder buf = new StringBuilder(); for (int i = 0; i < str.length();) { int codepoint = str.codePointAt(i); if (codepoint <= 0x7f) { buf.appendCodePoint(codepoint); } else { buf.append("&#" + codepoint + ";"); } i += Character.charCount(codepoint); } return buf.toString(); }