List of usage examples for java.lang String codePointCount
public int codePointCount(int beginIndex, int endIndex)
From source file:org.apache.hadoop.hive.common.type.HiveBaseChar.java
public static String enforceMaxLength(String val, int maxLength) { if (val == null) { return null; }//from ww w . jav a 2s .c om String value = val; if (maxLength > 0) { int valLength = val.codePointCount(0, val.length()); if (valLength > maxLength) { // Truncate the excess chars to fit the character length. // Also make sure we take supplementary chars into account. value = val.substring(0, val.offsetByCodePoints(0, maxLength)); } } return value; }
From source file:org.apache.hadoop.hive.common.type.HiveBaseChar.java
public static String getPaddedValue(String val, int maxLength) { if (val == null) { return null; }//from w w w. j av a 2 s . c o m if (maxLength < 0) { return val; } int valLength = val.codePointCount(0, val.length()); if (valLength > maxLength) { return enforceMaxLength(val, maxLength); } if (maxLength > valLength) { // Make sure we pad the right amount of spaces; valLength is in terms of code points, // while StringUtils.rpad() is based on the number of java chars. int padLength = val.length() + (maxLength - valLength); val = StringUtils.rightPad(val, padLength); } return val; }
From source file:org.apache.hadoop.hive.common.type.HiveChar.java
public int getCharacterLength() { String strippedValue = getStrippedValue(); return strippedValue.codePointCount(0, strippedValue.length()); }
From source file:org.apache.orc.impl.mask.RedactMaskFactory.java
/** * Mask the given stringified numeric value excluding the unmask range. * Non-digit characters are passed through on the assumption they are * markers (eg. one of ",.ef").// w ww . j a va 2s . co m * @param value the original value. */ String maskNumericString(final String value) { StringBuilder result = new StringBuilder(); final int length = value.codePointCount(0, value.length()); for (int c = 0; c < length; ++c) { int cp = value.codePointAt(c); if (isIndexInUnmaskRange(c, length) || Character.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) { result.appendCodePoint(cp); } else { result.appendCodePoint(DIGIT_CP_REPLACEMENT); } } return result.toString(); }
From source file:org.apache.tika.eval.tokens.LuceneTokenCounter.java
void count(String field) throws IOException { long tokenCount = leafReader.getSumTotalTermFreq(field); if (tokenCount > Integer.MAX_VALUE) { throw new IllegalArgumentException("can't handle longs"); }// w ww. jav a 2 s .c o m int tokenCountInt = (int) tokenCount; int uniqueTokenCount = 0; SummaryStatistics summStats = new SummaryStatistics(); double ent = 0.0d; double p = 0.0d; double base = 2.0; Terms terms = leafReader.terms(field); if (terms == null) { //if there were no terms fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, new TokenIntPair[0], ent, summStats)); return; } TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN); while (bytesRef != null) { long termFreq = termsEnum.totalTermFreq(); if (termFreq > Integer.MAX_VALUE) { throw new IllegalArgumentException("Sorry can't handle longs yet"); } int tf = (int) termFreq; //TODO: figure out how to avoid Stringifying this //to get codepoint count String t = bytesRef.utf8ToString(); int len = t.codePointCount(0, t.length()); for (int i = 0; i < tf; i++) { summStats.addValue(len); } p = (double) tf / (double) tokenCount; ent += p * FastMath.log(base, p); if (queue.top() == null || queue.size() < topN || tf >= queue.top().getValue()) { queue.insertWithOverflow(new TokenIntPair(t, tf)); } uniqueTokenCount++; bytesRef = termsEnum.next(); } if (tokenCountInt > 0) { ent = (-1.0d / (double) tokenCountInt) * ent; } fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, queue.getArray(), ent, summStats)); }
From source file:org.apache.tika.eval.tokens.TokenCounter.java
private void _add(String field, Analyzer analyzer, String content) throws IOException { int totalTokens = 0; TokenStream ts = analyzer.tokenStream(field, content); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset();// ww w . j a va2 s . c o m Map<String, MutableInt> tokenMap = map.get(field); if (tokenMap == null) { tokenMap = new HashMap<>(); map.put(field, tokenMap); } while (ts.incrementToken()) { String token = termAtt.toString(); MutableInt cnt = tokenMap.get(token); if (cnt == null) { cnt = new MutableInt(1); tokenMap.put(token, cnt); } else { cnt.increment(); } totalTokens++; } ts.close(); ts.end(); int totalUniqueTokens = tokenMap.size(); double ent = 0.0d; double p = 0.0d; double base = 2.0; TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN); SummaryStatistics summaryStatistics = new SummaryStatistics(); for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) { String token = e.getKey(); int termFreq = e.getValue().intValue(); p = (double) termFreq / (double) totalTokens; ent += p * FastMath.log(base, p); int len = token.codePointCount(0, token.length()); for (int i = 0; i < e.getValue().intValue(); i++) { summaryStatistics.addValue(len); } if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) { queue.insertWithOverflow(new TokenIntPair(token, termFreq)); } } if (totalTokens > 0) { ent = (-1.0d / (double) totalTokens) * ent; } /* Collections.sort(allTokens); List<TokenIntPair> topNList = new ArrayList<>(topN); for (int i = 0; i < topN && i < allTokens.size(); i++) { topNList.add(allTokens.get(i)); }*/ tokenStatistics.put(field, new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics)); }
From source file:org.ballerinalang.model.values.BXMLItem.java
/** * {@inheritDoc}/* ww w . j a va2 s. c o m*/ */ public long size() { if (getNodeType() == XMLNodeType.TEXT) { String textContent = ((OMText) this.omNode).getText(); return textContent.codePointCount(0, textContent.length()); } return this.omNode == null ? 0 : 1; }
From source file:org.eclipse.rdf4j.rio.turtle.TurtleParser.java
/** * Pushes back the supplied string by copying it to the front of the buffer. * After this method returns, successive calls to {@link #readCodePoint()} * will return the code points in the supplied string again, starting at the * first in the String..// w ww.j a va 2s . c o m * * @param string * the string to un-read. * @throws IOException */ protected void unread(String string) throws IOException { for (int i = string.codePointCount(0, string.length()); i >= 1; i--) { final int codePoint = string.codePointBefore(i); if (Character.isSupplementaryCodePoint(codePoint)) { final char[] surrogatePair = Character.toChars(codePoint); reader.unread(surrogatePair); } else { reader.unread(codePoint); } } }
From source file:org.languagetool.rules.spelling.hunspell.HunspellRule.java
protected String getSentenceTextWithoutUrlsAndImmunizedTokens(AnalyzedSentence sentence) { StringBuilder sb = new StringBuilder(); AnalyzedTokenReadings[] sentenceTokens = getSentenceWithImmunization(sentence).getTokens(); for (int i = 1; i < sentenceTokens.length; i++) { String token = sentenceTokens[i].getToken(); if (sentenceTokens[i].isImmunized() || sentenceTokens[i].isIgnoredBySpeller() || isUrl(token) || isEMail(token) || isQuotedCompound(sentence, i, token)) { if (isQuotedCompound(sentence, i, token)) { sb.append(" ").append(token.substring(1)); }//from w w w .j ava 2 s. c o m // replace URLs and immunized tokens with whitespace to ignore them for spell checking: else if (token.length() < 20) { sb.append(WHITESPACE_ARRAY[token.length()]); } else { for (int j = 0; j < token.length(); j++) { sb.append(' '); } } } else if (token.length() > 1 && token.codePointCount(0, token.length()) != token.length()) { // some symbols such as emojis () have a string length that equals 2 for (int charIndex = 0; charIndex < token.length();) { int unicodeCodePoint = token.codePointAt(charIndex); int increment = Character.charCount(unicodeCodePoint); if (increment == 1) { sb.append(token.charAt(charIndex)); } else { sb.append(" "); } charIndex += increment; } } else { sb.append(token); } } return sb.toString(); }
From source file:org.omegat.tokenizer.BaseTokenizer.java
protected Token[] tokenizeByCodePoint(String strOrig) { // See http://www.ibm.com/developerworks/library/j-unicode/#1-5 // Example 1-5 appears to be faster than 1-6 for us (because our strings are short?) Token[] tokens = new Token[strOrig.codePointCount(0, strOrig.length())]; for (int cp, i = 0, j = 0; i < strOrig.length(); i += Character.charCount(cp)) { cp = strOrig.codePointAt(i);/*w w w . j a v a2 s . c o m*/ tokens[j++] = new Token(String.valueOf(Character.toChars(cp)), i); } return tokens; }