Example usage for java.lang String codePointCount

List of usage examples for java.lang String codePointCount

Introduction

In this page you can find the example usage for java.lang String codePointCount.

Prototype

public int codePointCount(int beginIndex, int endIndex) 

Source Link

Document

Returns the number of Unicode code points in the specified text range of this String .

Usage

From source file:org.apache.hadoop.hive.common.type.HiveBaseChar.java

public static String enforceMaxLength(String val, int maxLength) {
    if (val == null) {
        return null;
    }//from  ww  w .  jav  a 2s  .c  om
    String value = val;

    if (maxLength > 0) {
        int valLength = val.codePointCount(0, val.length());
        if (valLength > maxLength) {
            // Truncate the excess chars to fit the character length.
            // Also make sure we take supplementary chars into account.
            value = val.substring(0, val.offsetByCodePoints(0, maxLength));
        }
    }
    return value;
}

From source file:org.apache.hadoop.hive.common.type.HiveBaseChar.java

public static String getPaddedValue(String val, int maxLength) {
    if (val == null) {
        return null;
    }//from w w  w.  j  av a  2 s  .  c  o  m
    if (maxLength < 0) {
        return val;
    }

    int valLength = val.codePointCount(0, val.length());
    if (valLength > maxLength) {
        return enforceMaxLength(val, maxLength);
    }

    if (maxLength > valLength) {
        // Make sure we pad the right amount of spaces; valLength is in terms of code points,
        // while StringUtils.rpad() is based on the number of java chars.
        int padLength = val.length() + (maxLength - valLength);
        val = StringUtils.rightPad(val, padLength);
    }
    return val;
}

From source file:org.apache.hadoop.hive.common.type.HiveChar.java

public int getCharacterLength() {
    String strippedValue = getStrippedValue();
    return strippedValue.codePointCount(0, strippedValue.length());
}

From source file:org.apache.orc.impl.mask.RedactMaskFactory.java

/**
 * Mask the given stringified numeric value excluding the unmask range.
 * Non-digit characters are passed through on the assumption they are
 * markers (eg. one of ",.ef").//  w ww  . j a va 2s  .  co  m
 * @param value the original value.
 */
String maskNumericString(final String value) {
    StringBuilder result = new StringBuilder();
    final int length = value.codePointCount(0, value.length());
    for (int c = 0; c < length; ++c) {
        int cp = value.codePointAt(c);
        if (isIndexInUnmaskRange(c, length) || Character.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {
            result.appendCodePoint(cp);
        } else {
            result.appendCodePoint(DIGIT_CP_REPLACEMENT);
        }
    }
    return result.toString();
}

From source file:org.apache.tika.eval.tokens.LuceneTokenCounter.java

void count(String field) throws IOException {
    long tokenCount = leafReader.getSumTotalTermFreq(field);
    if (tokenCount > Integer.MAX_VALUE) {
        throw new IllegalArgumentException("can't handle longs");
    }// w  ww.  jav a 2 s  .c o m
    int tokenCountInt = (int) tokenCount;
    int uniqueTokenCount = 0;
    SummaryStatistics summStats = new SummaryStatistics();
    double ent = 0.0d;
    double p = 0.0d;
    double base = 2.0;

    Terms terms = leafReader.terms(field);
    if (terms == null) {
        //if there were no terms
        fieldStats.put(field,
                new TokenStatistics(uniqueTokenCount, tokenCountInt, new TokenIntPair[0], ent, summStats));
        return;

    }
    TermsEnum termsEnum = terms.iterator();
    BytesRef bytesRef = termsEnum.next();
    TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);

    while (bytesRef != null) {

        long termFreq = termsEnum.totalTermFreq();
        if (termFreq > Integer.MAX_VALUE) {
            throw new IllegalArgumentException("Sorry can't handle longs yet");
        }
        int tf = (int) termFreq;
        //TODO: figure out how to avoid Stringifying this
        //to get codepoint count
        String t = bytesRef.utf8ToString();
        int len = t.codePointCount(0, t.length());
        for (int i = 0; i < tf; i++) {
            summStats.addValue(len);
        }
        p = (double) tf / (double) tokenCount;
        ent += p * FastMath.log(base, p);

        if (queue.top() == null || queue.size() < topN || tf >= queue.top().getValue()) {
            queue.insertWithOverflow(new TokenIntPair(t, tf));
        }

        uniqueTokenCount++;
        bytesRef = termsEnum.next();
    }
    if (tokenCountInt > 0) {
        ent = (-1.0d / (double) tokenCountInt) * ent;
    }

    fieldStats.put(field,
            new TokenStatistics(uniqueTokenCount, tokenCountInt, queue.getArray(), ent, summStats));
}

From source file:org.apache.tika.eval.tokens.TokenCounter.java

private void _add(String field, Analyzer analyzer, String content) throws IOException {
    int totalTokens = 0;

    TokenStream ts = analyzer.tokenStream(field, content);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();// ww  w . j  a va2  s .  c o m
    Map<String, MutableInt> tokenMap = map.get(field);
    if (tokenMap == null) {
        tokenMap = new HashMap<>();
        map.put(field, tokenMap);
    }
    while (ts.incrementToken()) {
        String token = termAtt.toString();
        MutableInt cnt = tokenMap.get(token);
        if (cnt == null) {
            cnt = new MutableInt(1);
            tokenMap.put(token, cnt);
        } else {
            cnt.increment();
        }
        totalTokens++;
    }
    ts.close();
    ts.end();

    int totalUniqueTokens = tokenMap.size();

    double ent = 0.0d;
    double p = 0.0d;
    double base = 2.0;

    TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);

    SummaryStatistics summaryStatistics = new SummaryStatistics();
    for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) {
        String token = e.getKey();
        int termFreq = e.getValue().intValue();

        p = (double) termFreq / (double) totalTokens;
        ent += p * FastMath.log(base, p);
        int len = token.codePointCount(0, token.length());
        for (int i = 0; i < e.getValue().intValue(); i++) {
            summaryStatistics.addValue(len);
        }
        if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) {
            queue.insertWithOverflow(new TokenIntPair(token, termFreq));
        }

    }
    if (totalTokens > 0) {
        ent = (-1.0d / (double) totalTokens) * ent;
    }

    /*            Collections.sort(allTokens);
    List<TokenIntPair> topNList = new ArrayList<>(topN);
    for (int i = 0; i < topN && i < allTokens.size(); i++) {
        topNList.add(allTokens.get(i));
    }*/

    tokenStatistics.put(field,
            new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics));

}

From source file:org.ballerinalang.model.values.BXMLItem.java

/**
 * {@inheritDoc}/* ww  w . j a va2 s. c  o m*/
 */
public long size() {
    if (getNodeType() == XMLNodeType.TEXT) {
        String textContent = ((OMText) this.omNode).getText();
        return textContent.codePointCount(0, textContent.length());
    }
    return this.omNode == null ? 0 : 1;
}

From source file:org.eclipse.rdf4j.rio.turtle.TurtleParser.java

/**
 * Pushes back the supplied string by copying it to the front of the buffer.
 * After this method returns, successive calls to {@link #readCodePoint()}
 * will return the code points in the supplied string again, starting at the
 * first in the String..//  w ww.j a  va  2s  . c o  m
 *
 * @param string
 *            the string to un-read.
 * @throws IOException
 */
protected void unread(String string) throws IOException {
    for (int i = string.codePointCount(0, string.length()); i >= 1; i--) {
        final int codePoint = string.codePointBefore(i);
        if (Character.isSupplementaryCodePoint(codePoint)) {
            final char[] surrogatePair = Character.toChars(codePoint);
            reader.unread(surrogatePair);
        } else {
            reader.unread(codePoint);
        }
    }
}

From source file:org.languagetool.rules.spelling.hunspell.HunspellRule.java

protected String getSentenceTextWithoutUrlsAndImmunizedTokens(AnalyzedSentence sentence) {
    StringBuilder sb = new StringBuilder();
    AnalyzedTokenReadings[] sentenceTokens = getSentenceWithImmunization(sentence).getTokens();
    for (int i = 1; i < sentenceTokens.length; i++) {
        String token = sentenceTokens[i].getToken();
        if (sentenceTokens[i].isImmunized() || sentenceTokens[i].isIgnoredBySpeller() || isUrl(token)
                || isEMail(token) || isQuotedCompound(sentence, i, token)) {
            if (isQuotedCompound(sentence, i, token)) {
                sb.append(" ").append(token.substring(1));
            }//from  w  w w .j ava 2  s.  c o  m
            // replace URLs and immunized tokens with whitespace to ignore them for spell checking:
            else if (token.length() < 20) {
                sb.append(WHITESPACE_ARRAY[token.length()]);
            } else {
                for (int j = 0; j < token.length(); j++) {
                    sb.append(' ');
                }
            }
        } else if (token.length() > 1 && token.codePointCount(0, token.length()) != token.length()) {
            // some symbols such as emojis () have a string length that equals 2 
            for (int charIndex = 0; charIndex < token.length();) {
                int unicodeCodePoint = token.codePointAt(charIndex);
                int increment = Character.charCount(unicodeCodePoint);
                if (increment == 1) {
                    sb.append(token.charAt(charIndex));
                } else {
                    sb.append("  ");
                }
                charIndex += increment;
            }
        } else {
            sb.append(token);
        }
    }
    return sb.toString();
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

protected Token[] tokenizeByCodePoint(String strOrig) {
    // See http://www.ibm.com/developerworks/library/j-unicode/#1-5
    // Example 1-5 appears to be faster than 1-6 for us (because our strings are short?)
    Token[] tokens = new Token[strOrig.codePointCount(0, strOrig.length())];
    for (int cp, i = 0, j = 0; i < strOrig.length(); i += Character.charCount(cp)) {
        cp = strOrig.codePointAt(i);/*w  w  w . j  a v a2  s . c o  m*/
        tokens[j++] = new Token(String.valueOf(Character.toChars(cp)), i);
    }
    return tokens;
}