Example usage for org.apache.commons.lang3 StringUtils isAllUpperCase

Introduction

In this page you can find the example usage for org.apache.commons.lang3 StringUtils isAllUpperCase.

Prototype

public static boolean isAllUpperCase(final CharSequence cs)

Source Link

Document

Checks if the CharSequence contains only uppercase characters.

null will return false .

Usage

From source file:org.languagetool.tagging.disambiguation.uk.UkrainianHybridDisambiguator.java

private void removeLowerCaseHomonymsForAbbreviations(AnalyzedSentence input) {
    AnalyzedTokenReadings[] tokens = input.getTokensWithoutWhitespace();
    for (int i = 1; i < tokens.length; i++) {
        if (StringUtils.isAllUpperCase(tokens[i].getToken())
                && PosTagHelper.hasPosTagPart(tokens[i], ":abbr")) {

            List<AnalyzedToken> analyzedTokens = tokens[i].getReadings();
            for (int j = analyzedTokens.size() - 1; j >= 0; j--) {
                AnalyzedToken analyzedToken = analyzedTokens.get(j);

                if (!PosTagHelper.hasPosTagPart(analyzedToken, ":abbr")
                        && !JLanguageTool.SENTENCE_END_TAGNAME.equals(analyzedToken)) {
                    tokens[i].removeReading(analyzedToken);
                }// w  w  w  .  ja  v  a2s .c om
            }
        }
    }
}

From source file:org.languagetool.tagging.uk.UkrainianTagger.java

@Override
protected List<AnalyzedToken> getAnalyzedTokens(String word) {
    List<AnalyzedToken> tokens = super.getAnalyzedTokens(word);

    if (tokens.get(0).hasNoTag()) {
        String origWord = word;//from www. j a v a2  s . c  om

        if (word.endsWith("") || word.endsWith("")) {
            word = origWord.substring(0, word.length() - 1);
            List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(origWord, word,
                    Pattern.compile("noun:inanim.*"), null, null);
            return newTokens.size() > 0 ? newTokens : tokens;
        }

        if (word.indexOf('\u2013') > 0 && ALT_DASHES_IN_WORD.matcher(word).find()) {

            word = origWord.replace('\u2013', '-');

            List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(origWord, word, null, null, null);

            if (newTokens.size() > 0) {
                tokens = newTokens;
            }
        }

        if (word.length() > 7 && word.startsWith("")) {
            String addPosTag = "";

            Matcher matcher = Pattern.compile("(['-]?)(.*)").matcher(word);
            matcher.matches();

            String prefix = matcher.group(1);
            String adjustedWord = matcher.group(2);

            List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(origWord, adjustedWord,
                    NAPIV_ALLOWED_TAGS_REGEX, null, null);

            if (newTokens.size() > 0) {
                if (!addPosTag.contains(":bad:")) {
                    if (word.charAt(5) == '-' && !adjustedWord.matches("[?-?].*")) {
                        addPosTag += ":bad";
                    } else if (word.charAt(5) != '\'' && adjustedWord.matches("[?].*")) {
                        addPosTag += ":bad";
                    }
                }

                for (int i = 0; i < newTokens.size(); i++) {
                    AnalyzedToken analyzedToken = newTokens.get(i);

                    String lemma = analyzedToken.getLemma();
                    String posTag = analyzedToken.getPOSTag();

                    posTag = NAPIV_REMOVE_TAGS_REGEX.matcher(posTag).replaceAll("");

                    posTag = PosTagHelper.addIfNotContains(posTag, addPosTag);

                    AnalyzedToken newToken = new AnalyzedToken(origWord, posTag, prefix + lemma);
                    newTokens.set(i, newToken);
                }
                tokens = newTokens;
            }
        }
        // try  instead of 
        else if (word.contains("")) {
            tokens = convertTokens(tokens, word, "", "", ":alt");
        } else if (word.contains("?")) {
            tokens = convertTokens(tokens, word, "?", "", ":alt");
        } else if (word.endsWith("")) {
            tokens = convertTokens(tokens, word, "", "", ":alt");
        }
    }

    // try ??? as  and  as 
    if (word.length() > 2 && StringUtils.isAllUpperCase(word)) {

        String newWord = StringUtils.capitalize(StringUtils.lowerCase(word));

        List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(word, newWord,
                Pattern.compile("noun.*?:prop.*"), null, null);
        if (newTokens.size() > 0) {
            if (tokens.get(0).hasNoTag()) {
                //TODO: add special tags if necessary
                tokens = newTokens;
            } else {
                tokens.addAll(newTokens);
            }
        }
    }

    // -? as adj from -?
    if (word.indexOf('-') > 1 && !word.endsWith("-")) {
        String[] parts = word.split("-");
        if (isAllCapitalized(parts)) {
            String lowerCasedWord = Stream.of(parts).map(String::toLowerCase).collect(Collectors.joining("-"));
            List<TaggedWord> wdList = wordTagger.tag(lowerCasedWord);
            if (PosTagHelper.hasPosTagPart2(wdList, "adj")) {
                List<AnalyzedToken> analyzedTokens = asAnalyzedTokenListForTaggedWordsInternal(word, wdList);
                analyzedTokens = PosTagHelper.filter(analyzedTokens, Pattern.compile("adj.*"));
                if (tokens.get(0).hasNoTag()) {
                    tokens = analyzedTokens;
                } else {
                    // compound tagging has already been performed and may have added tokens
                    for (AnalyzedToken token : analyzedTokens) {
                        if (!tokens.contains(token)) {
                            tokens.add(token);
                        }
                    }
                }
            }
        }
    }

    return tokens;
}

From source file:org.opensextant.extractors.geo.GazetteerMatcher.java

/**
 * Geotag a document, returning PlaceCandidates for the mentions in
 * document. Optionally just return the PlaceCandidates with name only and
 * no Place objects attached. Names of contients are passed back as matches,
 * with geo matches. Continents are filtered out by default.
 *
 * @param buffer//w w  w  . ja  va2  s  .  c o m
 *            text
 * @param docid
 *            identity of the text
 * @param tagOnly
 *            True if you wish to get the matched phrases only. False if you
 *            want the full list of Place Candidates.
 * @param fld
 *            gazetteer field to use for tagging
 * @return place_candidates List of place candidates
 * @throws ExtractionException
 *             on err
 */
public List<PlaceCandidate> tagText(String buffer, String docid, boolean tagOnly, String fld)
        throws ExtractionException {
    // "tagsCount":10, "tags":[{ "ids":[35], "endOffset":40,
    // "startOffset":38},
    // { "ids":[750308, 2769912, 2770041, 10413973, 10417546],
    // "endOffset":49,
    // "startOffset":41},
    // ...
    // "matchingDocs":{"numFound":75, "start":0, "docs":[ {
    // "place_id":"USGS1992921", "name":"Monterrey", "cc":"PR"}, {
    // "place_id":"USGS1991763", "name":"Monterrey", "cc":"PR"}, ]

    // Reset counts.
    this.defaultFilterCount = 0;
    this.userFilterCount = 0;
    // during post-processing tags we may have to distinguish between tagging/tokenizing 
    // general vs. cjk vs. ar. But not yet though.
    // boolean useGeneralMode = DEFAULT_TAG_FIELD.equals(fld);

    long t0 = System.currentTimeMillis();
    log.debug("TEXT SIZE = {}", buffer.length());
    int[] textMetrics = TextUtils.measureCase(buffer);
    boolean isUpperCase = TextUtils.isUpperCaseDocument(textMetrics);

    params.set("field", fld);
    Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100);
    QueryResponse response = tagTextCallSolrTagger(buffer, docid, beanMap);

    @SuppressWarnings("unchecked")
    List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");

    this.tagNamesTime = response.getQTime();
    long t1 = t0 + tagNamesTime;
    long t2 = System.currentTimeMillis();
    boolean geocode = !tagOnly;

    /*
     * Retrieve all offsets into a long list. These offsets will report a
     * text span and all the gazetteer record IDs that are associated to
     * that span. The text could either be a name, a code or some other
     * abbreviation.
     *
     * For practical reasons the default behavior is to filter trivial spans
     * given the gazetteer data that is returned for them.
     *
     * WARNING: lots of optimizations occur here due to the potentially
     * large volume of tags and gazetteer data that is involved. And this is
     * relatively early in the pipline.
     */
    log.debug("DOC={} TAGS SIZE={}", docid, tags.size());

    TreeMap<Integer, PlaceCandidate> candidates = new TreeMap<Integer, PlaceCandidate>();

    // names matched is used only for debugging, currently.
    Set<String> namesMatched = new HashSet<>();

    tagLoop: for (NamedList<?> tag : tags) {

        int x1 = (Integer) tag.get("startOffset");
        int x2 = (Integer) tag.get("endOffset");
        int len = x2 - x1;
        if (len == 1) {
            // Ignoring place names whose length is less than 2 chars
            ++this.defaultFilterCount;
            continue;
        }
        // +1 char after last matched
        // Could have enabled the "matchText" option from the tagger to get
        // this, but since we already have the content as a String then
        // we might as well not make the tagger do any more work.

        String matchText = (String) tag.get("matchText");
        // Get char immediately following match, for light NLP rules.
        char postChar = 0;
        if (x2 < buffer.length()) {
            postChar = buffer.charAt(x2);
        }

        // Then filter out trivial matches. E.g., Us is filtered out. vs. US would
        // be allowed. If lowercase abbreviations are allowed, then all matches are passed.               
        if (len < 3) {
            if (TextUtils.isASCII(matchText) && !StringUtils.isAllUpperCase(matchText)
                    && !allowLowercaseAbbrev) {
                ++this.defaultFilterCount;
                continue;
            }
        }

        if (TextUtils.countFormattingSpace(matchText) > 1) {
            // Phrases with words broken across more than one line are not
            // valid matches.
            // Phrase with a single TAB is okay
            ++this.defaultFilterCount;
            continue;
        }
        // Eliminate any newlines and extra whitespace in match
        matchText = TextUtils.squeeze_whitespace(matchText);

        /**
         * Filter out trivial tags. Due to normalization, we tend to get
         * lots of false positives that can be eliminated early.
         */
        if (filter.filterOut(matchText)) {
            ++this.defaultFilterCount;
            continue;
        }

        PlaceCandidate pc = new PlaceCandidate();
        pc.start = x1;
        pc.end = x2;
        pc.setText(matchText);

        /*
         * Filter out tags that user determined ahead of time as not-places
         * for their context.
         *
         */
        if (userfilter != null) {
            if (userfilter.filterOut(pc.getTextnorm())) {
                log.debug("User Filter:{}", matchText);
                ++this.userFilterCount;
                continue;
            }
        }

        /*
         * Continent filter is needed, as many mentions of contients confuse
         * real geotagging/geocoding.
         * 
         */
        if (continents.filterOut(pc.getTextnorm())) {
            pc.isContinent = true;
            pc.setFilteredOut(true);
            candidates.put(pc.start, pc);
            continue;
        }
        /*
         * Found UPPER CASE text in a mixed-cased document.
         * Conservatively, this is likely an acronym or some heading.
         * But possibly still a valid place name.
         * HEURISTIC: acronyms are relatively short. 
         * HEURISTIC: region codes can be acronyms and are valid places
         * 
         * using such place candidates you may score short acronym matches lower than fully named ones.
         * when inferring boundaries (states, provinces, etc)
         */
        if (!isUpperCase && pc.isUpper() && len < 5) {
            pc.isAcronym = true;
        }

        /*
         * Everything Else.
         */
        pc.setSurroundingTokens(buffer);

        @SuppressWarnings("unchecked")
        List<Integer> placeRecordIds = (List<Integer>) tag.get("ids");

        /*
         * This assertion is helpful in debugging: assert
         * placeRecordIds.size() == new
         * HashSet<Integer>(placeRecordIds).size() : "ids should be unique";
         */
        // assert!placeRecordIds.isEmpty();
        namesMatched.clear();

        //double maxNameBias = 0.0;
        for (Integer solrId : placeRecordIds) {
            // Yes, we must cast here.
            // As long as createTag generates the correct type stored in
            // beanMap we are fine.
            ScoredPlace pGeo = (ScoredPlace) beanMap.get(solrId);
            // assert pGeo != null;

            // Optimization: abbreviation filter.
            //
            // Do not add PlaceCandidates for lower case tokens that are
            // marked as Abbreviations, unless flagged to do so.
            //
            // DEFAULT behavior is to avoid lower case text that is tagged
            // as an abbreviation in gazetteer,
            //
            // Common terms: in, or, oh, me, us, we, etc. Are all not
            // typically place names or valid abbreviations in text.
            //
            if (!allowLowercaseAbbrev && pGeo.isAbbreviation() && pc.isLower()) {
                log.debug("Ignore lower case term={}", pc.getText());
                // DWS: TODO what if there is another pGeo for this pc that
                // isn't an abbrev? Therefore shouldn't we continue this
                // loop and not tagLoop?
                continue tagLoop;
            }

            /*
             * If text match contains "." and it matches any abbreviation,
             * mark the candidate as an abbrev. TODO: Possibly best confirm
             * this by sentence detection, as well. However, this pertains
             * to text spans that contain "." within the bounds, and not
             * likely an ending. E.g., "U.S." or "U.S" are trivial examples;
             * "US" is more ambiguous, as we need to know if document is
             * upperCase.
             * 
             * Any place abbreviation will trigger isAbbreviation = true
             * 
             * "IF YOU FIND US HERE"  the term 'US' is ambiguous here, so 
             * it is not classified as an abbreviation. Otherwise if you have
             * "My organization YAK happens to coincide with a place named Yak.
             * But we first must determine if 'YAK' is a valid abbreviation for an actual place.
             * HEURISTIC: place abbreviations are relatively short, e.g. one word(len=7 or less)
             */
            if (len < 8 && !pc.isAbbreviation) {
                assessAbbreviation(pc, pGeo, postChar, isUpperCase);
            }

            if (log.isDebugEnabled()) {
                namesMatched.add(pGeo.getName());
            }

            /**
             * Country names are the only names you can reasonably set ahead
             * of time. All other names need to be assessed in context.
             * Negate country names, e.g., "Georgia", by exception.
             */
            if (pGeo.isCountry()) {
                pc.isCountry = true;
            }

            if (geocode) {
                pGeo.defaultHierarchicalPath();
                // Default score for geo will be calculated in PlaceCandidate
                pc.addPlace(pGeo);
            }
        }

        // If geocoding, skip this PlaceCandidate if has no places (e.g. due
        // to filtering)
        if (geocode && !pc.hasPlaces()) {
            log.debug("Place has no places={}", pc.getText());
            continue;
        } else {
            if (log.isDebugEnabled()) {
                log.debug("Text {} matched {}", pc.getText(), namesMatched);
            }
        }

        candidates.put(pc.start, pc);
    } // for tag
    long t3 = System.currentTimeMillis();

    // this.tagNamesTime = (int)(t1 - t0);
    this.getNamesTime = (int) (t2 - t1);
    this.totalTime = (int) (t3 - t0);

    if (log.isDebugEnabled()) {
        summarizeExtraction(candidates.values(), docid);
    }

    this.filteredTotal += this.defaultFilterCount + this.userFilterCount;
    this.matchedTotal += candidates.size();

    return new ArrayList<PlaceCandidate>(candidates.values());
}

From source file:org.opensingular.form.SFormUtil.java

public static String generateUserFriendlyName(String simpleName) {
    final Pattern lowerUpper = Pattern.compile("(.*?[a-z])([A-Z].*?)");
    final Pattern prefixoSigla = Pattern.compile("([A-Z]+)([A-Z][a-z])");
    final ImmutableSet<String> upperCaseSpecialCases = ImmutableSet.of("id", "url");

    return StringUtils.capitalize(Arrays.asList(simpleName).stream()
            .map(s -> lowerUpper.matcher(s).replaceAll("$1-$2"))
            .map(s -> prefixoSigla.matcher(s).replaceAll("$1-$2"))
            .flatMap(s -> Arrays.asList(s.split("[-_]+")).stream())
            .map(s -> (StringUtils.isAllUpperCase(s) ? s : StringUtils.uncapitalize(s)))
            .map(s -> upperCaseSpecialCases.contains(s) ? StringUtils.capitalize(s) : s).collect(joining(" ")));
}

From source file:org.structr.console.tabcompletion.AbstractTabCompletionProvider.java

protected List<TabCompletionResult> getCaseInsensitiveResultsForCollection(final Collection<String> words,
        final String token, final String suffix) {

    final List<TabCompletionResult> result = new LinkedList<>();
    final String lowerToken = token.toLowerCase();
    final boolean upperCase = StringUtils.isAllUpperCase(token);

    for (final String word : words) {

        if (word.startsWith(lowerToken)) {

            if (upperCase) {

                result.add(getCompletion(word.toUpperCase(), token));

            } else {

                result.add(getCompletion(word, token));
            }/*from  w w  w .j a v  a 2  s  .  c  o m*/
        }
    }

    return result;
}

From source file:org.trnltk.morphology.contextless.rootfinder.ProperNounFromApostropheRootFinder.java

@Override
public List<? extends Root> findRootsForPartialInput(TurkishSequence partialInput, TurkishSequence input) {
    final TurkishSequence properNounCandidate = partialInput.subsequence(0, partialInput.length() - 1);

    final String properNounCandidateUnderlyingString = properNounCandidate.getUnderlyingString();

    if (StringUtils.isAllUpperCase(properNounCandidateUnderlyingString)) {
        final Lexeme lexeme = new ImmutableLexeme(properNounCandidateUnderlyingString,
                properNounCandidateUnderlyingString, PrimaryPos.Noun, SecondaryPos.Abbreviation, null);

        if (!properNounCandidate.getLastChar().getLetter().isVowel()) {
            // if last letter is not vowel (such as PTT, THY), then add char 'E' to the end and then calculate the phonetics
            final ImmutableSet<PhoneticAttribute> phoneticAttributes = Sets.immutableEnumSet(phoneticsAnalyzer
                    .calculatePhoneticAttributes(properNounCandidate.append(TURKISH_CHAR_E_UPPERCASE), null));
            return Arrays.asList(new ImmutableRoot(properNounCandidate, lexeme, phoneticAttributes, null));

        } else {//  w  w  w. j a  va  2s.  c o m
            final ImmutableSet<PhoneticAttribute> phoneticAttributes = Sets
                    .immutableEnumSet(phoneticsAnalyzer.calculatePhoneticAttributes(properNounCandidate, null));
            return Arrays.asList(new ImmutableRoot(properNounCandidate, lexeme, phoneticAttributes, null));
        }
    } else {
        final Lexeme lexeme = new ImmutableLexeme(properNounCandidateUnderlyingString,
                properNounCandidateUnderlyingString, PrimaryPos.Noun, SecondaryPos.ProperNoun, null);

        final ImmutableSet<PhoneticAttribute> phoneticAttributes = Sets
                .immutableEnumSet(phoneticsAnalyzer.calculatePhoneticAttributes(properNounCandidate, null));
        return Arrays.asList(new ImmutableRoot(properNounCandidate, lexeme, phoneticAttributes, null));
    }
}

From source file:org.trnltk.morphology.contextless.rootfinder.ProperNounWithoutApostropheRootFinder.java

@Override
public List<? extends Root> findRootsForPartialInput(TurkishSequence partialInput, TurkishSequence input) {
    final String partialInputUnderlyingString = partialInput.getUnderlyingString();

    if (partialInput.equals(input) && StringUtils.isAllUpperCase(partialInputUnderlyingString)) {
        final Lexeme abbreviationLexeme = new ImmutableLexeme(partialInputUnderlyingString,
                partialInputUnderlyingString, PrimaryPos.Noun, SecondaryPos.Abbreviation, null);
        if (!partialInput.getLastChar().getLetter().isVowel()) {
            // if last letter is not vowel (such as PTT, THY), then add char 'E' to the end and then calculate the phonetics
            final ImmutableSet<PhoneticAttribute> phoneticAttributes = Sets.immutableEnumSet(phoneticsAnalyzer
                    .calculatePhoneticAttributes(partialInput.append(TURKISH_CHAR_E_UPPERCASE), null));
            return Arrays.asList(new ImmutableRoot(partialInput, abbreviationLexeme, phoneticAttributes, null));

        } else {/*w w  w .j  a  v a2  s .  c o m*/
            final ImmutableSet<PhoneticAttribute> phoneticAttributes = Sets
                    .immutableEnumSet(phoneticsAnalyzer.calculatePhoneticAttributes(partialInput, null));
            return Arrays.asList(new ImmutableRoot(partialInput, abbreviationLexeme, phoneticAttributes, null));
        }
    } else {
        ///XXX : REALLY SMALL SUPPORT!

        // XXX: might be a known proper noun like "Turkce" or "Istanbul". no support for them yet

        // XXX: might be a known proper noun with implicit P3sg. like : Eminonu, Kusadasi.
        // it is important since :
        // 1. Ankara'_y_a but Eminonu'_n_e    : Since this case has apostrophe, it is handled in ProperNounFromApostropheRootFinder
        // 2: P3sg doesn't apply to these words: onun Kusadasi, onun Eminonu
        // 3. Possessions are applied to 'root' : benim Kusadam etc. SKIP this case!

        final Lexeme properNounLexeme = new ImmutableLexeme(partialInputUnderlyingString,
                partialInputUnderlyingString, PrimaryPos.Noun, SecondaryPos.ProperNoun, null);

        final ImmutableSet<PhoneticAttribute> phoneticAttributes = Sets
                .immutableEnumSet(phoneticsAnalyzer.calculatePhoneticAttributes(partialInput, null));
        final ImmutableRoot properNounRoot = new ImmutableRoot(partialInput, properNounLexeme,
                phoneticAttributes, null);
        return Arrays.asList(properNounRoot);
    }
}

From source file:org.xlrnet.tibaija.processor.ExecutionEnvironment.java

/**
 * Register a command as a function command in the execution environment. All programs and other commands can use
 * the new function once it has been registered. Every command may only be associated with at most one execution
 * environment. If a command is registered as a command function (i.e. through this function) it cannot be used as
 * an expression and won't return any value but will probably modify the system state (e.g. display output).
 *
 * @param commandName//  w  w w. ja v  a2 s .c  om
 *         Name of the new command under which it can be accessed.
 * @param command
 *         An instance of the concrete command.
 */
public void registerCommandFunction(@NotNull String commandName, @NotNull Command command)
        throws TIRuntimeException {
    checkNotNull(commandName);
    checkNotNull(command);

    checkArgument(StringUtils.isNotBlank(commandName), "Function command name may not be blank");
    checkArgument(StringUtils.isAllUpperCase(commandName.substring(0, 1)),
            "Function command name must begin with a uppercase letter");

    if (commandFunctionMap.get(commandName) != null)
        throw new DuplicateCommandException("Command function is already registered: " + commandName);

    if (command.getEnvironment() != null)
        throw new DuplicateCommandException(
                "New command instance is already registered in another environment");

    command.setEnvironment(this);
    commandFunctionMap.put(commandName, command);

    LOGGER.debug("Registered new command function '{}'", commandName);
}

From source file:org.xlrnet.tibaija.processor.ExecutionEnvironment.java

/**
 * Register a command as a statement in the execution environment. All programs and other commands can use the new
 * statement once it has been registered. Every command may only be associated with at most one execution
 * environment. If a command is registered as a command statement (i.e. through this function) it will not be
 * available in expressions and should not be called with parentheses. E.g. "DISP 123". Command statements should be
 * used when the system state must be manipulated (e.g. display output).
 *
 * @param commandName/*w  ww.jav  a  2 s  .c  o m*/
 *         Name of the new command under which it can be accessed. Must begin with an uppercase letter.
 * @param command
 *         An instance of the concrete command.
 */
public void registerCommandStatement(@NotNull String commandName, @NotNull Command command)
        throws TIRuntimeException {
    checkNotNull(commandName);
    checkNotNull(command);

    checkArgument(StringUtils.isNotBlank(commandName), "Command statement name may not be blank");
    checkArgument(StringUtils.isAllUpperCase(commandName.substring(0, 1)),
            "Command statement name must begin with a uppercase letter");

    if (commandStatementMap.get(commandName) != null)
        throw new DuplicateCommandException("Command statement is already registered: " + commandName);

    if (command.getEnvironment() != null)
        throw new DuplicateCommandException(
                "New command instance is already registered in another environment");

    command.setEnvironment(this);
    commandStatementMap.put(commandName, command);

    LOGGER.debug("Registered new command statement '{}'", commandName);
}