List of usage examples for org.apache.commons.lang3 StringUtils isAllUpperCase
public static boolean isAllUpperCase(final CharSequence cs)
Checks if the CharSequence contains only uppercase characters.
null will return false .
From source file:org.languagetool.tagging.disambiguation.uk.UkrainianHybridDisambiguator.java
private void removeLowerCaseHomonymsForAbbreviations(AnalyzedSentence input) { AnalyzedTokenReadings[] tokens = input.getTokensWithoutWhitespace(); for (int i = 1; i < tokens.length; i++) { if (StringUtils.isAllUpperCase(tokens[i].getToken()) && PosTagHelper.hasPosTagPart(tokens[i], ":abbr")) { List<AnalyzedToken> analyzedTokens = tokens[i].getReadings(); for (int j = analyzedTokens.size() - 1; j >= 0; j--) { AnalyzedToken analyzedToken = analyzedTokens.get(j); if (!PosTagHelper.hasPosTagPart(analyzedToken, ":abbr") && !JLanguageTool.SENTENCE_END_TAGNAME.equals(analyzedToken)) { tokens[i].removeReading(analyzedToken); }// w w w . ja v a2s .c om } } } }
From source file:org.languagetool.tagging.uk.UkrainianTagger.java
@Override protected List<AnalyzedToken> getAnalyzedTokens(String word) { List<AnalyzedToken> tokens = super.getAnalyzedTokens(word); if (tokens.get(0).hasNoTag()) { String origWord = word;//from www. j a v a2 s . c om if (word.endsWith("") || word.endsWith("")) { word = origWord.substring(0, word.length() - 1); List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(origWord, word, Pattern.compile("noun:inanim.*"), null, null); return newTokens.size() > 0 ? newTokens : tokens; } if (word.indexOf('\u2013') > 0 && ALT_DASHES_IN_WORD.matcher(word).find()) { word = origWord.replace('\u2013', '-'); List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(origWord, word, null, null, null); if (newTokens.size() > 0) { tokens = newTokens; } } if (word.length() > 7 && word.startsWith("")) { String addPosTag = ""; Matcher matcher = Pattern.compile("(['-]?)(.*)").matcher(word); matcher.matches(); String prefix = matcher.group(1); String adjustedWord = matcher.group(2); List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(origWord, adjustedWord, NAPIV_ALLOWED_TAGS_REGEX, null, null); if (newTokens.size() > 0) { if (!addPosTag.contains(":bad:")) { if (word.charAt(5) == '-' && !adjustedWord.matches("[?-?].*")) { addPosTag += ":bad"; } else if (word.charAt(5) != '\'' && adjustedWord.matches("[?].*")) { addPosTag += ":bad"; } } for (int i = 0; i < newTokens.size(); i++) { AnalyzedToken analyzedToken = newTokens.get(i); String lemma = analyzedToken.getLemma(); String posTag = analyzedToken.getPOSTag(); posTag = NAPIV_REMOVE_TAGS_REGEX.matcher(posTag).replaceAll(""); posTag = PosTagHelper.addIfNotContains(posTag, addPosTag); AnalyzedToken newToken = new AnalyzedToken(origWord, posTag, prefix + lemma); newTokens.set(i, newToken); } tokens = newTokens; } } // try instead of else if (word.contains("")) { tokens = convertTokens(tokens, word, "", "", ":alt"); } else if (word.contains("?")) { tokens = convertTokens(tokens, word, "?", "", ":alt"); } else if (word.endsWith("")) { tokens = convertTokens(tokens, word, "", "", ":alt"); } } // try ??? as and as if (word.length() > 2 && StringUtils.isAllUpperCase(word)) { String newWord = StringUtils.capitalize(StringUtils.lowerCase(word)); List<AnalyzedToken> newTokens = getAdjustedAnalyzedTokens(word, newWord, Pattern.compile("noun.*?:prop.*"), null, null); if (newTokens.size() > 0) { if (tokens.get(0).hasNoTag()) { //TODO: add special tags if necessary tokens = newTokens; } else { tokens.addAll(newTokens); } } } // -? as adj from -? if (word.indexOf('-') > 1 && !word.endsWith("-")) { String[] parts = word.split("-"); if (isAllCapitalized(parts)) { String lowerCasedWord = Stream.of(parts).map(String::toLowerCase).collect(Collectors.joining("-")); List<TaggedWord> wdList = wordTagger.tag(lowerCasedWord); if (PosTagHelper.hasPosTagPart2(wdList, "adj")) { List<AnalyzedToken> analyzedTokens = asAnalyzedTokenListForTaggedWordsInternal(word, wdList); analyzedTokens = PosTagHelper.filter(analyzedTokens, Pattern.compile("adj.*")); if (tokens.get(0).hasNoTag()) { tokens = analyzedTokens; } else { // compound tagging has already been performed and may have added tokens for (AnalyzedToken token : analyzedTokens) { if (!tokens.contains(token)) { tokens.add(token); } } } } } } return tokens; }
From source file:org.opensextant.extractors.geo.GazetteerMatcher.java
/** * Geotag a document, returning PlaceCandidates for the mentions in * document. Optionally just return the PlaceCandidates with name only and * no Place objects attached. Names of contients are passed back as matches, * with geo matches. Continents are filtered out by default. * * @param buffer//w w w . ja va2 s . c o m * text * @param docid * identity of the text * @param tagOnly * True if you wish to get the matched phrases only. False if you * want the full list of Place Candidates. * @param fld * gazetteer field to use for tagging * @return place_candidates List of place candidates * @throws ExtractionException * on err */ public List<PlaceCandidate> tagText(String buffer, String docid, boolean tagOnly, String fld) throws ExtractionException { // "tagsCount":10, "tags":[{ "ids":[35], "endOffset":40, // "startOffset":38}, // { "ids":[750308, 2769912, 2770041, 10413973, 10417546], // "endOffset":49, // "startOffset":41}, // ... // "matchingDocs":{"numFound":75, "start":0, "docs":[ { // "place_id":"USGS1992921", "name":"Monterrey", "cc":"PR"}, { // "place_id":"USGS1991763", "name":"Monterrey", "cc":"PR"}, ] // Reset counts. this.defaultFilterCount = 0; this.userFilterCount = 0; // during post-processing tags we may have to distinguish between tagging/tokenizing // general vs. cjk vs. ar. But not yet though. // boolean useGeneralMode = DEFAULT_TAG_FIELD.equals(fld); long t0 = System.currentTimeMillis(); log.debug("TEXT SIZE = {}", buffer.length()); int[] textMetrics = TextUtils.measureCase(buffer); boolean isUpperCase = TextUtils.isUpperCaseDocument(textMetrics); params.set("field", fld); Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100); QueryResponse response = tagTextCallSolrTagger(buffer, docid, beanMap); @SuppressWarnings("unchecked") List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags"); this.tagNamesTime = response.getQTime(); long t1 = t0 + tagNamesTime; long t2 = System.currentTimeMillis(); boolean geocode = !tagOnly; /* * Retrieve all offsets into a long list. These offsets will report a * text span and all the gazetteer record IDs that are associated to * that span. The text could either be a name, a code or some other * abbreviation. * * For practical reasons the default behavior is to filter trivial spans * given the gazetteer data that is returned for them. * * WARNING: lots of optimizations occur here due to the potentially * large volume of tags and gazetteer data that is involved. And this is * relatively early in the pipline. */ log.debug("DOC={} TAGS SIZE={}", docid, tags.size()); TreeMap<Integer, PlaceCandidate> candidates = new TreeMap<Integer, PlaceCandidate>(); // names matched is used only for debugging, currently. Set<String> namesMatched = new HashSet<>(); tagLoop: for (NamedList<?> tag : tags) { int x1 = (Integer) tag.get("startOffset"); int x2 = (Integer) tag.get("endOffset"); int len = x2 - x1; if (len == 1) { // Ignoring place names whose length is less than 2 chars ++this.defaultFilterCount; continue; } // +1 char after last matched // Could have enabled the "matchText" option from the tagger to get // this, but since we already have the content as a String then // we might as well not make the tagger do any more work. String matchText = (String) tag.get("matchText"); // Get char immediately following match, for light NLP rules. char postChar = 0; if (x2 < buffer.length()) { postChar = buffer.charAt(x2); } // Then filter out trivial matches. E.g., Us is filtered out. vs. US would // be allowed. If lowercase abbreviations are allowed, then all matches are passed. if (len < 3) { if (TextUtils.isASCII(matchText) && !StringUtils.isAllUpperCase(matchText) && !allowLowercaseAbbrev) { ++this.defaultFilterCount; continue; } } if (TextUtils.countFormattingSpace(matchText) > 1) { // Phrases with words broken across more than one line are not // valid matches. // Phrase with a single TAB is okay ++this.defaultFilterCount; continue; } // Eliminate any newlines and extra whitespace in match matchText = TextUtils.squeeze_whitespace(matchText); /** * Filter out trivial tags. Due to normalization, we tend to get * lots of false positives that can be eliminated early. */ if (filter.filterOut(matchText)) { ++this.defaultFilterCount; continue; } PlaceCandidate pc = new PlaceCandidate(); pc.start = x1; pc.end = x2; pc.setText(matchText); /* * Filter out tags that user determined ahead of time as not-places * for their context. * */ if (userfilter != null) { if (userfilter.filterOut(pc.getTextnorm())) { log.debug("User Filter:{}", matchText); ++this.userFilterCount; continue; } } /* * Continent filter is needed, as many mentions of contients confuse * real geotagging/geocoding. * */ if (continents.filterOut(pc.getTextnorm())) { pc.isContinent = true; pc.setFilteredOut(true); candidates.put(pc.start, pc); continue; } /* * Found UPPER CASE text in a mixed-cased document. * Conservatively, this is likely an acronym or some heading. * But possibly still a valid place name. * HEURISTIC: acronyms are relatively short. * HEURISTIC: region codes can be acronyms and are valid places * * using such place candidates you may score short acronym matches lower than fully named ones. * when inferring boundaries (states, provinces, etc) */ if (!isUpperCase && pc.isUpper() && len < 5) { pc.isAcronym = true; } /* * Everything Else. */ pc.setSurroundingTokens(buffer); @SuppressWarnings("unchecked") List<Integer> placeRecordIds = (List<Integer>) tag.get("ids"); /* * This assertion is helpful in debugging: assert * placeRecordIds.size() == new * HashSet<Integer>(placeRecordIds).size() : "ids should be unique"; */ // assert!placeRecordIds.isEmpty(); namesMatched.clear(); //double maxNameBias = 0.0; for (Integer solrId : placeRecordIds) { // Yes, we must cast here. // As long as createTag generates the correct type stored in // beanMap we are fine. ScoredPlace pGeo = (ScoredPlace) beanMap.get(solrId); // assert pGeo != null; // Optimization: abbreviation filter. // // Do not add PlaceCandidates for lower case tokens that are // marked as Abbreviations, unless flagged to do so. // // DEFAULT behavior is to avoid lower case text that is tagged // as an abbreviation in gazetteer, // // Common terms: in, or, oh, me, us, we, etc. Are all not // typically place names or valid abbreviations in text. // if (!allowLowercaseAbbrev && pGeo.isAbbreviation() && pc.isLower()) { log.debug("Ignore lower case term={}", pc.getText()); // DWS: TODO what if there is another pGeo for this pc that // isn't an abbrev? Therefore shouldn't we continue this // loop and not tagLoop? continue tagLoop; } /* * If text match contains "." and it matches any abbreviation, * mark the candidate as an abbrev. TODO: Possibly best confirm * this by sentence detection, as well. However, this pertains * to text spans that contain "." within the bounds, and not * likely an ending. E.g., "U.S." or "U.S" are trivial examples; * "US" is more ambiguous, as we need to know if document is * upperCase. * * Any place abbreviation will trigger isAbbreviation = true * * "IF YOU FIND US HERE" the term 'US' is ambiguous here, so * it is not classified as an abbreviation. Otherwise if you have * "My organization YAK happens to coincide with a place named Yak. * But we first must determine if 'YAK' is a valid abbreviation for an actual place. * HEURISTIC: place abbreviations are relatively short, e.g. one word(len=7 or less) */ if (len < 8 && !pc.isAbbreviation) { assessAbbreviation(pc, pGeo, postChar, isUpperCase); } if (log.isDebugEnabled()) { namesMatched.add(pGeo.getName()); } /** * Country names are the only names you can reasonably set ahead * of time. All other names need to be assessed in context. * Negate country names, e.g., "Georgia", by exception. */ if (pGeo.isCountry()) { pc.isCountry = true; } if (geocode) { pGeo.defaultHierarchicalPath(); // Default score for geo will be calculated in PlaceCandidate pc.addPlace(pGeo); } } // If geocoding, skip this PlaceCandidate if has no places (e.g. due // to filtering) if (geocode && !pc.hasPlaces()) { log.debug("Place has no places={}", pc.getText()); continue; } else { if (log.isDebugEnabled()) { log.debug("Text {} matched {}", pc.getText(), namesMatched); } } candidates.put(pc.start, pc); } // for tag long t3 = System.currentTimeMillis(); // this.tagNamesTime = (int)(t1 - t0); this.getNamesTime = (int) (t2 - t1); this.totalTime = (int) (t3 - t0); if (log.isDebugEnabled()) { summarizeExtraction(candidates.values(), docid); } this.filteredTotal += this.defaultFilterCount + this.userFilterCount; this.matchedTotal += candidates.size(); return new ArrayList<PlaceCandidate>(candidates.values()); }
From source file:org.opensingular.form.SFormUtil.java
public static String generateUserFriendlyName(String simpleName) { final Pattern lowerUpper = Pattern.compile("(.*?[a-z])([A-Z].*?)"); final Pattern prefixoSigla = Pattern.compile("([A-Z]+)([A-Z][a-z])"); final ImmutableSet<String> upperCaseSpecialCases = ImmutableSet.of("id", "url"); return StringUtils.capitalize(Arrays.asList(simpleName).stream() .map(s -> lowerUpper.matcher(s).replaceAll("$1-$2")) .map(s -> prefixoSigla.matcher(s).replaceAll("$1-$2")) .flatMap(s -> Arrays.asList(s.split("[-_]+")).stream()) .map(s -> (StringUtils.isAllUpperCase(s) ? s : StringUtils.uncapitalize(s))) .map(s -> upperCaseSpecialCases.contains(s) ? StringUtils.capitalize(s) : s).collect(joining(" "))); }
From source file:org.structr.console.tabcompletion.AbstractTabCompletionProvider.java
protected List<TabCompletionResult> getCaseInsensitiveResultsForCollection(final Collection<String> words, final String token, final String suffix) { final List<TabCompletionResult> result = new LinkedList<>(); final String lowerToken = token.toLowerCase(); final boolean upperCase = StringUtils.isAllUpperCase(token); for (final String word : words) { if (word.startsWith(lowerToken)) { if (upperCase) { result.add(getCompletion(word.toUpperCase(), token)); } else { result.add(getCompletion(word, token)); }/*from w w w .j a v a 2 s . c o m*/ } } return result; }
From source file:org.trnltk.morphology.contextless.rootfinder.ProperNounFromApostropheRootFinder.java
@Override public List<? extends Root> findRootsForPartialInput(TurkishSequence partialInput, TurkishSequence input) { final TurkishSequence properNounCandidate = partialInput.subsequence(0, partialInput.length() - 1); final String properNounCandidateUnderlyingString = properNounCandidate.getUnderlyingString(); if (StringUtils.isAllUpperCase(properNounCandidateUnderlyingString)) { final Lexeme lexeme = new ImmutableLexeme(properNounCandidateUnderlyingString, properNounCandidateUnderlyingString, PrimaryPos.Noun, SecondaryPos.Abbreviation, null); if (!properNounCandidate.getLastChar().getLetter().isVowel()) { // if last letter is not vowel (such as PTT, THY), then add char 'E' to the end and then calculate the phonetics final ImmutableSet<PhoneticAttribute> phoneticAttributes = Sets.immutableEnumSet(phoneticsAnalyzer .calculatePhoneticAttributes(properNounCandidate.append(TURKISH_CHAR_E_UPPERCASE), null)); return Arrays.asList(new ImmutableRoot(properNounCandidate, lexeme, phoneticAttributes, null)); } else {// w w w. j a va 2s. c o m final ImmutableSet<PhoneticAttribute> phoneticAttributes = Sets .immutableEnumSet(phoneticsAnalyzer.calculatePhoneticAttributes(properNounCandidate, null)); return Arrays.asList(new ImmutableRoot(properNounCandidate, lexeme, phoneticAttributes, null)); } } else { final Lexeme lexeme = new ImmutableLexeme(properNounCandidateUnderlyingString, properNounCandidateUnderlyingString, PrimaryPos.Noun, SecondaryPos.ProperNoun, null); final ImmutableSet<PhoneticAttribute> phoneticAttributes = Sets .immutableEnumSet(phoneticsAnalyzer.calculatePhoneticAttributes(properNounCandidate, null)); return Arrays.asList(new ImmutableRoot(properNounCandidate, lexeme, phoneticAttributes, null)); } }
From source file:org.trnltk.morphology.contextless.rootfinder.ProperNounWithoutApostropheRootFinder.java
@Override public List<? extends Root> findRootsForPartialInput(TurkishSequence partialInput, TurkishSequence input) { final String partialInputUnderlyingString = partialInput.getUnderlyingString(); if (partialInput.equals(input) && StringUtils.isAllUpperCase(partialInputUnderlyingString)) { final Lexeme abbreviationLexeme = new ImmutableLexeme(partialInputUnderlyingString, partialInputUnderlyingString, PrimaryPos.Noun, SecondaryPos.Abbreviation, null); if (!partialInput.getLastChar().getLetter().isVowel()) { // if last letter is not vowel (such as PTT, THY), then add char 'E' to the end and then calculate the phonetics final ImmutableSet<PhoneticAttribute> phoneticAttributes = Sets.immutableEnumSet(phoneticsAnalyzer .calculatePhoneticAttributes(partialInput.append(TURKISH_CHAR_E_UPPERCASE), null)); return Arrays.asList(new ImmutableRoot(partialInput, abbreviationLexeme, phoneticAttributes, null)); } else {/*w w w .j a v a2 s . c o m*/ final ImmutableSet<PhoneticAttribute> phoneticAttributes = Sets .immutableEnumSet(phoneticsAnalyzer.calculatePhoneticAttributes(partialInput, null)); return Arrays.asList(new ImmutableRoot(partialInput, abbreviationLexeme, phoneticAttributes, null)); } } else { ///XXX : REALLY SMALL SUPPORT! // XXX: might be a known proper noun like "Turkce" or "Istanbul". no support for them yet // XXX: might be a known proper noun with implicit P3sg. like : Eminonu, Kusadasi. // it is important since : // 1. Ankara'_y_a but Eminonu'_n_e : Since this case has apostrophe, it is handled in ProperNounFromApostropheRootFinder // 2: P3sg doesn't apply to these words: onun Kusadasi, onun Eminonu // 3. Possessions are applied to 'root' : benim Kusadam etc. SKIP this case! final Lexeme properNounLexeme = new ImmutableLexeme(partialInputUnderlyingString, partialInputUnderlyingString, PrimaryPos.Noun, SecondaryPos.ProperNoun, null); final ImmutableSet<PhoneticAttribute> phoneticAttributes = Sets .immutableEnumSet(phoneticsAnalyzer.calculatePhoneticAttributes(partialInput, null)); final ImmutableRoot properNounRoot = new ImmutableRoot(partialInput, properNounLexeme, phoneticAttributes, null); return Arrays.asList(properNounRoot); } }
From source file:org.xlrnet.tibaija.processor.ExecutionEnvironment.java
/** * Register a command as a function command in the execution environment. All programs and other commands can use * the new function once it has been registered. Every command may only be associated with at most one execution * environment. If a command is registered as a command function (i.e. through this function) it cannot be used as * an expression and won't return any value but will probably modify the system state (e.g. display output). * * @param commandName// w w w. ja v a2 s .c om * Name of the new command under which it can be accessed. * @param command * An instance of the concrete command. */ public void registerCommandFunction(@NotNull String commandName, @NotNull Command command) throws TIRuntimeException { checkNotNull(commandName); checkNotNull(command); checkArgument(StringUtils.isNotBlank(commandName), "Function command name may not be blank"); checkArgument(StringUtils.isAllUpperCase(commandName.substring(0, 1)), "Function command name must begin with a uppercase letter"); if (commandFunctionMap.get(commandName) != null) throw new DuplicateCommandException("Command function is already registered: " + commandName); if (command.getEnvironment() != null) throw new DuplicateCommandException( "New command instance is already registered in another environment"); command.setEnvironment(this); commandFunctionMap.put(commandName, command); LOGGER.debug("Registered new command function '{}'", commandName); }
From source file:org.xlrnet.tibaija.processor.ExecutionEnvironment.java
/** * Register a command as a statement in the execution environment. All programs and other commands can use the new * statement once it has been registered. Every command may only be associated with at most one execution * environment. If a command is registered as a command statement (i.e. through this function) it will not be * available in expressions and should not be called with parentheses. E.g. "DISP 123". Command statements should be * used when the system state must be manipulated (e.g. display output). * * @param commandName/*w ww.jav a 2 s .c o m*/ * Name of the new command under which it can be accessed. Must begin with an uppercase letter. * @param command * An instance of the concrete command. */ public void registerCommandStatement(@NotNull String commandName, @NotNull Command command) throws TIRuntimeException { checkNotNull(commandName); checkNotNull(command); checkArgument(StringUtils.isNotBlank(commandName), "Command statement name may not be blank"); checkArgument(StringUtils.isAllUpperCase(commandName.substring(0, 1)), "Command statement name must begin with a uppercase letter"); if (commandStatementMap.get(commandName) != null) throw new DuplicateCommandException("Command statement is already registered: " + commandName); if (command.getEnvironment() != null) throw new DuplicateCommandException( "New command instance is already registered in another environment"); command.setEnvironment(this); commandStatementMap.put(commandName, command); LOGGER.debug("Registered new command statement '{}'", commandName); }