List of usage examples for java.lang Character isLowerCase
public static boolean isLowerCase(int codePoint)
From source file:de.tudarmstadt.ukp.dkpro.spelling.experiments.errormining.SpellingErrorFilter.java
private boolean haveFirstLettersSameCase(char char1, char char2) { if (Character.isUpperCase(char1) && Character.isLowerCase(char2) || Character.isLowerCase(char1) && Character.isUpperCase(char2)) { return false; }/*from w w w . j a va 2s . c o m*/ return true; }
From source file:com.sfs.whichdoctor.dao.AddressVerificationDAOImpl.java
/** * Checks if the string is all upper case. * * @param s the string//from w w w.ja v a2 s. c om * @return true, if is all upper */ private static boolean isAllUpper(String s) { for (char c : s.toCharArray()) { if (Character.isLetter(c) && Character.isLowerCase(c)) { return false; } } return true; }
From source file:org.languagetool.rules.de.CaseRule.java
@Override public RuleMatch[] match(AnalyzedSentence sentence) throws IOException { List<RuleMatch> ruleMatches = new ArrayList<>(); AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace(); boolean prevTokenIsDas = false; boolean isPrecededByModalOrAuxiliary = false; for (int i = 0; i < tokens.length; i++) { //Note: defaulting to the first analysis is only save if we only query for sentence start String posToken = tokens[i].getAnalyzedToken(0).getPOSTag(); if (JLanguageTool.SENTENCE_START_TAGNAME.equals(posToken)) { continue; }/*from w ww . j a va 2s . c om*/ if (i == 1) { // don't care about first word, UppercaseSentenceStartRule does this already prevTokenIsDas = nounIndicators.contains(tokens[1].getToken().toLowerCase()); continue; } if (i > 0 && isSalutation(tokens[i - 1].getToken())) { // e.g. "Frau Stieg" could be a name, ignore continue; } AnalyzedTokenReadings analyzedToken = tokens[i]; String token = analyzedToken.getToken(); boolean isBaseform = analyzedToken.getReadingsLength() >= 1 && analyzedToken.hasLemma(token); if ((analyzedToken.getAnalyzedToken(0).getPOSTag() == null || GermanHelper.hasReadingOfType(analyzedToken, GermanToken.POSType.VERB)) && isBaseform) { boolean nextTokenIsPersonalOrReflexivePronoun = false; if (i < tokens.length - 1) { AnalyzedTokenReadings nextToken = tokens[i + 1]; // avoid false alarm for "Das haben wir getan." etc: nextTokenIsPersonalOrReflexivePronoun = nextToken.hasPartialPosTag("PRO:PER") || StringUtils.equalsAny(nextToken.getToken(), "sich", "Sie"); if (nextToken.hasPosTag("PKT")) { // avoid false alarm for "So sollte das funktionieren." (might also remove true alarms...) continue; } if (prevTokenIsDas && (DAS_VERB_EXCEPTIONS.contains(nextToken.getToken()) || isFollowedByRelativeOrSubordinateClause(i, tokens)) || (i > 1 && hasPartialTag(tokens[i - 2], "VER:AUX", "VER:MOD"))) { // avoid false alarm for "Er kann ihr das bieten, was sie verdient." // avoid false alarm for "Das wissen die meisten." / "Um das sagen zu knnen, ..." // avoid false alarm for "Du musst/solltest/knntest das wissen, damit du die Prfung bestehst / weil wir das gestern besprochen haben." // avoid false alarm for "Wir werden das stoppen." // avoid false alarm for "Wahre Liebe muss das aushalten." continue; } } if (isPrevProbablyRelativePronoun(tokens, i) || (prevTokenIsDas && getTokensWithPartialPosTagCount(tokens, "VER") == 1)) {// ignore sentences containing a single verb, e.g., "Das wissen viele nicht." continue; } potentiallyAddLowercaseMatch(ruleMatches, tokens[i], prevTokenIsDas, token, nextTokenIsPersonalOrReflexivePronoun, sentence); } prevTokenIsDas = nounIndicators.contains(tokens[i].getToken().toLowerCase()); if (analyzedToken.matchesPosTagRegex("VER:(MOD|AUX):[1-3]:.*")) { isPrecededByModalOrAuxiliary = true; } AnalyzedTokenReadings lowercaseReadings = tagger.lookup(token.toLowerCase()); if (hasNounReading(analyzedToken)) { // it's the spell checker's task to check that nouns are uppercase if (!isPotentialUpperCaseError(i, tokens, lowercaseReadings, isPrecededByModalOrAuxiliary)) { continue; } } else if (analyzedToken.hasPosTagStartingWith("SUB:") && i < tokens.length - 1 && Character.isLowerCase(tokens[i + 1].getToken().charAt(0)) && tokens[i + 1].matchesPosTagRegex("VER:[123]:.+")) { // "Viele Minderjhrige sind" but not "Das wirklich Wichtige Verfahren ist" continue; } if (analyzedToken.getAnalyzedToken(0).getPOSTag() == null && lowercaseReadings == null) { continue; } if (analyzedToken.getAnalyzedToken(0).getPOSTag() == null && lowercaseReadings != null && (lowercaseReadings.getAnalyzedToken(0).getPOSTag() == null || analyzedToken.getToken().endsWith("innen"))) { continue; // unknown word, probably a name etc. } potentiallyAddUppercaseMatch(ruleMatches, tokens, i, analyzedToken, token, lowercaseReadings, sentence); } return toRuleMatchArray(ruleMatches); }
From source file:it.cnr.isti.hpc.dexter.disambiguation.TurkishEntityDisambiguator.java
@Override public EntityMatchList disambiguate(DexterLocalParams localParams, SpotMatchList sml) { entityScoreMap = new HashMap<String, EntityScores>(); selectedEntities = new HashSet<String>(); Multiset<String> entityFrequencyMultiset = HashMultiset.create(); EntityMatchList entities = sml.getEntities(); String inputText = localParams.getParams().get("text"); String algorithm = Property.getInstance().get("algorithm"); String ambigious = Property.getInstance().get("algorithm.ambigious"); List<Token> inputTokens = Zemberek.getInstance().disambiguateFindTokens(inputText, false, true); List<Double> documentVector = DescriptionEmbeddingAverage.getAverageVectorList(inputText); Multiset<String> inputTokensMultiset = HashMultiset.create(); for (Token token : inputTokens) { inputTokensMultiset.add(token.getMorphText()); }//from w w w . j a v a2 s. co m Multiset<String> domainMultiset = HashMultiset.create(); Multiset<String> typeMultiset = HashMultiset.create(); HashMap<String, Double> entitySimMap = new HashMap<String, Double>(); // if (printCandidateEntities) { // printEntities(entities); // } HashSet<String> words = new HashSet<String>(); Multiset<String> leskWords = HashMultiset.create(); // first pass for finding number of types and domains for (int i = 0; i < entities.size(); i++) { EntityMatch em = entities.get(i); String id = em.getId(); if (!entityFrequencyMultiset.contains(id)) { entityFrequencyMultiset.add(id); Entity entity = em.getEntity(); words.add(entity.getShingle().getText()); String type = entity.getPage().getType(); if (type != null && type.length() > 0) { typeMultiset.add(type); } String domain = entity.getPage().getDomain(); if (domain != null && domain.length() > 0) { domainMultiset.add(domain); } String desc = entity.getPage().getDescription(); List<Token> tokens = Zemberek.getInstance().disambiguateFindTokens(desc, false, true); for (Token token : tokens) { leskWords.add(token.getMorphText()); } } else { entityFrequencyMultiset.add(id); } } int maxDomainCount = 0; for (String domain : Multisets.copyHighestCountFirst(domainMultiset).elementSet()) { maxDomainCount = domainMultiset.count(domain); break; } int maxTypeCount = 0; for (String type : Multisets.copyHighestCountFirst(typeMultiset).elementSet()) { maxTypeCount = typeMultiset.count(type); break; } double maxSuffixScore = 0, maxLeskScore = 0, maxSimpleLeskScore = 0, maxLinkScore = 0, maxHashInfoboxScore = 0, maxwordvecDescriptionLocalScore = 0, maxHashDescriptionScore = 0, maxPopularityScore = 0, maxWordvectorAverage = 0, maxWordvecLinksScore = 0; // second pass compute similarities between entities in a window int currentSpotIndex = -1; SpotMatch currentSpot = null; for (int i = 0; i < entities.size(); i++) { EntityMatch em = entities.get(i); SpotMatch spot = em.getSpot(); if (currentSpot == null || spot != currentSpot) { currentSpotIndex++; currentSpot = spot; } String id = em.getId(); Entity entity = entities.get(i).getEntity(); EntityPage page = entities.get(i).getEntity().getPage(); String domain = page.getDomain(); String type = page.getType(); Shingle shingle = entity.getShingle(); /* windowing algorithms stars */ int left = currentSpotIndex - window; int right = currentSpotIndex + window; if (left < 0) { right -= left; left = 0; } if (right > sml.size()) { left += (sml.size()) - right; right = sml.size(); if (left < 0) { left = 0; } } double linkScore = 0, hashInfoboxScore = 0, wordvecDescriptionLocalScore = 0, hashDescriptionScore = 0, wordvecLinksScore = 0; for (int j = left; j < right; j++) { SpotMatch sm2 = sml.get(j); EntityMatchList entities2 = sm2.getEntities(); for (EntityMatch em2 : entities2) { String id2 = em2.getId(); EntityPage page2 = em2.getEntity().getPage(); int counter = 0; if (!ambigious.equals("true")) { for (EntityMatch entityMatch : entities2) { if (entityMatch.getId().startsWith("w")) { counter++; } } } if ((ambigious.equals("true") || counter == 1) && em.getSpot() != em2.getSpot() && !id.equals(id2)) { // Link Similarity calculation starts double linkSim = 0; if (id.startsWith("w") && id2.startsWith("w")) { if (entitySimMap.containsKey("link" + id + id2)) { linkSim = entitySimMap.get("link" + id + id2); } else { HashSet<String> set1 = Sets.newHashSet(page.getLinks().split(" ")); HashSet<String> set2 = Sets.newHashSet(page2.getLinks().split(" ")); linkSim = JaccardCalculator.calculateSimilarity(set1, set2); entitySimMap.put("link" + id + id2, linkSim); } linkScore += linkSim; // Link Similarity calculation ends } // Entity embedding similarity calculation starts double eeSim = 0; if (id.startsWith("w") && id2.startsWith("w")) { if (entitySimMap.containsKey("ee" + id + id2)) { eeSim = entitySimMap.get("ee" + id + id2); } else { eeSim = EntityEmbeddingSimilarity.getInstance().getSimilarity(page, page2); entitySimMap.put("ee" + id + id2, eeSim); } hashInfoboxScore += eeSim; } double w2veclinksSim = 0; if (id.startsWith("w") && id2.startsWith("w")) { if (entitySimMap.containsKey("wl" + id + id2)) { w2veclinksSim = entitySimMap.get("wl" + id + id2); } else { w2veclinksSim = AveragePooling.getInstance().getSimilarity(page.getWord2vec(), page2.getWord2vec()); entitySimMap.put("wl" + id + id2, w2veclinksSim); } wordvecLinksScore += w2veclinksSim; } // Entity embedding similarity calculation ends // Description word2vec similarity calculation // starts double word2vecSim = 0; if (entitySimMap.containsKey("w2v" + id + id2)) { word2vecSim = entitySimMap.get("w2v" + id + id2); } else { word2vecSim = AveragePooling.getInstance().getSimilarity(page2.getDword2vec(), page.getDword2vec()); entitySimMap.put("w2v" + id + id2, word2vecSim); } wordvecDescriptionLocalScore += word2vecSim; // Description word2vec similarity calculation ends // Description autoencoder similarity calculation // starts double autoVecSim = 0; if (entitySimMap.containsKey("a2v" + id + id2)) { autoVecSim = entitySimMap.get("a2v" + id + id2); } else { autoVecSim = AveragePooling.getInstance().getSimilarity(page2.getDautoencoder(), page.getDautoencoder()); entitySimMap.put("a2v" + id + id2, autoVecSim); } hashDescriptionScore += autoVecSim; // Description autoencoder similarity calculation // ends } } } if (linkScore > maxLinkScore) { maxLinkScore = linkScore; } if (hashInfoboxScore > maxHashInfoboxScore) { maxHashInfoboxScore = hashInfoboxScore; } if (wordvecDescriptionLocalScore > maxwordvecDescriptionLocalScore) { maxwordvecDescriptionLocalScore = wordvecDescriptionLocalScore; } if (hashDescriptionScore > maxHashDescriptionScore) { maxHashDescriptionScore = hashDescriptionScore; } if (wordvecLinksScore > maxWordvecLinksScore) { maxWordvecLinksScore = wordvecLinksScore; } /* windowing algorithms ends */ double domainScore = 0; if (domainMultiset.size() > 0 && maxDomainCount > 1 && domainMultiset.count(domain) > 1) { domainScore = (double) domainMultiset.count(domain) / maxDomainCount; } double typeScore = 0; if (typeMultiset.size() > 0 && maxTypeCount > 1 && typeMultiset.count(type) > 1) { typeScore = (double) typeMultiset.count(type) / maxTypeCount; } if (typeBlackList.contains(type)) { typeScore /= 10; } double typeContentScore = 0; if (type.length() > 0 && StringUtils.containsIgnoreCase(words.toString(), type)) { typeContentScore = 1; } double typeClassifierScore = TypeClassifier.getInstance().predict(page, page.getTitle(), page.getType(), entity.getShingle().getSentence()); double wordvecDescriptionScore = AveragePooling.getInstance().getSimilarity(documentVector, page.getDword2vec()); if (wordvecDescriptionScore > maxWordvectorAverage) { maxWordvectorAverage = wordvecDescriptionScore; } double suffixScore = 0; if (type != null && type.length() > 0) { Set<String> suffixes = new HashSet<String>(); String t = entity.getTitle().toLowerCase(new Locale("tr", "TR")); for (int x = 0; x < entities.size(); x++) { EntityMatch e2 = entities.get(x); if (e2.getId().equals(entity.getId())) { suffixes.add(e2.getMention()); } } suffixes.remove(t); suffixes.remove(entity.getTitle()); // String inputTextLower = inputText.toLowerCase(new // Locale("tr", // "TR")); // while (inputTextLower.contains(t)) { // int start = inputTextLower.indexOf(t); // int end = inputTextLower.indexOf(" ", start + t.length()); // if (end > start) { // String suffix = inputTextLower.substring(start, end); // // .replaceAll("\\W", ""); // if (suffix.contains("'") // || (Zemberek.getInstance().hasMorph(suffix) // && !suffix.equals(t) && suffix.length() > 4)) { // suffixes.add(suffix); // } // inputTextLower = inputTextLower.substring(end); // } else { // break; // } // } if (suffixes.size() >= minSuffix) { for (String suffix : suffixes) { double sim = gd.calculateSimilarity(suffix, type); suffixScore += sim; } } } // String entitySuffix = page.getSuffix(); // String[] inputSuffix = shingle.getSuffix().split(" "); // for (int j = 0; j < inputSuffix.length; j++) { // if (entitySuffix.contains(inputSuffix[j])) { // suffixScore += 0.25f; // } // } if (suffixScore > maxSuffixScore) { maxSuffixScore = suffixScore; } // if (id.equals("w691538")) { // LOGGER.info(""); // } double letterCaseScore = 0; int lc = page.getLetterCase(); if (StringUtils.isAllLowerCase(em.getMention()) && lc == 0 && id.startsWith("t")) { letterCaseScore = 1; } else if (StringUtils.isAllUpperCase(em.getMention()) && lc == 1 && id.startsWith("w")) { letterCaseScore = 1; } else if (Character.isUpperCase(em.getMention().charAt(0)) && lc == 2 && id.startsWith("w")) { letterCaseScore = 1; } else if (StringUtils.isAllLowerCase(em.getMention()) && id.startsWith("t")) { letterCaseScore = 1; } double nameScore = 1 - LevenshteinDistanceCalculator.calculateDistance(page.getTitle(), Zemberek.removeAfterSpostrophe(em.getMention())); double popularityScore = page.getRank(); if (id.startsWith("w")) { popularityScore = Math.log10(popularityScore + 1); if (popularityScore > maxPopularityScore) { maxPopularityScore = popularityScore; } } double leskScore = 0, simpleLeskScore = 0; String desc = em.getEntity().getPage().getDescription(); if (desc != null) { List<Token> tokens = Zemberek.getInstance().disambiguateFindTokens(desc, false, true); for (Token token : tokens) { if (inputTokensMultiset.contains(token.getMorphText()) && !TurkishNLP.isStopWord(token.getMorphText())) { simpleLeskScore += inputTokensMultiset.count(token.getMorphText()); } if (leskWords.contains(token.getMorphText()) && !TurkishNLP.isStopWord(token.getMorphText())) { leskScore += leskWords.count(token.getMorphText()); } } leskScore /= Math.log(tokens.size() + 1); simpleLeskScore /= Math.log(tokens.size() + 1); if (leskScore > maxLeskScore) { maxLeskScore = leskScore; } if (simpleLeskScore > maxSimpleLeskScore) { maxSimpleLeskScore = simpleLeskScore; } if (!entityScoreMap.containsKey(id)) { EntityScores scores = new EntityScores(em, id, popularityScore, nameScore, letterCaseScore, suffixScore, wordvecDescriptionScore, typeContentScore, typeScore, domainScore, hashDescriptionScore, wordvecDescriptionLocalScore, hashInfoboxScore, linkScore, wordvecLinksScore, leskScore, simpleLeskScore, typeClassifierScore); entityScoreMap.put(id, scores); } else { EntityScores entityScores = entityScoreMap.get(id); entityScores.setHashInfoboxScore((entityScores.getHashInfoboxScore() + hashInfoboxScore) / 2); entityScores.setHashDescriptionScore( (entityScores.getHashInfoboxScore() + hashDescriptionScore) / 2); entityScores.setLinkScore((entityScores.getLinkScore() + linkScore) / 2); entityScores.setWordvecDescriptionLocalScore( (entityScores.getWordvecDescriptionLocalScore() + wordvecDescriptionLocalScore) / 2); entityScores .setWordvecLinksScore((entityScores.getWordvecLinksScore() + wordvecLinksScore) / 2); entityScores.setLeskScore((entityScores.getLeskScore() + leskScore) / 2); } } } /* normalization and total score calculation starts */ Set<String> set = new HashSet<String>(); for (int i = 0; i < entities.size(); i++) { EntityMatch em = entities.get(i); String id = em.getId(); EntityScores entityScores = entityScoreMap.get(id); if (set.contains(id)) { continue; } if (id.startsWith("w")) { if (maxLinkScore > 0 && entityScores.getLinkScore() > 0) { entityScores.setLinkScore(entityScores.getLinkScore() / maxLinkScore); } if (maxHashInfoboxScore > 0 && entityScores.getHashInfoboxScore() > 0) { entityScores.setHashInfoboxScore(entityScores.getHashInfoboxScore() / maxHashInfoboxScore); } if (maxWordvecLinksScore > 0 && entityScores.getWordvecLinksScore() > 0) { entityScores.setWordvecLinksScore(entityScores.getWordvecLinksScore() / maxWordvecLinksScore); } if (maxPopularityScore > 0 && entityScores.getPopularityScore() > 0) { entityScores.setPopularityScore(entityScores.getPopularityScore() / maxPopularityScore); } } if (maxwordvecDescriptionLocalScore > 0 && entityScores.getWordvecDescriptionLocalScore() > 0) { entityScores.setWordvecDescriptionLocalScore( entityScores.getWordvecDescriptionLocalScore() / maxwordvecDescriptionLocalScore); } if (maxHashDescriptionScore > 0 && entityScores.getHashDescriptionScore() > 0) { entityScores .setHashDescriptionScore(entityScores.getHashDescriptionScore() / maxHashDescriptionScore); } if (maxWordvectorAverage > 0 && entityScores.getWordvecDescriptionScore() > 0) { entityScores.setWordvecDescriptionScore( entityScores.getWordvecDescriptionScore() / maxWordvectorAverage); } if (maxLeskScore > 0 && entityScores.getLeskScore() > 0) { entityScores.setLeskScore(entityScores.getLeskScore() / maxLeskScore); } if (maxSimpleLeskScore > 0 && entityScores.getSimpleLeskScore() > 0) { entityScores.setSimpleLeskScore(entityScores.getSimpleLeskScore() / maxSimpleLeskScore); } if (maxSuffixScore > 0 && entityScores.getSuffixScore() > 0) { entityScores.setSuffixScore(entityScores.getSuffixScore() / maxSuffixScore); } set.add(id); } LOGGER.info("\t" + "id\tTitle\tURL\tScore\tPopularity\tName\tLesk\tSimpeLesk\tCase\tNoun\tSuffix\tTypeContent\tType\tDomain\twordvecDescription\twordvecDescriptionLocal\thashDescription\thashInfobox\tword2vecLinks\tLink\t\ttypeClassifier\tDescription"); for (int i = 0; i < entities.size(); i++) { EntityMatch em = entities.get(i); String id = em.getId(); EntityScores e = entityScoreMap.get(id); double wikiScore = 0; if (id.startsWith("w") && Character.isUpperCase(em.getMention().charAt(0))) { wikiScore = wikiWeight; } else if (id.startsWith("t") && Character.isLowerCase(em.getMention().charAt(0))) { wikiScore = wikiWeight; } // if(id.equals("w508792")){ // LOGGER.info(""); // } double totalScore = wikiScore + e.getPopularityScore() * popularityWeight + e.getNameScore() * nameWeight + e.getLeskScore() * leskWeight + e.getSimpleLeskScore() * simpleLeskWeight + e.getLetterCaseScore() * letterCaseWeight + e.getSuffixScore() * suffixWeight + e.getTypeContentScore() * typeContentWeight + e.getTypeScore() * typeWeight + e.getDomainScore() * domainWeight + e.getWordvecDescriptionScore() * wordvecDescriptionWeight + e.getWordvecDescriptionLocalScore() * wordvecDescriptionLocalWeight + e.getHashDescriptionScore() * hashDescriptionWeight + e.getHashInfoboxScore() * hashInfoboxWeight + e.getWordvecLinksScore() * word2vecLinksWeight + e.getLinkScore() * linkWeight + e.getTypeClassifierkScore() * typeClassifierkWeight; if (ranklib == true) { totalScore = RankLib.getInstance().score(e); } if (em.getEntity().getPage().getUrlTitle().contains("(")) { totalScore /= 2; } em.setScore(totalScore); e.setScore(totalScore); LOGGER.info("\t" + id + "\t" + em.getEntity().getPage().getTitle() + "\t" + em.getEntity().getPage().getUrlTitle() + "\t" + em.getScore() + "\t" + e.getPopularityScore() * popularityWeight + "\t" + e.getNameScore() * nameWeight + "\t" + e.getLeskScore() * leskWeight + "\t" + e.getSimpleLeskScore() * simpleLeskWeight + "\t" + e.getLetterCaseScore() * letterCaseWeight + "\t" + e.getSuffixScore() * suffixWeight + "\t" + e.getTypeContentScore() * typeContentWeight + "\t" + e.getTypeScore() * typeWeight + "\t" + e.getDomainScore() * domainWeight + "\t" + e.getWordvecDescriptionScore() * wordvecDescriptionWeight + "\t" + e.getWordvecDescriptionLocalScore() * wordvecDescriptionLocalWeight + "\t" + e.getHashDescriptionScore() * hashDescriptionWeight + "\t" + e.getHashInfoboxScore() * hashInfoboxWeight + "\t" + e.getWordvecLinksScore() * word2vecLinksWeight + "\t" + e.getLinkScore() * linkWeight + "\t" + e.getTypeClassifierkScore() * typeClassifierkWeight + "\t" + em.getEntity().getPage().getDescription()); } // if (annotateEntities) { // annotateEntities(localParams.getParams().get("originalText"), sml); // } EntityMatchList eml = new EntityMatchList(); for (SpotMatch match : sml) { EntityMatchList list = match.getEntities(); if (!list.isEmpty()) { list.sort(); eml.add(list.get(0)); selectedEntities.add(list.get(0).getId()); } } return eml; }
From source file:models.persistence.lecture.Lecture.java
@JsonIgnore public String getShortName() { StringBuilder sb = new StringBuilder(); for (int i = 0; i < name.length(); i++) { if (Character.isUpperCase(name.charAt(i)) || Character.isDigit(name.charAt(i)) || name.charAt(i) == '/' || name.charAt(i) == ' ' || name.charAt(i) == '+' || name.charAt(i) == '-') { sb.append(name.charAt(i));//from w w w. j av a 2s.com if (Character.isUpperCase(name.charAt(i))) { for (int j = i; j < i + 3 && j < name.length(); j++) { if (Character.isLowerCase(name.charAt(j))) { sb.append(name.charAt(j)); } } } } } return sb.toString().replaceAll(" ", "").replaceAll("AE", "").replaceAll("OE", "") .replaceAll("UE", "").trim(); //return sb.toString().replaceAll("","AE").replaceAll("","OE").replaceAll("","UE").trim(); }
From source file:com.svi.uzabase.logic.ValidationProcess.java
private List<XMLHolder> validateData(List<XMLHolder> xmlBatchHolder) { try {// w ww. j a v a 2s . c om int totalCounter = 0; //Initialize dictionary String dictFileName = "file://./dic/english.jar"; String configFile = "file://./classes/spellCheck.config"; BasicDictionary dictionary = new BasicDictionary(dictFileName); SpellCheckConfiguration configuration = new SpellCheckConfiguration(configFile); BasicSuggester suggester = new BasicSuggester(configuration); suggester.attach(dictionary); // create SpellCheck object based on configuration and specify Suggester SpellCheck spellCheck = new SpellCheck(configuration); spellCheck.setSuggester(suggester); //set for jprogress bar for (XMLHolder h : xmlBatchHolder) { totalCounter += h.size(); } progress = new AtomicInteger(0); total = new AtomicInteger(totalCounter); mf.setJprogressValues(total, progress); //validation process begins here String[] invalidWords = { "corporation", "inc.", "city", "corp.", "st.", "co.", "ltd." }; String[] invalidBoardWords = { "other", "oth" }; String[] validWords = { "loc", "to", "ext", "local" }; String[] invalidCharacters = { ",", "/", "\\", "[", "]", "\"", ":", "^", "{", "}", "%", "+", "#", "(", ")" }; String[] splitter; String tempURL; SimpleDateFormat sdf = new SimpleDateFormat("YYYY/MM/DD"); SimpleDateFormat fiscalYear = new SimpleDateFormat("MM/DD"); sdf.setLenient(false); Set<String> officerList = new HashSet<>(); List<Double> percentOwnership = new ArrayList<>(); UrlValidator urlValidator = new UrlValidator(); Date date = null; for (XMLHolder h : xmlBatchHolder) { for (Field f : h) { mf.loader("Validating fields: ", false); if (!f.getType().equals("none") && !f.getValue().equals("*N/A")) { switch (f.getType()) { case "city": if (f.getValue().isEmpty() || f.getValue().equals("")) { f.add("Address is empty"); } else { if (hasWhiteSpaceTrailing(f.getValue())) { f.add("has trailing white space "); } if (cityList.indexOf(f.getValue()) < 0) { f.add("City not found on list!"); } } break; case "province": if (f.getValue().isEmpty() || f.getValue().equals("")) { f.add("Address is empty"); } else { if (hasWhiteSpaceTrailing(f.getValue())) { f.add("has trailing white space "); } if (provinceList.indexOf(f.getValue()) < 0) { f.add("Province not found on list!"); } } break; case "tel": if (!f.getValue().isEmpty() || !f.getValue().equals("")) { // if (f.getValue().matches("[a-z A-Z]+")) { if (f.getValue().matches(".*[a-zA-Z]+.*")) { for (String s : validWords) { if (!f.getValue().contains(s)) { f.add("Invalid telephone number"); } } } if (f.getValue().replace(" ", "").replace("-", "").length() < 7 || f.getValue().replace(" ", "").replace("-", "").length() > 8) { f.add("Invalid telephone number length"); } if (hasWhiteSpaceTrailing(f.getValue())) { f.add("has trailing white space "); } if (StringUtils.countMatches(f.getValue(), "-") > 2) { f.add("Invalid telephone number"); } for (String c : invalidCharacters) { if (f.getValue().contains(c)) { f.add("Contains invalid character [ " + c + " ]"); break; } } } break; case "fax": if (!f.getValue().isEmpty() || !f.getValue().equals("")) { // if (f.getValue().matches("[a-z A-Z]+")) { if (f.getValue().matches(".*[a-zA-Z]+.*")) { for (String s : validWords) { if (!f.getValue().contains(s)) { f.add("Invalid fax number"); } } } if (f.getValue().replace(" ", "").length() < 6) { f.add("Invalid fax number"); } if (StringUtils.countMatches(f.getValue(), "-") > 1) { f.add("Invalid fax number"); } if (hasWhiteSpaceTrailing(f.getValue())) { f.add("has trailing white space "); } for (String c : invalidCharacters) { if (f.getValue().contains(c)) { f.add("Contains invalid character [ " + c + " ]"); break; } } } break; case "person": if (!f.getValue().isEmpty() || !f.getValue().equals("")) { if (!f.getValue().matches("[a-zA-Z\\.,\\- ()]+")) { f.add("Invalid name"); } if (f.getValue().matches("[a-z ]+")) { f.add("All small caps"); } if (f.getValue().matches("\\w+")) { f.add("Only one word"); } if (f.getValue().replace(" ", "").length() > 30) { f.add("More than 30 characters."); } if (f.getValue().replace(" ", "").length() < 2) { f.add("Invalid name."); } if (hasWhiteSpaceTrailing(f.getValue())) { f.add("has trailing white space "); } } break; case "email": if (!f.getValue().isEmpty() || !f.getValue().equals("")) { if (!EmailValidator.getInstance(true).isValid(f.getValue())) { f.add("Invalid email"); } } break; case "website": if (!f.getValue().isEmpty() || !f.getValue().equals("")) { if (!f.getValue().contains("http")) { tempURL = "http://" + f.getValue(); } else { tempURL = f.getValue(); } if (!urlValidator.isValid(tempURL)) { f.add("Invalid website"); } if (hasWhiteSpaceTrailing(f.getValue())) { f.add("has trailing white space "); } } break; case "name": officerList.add(f.getValue()); if (!f.getValue().isEmpty() || !f.getValue().equals("")) { if (!f.getValue().matches("[a-zA-Z\\.,\\-() ]+")) { f.add("Invalid name"); } if (f.getValue().replace(" ", "").length() > 30) { f.add("More than 50 characters."); } if (f.getValue().matches("[a-z ]+")) { f.add("All small caps"); } if (f.getValue().matches("\\w+")) { f.add("Only one word"); } if (f.getValue().replace(" ", "").length() < 2) { f.add("Invalid name."); } if (hasWhiteSpaceTrailing(f.getValue())) { f.add("has trailing white space "); } for (String s : invalidWords) { if (f.getValue().contains(s)) { f.add("Contains invalid word: " + s); break; } } } break; case "stockholder": officerList.add(f.getValue()); if (!f.getValue().isEmpty() || !f.getValue().equals("")) { if (!f.getValue().matches("[a-zA-Z\\.,\\-() ]+")) { f.add("Invalid name"); } if (f.getValue().replace(" ", "").length() > 30) { f.add("More than 50 characters."); } if (f.getValue().matches("[a-z ]+")) { f.add("All small caps"); } if (f.getValue().matches("\\w+")) { f.add("Only one word"); } if (f.getValue().replace(" ", "").length() < 2) { f.add("Invalid name."); } if (hasWhiteSpaceTrailing(f.getValue())) { f.add("has trailing white space "); } for (String s : invalidWords) { if (f.getValue().contains(s)) { f.add("Contains invalid word: " + s); break; } } } break; case "board": if (!f.getValue().isEmpty() || !f.getValue().equals("")) { if (!f.getValue().matches("[a-zA-Z\\.,\\-() ]+")) { f.add("Invalid position"); } for (String c : invalidCharacters) { if (f.getValue().contains(c)) { f.add("Contains invalid character [ " + c + " ]"); break; } } for (String c : invalidBoardWords) { if (f.getValue().contains(c)) { f.add("Contains invalid word [ " + c + " ]"); break; } } if (f.getValue().equalsIgnoreCase("N") || f.getValue().equalsIgnoreCase("Y")) { f.add("is letter " + f.getValue() + " only"); } if (Character.isLowerCase(f.getValue().charAt(0))) { f.add("starts with a lower case letter"); } spellCheck.setText(f.getValue(), Constants.DOC_TYPE_TEXT, "en"); spellCheck.check(); if (spellCheck.hasMisspelt()) { f.add("word is misspelled."); } } break; case "corporation": if (companyList.indexOf(f.getValue().toUpperCase()) < 0) { f.add("Company name not found on table."); } break; case "sec": if (StringUtils.countMatches(f.getValue(), "-") > 1) { f.add("Invalid SEC number"); } if (f.getValue().replace(" ", "").length() > 9) { f.add("SEC number more than 9 digits."); } if (hasWhiteSpaceTrailing(f.getValue())) { f.add("SEC has trailing white space."); } for (String c : invalidCharacters) { if (f.getValue().contains(c)) { f.add("Contains invalid character [ " + c + " ]"); break; } } break; case "tin": if (f.getValue().isEmpty() || f.getValue().equals("")) { f.add("TIN is empty"); } if (hasWhiteSpaceTrailing(f.getValue())) { f.add("TIN has trailing white space."); } if (!f.getValue().matches("[0-9]+")) { f.add("invalid TIN number"); } if (f.getValue().replace(" ", "").replace("-", "").length() > 12 || f.getValue().replace(" ", "").replace("-", "").length() < 9) { f.add("TIN number invalid length."); } if (StringUtils.countMatches(f.getValue(), "-") > 1) { f.add("Invalid TIN number"); } for (String c : invalidCharacters) { if (f.getValue().contains(c)) { f.add("Contains invalid character [ " + c + " ]"); break; } } break; case "nationality": if (!f.getValue().isEmpty() || !f.getValue().equals("")) { if (nationalityList.indexOf(f.getValue()) < 0) { f.add("nationality is misspelled."); } } break; case "purpose": splitter = f.getValue().split(" "); for (int i = 0; i < splitter.length; i++) { spellCheck.setText(splitter[i], Constants.DOC_TYPE_TEXT, "en"); spellCheck.check(); if (spellCheck.hasMisspelt()) { f.add("word is misspelled. ( " + spellCheck.getMisspelt() + " )"); } } break; case "periodCovered": try { date = sdf.parse(f.getValue()); if (!f.getValue().equals(sdf.format(date))) { f.add("Invalid date format"); } } catch (ParseException ex) { f.add("Invalid date format"); } break; case "fiscalYear": try { date = fiscalYear.parse(f.getValue()); if (!f.getValue().equals(sdf.format(date))) { f.add("Invalid date format"); } } catch (ParseException ex) { f.add("Invalid date format"); } break; case "position": if (f.getValue().contains("\\d+")) { f.add("Invalid position/designation"); } if (f.getValue().replace(" ", "").length() > 10 || f.getValue().replace(" ", "").length() < 3) { f.add("More than 30 characters."); } break; case "shareType": if (f.getValue().toLowerCase().contains("total")) { f.add("Share type contains total."); } if (f.getValue().replace(" ", "").length() > 20) { f.add("Share type More than 20 characters."); } break; case "ownership": percentOwnership.add(Double.parseDouble(f.getValue())); if (Double.parseDouble(f.getValue()) > 100) { f.add("Percent ownership more than 100%"); } break; default: break; } } else if (f.getType().equals("tin") && f.getValue().equals("*N/A")) { f.add("TIN is N/A"); } } } } catch (EncryptedDocumentException | SuggesterException ex) { Logger.getLogger(ValidationProcess.class.getName()).log(Level.SEVERE, null, ex); } return xmlBatchHolder; }
From source file:net.yacy.cora.document.id.MultiProtocolURL.java
/** * <p>// w w w. jav a 2 s . co m * Percent-encode/escape an URL path part according to the allowed characters * specified in RFC3986 (formerly RFC1738 and RFC2396). Uses UTF-8 character * codes for non-ASCII. * </p> * <p> * When isPattern is true, the string is processed as a regular expression, and * therefore meta-characters used by the {@link Pattern} class are not * percent-encoded. * </p> * * @param pathToEscape the path part to escape. * @param isPattern when true, regular meta-characters are not escaped * @return an escaped path regular expression with only allowed ASCII * characters, or null when pathPattern is null. * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">RFC3986 * percent-encoding section</a> * @see <z href="https://tools.ietf.org/html/rfc3986#appendix-A">RFC3986 path * definition</a> */ private static String escapePath(final String pathToEscape, final boolean isPattern) { if (pathToEscape == null) { return pathToEscape; } final StringBuilder ptmp = new StringBuilder(pathToEscape.length() + 10); boolean modified = false; final int len = pathToEscape.length(); int i = 0; while (i < len) { int ch = pathToEscape.charAt(i); if (ch == '%' && (i + 2) < len) { final char digit1 = pathToEscape.charAt(i + 1); final char digit2 = pathToEscape.charAt(i + 2); if (isHexDigit(digit1) && isHexDigit(digit2)) { /* Already percent-encoded character */ ptmp.append((char) ch); /* Normalize hexadecimal digits to upper case */ if (Character.isLowerCase(digit1) || Character.isLowerCase(digit2)) { modified = true; } ptmp.append(Character.toUpperCase(digit1)); ptmp.append(Character.toUpperCase(digit2)); i += 2; } else { /* Not a valid percent-encoded character : we encode it now */ ptmp.append(hex[ch]); modified = true; } } else if (isPattern && PATTERN_METACHARACTERS.get(ch)) { ptmp.append((char) ch); } else if (ch <= 0x7F) { if (UNRESERVED_PATH.get(ch)) { ptmp.append((char) ch); } else { ptmp.append(hex[ch]); modified = true; } } else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF ptmp.append(hex[0xc0 | (ch >> 6)]); ptmp.append(hex[0x80 | (ch & 0x3F)]); modified = true; } else { // 0x7FF < ch <= 0xFFFF ptmp.append(hex[0xe0 | (ch >> 12)]); ptmp.append(hex[0x80 | ((ch >> 6) & 0x3F)]); ptmp.append(hex[0x80 | (ch & 0x3F)]); modified = true; } i++; } if (modified) { return ptmp.toString(); } return pathToEscape; }
From source file:org.gvnix.service.roo.addon.addon.util.WsdlParserUtils.java
/** * Capitalize the first character of the name. * //from ww w . j a v a 2 s . c om * @param name * @return */ public static String capitalizeFirstChar(String name) { if ((name == null) || name.equals("")) { return name; } char start = name.charAt(0); if (Character.isLowerCase(start)) { start = Character.toUpperCase(start); return start + name.substring(1); } return name; }
From source file:org.apache.axis.utils.JavaUtils.java
/** * Map an XML name to a Java identifier per * the mapping rules of JSR 101 (in version 1.0 this is * "Chapter 20: Appendix: Mapping of XML Names" * // w ww. j a v a2 s. c o m * @param name is the xml name * @return the java name per JSR 101 specification */ public static String xmlNameToJava(String name) { // protect ourselves from garbage if (name == null || name.equals("")) return name; char[] nameArray = name.toCharArray(); int nameLen = name.length(); StringBuffer result = new StringBuffer(nameLen); boolean wordStart = false; // The mapping indicates to convert first character. int i = 0; while (i < nameLen && (isPunctuation(nameArray[i]) || !Character.isJavaIdentifierStart(nameArray[i]))) { i++; } if (i < nameLen) { // Decapitalization code used to be here, but we use the // Introspector function now after we filter out all bad chars. result.append(nameArray[i]); //wordStart = !Character.isLetter(nameArray[i]); wordStart = !Character.isLetter(nameArray[i]) && nameArray[i] != "_".charAt(0); } else { // The identifier cannot be mapped strictly according to // JSR 101 if (Character.isJavaIdentifierPart(nameArray[0])) { result.append("_" + nameArray[0]); } else { // The XML identifier does not contain any characters // we can map to Java. Using the length of the string // will make it somewhat unique. result.append("_" + nameArray.length); } } // The mapping indicates to skip over // all characters that are not letters or // digits. The first letter/digit // following a skipped character is // upper-cased. for (++i; i < nameLen; ++i) { char c = nameArray[i]; // if this is a bad char, skip it and remember to capitalize next // good character we encounter if (isPunctuation(c) || !Character.isJavaIdentifierPart(c)) { wordStart = true; continue; } if (wordStart && Character.isLowerCase(c)) { result.append(Character.toUpperCase(c)); } else { result.append(c); } // If c is not a character, but is a legal Java // identifier character, capitalize the next character. // For example: "22hi" becomes "22Hi" //wordStart = !Character.isLetter(c); wordStart = !Character.isLetter(c) && c != "_".charAt(0); } // covert back to a String String newName = result.toString(); // Follow JavaBean rules, but we need to check if the first // letter is uppercase first if (Character.isUpperCase(newName.charAt(0))) newName = Introspector.decapitalize(newName); // check for Java keywords if (isJavaKeyword(newName)) newName = makeNonJavaKeyword(newName); return newName; }
From source file:org.languagetool.rules.de.CaseRule.java
private void potentiallyAddLowercaseMatch(List<RuleMatch> ruleMatches, AnalyzedTokenReadings tokenReadings, boolean prevTokenIsDas, String token, boolean nextTokenIsPersonalOrReflexivePronoun, AnalyzedSentence sentence) {//from w w w .jav a 2 s. c o m // e.g. essen -> Essen if (prevTokenIsDas && !nextTokenIsPersonalOrReflexivePronoun && Character.isLowerCase(token.charAt(0)) && !substVerbenExceptions.contains(token) && tokenReadings.hasPosTagStartingWith("VER:INF") && !tokenReadings.isIgnoredBySpeller() && !tokenReadings.isImmunized()) { addRuleMatch(ruleMatches, sentence, LOWERCASE_MESSAGE, tokenReadings, StringTools.uppercaseFirstChar(tokenReadings.getToken())); } }