Java tutorial
/* Cognition-DNC (Dynamic Name Concealer) Developed by Ismail Kartoglu (https://github.com/iemre) Binary to text document converter and database pseudonymiser. Copyright (C) 2015 Biomedical Research Centre for Mental Health This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ package uk.ac.kcl.utils; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; public class StringTools { public static int getLevenshteinDistance(String str1, String str2) { return StringUtils.getLevenshteinDistance(str1, str2); } /** * @param sourceString Source string to search for approximately matching segments. * @param search String to search in {@code sourceString}. * @param maxDistance Maximum edit distance that should be satisfied. * @return A list of substrings from the @sourceString each of which approximately matches {@code search}. */ public static Set<String> getApproximatelyMatchingStringList(String sourceString, String search, int maxDistance) { maxDistance = getMaxAllowedLevenshteinDistanceFor(search, maxDistance); Set<String> matches = new HashSet<>(); if (StringUtils.isBlank(search)) { return matches; } search = search.trim(); int searchLength = search.length(); if (searchLength <= 1) { return matches; } if (searchLength <= 3) { matches.add(search); return matches; } sourceString = sourceString.toLowerCase().trim(); search = search.toLowerCase().trim(); for (int i = 0; i < sourceString.length(); i++) { int endIndex = i + searchLength; if (endIndex >= sourceString.length()) { endIndex = sourceString.length(); } String completingString = getCompletingString(sourceString, i, endIndex); if (matches.contains(completingString)) { continue; } if (getLevenshteinDistance(completingString, search) <= maxDistance) { matches.add(completingString); i = endIndex; } } return matches; } /** * @param word * @return Max heuristic Levenshtein distance for {@code word}. */ private static int getMaxAllowedLevenshteinDistanceFor(String word, int levDistance) { if (StringUtils.isBlank(word)) { return 0; } return Math.round((float) word.length() * levDistance / 100); } private static String getCompletingString(String string, int begin, int end) { while (begin > 0 && StringUtils.isAlphanumeric(string.substring(begin, begin + 1))) { begin -= 1; } if (begin != 0) begin += 1; while (end < string.length() - 1 && StringUtils.isAlphanumeric(string.substring(end, end + 1))) { end += 1; } String regex = "\\w+(\\(?\\)?\\s+\\w+)*"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(string.substring(begin, end)); if (matcher.find()) { return matcher.group(); } return StringUtils.EMPTY; } /** * * @param text The source text. * @param search The text to be searched in {@code text}. * @param threshold The threshold value between 0.0 - 1.0. * @return A list of MatchingWindow objects. */ public static List<MatchingWindow> getMatchingWindowsAboveThreshold(String text, String search, double threshold) { if (StringUtils.isBlank(text)) { return new ArrayList<>(); } if (StringUtils.isBlank(search)) { return new ArrayList<>(); } String[] addressWords = search.split(" "); int bagSize = addressWords.length; String[] textWords = text.split(" "); int textWordCount = textWords.length; List<MatchingWindow> windows = new ArrayList<>(); for (int i = 0; i < textWordCount; i++) { MatchingWindow window = takeBag(textWords, i, bagSize); window.setScoreAccordingTo(addressWords); window.setMatchingText(text.substring(window.getBegin(), window.getEnd())); windows.add(window); } Collections.sort(windows); windows = windows.stream().filter(window -> window.isScoreAboveThreshold(threshold)) .collect(Collectors.toList()); return windows; } private static MatchingWindow takeBag(String[] textWords, int startWordIndex, int bagSize) { MatchingWindow window = new MatchingWindow(); int offset = 0; for (int i = startWordIndex; i < startWordIndex + bagSize; i++) { if (i >= textWords.length) { break; } offset += textWords[i].length() + 1; window.addWord(textWords[i]); } offset -= 1; int begin = 0; for (int i = 0; i < startWordIndex; i++) { begin += textWords[i].length() + 1; } window.setBegin(begin); window.setEnd(begin + offset); return window; } /** * Splits the given string into words and returns a set of those words that have a greater * length than the argument {@code minLength}. * @param string String to be split. * @param minLength Minimum allowed length of a word. * @return A set of words with length larger than {@code minLength}. */ public static Set<String> splitIntoWordsWithLengthHigherThan(String string, int minLength) { Set<String> strings = new HashSet<>(); if (StringUtils.isBlank(string)) { return strings; } String[] splitArray = string.split(" "); for (String word : splitArray) { if (word.length() > minLength || word.matches("[0-9]+") || word.matches("[0-9]+-[0-9]+") || word.matches("(?i)[0-9]+st") || word.matches("(?i)[0-9]+nd") || word.matches("(?i)[0-9]+rd") || word.matches("(?i)[0-9]+th") || word.matches("(?i)inn")) { strings.add(word); } } return strings; } public static Set<String> splitIntoWordsWithLengthHigherThan(String string, int minLength, String... ignoreWords) { if (StringUtils.isBlank(string)) { return new HashSet<>(); } for (String ignoreWord : ignoreWords) { string = string.replaceAll("(?i)" + ignoreWord, ""); } return splitIntoWordsWithLengthHigherThan(string, minLength); } public static boolean noContentInHtml(String text) { if (StringUtils.isBlank(text)) { return true; } try { Document doc = Jsoup.parse(text); String bodyText = doc.body().text(); return StringUtils.isBlank(bodyText); } catch (Exception ex) { return false; } } /** * @param text * @param regex * @param minLength * @return Returns a list of strings with minimum length of {@code minLength} * that match the given regular expression. */ public static List<String> getRegexMatchesWithMinLength(String text, String regex, int minLength) { List<String> result = new ArrayList<>(); Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(text); while (matcher.find()) { String match = matcher.group(); if (match.length() >= minLength) { result.add(match); } } return result; } }