Java tutorial
/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.experiments.dip.wp1.documents.helpers.boilerplateremoval.impl; import org.apache.commons.io.IOUtils; import java.io.IOException; import java.io.InputStream; import java.net.URISyntaxException; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Set; /** * Some helper functions * * @author Omnia Zayed */ public class Utils { /** * Translates multiple whitespace into single space character. If there is * at least one new line character chunk is replaced by single LF (Unix new * line) character. * * @param text * @return */ public static String normalizeWhitespace(String text) { text = text.replaceAll("(\r\n|\r)", "\n"); //remove multiple white spaces but keep new lines text = text.replaceAll("(?:(?![\n])\\s+)", " "); // or [\\s+&&[^\n])] //replace extra <br> (sometimes the paragraph contains <br><br>, //the first one will be use as new paragraph marker but the second //one must be removed) text = text.replaceAll("<br>", ""); // or [\\s+&&[^\n])] return text; } /** * Returns `True` if string contains only white-space characters or is * empty. Otherwise `False` is returned. * * @param text * @return */ // TODO remove public static boolean is_blank(String text) { return (text.isEmpty() || text.matches("\\s+")); } /** * load the stop-words list of a given language * * @param locale * @return * @throws URISyntaxException * @throws IOException */ public static Set<String> loadStopWords(Locale locale) throws IOException { String streamName = "/stoplists/" + locale.getLanguage() + ".txt"; InputStream stream = Utils.class.getResourceAsStream(streamName); if (stream == null) { throw new IOException("Stream " + streamName + " not found"); } List<String> stopList = IOUtils.readLines(stream); HashSet<String> stopSet = new HashSet<String>(stopList); return stopSet; } private static final String WHITESPACE_CHARS = "" + "\\u0009" // CHARACTER TABULATION // + "\\u000A" // LINE FEED (LF) + "\\u000B" // LINE TABULATION // + "\\u000C" // FORM FEED (FF) // + "\\u000D" // CARRIAGE RETURN (CR) + "\\u0009" // horizontal tab + "\\u0010" // Data Link Escape + "\\u0020" // SPACE + "\\u0085" // NEXT LINE (NEL) + "\\u00A0" // NO-BREAK SPACE + "\\u1680" // OGHAM SPACE MARK + "\\u180E" // MONGOLIAN VOWEL SEPARATOR + "\\u2000" // EN QUAD + "\\u2001" // EM QUAD + "\\u2002" // EN SPACE + "\\u2003" // EM SPACE + "\\u2004" // THREE-PER-EM SPACE + "\\u2005" // FOUR-PER-EM SPACE + "\\u2006" // SIX-PER-EM SPACE + "\\u2007" // FIGURE SPACE + "\\u2008" // PUNCTUATION SPACE + "\\u2009" // THIN SPACE + "\\u200A" // HAIR SPACE + "\\u2028" // LINE SEPARATOR + "\\u2029" // PARAGRAPH SEPARATOR + "\\u202F" // NARROW NO-BREAK SPACE + "\\u205F" // MEDIUM MATHEMATICAL SPACE + "\\u3000"; private static final String WHITESPACE_CHAR_CLASS = "[" + WHITESPACE_CHARS + "]"; /** * Normalizes the given string - unifying whitespaces, quotations, and dashes * * @param text text * @return normalized text */ public static String normalize(String text) { String result = text.replaceAll("\\n+", "\n"); // first replace all control characters except newlines result = result.replaceAll("[\\p{Cntrl}&&[^\\r\\n]]", ""); // all weird whitespaces result = result.replaceAll(WHITESPACE_CHAR_CLASS + "+", " "); // trim the lines result = result.replaceAll("\\n" + WHITESPACE_CHAR_CLASS + "+", "\n"); result = result.replaceAll(WHITESPACE_CHAR_CLASS + "+\\n", "\n"); // dashes String dashChars = "" + "\\u2012" // figure dash + "\\u2013" // en dash + "\\u2014" // em dash + "\\u2015" // horizontal bar + "\\u2053" // swung dash ; result = result.replaceAll("[" + dashChars + "]+", "-"); // elipsis result = result.replaceAll("\\u2026", "..."); // quotation marks result = result.replaceAll("[?]", "\""); result = result.replaceAll("[`]", "'"); return result.trim(); } }