Java tutorial
/* * Copyright (c) 2010-2011, Martijn Brinkers, Djigzo. * * This file is part of Djigzo email encryption. * * Djigzo is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License * version 3, 19 November 2007 as published by the Free Software * Foundation. * * Djigzo is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public * License along with Djigzo. If not, see <http://www.gnu.org/licenses/> * * Additional permission under GNU AGPL version 3 section 7 * * If you modify this Program, or any covered work, by linking or * combining it with aspectjrt.jar, aspectjweaver.jar, tyrex-1.0.3.jar, * freemarker.jar, dom4j.jar, mx4j-jmx.jar, mx4j-tools.jar, * spice-classman-1.0.jar, spice-loggerstore-0.5.jar, spice-salt-0.8.jar, * spice-xmlpolicy-1.0.jar, saaj-api-1.3.jar, saaj-impl-1.3.jar, * wsdl4j-1.6.1.jar (or modified versions of these libraries), * containing parts covered by the terms of Eclipse Public License, * tyrex license, freemarker license, dom4j license, mx4j license, * Spice Software License, Common Development and Distribution License * (CDDL), Common Public License (CPL) the licensors of this Program grant * you additional permission to convey the resulting work. */ package mitm.common.dlp.impl; import java.io.IOException; import java.io.Reader; import java.io.Writer; import java.text.Normalizer; import org.apache.commons.lang.StringUtils; import mitm.common.dlp.TextNormalizer; import mitm.common.dlp.WordSkipper; import mitm.common.util.Check; import mitm.common.util.WordIterator; /** * TextNormalizer implementation that removes all excess whitespace, make all words lowercase, normalizes the * words to unicode normalized form and removes words that are should be skipped according to the WordSkipper. * * @author Martijn Brinkers * */ public class TextNormalizerImpl implements TextNormalizer { /* * Determines which words to skip */ private final WordSkipper wordSkipper; public TextNormalizerImpl(WordSkipper wordSkipper) { this.wordSkipper = wordSkipper; } @Override public void normalize(Reader input, Writer output) throws IOException { Check.notNull(input, "input"); Check.notNull(output, "output"); WordIterator wi = new WordIterator(input); String word; while ((word = wi.nextWord()) != null) { word = StringUtils.trimToNull(word); if (word != null) { /* * Unicode normalize the word to make sure the word only has one form */ word = Normalizer.normalize(word.toLowerCase(), Normalizer.Form.NFC); if (wordSkipper == null || !wordSkipper.isSkip(word)) { output.append(word).append(' '); } } } } }