Java tutorial
/******************************************************************************* * This file is part of the Coporate Semantic Web Project. * * This work has been partially supported by the ``InnoProfile-Corporate Semantic Web" project funded by the German Federal * Ministry of Education and Research (BMBF) and the BMBF Innovation Initiative for the New German Laender - Entrepreneurial Regions. * * http://www.corporate-semantic-web.de/ * * Freie Universitaet Berlin * Copyright (c) 2007-2013 * * Institut fuer Informatik * Working Group Coporate Semantic Web * Koenigin-Luise-Strasse 24-26 * 14195 Berlin * * http://www.mi.fu-berlin.de/en/inf/groups/ag-csw/ * * This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. * You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA or see <http://www.gnu.org/licenses/> ******************************************************************************/ package org.apache.lucene.analysis.de; import org.apache.commons.lang.StringUtils; import de.csw.ontology.OntologyIndex; // This file is encoded in UTF-8 /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * A stemmer for German words. The algorithm is based on the report * "A Fast and Simple Stemming Algorithm for German Words" by Jörg Caumanns * (joerg.caumanns at isst.fhg.de). * * * @version $Id$ */ public class GermanStemmer implements Stemmer { /** * Buffer for the terms while stemming them. */ private StringBuffer sb = new StringBuffer(); /** * Amount of characters that are removed with <tt>substitute()</tt> while * stemming. */ private int substCount = 0; /** * Stems the given term term to an unique <tt>discriminator</tt>. * * @param term * The term that should be stemmed. * @return Discriminator for <tt>term</tt> */ public String stem(String term) { // TODO we should use a global splitter for terms being used in OntologyIndex and here String[] frags = StringUtils.split(term); for (int i = 0; i < frags.length; i++) { frags[i] = stemSingleTerm(frags[i]); } return StringUtils.join(frags, OntologyIndex.PREFIX_SEPARATOR); } /** * Stems the given term consisting of a single term to an unique * <tt>discriminator</tt>. * * @param term * The term that should be stemmed. * @return Discriminator for <tt>term</tt> */ protected String stemSingleTerm(String term) { if (term == null) return null; // Use lowercase for medium stemming. term = term.toLowerCase(); if (!isStemmable(term)) return term; // Reset the StringBuffer. sb.delete(0, sb.length()); sb.insert(0, term); // Stemming starts here... substitute(sb); strip(sb); optimize(sb); resubstitute(sb); removeParticleDenotion(sb); return sb.toString(); } /** * Checks if a term could be stemmed. * * @return true if, and only if, the given term consists in letters. */ private boolean isStemmable(String term) { for (int c = 0; c < term.length(); c++) { if (!Character.isLetter(term.charAt(c))) return false; } return true; } /** * suffix stripping (stemming) on the current term. The stripping is reduced * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd", * from which all regular suffixes are build of. The simplification causes * some overstemming, and way more irregular stems, but still provides * unique. discriminators in the most of those cases. The algorithm is * context free, except of the length restrictions. */ private void strip(StringBuffer buffer) { boolean doMore = true; while (doMore && buffer.length() > 3) { if ((buffer.length() + substCount > 5) && buffer.substring(buffer.length() - 2, buffer.length()).equals("nd")) { buffer.delete(buffer.length() - 2, buffer.length()); } else if ((buffer.length() + substCount > 4) && buffer.substring(buffer.length() - 2, buffer.length()).equals("em")) { buffer.delete(buffer.length() - 2, buffer.length()); } else if ((buffer.length() + substCount > 4) && buffer.substring(buffer.length() - 2, buffer.length()).equals("er")) { buffer.delete(buffer.length() - 2, buffer.length()); } else if (buffer.charAt(buffer.length() - 1) == 'e') { buffer.deleteCharAt(buffer.length() - 1); } else if (buffer.charAt(buffer.length() - 1) == 's') { buffer.deleteCharAt(buffer.length() - 1); } else if (buffer.charAt(buffer.length() - 1) == 'n') { buffer.deleteCharAt(buffer.length() - 1); } // "t" occurs only as suffix of verbs. else if (buffer.charAt(buffer.length() - 1) == 't') { buffer.deleteCharAt(buffer.length() - 1); } else { doMore = false; } } } /** * Does some optimizations on the term. This optimisations are contextual. */ private void optimize(StringBuffer buffer) { // Additional step for female plurals of professions and inhabitants. if (buffer.length() > 5 && buffer.substring(buffer.length() - 5, buffer.length()).equals("erin*")) { buffer.deleteCharAt(buffer.length() - 1); strip(buffer); } // Additional step for irregular plural nouns like "Matrizen -> Matrix". if (buffer.charAt(buffer.length() - 1) == ('z')) { buffer.setCharAt(buffer.length() - 1, 'x'); } } /** * Removes a particle denotion ("ge") from a term. */ private void removeParticleDenotion(StringBuffer buffer) { if (buffer.length() > 4) { for (int c = 0; c < buffer.length() - 3; c++) { if (buffer.substring(c, c + 4).equals("gege")) { buffer.delete(c, c + 2); return; } } } } /** * Do some substitutions for the term to reduce overstemming: * * - Substitute Umlauts with their corresponding vowel: -> aou, "" is * substituted by "ss" - Substitute a second char of a pair of equal * characters with an asterisk: ?? -> ?* - Substitute some common character * combinations with a token: sch/ch/ei/ie/ig/st -> $//%/&/#/! */ private void substitute(StringBuffer buffer) { substCount = 0; for (int c = 0; c < buffer.length(); c++) { // Replace the second char of a pair of the equal characters with an // asterisk if (c > 0 && buffer.charAt(c) == buffer.charAt(c - 1)) { buffer.setCharAt(c, '*'); } // Substitute Umlauts. else if (buffer.charAt(c) == '') { buffer.setCharAt(c, 'a'); } else if (buffer.charAt(c) == '') { buffer.setCharAt(c, 'o'); } else if (buffer.charAt(c) == '') { buffer.setCharAt(c, 'u'); } // Fix bug so that '' at the end of a word is replaced. else if (buffer.charAt(c) == '') { buffer.setCharAt(c, 's'); buffer.insert(c + 1, 's'); substCount++; } // Take care that at least one character is left left side from the // current one if (c < buffer.length() - 1) { // Masking several common character combinations with an token if ((c < buffer.length() - 2) && buffer.charAt(c) == 's' && buffer.charAt(c + 1) == 'c' && buffer.charAt(c + 2) == 'h') { buffer.setCharAt(c, '$'); buffer.delete(c + 1, c + 3); substCount = +2; } else if (buffer.charAt(c) == 'c' && buffer.charAt(c + 1) == 'h') { buffer.setCharAt(c, ''); buffer.deleteCharAt(c + 1); substCount++; } else if (buffer.charAt(c) == 'e' && buffer.charAt(c + 1) == 'i') { buffer.setCharAt(c, '%'); buffer.deleteCharAt(c + 1); substCount++; } else if (buffer.charAt(c) == 'i' && buffer.charAt(c + 1) == 'e') { buffer.setCharAt(c, '&'); buffer.deleteCharAt(c + 1); substCount++; } else if (buffer.charAt(c) == 'i' && buffer.charAt(c + 1) == 'g') { buffer.setCharAt(c, '#'); buffer.deleteCharAt(c + 1); substCount++; } else if (buffer.charAt(c) == 's' && buffer.charAt(c + 1) == 't') { buffer.setCharAt(c, '!'); buffer.deleteCharAt(c + 1); substCount++; } } } } /** * Undoes the changes made by substitute(). That are character pairs and * character combinations. Umlauts will remain as their corresponding vowel, * as "" remains as "ss". */ private void resubstitute(StringBuffer buffer) { for (int c = 0; c < buffer.length(); c++) { if (buffer.charAt(c) == '*') { char x = buffer.charAt(c - 1); buffer.setCharAt(c, x); } else if (buffer.charAt(c) == '$') { buffer.setCharAt(c, 's'); buffer.insert(c + 1, new char[] { 'c', 'h' }, 0, 2); } else if (buffer.charAt(c) == '') { buffer.setCharAt(c, 'c'); buffer.insert(c + 1, 'h'); } else if (buffer.charAt(c) == '%') { buffer.setCharAt(c, 'e'); buffer.insert(c + 1, 'i'); } else if (buffer.charAt(c) == '&') { buffer.setCharAt(c, 'i'); buffer.insert(c + 1, 'e'); } else if (buffer.charAt(c) == '#') { buffer.setCharAt(c, 'i'); buffer.insert(c + 1, 'g'); } else if (buffer.charAt(c) == '!') { buffer.setCharAt(c, 's'); buffer.insert(c + 1, 't'); } } } }