Java tutorial
package com.avricot.prediction.utils; import org.springframework.stereotype.Service; /** * Copyright 2001-2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * A stemmer for French words. The algorithm is based on the work of * Dr Martin Porter on his snowball project<br> * refer to http://snowball.sourceforge.net/french/stemmer.html<br> * (French stemming algorithm) for details * * @author Patrick Talbot */ @Service public class Steemer { /** * Buffer for the terms while stemming them. */ private StringBuffer sb = new StringBuffer(); /** * A temporary buffer, used to reconstruct R2 */ private StringBuffer tb = new StringBuffer(); /** * Region R0 is equal to the whole buffer */ private String R0; /** * Region RV * "If the word begins with two vowels, RV is the region after the third letter, * otherwise the region after the first vowel not at the beginning of the word, * or the end of the word if these positions cannot be found." */ private String RV; /** * Region R1 * "R1 is the region after the first non-vowel following a vowel * or is the null region at the end of the word if there is no such non-vowel" */ private String R1; /** * Region R2 * "R2 is the region after the first non-vowel in R1 following a vowel * or is the null region at the end of the word if there is no such non-vowel" */ private String R2; /** * Set to true if we need to perform step 2 */ private boolean suite; /** * Set to true if the buffer was modified */ private boolean modified; /** * Stemms the given term to a unique <tt>discriminator</tt>. * * @param term java.langString The term that should be stemmed * @return java.lang.String Discriminator for <tt>term</tt> */ public String stem(String term) { if (!isStemmable(term)) { return term; } // Use lowercase for medium stemming. term = term.toLowerCase(); // Reset the StringBuffer. sb.delete(0, sb.length()); sb.insert(0, term); // reset the booleans modified = false; suite = false; sb = treatVowels(sb); setStrings(); step1(); if (!modified || suite) { if (RV != null) { suite = step2a(); if (!suite) step2b(); } } if (modified || suite) step3(); else step4(); step5(); step6(); return sb.toString(); } /** * Sets the search region Strings<br> * it needs to be done each time the buffer was modified */ private void setStrings() { // set the strings R0 = sb.toString(); RV = retrieveRV(sb); R1 = retrieveR(sb); if (R1 != null) { tb.delete(0, tb.length()); tb.insert(0, R1); R2 = retrieveR(tb); } else R2 = null; } /** * First step of the Porter Algorithmn<br> * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation */ private void step1() { String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" }; deleteFrom(R2, suffix); replaceFrom(R2, new String[] { "logies", "logie" }, "log"); replaceFrom(R2, new String[] { "usions", "utions", "usion", "ution" }, "u"); replaceFrom(R2, new String[] { "ences", "ence" }, "ent"); String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation" }; deleteButSuffixFromElseReplace(R2, search, "ic", true, R0, "iqU"); deleteButSuffixFromElseReplace(R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux"); deleteButSuffixFrom(R2, new String[] { "ements", "ement" }, "ativ", false); deleteButSuffixFrom(R2, new String[] { "ements", "ement" }, "iv", false); deleteButSuffixFrom(R2, new String[] { "ements", "ement" }, "abl", false); deleteButSuffixFrom(R2, new String[] { "ements", "ement" }, "iqU", false); deleteFromIfTestVowelBeforeIn(R1, new String[] { "issements", "issement" }, false, R0); deleteFrom(RV, new String[] { "ements", "ement" }); deleteButSuffixFromElseReplace(R2, new String[] { "its", "it" }, "abil", false, R0, "abl"); deleteButSuffixFromElseReplace(R2, new String[] { "its", "it" }, "ic", false, R0, "iqU"); deleteButSuffixFrom(R2, new String[] { "its", "it" }, "iv", true); String[] autre = { "ifs", "ives", "if", "ive" }; deleteButSuffixFromElseReplace(R2, autre, "icat", false, R0, "iqU"); deleteButSuffixFromElseReplace(R2, autre, "at", true, R2, "iqU"); replaceFrom(R0, new String[] { "eaux" }, "eau"); replaceFrom(R1, new String[] { "aux" }, "al"); deleteButSuffixFromElseReplace(R2, new String[] { "euses", "euse" }, "", true, R1, "eux"); deleteFrom(R2, new String[] { "eux" }); // if one of the next steps is performed, we will need to perform step2a boolean temp = false; temp = replaceFrom(RV, new String[] { "amment" }, "ant"); if (temp == true) suite = true; temp = replaceFrom(RV, new String[] { "emment" }, "ent"); if (temp == true) suite = true; temp = deleteFromIfTestVowelBeforeIn(RV, new String[] { "ments", "ment" }, true, RV); if (temp == true) suite = true; } /** * Second step (A) of the Porter Algorithmn<br> * Will be performed if nothing changed from the first step * or changed were done in the amment, emment, ments or ment suffixes<br> * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation * * @return boolean - true if something changed in the StringBuffer */ private boolean step2a() { String[] search = { "mes", "tes", "iraIent", "irait", "irais", "irai", "iras", "ira", "irent", "iriez", "irez", "irions", "irons", "iront", "issaIent", "issais", "issantes", "issante", "issants", "issant", "issait", "issais", "issions", "issons", "issiez", "issez", "issent", "isses", "isse", "ir", "is", "t", "it", "ies", "ie", "i" }; return deleteFromIfTestVowelBeforeIn(RV, search, false, RV); } /** * Second step (B) of the Porter Algorithmn<br> * Will be performed if step 2 A was performed unsuccessfully<br> * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation */ private void step2b() { String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez", "erons", "eront", "erez", "rent", "era", "es", "iez", "e", "s", "er", "ez", "" }; deleteFrom(RV, suffix); String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent", "antes", "aIent", "Aient", "ante", "mes", "tes", "ants", "ant", "ait", "at", "ais", "Ait", "At", "Ais", "t", "as", "ai", "Ai", "a" }; deleteButSuffixFrom(RV, search, "e", true); deleteFrom(R2, new String[] { "ions" }); } /** * Third step of the Porter Algorithmn<br> * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation */ private void step3() { if (sb.length()>0) { char ch = sb.charAt( sb.length()-1 ); if (ch == 'Y') { sb.setCharAt( sb.length()-1, 'i' ); setStrings(); } else if (ch == '') { sb.setCharAt( sb.length()-1, 'c' ); setStrings(); } } } /** * Fourth step of the Porter Algorithmn<br> * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation */ private void step4() { if (sb.length() > 1) { char ch = sb.charAt( sb.length()-1 ); if (ch == 's') { char b = sb.charAt( sb.length()-2 ); if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != '' && b != 's') { sb.delete( sb.length() - 1, sb.length()); setStrings(); } } } boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" ); if (!found) found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" ); replaceFrom( RV, new String[] { "Ire", "ire", "Ier", "ier" }, "i" ); deleteFrom( RV, new String[] { "e" } ); deleteFromIfPrecededIn( RV, new String[] { "" }, R0, "gu" ); } /** * Fifth step of the Porter Algorithmn<br> * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation */ private void step5() { if (R0 != null) { if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill")) { sb.delete(sb.length() - 1, sb.length()); setStrings(); } } } /** * Sixth (and last!) step of the Porter Algorithmn<br> * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation */ private void step6() { if (R0!=null && R0.length()>0) { boolean seenVowel = false; boolean seenConson = false; int pos = -1; for (int i = R0.length()-1; i > -1; i--) { char ch = R0.charAt(i); if (isVowel(ch)) { if (!seenVowel) { if (ch == '' || ch == '') { pos = i; break; } } seenVowel = true; } else { if (seenVowel) break; else seenConson = true; } } if (pos > -1 && seenConson && !seenVowel) sb.setCharAt(pos, 'e'); } } /** * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string * * @param source java.lang.String - the primary source zone for search * @param search java.lang.String[] - the strings to search for suppression * @param from java.lang.String - the secondary source zone for search * @param prefix java.lang.String - the prefix to add to the search string to test * @return boolean - true if modified */ private boolean deleteFromIfPrecededIn(String source, String[] search, String from, String prefix) { boolean found = false; if (source != null) { for (int i = 0; i < search.length; i++) { if (source.endsWith(search[i])) { if (from != null && from.endsWith(prefix + search[i])) { sb.delete(sb.length() - search[i].length(), sb.length()); found = true; setStrings(); break; } } } } return found; } /** * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel * * @param source java.lang.String - the primary source zone for search * @param search java.lang.String[] - the strings to search for suppression * @param vowel boolean - true if we need a vowel before the search string * @param from java.lang.String - the secondary source zone for search (where vowel could be) * @return boolean - true if modified */ private boolean deleteFromIfTestVowelBeforeIn(String source, String[] search, boolean vowel, String from) { boolean found = false; if (source != null && from != null) { for (int i = 0; i < search.length; i++) { if (source.endsWith(search[i])) { if ((search[i].length() + 1) <= from.length()) { boolean test = isVowel(sb.charAt(sb.length() - (search[i].length() + 1))); if (test == vowel) { sb.delete(sb.length() - search[i].length(), sb.length()); modified = true; found = true; setStrings(); break; } } } } } return found; } /** * Delete a suffix searched in zone "source" if preceded by the prefix * * @param source java.lang.String - the primary source zone for search * @param search java.lang.String[] - the strings to search for suppression * @param prefix java.lang.String - the prefix to add to the search string to test * @param without boolean - true if it will be deleted even without prefix found */ private void deleteButSuffixFrom(String source, String[] search, String prefix, boolean without) { if (source != null) { for (int i = 0; i < search.length; i++) { if (source.endsWith(prefix + search[i])) { sb.delete(sb.length() - (prefix.length() + search[i].length()), sb.length()); modified = true; setStrings(); break; } else if (without && source.endsWith(search[i])) { sb.delete(sb.length() - search[i].length(), sb.length()); modified = true; setStrings(); break; } } } } /** * Delete a suffix searched in zone "source" if preceded by prefix<br> * or replace it with the replace string if preceded by the prefix in the zone "from"<br> * or delete the suffix if specified * * @param source java.lang.String - the primary source zone for search * @param search java.lang.String[] - the strings to search for suppression * @param prefix java.lang.String - the prefix to add to the search string to test * @param without boolean - true if it will be deleted even without prefix found */ private void deleteButSuffixFromElseReplace(String source, String[] search, String prefix, boolean without, String from, String replace) { if (source != null) { for (int i = 0; i < search.length; i++) { if (source.endsWith(prefix + search[i])) { sb.delete(sb.length() - (prefix.length() + search[i].length()), sb.length()); modified = true; setStrings(); break; } else if (from != null && from.endsWith(prefix + search[i])) { sb.replace(sb.length() - (prefix.length() + search[i].length()), sb.length(), replace); modified = true; setStrings(); break; } else if (without && source.endsWith(search[i])) { sb.delete(sb.length() - search[i].length(), sb.length()); modified = true; setStrings(); break; } } } } /** * Replace a search string with another within the source zone * * @param source java.lang.String - the source zone for search * @param search java.lang.String[] - the strings to search for replacement * @param replace java.lang.String - the replacement string */ private boolean replaceFrom(String source, String[] search, String replace) { boolean found = false; if (source != null) { for (int i = 0; i < search.length; i++) { if (source.endsWith(search[i])) { sb.replace(sb.length() - search[i].length(), sb.length(), replace); modified = true; found = true; setStrings(); break; } } } return found; } /** * Delete a search string within the source zone * * @param source the source zone for search * @param suffix the strings to search for suppression */ private void deleteFrom(String source, String[] suffix) { if (source != null) { for (int i = 0; i < suffix.length; i++) { if (source.endsWith(suffix[i])) { sb.delete(sb.length() - suffix[i].length(), sb.length()); modified = true; setStrings(); break; } } } } /** * Test if a char is a french vowel, including accentuated ones * * @param ch the char to test * @return boolean - true if the char is a vowel */ private boolean isVowel(char ch) { switch (ch) { case 'a': case 'e': case 'i': case 'o': case 'u': case 'y': case '': case '': case '': case '': case '': case '': case '': case '': case '': case '': case '': case '': return true; default: return false; } } /** * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br> * "R is the region after the first non-vowel following a vowel * or is the null region at the end of the word if there is no such non-vowel"<br> * @param buffer java.lang.StringBuffer - the in buffer * @return java.lang.String - the resulting string */ private String retrieveR(StringBuffer buffer) { int len = buffer.length(); int pos = -1; for (int c = 0; c < len; c++) { if (isVowel(buffer.charAt(c))) { pos = c; break; } } if (pos > -1) { int consonne = -1; for (int c = pos; c < len; c++) { if (!isVowel(buffer.charAt(c))) { consonne = c; break; } } if (consonne > -1 && (consonne + 1) < len) return buffer.substring(consonne + 1, len); else return null; } else return null; } /** * Retrieve the "RV zone" from a buffer an return the corresponding string<br> * "If the word begins with two vowels, RV is the region after the third letter, * otherwise the region after the first vowel not at the beginning of the word, * or the end of the word if these positions cannot be found."<br> * @param buffer java.lang.StringBuffer - the in buffer * @return java.lang.String - the resulting string */ private String retrieveRV(StringBuffer buffer) { int len = buffer.length(); if (buffer.length() > 3) { if (isVowel(buffer.charAt(0)) && isVowel(buffer.charAt(1))) { return buffer.substring(3, len); } else { int pos = 0; for (int c = 1; c < len; c++) { if (isVowel(buffer.charAt(c))) { pos = c; break; } } if (pos + 1 < len) return buffer.substring(pos + 1, len); else return null; } } else return null; } /** * Turns u and i preceded AND followed by a vowel to UpperCase<br> * Turns y preceded OR followed by a vowel to UpperCase<br> * Turns u preceded by q to UpperCase<br> * * @param buffer java.util.StringBuffer - the buffer to treat * @return java.util.StringBuffer - the treated buffer */ private StringBuffer treatVowels(StringBuffer buffer) { for (int c = 0; c < buffer.length(); c++) { char ch = buffer.charAt(c); if (c == 0) // first char { if (buffer.length() > 1) { if (ch == 'y' && isVowel(buffer.charAt(c + 1))) buffer.setCharAt(c, 'Y'); } } else if (c == buffer.length() - 1) // last char { if (ch == 'u' && buffer.charAt(c - 1) == 'q') buffer.setCharAt(c, 'U'); if (ch == 'y' && isVowel(buffer.charAt(c - 1))) buffer.setCharAt(c, 'Y'); } else // other cases { if (ch == 'u') { if (buffer.charAt(c - 1) == 'q') buffer.setCharAt(c, 'U'); else if (isVowel(buffer.charAt(c - 1)) && isVowel(buffer.charAt(c + 1))) buffer.setCharAt(c, 'U'); } if (ch == 'i') { if (isVowel(buffer.charAt(c - 1)) && isVowel(buffer.charAt(c + 1))) buffer.setCharAt(c, 'I'); } if (ch == 'y') { if (isVowel(buffer.charAt(c - 1)) || isVowel(buffer.charAt(c + 1))) buffer.setCharAt(c, 'Y'); } } } return buffer; } /** * Checks a term if it can be processed correctly. * * @return boolean - true if, and only if, the given term consists in letters. */ private boolean isStemmable(String term) { boolean upper = false; int first = -1; for (int c = 0; c < term.length(); c++) { // Discard terms that contain non-letter characters. if (!Character.isLetter(term.charAt(c))) { return false; } // Discard terms that contain multiple uppercase letters. if (Character.isUpperCase(term.charAt(c))) { if (upper) { return false; } // First encountered uppercase letter, set flag and save // position. else { first = c; upper = true; } } } // Discard the term if it contains a single uppercase letter that // is not starting the term. if (first > 0) { return false; } return true; } }