Soundex - the Soundex Algorithm, as described by Knuth
/*
* Copyright (c) Ian F. Darwin, http://www.darwinsys.com/, 1996-2002.
* All rights reserved. Software written by Ian F. Darwin and others.
* $Id: LICENSE,v 1.8 2004/02/09 03:33:38 ian Exp $
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Java, the Duke mascot, and all variants of Sun's Java "steaming coffee
* cup" logo are trademarks of Sun Microsystems. Sun's, and James Gosling's,
* pioneering role in inventing and promulgating (and standardizing) the Java
* language and environment is gratefully acknowledged.
*
* The pioneering role of Dennis Ritchie and Bjarne Stroustrup, of AT&T, for
* inventing predecessor languages C and C++ is also gratefully acknowledged.
*/
/**
* Soundex - the Soundex Algorithm, as described by Knuth
* <p>
* This class implements the soundex algorithm as described by Donald
* Knuth in Volume 3 of <I>The Art of Computer Programming</I>. The
* algorithm is intended to hash words (in particular surnames) into
* a small space using a simple model which approximates the sound of
* the word when spoken by an English speaker. Each word is reduced
* to a four character string, the first character being an upper case
* letter and the remaining three being digits. Double letters are
* collapsed to a single digit.
*
* <h2>EXAMPLES</h2>
* Knuth's examples of various names and the soundex codes they map
* to are:
* <b>Euler, Ellery -> E460
* <b>Gauss, Ghosh -> G200
* <b>Hilbert, Heilbronn -> H416
* <b>Knuth, Kant -> K530
* <b>Lloyd, Ladd -> L300
* <b>Lukasiewicz, Lissajous -> L222
*
* <h2>LIMITATIONS</h2>
* As the soundex algorithm was originally used a <B>long</B> time ago
* in the United States of America, it uses only the English alphabet
* and pronunciation.
* <p>
* As it is mapping a large space (arbitrary length strings) onto a
* small space (single letter plus 3 digits) no inference can be made
* about the similarity of two strings which end up with the same
* soundex code. For example, both "Hilbert" and "Heilbronn" end up
* with a soundex code of "H416".
* <p>
* The soundex() method is static, as it maintains no per-instance
* state; this means you never need to instantiate this class.
*
* @author Perl implementation by Mike Stok (<stok@cybercom.net>) from
* the description given by Knuth. Ian Phillips (<ian@pipex.net>) and
* Rich Pinder (<rpinder@hsc.usc.edu>) supplied ideas and spotted
* mistakes.
* @author Ian Darwin, http://www.darwinsys.com/ (Java Version)
* @version $Id: Soundex.java,v 1.9 2004/02/23 00:30:49 ian Exp $
*/
public class Soundex {
/* Implements the mapping
* from: AEHIOUWYBFPVCGJKQSXZDTLMNR
* to: 00000000111122222222334556
*/
public static final char[] MAP = {
//A B C D E F G H I J K L M
'0','1','2','3','0','1','2','0','0','2','2','4','5',
//N O P W R S T U V W X Y Z
'5','0','1','2','6','2','3','0','1','0','2','0','2'
};
/** Convert the given String to its Soundex code.
* @return null If the given string can't be mapped to Soundex.
*/
public static String soundex(String s) {
// Algorithm works on uppercase (mainframe era).
String t = s.toUpperCase();
StringBuffer res = new StringBuffer();
char c, prev = '?';
// Main loop: find up to 4 chars that map.
for (int i=0; i<t.length() && res.length() < 4 &&
(c = t.charAt(i)) != ','; i++) {
// Check to see if the given character is alphabetic.
// Text is already converted to uppercase. Algorithm
// only handles ASCII letters, do NOT use Character.isLetter()!
// Also, skip double letters.
if (c>='A' && c<='Z' && c != prev) {
prev = c;
// First char is installed unchanged, for sorting.
if (i==0)
res.append(c);
else {
char m = MAP[c-'A'];
if (m != '0')
res.append(m);
}
}
}
if (res.length() == 0)
return null;
for (int i=res.length(); i<4; i++)
res.append('0');
return res.toString();
}
/** main */
public static void main(String[] args) {
String[] names = {
"Darwin, Ian",
"Davidson, Greg",
"Darwent, William",
"Derwin, Daemon"
};
for (int i = 0; i< names.length; i++)
System.out.println(Soundex.soundex(names[i]) + ' ' + names[i]);
}
}
Related examples in the same category