Title Name Parser
// Thanks to Robert Cooper for this!
//package com.totsp.bookworm.util;
import java.util.HashSet;
import java.util.Set;
/**
*
* @author kebernet
*/
public class NameParser {
private static final Set<String> TITLES = new HashSet<String>();
private static final Set<String> SUFFIXES = new HashSet<String>();
private static final Set<String> COMPOUND_NAMES = new HashSet<String>();
public static final int TITLE = 0;
public static final int FIRST_NAME = 1;
public static final int MIDDLE_NAME = 2;
public static final int LAST_NAME = 3;
public static final int SUFFIX = 4;
static {
for (String title : new String[] { "dr.", "dr", "doctor", "mr.", "mr", "mister", "ms.", "ms", "miss", "mrs.",
"mrs", "mistress", "hn.", "hn", "honorable", "the", "honorable", "his", "her", "honor", "fr", "fr.",
"frau", "hr", "herr", "rv.", "rv", "rev.", "rev", "reverend", "reverend", "madam", "lord", "lady",
"sir", "senior", "bishop", "rabbi", "holiness", "rebbe", "deacon", "eminence", "majesty", "consul",
"vice", "president", "ambassador", "secretary", "undersecretary", "deputy", "inspector", "ins.",
"detective", "det", "det.", "constable", "private", "pvt.", "pvt", "petty", "p.o.", "po", "first",
"class", "p.f.c.", "pfc", "lcp.", "lcp", "corporal", "cpl.", "cpl", "colonel", "col", "col.",
"capitain", "cpt.", "cpt", "ensign", "ens.", "ens", "lieutenant", "lt.", "lt", "ltc.", "ltc",
"commander", "cmd.", "cmd", "cmdr", "rear", "radm", "r.adm.", "admiral", "adm.", "adm", "commodore",
"cmd.", "cmd", "general", "gen", "gen.", "ltgen", "lt.gen.", "maj.gen.", "majgen.", "major", "maj.",
"mjr", "maj", "seargent", "sgt.", "sgt", "chief", "cf.", "cf", "petty", "officer", "c.p.o.", "cpo",
"master", "cmcpo", "fltmc", "formc", "mcpo", "mcpocg", "command", "fleet", "force" }) {
NameParser.TITLES.add(title);
}
for (String suffix : new String[] { "jr.", "jr", "junior", "ii", "iii", "iv", "senior", "sr.", "sr", //family
"phd", "ph.d", "ph.d.", "m.d.", "md", "d.d.s.", "dds", // doctors
"k.c.v.o", "kcvo", "o.o.c", "ooc", "o.o.a", "ooa", "g.b.e", "gbe", // knighthoods
"k.b.e.", "kbe", "c.b.e.", "cbe", "o.b.e.", "obe", "m.b.e", "mbe", // cont
"esq.", "esq", "esquire", "j.d.", "jd", // lawyers
"m.f.a.", "mfa", //misc
"r.n.", "rn", "l.p.n.", "lpn", "l.n.p.", "lnp", //nurses
"c.p.a.", "cpa", //money men
"d.d.", "dd", "d.div.", "ddiv", //preachers
"ret", "ret." }) {
NameParser.SUFFIXES.add(suffix);
}
for (String comp : new String[] { "de", "la", "st", "st.", "ste", "ste.", "saint", "van", "der", "al", "bin",
"le", "mac", "di", "del", "vel", "von", "e'", "san", "af", "el" }) {
NameParser.COMPOUND_NAMES.add(comp);
}
}
/**
* This method will parse a name into first middle and last names.
* <p>
* Notes: "Al" is treated as a name. "al" as a name fragment. That is the
* only exception for capitalization.
* </p>
* @param name name to parse
* @return String[5] containing title, first, middle and last names, suffix
*/
public String[] parseName(String name) {
// NOTE Add lookahead for Suffixes to support
// "Winthrop Wolfcasts, the 31st Duke of Winchester"
String[] result = new String[5];
if (name == null) {
return result;
}
StringBuffer title = new StringBuffer();
StringBuffer first = new StringBuffer();
StringBuffer middle = new StringBuffer();
StringBuffer last = new StringBuffer();
StringBuffer suffix = new StringBuffer();
boolean isLastCommaFirst = false;
if (name.indexOf(",") != -1) {
String[] lastRest = name.split(",");
if (lastRest.length > 2) {
isLastCommaFirst = true;
} else {
String[] suffixes = lastRest[1].toLowerCase().trim().split(" ");
for (String check : suffixes) {
if (!NameParser.SUFFIXES.contains(check)) {
isLastCommaFirst = true;
break;
}
}
}
}
if (isLastCommaFirst) // the user split the last name
{
String[] lastRest = name.split(",");
if (lastRest.length > 2) {
for (int i = 2; i < lastRest.length; i++) //append the remaining elements to the end of the second element
{
lastRest[1] += (" " + lastRest[i]);
}
}
result[NameParser.LAST_NAME] = lastRest[0].trim();
if ((lastRest.length > 1) && (lastRest[1].trim().indexOf(" ") == -1)) // easy case
{
result[NameParser.FIRST_NAME] = lastRest[1].trim();
return result;
} else {
String[] rest = lastRest[1].trim().split(" ");
int head = 0;
int tail = rest.length - 1;
//System.out.println("tail::" + rest[tail]);
//parse titles
for (int i = head; (i < rest.length) && NameParser.TITLES.contains(rest[i].toLowerCase().trim()); i++) {
if (i != 0) {
title.append(' ');
}
title.append(rest[i]);
head++;
}
if (title.length() > 0) {
result[NameParser.TITLE] = title.toString();
}
//System.out.println(rest[tail].toLowerCase().trim() + ":: " +
// SUFFIXES.contains(rest[tail].toLowerCase().trim()));
//parse suffixes
for (int i = tail; (i >= head) && NameParser.SUFFIXES.contains(rest[i].toLowerCase().trim()); i--) {
if (i != tail) {
suffix.insert(0, ' ');
}
suffix.insert(0, rest[i]);
tail--;
}
if (suffix.length() > 0) {
result[NameParser.SUFFIX] = suffix.toString();
}
int[] nextNameOrder = new int[] { NameParser.FIRST_NAME, NameParser.MIDDLE_NAME };
int nextNameIndex = 0;
//System.out.println("head:" + head + " tail:" + tail);
//System.out.println("Suffix " + suffix);
for (int i = head; i <= tail; i++) {
StringBuffer nextName = new StringBuffer();
while (!rest[i].trim().equals("Al") && NameParser.COMPOUND_NAMES.contains(rest[i].toLowerCase().trim())) {
nextName.append(rest[i].trim());
if (i != tail) {
nextName.append(' ');
}
i++;
if (i == tail) {
break;
}
}
nextName.append(rest[i]);
result[nextNameOrder[nextNameIndex]] = nextName.toString();
nextNameIndex++;
if (nextNameIndex == nextNameOrder.length) {
for (int j = i + 1; j < tail; j++) {
if (j != (i + 1)) {
nextName.append(' ');
}
nextName.append(rest[j]);
}
result[nextNameOrder[nextNameIndex - 1]] = nextName.toString();
break;
}
}
}
} // end last, first case.
else {
String[] names = name.split(" ");
int head = 0;
int tail = names.length - 1;
//parse titles
for (int i = head; (i < tail) && NameParser.TITLES.contains(names[i].toLowerCase().trim()); i++) {
if (i != 0) {
title.append(' ');
}
title.append(names[i]);
head++;
}
if (title.length() > 0) {
result[NameParser.TITLE] = title.toString();
}
//parse suffixes
for (int i = tail; (i >= head) && NameParser.SUFFIXES.contains(names[i].toLowerCase().trim()); i--) {
if (i != tail) {
suffix.insert(0, ' ');
}
suffix.insert(0, names[i]);
tail--;
}
if (suffix.length() > 0) {
result[NameParser.SUFFIX] = suffix.toString();
names[tail] = names[tail].replaceAll(",", "");
}
if (head == tail) { //Only one name left
if (names[head].trim().length() > 0) {
result[NameParser.FIRST_NAME] = names[head];
}
} else {
//parse last name
last.append(names[tail]);
tail--;
for (int i = tail; (i >= head) && !names[i].trim().equals("Al")
&& NameParser.COMPOUND_NAMES.contains(names[i].toLowerCase().trim()); i--) {
last.insert(0, ' ');
last.insert(0, names[i]);
tail--;
}
boolean firstPass = true;
//parse first name
for (int i = head; i <= tail; i++) {
if (!firstPass) {
first.append(' ');
}
first.append(names[i].trim());
head++;
firstPass = false;
if (names[i].trim().equals("Al") || !NameParser.COMPOUND_NAMES.contains(names[i].trim().toLowerCase())) {
break;
}
}
//build middle name
for (int i = head; i <= tail; i++) {
if (i != head) {
middle.append(' ');
}
middle.append(names[i].trim());
}
}
if (first.length() > 0) {
result[NameParser.FIRST_NAME] = first.toString().trim();
}
if (last.length() > 0) {
result[NameParser.LAST_NAME] = last.toString().trim();
}
if (middle.length() > 0) {
result[NameParser.MIDDLE_NAME] = middle.toString().trim();
}
}
return result;
}
}
Related examples in the same category
1. | Split with | | |
2. | Split first with | | |
3. | split By Space and save result to a List | | |
4. | Space trim | | |
5. | truncate by length | | |
6. | Remove all blanks | | |
7. | Is a string a Number | | |
8. | Random string | | |
9. | Tokenizer. Why? Because StringTokenizer is not available in J2ME. | | |
10. | String resource | | |
11. | Shows creating text with links from HTML in the Java code, rather than from a string resource. Note that for a | | |
12. | Join a collection of strings by a seperator | | |
13. | Tests if a string is blank: null, emtpy, or only whitespace (" ", \r\n, \t, etc) | | |
14. | Tests if a string is numeric, i.e. contains only digit characters | | |
15. | Writer implementation that outputs to a StringBuilder | | |
16. | Gets the device's phone number as a String. | | |
17. | Inspects a link Configuration through reflection API to generate a human readable String with values replaced with their constants names. | | |
18. | Returns a String representation of the content of a android.view.Display object. | | |
19. | Get String Element Value | | |
20. | Join strings | | |
21. | Find two consecutive newlines in a string. | | |
22. | Retrieve a boolean primitive type from a String. | | |
23. | Trim char from string | | |
24. | Returns true if the string does not fit in standard ASCII | | |
25. | Returns true if the given string is null or empty. | | |
26. | 4 octets in address string | | |
27. | Add space to CSV string | | |
28. | String fast Split | | |
29. | Split a String by a Character, i.e. Split lines by using '\n' | | |
30. | String Capitalizer | | |
31. | Count char in a string | | |
32. | Search char in a string from a starting position | | |
33. | load String From Raw Resource | | |
34. | Join Collection of String | | |
35. | Padding a string, truncate a string | | |
36. | Converts a string to title casing. | | |
37. | reversing String | | |
38. | load Resource To String | | |
39. | convert Duration to String | | |
40. | Convert string from one encoding to another | | |
41. | Object to String and String to Object | | |
42. | IP to String | | |
43. | Convert string to bumber and convert number to string | | |
44. | line string reader in J2ME | | |
45. | String to Map with token | | |
46. | Generate the client id, which is a fixed string of length 8 concatenated with 12 random bytes | | |
47. | StringBuilder Writer | | |
48. | Return a specific raw resource contents as a String value. | | |
49. | Returns the ISO 8601-format String corresponding to the given duration (measured in milliseconds). | | |
50. | Returns a string representation of the given number of nanoseconds. | | |
51. | Simple Tokenizer | | |
52. | split By Space | | |
53. | Pad Front | | |
54. | Count Occurrences | | |
55. | Padding Left | | |
56. | captalize Words | | |
57. | Tokenizer Utils | | |
58. | Returns space padding | | |
59. | Normalise Whitespace | | |
60. | Removes unwanted blank characters | | |
61. | Removes unwanted backslashes characters | | |
62. | equals Ignore Case | | |
63. | A method to decode/encode quoted printable encoded data | | |
64. | Split Camal Case | | |
65. | Split and combine by token | | |
66. | Shorten text for display in lists etc. | | |