Java tutorial
/** * Copyright 2013 , UT-Battelle, LLC * All rights reserved * * JavaParserOfAddresses, Version 1.0 * http://github.com/sorokine/JavaParserOfAddresses * * This program is freely distributed under UT-Batelle, LLC * open source license. Read the file LICENSE.txt for details. */ package GIST.IzbirkomExtractor; import info.sorokine.utils.FischerKrause; import java.util.ArrayList; import java.util.logging.Logger; import java.util.regex.Matcher; import org.apache.commons.lang3.text.WordUtils; import GIST.IzbirkomExtractor.Russian.Ordinal; import GIST.IzbirkomExtractor.Russian.OrdinalFactory; /** * @author Alex Sorokine <sorokine@gmail.com> * */ public class StreetNameNormalizer { private AbbrList streetTypeAbbrList = new StreetTypesAbbrList(); private AbbrList streetNamePartList = new StreetNamePartsAbbrList(); private OrdinalFactory ordinalFactory = new OrdinalFactory(); /** * Main logger */ private final static Logger logger = Logger.getLogger("ALL"); public StreetNameNormalizer() { } /** * * @param streetName street name to be normalized * @return normalized street name */ public ArrayList<String> normalize(String streetName) { /** * Array for street name variations. */ ArrayList<String> variations = new ArrayList<String>(); /* expand abbreviated street type, extract street type, and extract it from the street name */ Matcher m = streetTypeAbbrList.getExpansionsPattern() .matcher(streetTypeAbbrList.expandAbbreviations(streetName.toLowerCase())); String streetType; String streetNoType; if (!m.find()) { logger.warning("No street type in " + streetName); streetType = ""; streetNoType = streetName; } else { streetType = m.group(1); streetNoType = m.replaceFirst(" "); } streetNoType = streetNoType.replaceAll("\\s+", " ") .trim(); /* this is to fix case when street type is in the middle */ streetNoType = WordUtils.capitalizeFully(streetNoType).replaceAll("\\b\\b", ""); /* capitalize word parts but avoid capitalizing single */ /* replacement from abbreviated street name parts */ /* permute street type place in the street name */ for (String streetNameVar : streetNamePartList.createAllExpansions(streetNoType)) { String[] streetNameParts = streetNameVar.split("\\s+"); /* check each if each street name part is an ordinal */ ArrayList<Ordinal> ordinals = new ArrayList<Ordinal>(streetNameParts.length); int ordinalCount = 0; for (int i = 0; i < streetNameParts.length; i++) { Ordinal o = ordinalFactory.parse(streetNameParts[i]); ordinals.add(o); if (o != null) ordinalCount++; } /* permute all parts of the street name except for street type */ for (FischerKrause fk = new FischerKrause(streetNameParts.length); fk.hasNext();) { int idx[] = fk.next(); StringBuilder sb = new StringBuilder(); StringBuilder sb_regex = new StringBuilder(); /* StringBuilder for queries with regex */ for (int i = 0; i < idx.length; i++) { sb.append(streetNameParts[idx[i]]); if (ordinals.get(idx[i]) != null) sb_regex.append(ordinals.get(idx[i]).getSQLRegex()); else sb_regex.append(streetNameParts[idx[i]]); if (i == idx.length - 1) continue; /* avoid adding space at the end of the string */ sb.append(' '); sb_regex.append(' '); } /* permutation of the words without street types */ variations.add(sb.toString() + ' ' + streetType); variations.add(streetType + ' ' + sb.toString()); /* permutation for regexped form of the street name with ordinals regexps */ if (ordinalCount > 0 && !sb.toString().contains( "(")) { /* make sure that streetname itself does not contain regex-like symbols (typically resulting from parse errors) */ variations.add("^" + sb_regex.toString() + ' ' + streetType + '$'); variations.add("^" + streetType + ' ' + sb_regex.toString() + '$'); /* if the street name starts with an ordinal add permutations with the street type after the 1st ordinal */ if (ordinals.get(idx[0]) != null && streetNameParts.length > 1) { variations .add(sb.insert(streetNameParts[idx[0]].length() + 1, streetType + ' ').toString()); variations.add("^" + sb_regex.insert(ordinals.get(idx[0]).getSQLRegex().length() + 1, streetType + ' ') .toString() + '$'); } } } } return variations; } /** * This is for testing only, will be removed * @param args */ public static void main(String[] args) { String s[] = { "5-? ?? ?", " 26 ? ??", "1-? ?? ", "3- ", "2-? ?? ", "1-? ?? ?", "9 ? ?", "1 ? ? ", "3-? ?? ", "? ?? ", "1-? ?? ", "5 ? ? ", "2 ? ?", "? 3 ", "6-? ?? ?", "? 5 ", "2-? ", "4 ? ?", "? ?? ?", "3-? " }; try { String ars[] = args.length > 0 ? args : s; for (String string : ars) System.out.println(s + " => " + (new StreetNameNormalizer().normalize(string)).toString()); } catch (Exception e) { e.printStackTrace(); } } }