Java tutorial
package com.geocode.service.impl; import java.io.File; import java.io.IOException; import java.io.StringReader; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import javax.annotation.PostConstruct; import opennlp.tools.cmdline.PerformanceMonitor; import opennlp.tools.cmdline.postag.POSModelLoader; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSSample; import opennlp.tools.postag.POSTaggerME; import opennlp.tools.tokenize.WhitespaceTokenizer; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import org.springframework.stereotype.Service; import org.springframework.util.StringUtils; import com.geocode.service.AddressService; import com.geocode.utils.CounterHelper; @Service("addressService") public class AddressServiceImpl implements AddressService { /* * CC Coordinating conjunction e.g. and,but,or... * CD Cardinal Number * DT Determiner * EX Existential there * FW Foreign Word * IN Preposision or subordinating conjunction * JJ Adjective * JJR Adjective, comparative * JJS Adjective, superlative * LS List Item Marker * MD Modal e.g. can, could, might, may... * NN Noun, singular or mass * NNP Proper Noun, singular * NNPS Proper Noun, plural * NNS Noun, plural * PDT Predeterminer e.g. all, both ... when they precede an article * POS Possessive Ending e.g. Nouns ending in 's * PRP Personal Pronoun e.g. I, me, you, he... * PRP$ Possessive Pronoun e.g. my, your, mine, yours... * RB Adverb * Most words that end in -ly as well as degree words like quite, too and very * RBR Adverb, comparative * Adverbs with the comparative ending -er, with a strictly comparative meaning. * RBS Adverb, superlative * RP Particle * SYM Symbol * Should be used for mathematical, scientific or technical symbols * TO to * UH Interjection e.g. uh, well, yes, my... * VB Verb, base form * subsumes imperatives, infinitives and subjunctives * VBD Verb, past tense * includes the conditional form of the verb to be * VBG Verb, gerund or persent participle * VBN Verb, past participle * VBP Verb, non-3rd person singular present * VBZ Verb, 3rd person singular present * WDT Wh-determiner e.g. which, and that when it is used as a relative pronoun * WP Wh-pronoun e.g. what, who, whom... * WP$ Possessive wh-pronoun e.g. * WRB Wh-adverb e.g. how, where why */ private static String basePath = "/models/"; // private static String input = "We have created a search built on Lucene and the results are indexed for speed as you mention. We would like to add zip code to this and then have the results be returned (narrow search) by distance from this zip code. 102 main street Anytown, state 400n 600e #2, 52173 p.o. #104 60203 102 main street 6806 Fifth Ave NW 123 - 123-A34 #45 St. Garmin Street 21 St. Elizabeth Dr. 253 N. Cherry St. 123 - 123-A34 #45 St. Germain Street, Hartford, CT, 06142 USA 123 - 123-A34 #45 Germain Street, Toronto, ON, M9M0C6 Canada 455 Larkspur Dr. San Jose, CA 92926 N6W2 3001 Bluemound Road A 19 Calle 117 1234 North Main Street 123 1/2 Main Street 456 B Wilson Street B317 A Calle 117 206-210 Fourth Street 194-03 1/2 50th Avenue, New York, NY 11365 27N4W305-A County Road 45 0 1/2 Fifth Avenue 123 A Main Street Atlanta, Georgia 30316 General Delivery, Tampa, FL 33602 RR 4, Box 10 HC 2, Box 7 PSC 4, Box 3 CMR 4, Box 2 UNIT 475 PO Box 6943; PO Box G; PO Box 00145 PO Box 489; Box AA Birmingham, Alabama 35305 On Wednesday, May 21, 2014, 5 p.m., at Paramount Fine Foods, 253 Yonge Street , Toronto police officers will have their legs shaved by Miss Universe contestants in the second 'Bald and the Beautiful' fundraiser Cops For Cancer event benefiting the Canadian Cancer Society. All 70 Miss Universe contestants will be available for interviews at Paramount Fine Foods after the event."; private static List<String> excludedList = new ArrayList<String>(Arrays.asList("JANUARY", "FEBRUARY", "MARCH", "APRIL", "MAY", "JUNE", "JULY", "AUGUST", "SEPTEMBER", "OCTOBER", "NOVEMBER", "DECEMBER", "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC", "JAN.", "FEB.", "MAR.", "APR.", "MAY.", "JUN.", "JUL.", "AUG.", "SEP.", "OCT.", "NOV.", "DEC.", "SUNDAY", "MONDAY", "TUESDAY", "WEDNESDAY", "THRUSDAY", "FRIDAY", "SATURDAY", "P.M.", "A.M", "AM", "PM", "KM-H.", "KM-H", "CALL")); private static List<String> excludedEndsWithList = new ArrayList<String>( Arrays.asList("AM", "PM", "AM,", "PM,")); private static List<String> baseList = new ArrayList<String>(Arrays.asList("NNP", "NNPS", "JJ", "NN", "IN")); private static List<String> list = new ArrayList<String>(Arrays.asList(",", "CD")); private static POSTaggerME tagger = null; private static POSModel model = null; @PostConstruct public void init() throws URISyntaxException { model = new POSModelLoader() .load(new File(this.getClass().getResource(basePath + "en-pos-maxent.bin").toURI())); tagger = new POSTaggerME(model); list.addAll(baseList); CounterHelper.readCounter(); } public static void main(String[] args) throws IOException, URISyntaxException { String input = "Wednesday, June 4, 8pm, Gladstone Hotel (1214 Queen Street West). DOC NOW Festival 2014 Screenings 11 films in four programs will screen for free as party as Ryerson's DOC NOW Fest. Check out more about these screenings here. June 4-5, 6:30pm, Bloor Hot Docs Cinema (504 Bloor Street West). Spring Fling The Piston is giving the last weeks of spring a rush of fun with The Order of Good Cheer (who are releasing an album), Dine Alone Foods, Collective Arts Brewing, and Gooch's World Famous Smoked Meat. For PWYC you can show up to win prizes, see bands, dance to DJs, and see surprise guests. June 4, 11, 18, 25, 9pm, The Piston (937 Bloor St West). Pivot Finale Pivot is closing up shop for their 2013-14 season. Angela Hibbs, Aisha Sasha John, Jim Johnstone, and Suzannah Showler will read. It's a good cheap date for literary types, but donate whatever you can to these poor authors. Wednesday, June 4, 8pm, The Press Club (850 Dundas Street West). Cardboard Beach If you live or work downtown and fantasize about running away (but not too far away) to catch some rays, art has your back. Luminato will install a fully licensed temporary beach at their hub at David Pecaut Square from June 6-15, where the world's largest disco ball hung last year. The catch? Unlike Toronto's admittedly decent existing beaches, this 'oasis' will be entirely made of cardboard. Parties will take place at the beach (with food!) so check out Luminato's website for more. June 6-15, Luminato Festival Hub at David Pecaut Square (55 John Street) IsKw Live in the Stacks All ages event Live in the Stacks is back bringing the music IsKw, a Cree/Dene/Irish alternative r&b/trip hop artist, to the quiet old library. Friday, June 6, 8pm, Spadina Road Library (10 Spadina Road). SlowPitch Live at The Film Buff East We've featured sci-fi loving turntablist SlowPitch in our breakout band series, and here's a chance to see him work those decks live and for free while browsing films, and maybe splurging on an ice cream if it's hot. Friday, June 6, 7pm, Film Buff (1380 Queen Street East)."; AddressServiceImpl addressService = new AddressServiceImpl(); addressService.init(); List<String> addresses = addressService.extractAddress(input); //chunk(input); for (String string : addresses) { System.out.println(string); } } private String cleanInputString(String input, List<String> locations) { String braceStart = "("; String braceEnd = ")"; if (input != null) { int index = input.indexOf(braceStart); while (index >= 0) { int indexEnd = input.indexOf(braceEnd, index + 1); if (indexEnd >= 0) locations.add(input.substring(index + 1, indexEnd)); index = input.indexOf(braceStart, index + 1); } locations = filterLocations(locations); input = input.replace(")", " "); input = input.replace("(", " "); } return input; } @Override public List<String> extractAddress(String input) throws IOException { List<String> locations = new ArrayList<String>(); PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent"); ObjectStream<String> lineStream = new PlainTextByLineStream(new StringReader(input)); perfMon.start(); input = cleanInputString(input, locations); String line; String whitespaceTokenizerLine[] = null; String[] tags = null; POSSample sample = null; while ((line = lineStream.read()) != null) { whitespaceTokenizerLine = WhitespaceTokenizer.INSTANCE.tokenize(line); tags = tagger.tag(whitespaceTokenizerLine); sample = new POSSample(whitespaceTokenizerLine, tags); //System.out.println(sample.toString()); perfMon.incrementCounter(); } if (sample != null && sample.getSentence() != null) { String[] sent = sample.getSentence(); String nnp = null; boolean itemProcessed = false; for (int i = 0; i < tags.length; i++) { itemProcessed = false; if (!itemProcessed && tags[i].equals("CD") && nnp == null) { if (i + 1 < tags.length && baseList.contains(tags[i + 1])) { nnp = addWordToLocation(nnp, sent[i]); } itemProcessed = true; } // Denotes completetion of one address if (!itemProcessed && tags[i].equals("NN")) { if (i - 1 >= 0 && tags[i - 1].equals("NNP")) { itemProcessed = true; locations.add(nnp); nnp = null; } } if (!itemProcessed && list.contains(tags[i]) && !checkExcludedWords(sent[i])) { itemProcessed = true; nnp = addWordToLocation(nnp, sent[i]); } if (!itemProcessed) { itemProcessed = true; if (nnp != null) { locations.add(nnp); nnp = null; } } } perfMon.stopAndPrintFinalResult(); // for (String string : locations) { // if(string.contains(" ") && string.matches(".*\\d+.*")) // System.out.println(string); // } } return filterLocations(locations); } public boolean checkExcludedWords(String string) { string = string.trim().toUpperCase(); string = string.replace(",", ""); return excludedList.contains(string); } public boolean checkExcludedEndsWithWords(String string) { string = string.trim().toUpperCase(); boolean invalid = false; for (String endText : excludedEndsWithList) { if (string.endsWith(endText)) { invalid = true; break; } } return invalid; } private String addWordToLocation(String nnp, String sent) { if (!checkExcludedEndsWithWords(sent)) { if (nnp == null) { nnp = sent; } else { nnp = nnp + " " + sent; } } else if (nnp != null && !nnp.contains(" ")) { nnp = null; } return nnp; } public List<String> filterLocations(List<String> locations) { List<String> fileteredLocations = new ArrayList<String>(); for (String string : locations) { if (string != null && string.matches("((?=.*\\d)(?=.*\\s)(?=.*[a-zA-Z]).{2,})") && StringUtils.countOccurrencesOf(string, " ") >= 2) fileteredLocations.add(string); } return fileteredLocations; } }