Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package bzh.terrevirtuelle.navisu.gazetteer.impl.lucene; import bzh.terrevirtuelle.navisu.gazetteer.impl.GazetteerComponentImpl; import bzh.terrevirtuelle.navisu.gazetteer.impl.lucene.domain.Location; import java.io.BufferedReader; import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.PriorityQueue; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; //import org.apache.commons.cli.DefaultParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.lang3.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.DoubleField; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortedNumericSortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.spatial.SpatialStrategy; import org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy; import org.apache.lucene.spatial.prefix.tree.GeohashPrefixTree; import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree; import org.apache.lucene.spatial.query.SpatialArgs; import org.apache.lucene.spatial.query.SpatialOperation; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import com.google.gson.Gson; import com.spatial4j.core.context.SpatialContext; import com.spatial4j.core.distance.DistanceUtils; import com.spatial4j.core.shape.Point; import edu.usc.ir.geo.gazetteer.service.Launcher; import java.util.Properties; import javafx.scene.control.Alert; import javafx.scene.media.AudioClip; import javafx.scene.text.Text; import org.apache.commons.cli.BasicParser; //import org.apache.commons.cli.DefaultParser; public class GeoNameResolver implements Closeable { //UPPER BOUND FOR SEARCHING AN AREA IN MILES private static final double REVERSE_DISTANCE_LIMIT = 5; private static final String JSON_OPT = "json"; private static final String REVERSE_OPT = "r"; private static final String REVERSE_LONG_OPT = "enable-reverse"; private static final String SEARCH_REVERSE_OPT = "sr"; private static final String SEARCH_REVERSE_LONG_OPT = "search-reverse"; /** * Below constants define name of field in lucene index */ public static final String FIELD_NAME_ID = "ID"; public static final String FIELD_NAME_NAME = "name"; public static final String FIELD_NAME_LONGITUDE = "longitude"; public static final String FIELD_NAME_LATITUDE = "latitude"; public static final String FIELD_NAME_ALTERNATE_NAMES = "alternatenames"; public static final String FIELD_NAME_FEATURE_CODE = "featureCode"; public static final String FIELD_NAME_COUNTRY_CODE = "countryCode"; public static final String FIELD_NAME_ADMIN1_CODE = "admin1Code"; public static final String FIELD_NAME_ADMIN2_CODE = "admin2Code"; public static final String FIELD_NAME_POPULATION = "population"; /** * Below constants define weight multipliers used for result relevance. */ private static final int WEIGHT_SORT_ORDER = 20; private static final int WEIGHT_SIZE_ALT_NAME = 50; private static final int WEIGHT_NAME_MATCH = 20000; private static final int WEIGHT_NAME_PART_MATCH = 15000; private static final Logger LOG = Logger.getLogger(GeoNameResolver.class.getName()); private static final Double OUT_OF_BOUNDS = 999999.0; private static Analyzer analyzer = new StandardAnalyzer(); private static IndexWriter indexWriter; private static Directory indexDir; private static int hitsPerPage = 8; //sort descending on population SortField populationSort = new SortedNumericSortField(FIELD_NAME_POPULATION, SortField.Type.LONG, true); private IndexReader indexReader; private SpatialContext ctx = SpatialContext.GEO; private SpatialPrefixTree grid = new GeohashPrefixTree(ctx, 11); private SpatialStrategy strategy = new RecursivePrefixTreeStrategy(grid, "location"); private String indexerPath; private String gazetteerPath; private Properties properties; protected String CONFIG_FILE_NAME = System.getProperty("user.home") + "/.navisu/config/config.properties"; private final String INDEXER_PATH = "luceneAllCountriesIndexPath"; private final String GAZETEER_PATH = "allCountriesPath"; public GeoNameResolver() { properties = new Properties(); try { properties.load(new FileInputStream(CONFIG_FILE_NAME)); indexerPath = properties.getProperty(INDEXER_PATH).trim(); gazetteerPath = properties.getProperty(GAZETEER_PATH).trim(); } catch (IOException ex) { Logger.getLogger(GazetteerComponentImpl.class.getName()).log(Level.SEVERE, ex.toString(), ex); } if (indexerPath == null) { Alert alert = new Alert(Alert.AlertType.WARNING); alert.setTitle("Gazetteer"); alert.setHeaderText("Attention"); Text s = new Text(" Le chemin de l'index gographique est incorrect." + "\n Vous devez complter le fichier config.properties" + "\n Menu : TOOLS/Config/App/Options"); s.setWrappingWidth(350); alert.getDialogPane().setContent(s); alert.show(); AudioClip plonkSound = new AudioClip( this.getClass().getResource("sounds/warning.mp3").toExternalForm()); plonkSound.play(); } if (gazetteerPath == null) { Alert alert = new Alert(Alert.AlertType.WARNING); alert.setTitle("Gazetteer"); alert.setHeaderText("Attention"); Text s = new Text(" Le chemin des donnes de l'index gographique \n est incorrect." + "\n Vous devez complter le fichier config.properties" + "\n Menu : TOOLS/Config/App/Options"); s.setWrappingWidth(350); alert.getDialogPane().setContent(s); alert.show(); } } /** * Creates a GeoNameResolver for given path * * @param indexPath the path to lucene index * @throws IOException */ public GeoNameResolver(String indexPath) throws IOException { this.indexReader = createIndexReader(indexPath); } /** * * @param locationNames List of location na,es * @param count Number of results per location * @return resolved Geo Names * @throws IOException */ public HashMap<String, List<Location>> searchGeoName(List<String> locationNames, int count) throws IOException { return resolveEntities(locationNames, count, this.indexReader); } /** * Search corresponding GeoName for each location entity * * @param count Number of results for one locations * @param querystr it's the NER actually * * @return HashMap each name has a list of resolved entities * @throws IOException * @throws RuntimeException */ public HashMap<String, List<Location>> searchGeoName(String indexerPath, List<String> locationNameEntities, int count) throws IOException { if (locationNameEntities.size() == 0 || locationNameEntities.get(0).length() == 0) { return new HashMap<String, List<Location>>(); } IndexReader reader = createIndexReader(indexerPath); HashMap<String, List<Location>> resolvedEntities = resolveEntities(locationNameEntities, count, reader); reader.close(); return resolvedEntities; } /** * Returns a list of location near a certain coordinate. * * @param latitude, @param longitude - Center of search area * @param distanceInMiles - Search Radius in miles * @param indexerPath - Path to Lucene index * @param count - Upper bound to number of results * @return - List of locations sorted by population * @throws IOException */ public List<Location> searchNearby(Double latitude, Double longitude, Double distanceInMiles, String indexerPath, int count) throws IOException { double distanceInDeg = DistanceUtils.dist2Degrees(distanceInMiles, DistanceUtils.EARTH_EQUATORIAL_RADIUS_MI); SpatialArgs spatialArgs = new SpatialArgs(SpatialOperation.IsWithin, ctx.makeCircle(longitude, latitude, distanceInDeg)); String key = latitude + "-" + longitude; Filter filter = strategy.makeFilter(spatialArgs); IndexSearcher searcher = new IndexSearcher(createIndexReader(indexerPath)); Sort sort = new Sort(populationSort); TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), filter, count, sort); ScoreDoc[] scoreDocs = topDocs.scoreDocs; HashMap<String, List<Location>> allCandidates = new HashMap<String, List<Location>>(); getMatchingCandidates(searcher, allCandidates, key, scoreDocs); List<Location> results = allCandidates.get(key); return results; } private IndexReader createIndexReader(String indexerPath) throws IOException { File indexfile = new File(indexerPath); indexDir = FSDirectory.open(indexfile.toPath()); if (!DirectoryReader.indexExists(indexDir)) { LOG.log(Level.SEVERE, "No Lucene Index Dierctory Found, Invoke indexBuild() First !"); System.exit(1); } return DirectoryReader.open(indexDir); } private HashMap<String, List<Location>> resolveEntities(List<String> locationNames, int count, IndexReader reader) throws IOException { if (locationNames.size() >= 200) { hitsPerPage = 5; // avoid heavy computation } IndexSearcher searcher = new IndexSearcher(reader); Query q = null; HashMap<String, List<Location>> allCandidates = new HashMap<String, List<Location>>(); for (String name : locationNames) { if (!allCandidates.containsKey(name)) { try { //query is wrapped in additional quotes (") to avoid query tokenization on space q = new MultiFieldQueryParser(new String[] { FIELD_NAME_NAME, FIELD_NAME_ALTERNATE_NAMES }, analyzer).parse(String.format("\"%s\"", name)); Sort sort = new Sort(populationSort); //Fetch 3 times desired values, these will be sorted on code and only desired number will be kept ScoreDoc[] hits = searcher.search(q, hitsPerPage * 3, sort).scoreDocs; getMatchingCandidates(searcher, allCandidates, name, hits); } catch (org.apache.lucene.queryparser.classic.ParseException e) { e.printStackTrace(); } } } HashMap<String, List<Location>> resolvedEntities = new HashMap<String, List<Location>>(); pickBestCandidates(resolvedEntities, allCandidates, count); return resolvedEntities; } private void getMatchingCandidates(IndexSearcher searcher, HashMap<String, List<Location>> allCandidates, String name, ScoreDoc[] hits) { List<Location> topHits = new ArrayList<Location>(); for (int i = 0; i < hits.length; ++i) { Location tmpLocObj = new Location(); int docId = hits[i].doc; Document d; try { d = searcher.doc(docId); tmpLocObj.setName(d.get(FIELD_NAME_NAME)); tmpLocObj.setLongitude(d.get(FIELD_NAME_LONGITUDE)); tmpLocObj.setLatitude(d.get(FIELD_NAME_LATITUDE)); //If alternate names are empty put name as actual name //This covers missing data and equals weight for later computation if (d.get(FIELD_NAME_ALTERNATE_NAMES).isEmpty()) { tmpLocObj.setAlternateNames(d.get(FIELD_NAME_NAME)); } else { tmpLocObj.setAlternateNames(d.get(FIELD_NAME_ALTERNATE_NAMES)); } tmpLocObj.setCountryCode(d.get(FIELD_NAME_COUNTRY_CODE)); tmpLocObj.setAdmin1Code(d.get(FIELD_NAME_ADMIN1_CODE)); tmpLocObj.setAdmin2Code(d.get(FIELD_NAME_ADMIN2_CODE)); tmpLocObj.setFeatureCode(d.get(FIELD_NAME_FEATURE_CODE)); } catch (IOException e) { e.printStackTrace(); } topHits.add(tmpLocObj); } //Picking hitsPerPage number of locations from feature code sorted list allCandidates.put(name, pickTopSortedByCode(topHits, hitsPerPage)); } /** * Sorts inputLocations as per FeatureCodeComparator and returns at most * topCount locations * * @param inputLocations List of locations to be sorted * @param topCount Number of locations to be kept in curtailed list * @return List of at most topCount locations sorted by * edu.usc.ir.geo.gazetteer.CustomLuceneGeoGazetteerComparator.FeatureCodeComparator */ private List<Location> pickTopSortedByCode(List<Location> inputLocations, int topCount) { if (inputLocations == null || inputLocations.size() == 0) { return new ArrayList<>(); } Collections.sort(inputLocations, new CustomLuceneGeoGazetteerComparator.FeatureCodeComparator()); return inputLocations.subList(0, inputLocations.size() > topCount ? topCount : inputLocations.size() - 1); } /** * Select the best match for each location name extracted from a document, * choosing from among a list of lists of candidate matches. Filter uses the * following features: 1) edit distance between name and the resolved name, * choose smallest one 2) content (haven't implemented) * * @param resolvedEntities final result for the input stream * @param allCandidates each location name may hits several documents, this * is the collection for all hitted documents * @param count Number of results for one locations * @throws IOException * @throws RuntimeException */ private void pickBestCandidates(HashMap<String, List<Location>> resolvedEntities, HashMap<String, List<Location>> allCandidates, int count) { for (String extractedName : allCandidates.keySet()) { List<Location> cur = allCandidates.get(extractedName); if (cur.isEmpty()) { continue;//continue if no results found } int maxWeight = Integer.MIN_VALUE; //In case weight is equal for all return top element int bestIndex = 0; //Priority queue to return top elements PriorityQueue<Location> pq = new PriorityQueue<>(cur.size(), new Comparator<Location>() { @Override public int compare(Location o1, Location o2) { return Integer.compare(o2.getWeight(), o1.getWeight()); } }); for (int i = 0; i < cur.size(); ++i) { int weight = 0; // get cur's ith resolved entry's name String resolvedName = String.format(" %s ", cur.get(i).getName()); if (resolvedName.contains(String.format(" %s ", extractedName))) { // Assign a weight as per configuration if extracted name is found as a exact word in name weight = WEIGHT_NAME_MATCH; } else if (resolvedName.contains(extractedName)) { // Assign a weight as per configuration if extracted name is found partly in name weight = WEIGHT_NAME_PART_MATCH; } // get all alternate names of cur's ith resolved entry's String[] altNames = cur.get(i).getAlternateNames().split(","); float altEditDist = 0; for (String altName : altNames) { if (altName.contains(extractedName)) { altEditDist += StringUtils.getLevenshteinDistance(extractedName, altName); } } //lesser the edit distance more should be the weight weight += getCalibratedWeight(altNames.length, altEditDist); //Give preference to sorted results. 0th result should have more priority weight += (cur.size() - i) * WEIGHT_SORT_ORDER; cur.get(i).setWeight(weight); if (weight > maxWeight) { maxWeight = weight; bestIndex = i; } pq.add(cur.get(i)); } if (bestIndex == -1) { continue; } List<Location> resultList = new ArrayList<>(); for (int i = 0; i < count && !pq.isEmpty(); i++) { resultList.add(pq.poll()); } resolvedEntities.put(extractedName, resultList); } } /** * Returns a weight for average edit distance for set of alternate * name<br/><br/> * altNamesSize * WEIGHT_SIZE_ALT_NAME - (altEditDist/altNamesSize) * ;<br/><br/> * altNamesSize * WEIGHT_SIZE_ALT_NAME ensure more priority for results with * more alternate names.<br/> * altEditDist/altNamesSize is average edit distance. <br/> * Lesser the average, higher the over all expression * * @param altNamesSize - Count of altNames * @param altEditDist - sum of individual edit distances * @return */ public float getCalibratedWeight(int altNamesSize, float altEditDist) { return altNamesSize * WEIGHT_SIZE_ALT_NAME - (altEditDist / altNamesSize); } /** * Build the gazetteer index line by line * * @param gazetteerPath path of the gazetteer file * @param indexerPath path to the created Lucene index directory. * @param reverseGeocodingEnabled * @throws IOException * @throws RuntimeException */ public void buildIndex(String gazetteerPath, String indexerPath, boolean reverseGeocodingEnabled) throws IOException { File indexfile = new File(indexerPath); indexDir = FSDirectory.open(indexfile.toPath()); if (!DirectoryReader.indexExists(indexDir)) { IndexWriterConfig config = new IndexWriterConfig(analyzer); indexWriter = new IndexWriter(indexDir, config); Logger logger = Logger.getLogger(this.getClass().getName()); logger.log(Level.WARNING, "Start Building Index for Gazatteer"); BufferedReader filereader = new BufferedReader( new InputStreamReader(new FileInputStream(gazetteerPath), "UTF-8")); String line; int count = 0; while ((line = filereader.readLine()) != null) { try { count += 1; if (count % 100000 == 0) { logger.log(Level.INFO, "Indexed Row Count: " + count); } addDoc(indexWriter, line, reverseGeocodingEnabled); } catch (RuntimeException re) { logger.log(Level.WARNING, "Skipping... Error on line: {0}", line); re.printStackTrace(); } } logger.log(Level.WARNING, "Building Finished"); filereader.close(); indexWriter.close(); } } /** * Index gazetteer's one line data by built-in Lucene Index functions * * @param indexWriter Lucene indexWriter to be loaded * @param line a line from the gazetteer file * @throws IOException * @throws NumberFormatException */ private void addDoc(IndexWriter indexWriter, final String line, final boolean reverseGeocodingEnabled) { String[] tokens = line.split("\t"); int ID = Integer.parseInt(tokens[0]); String name = tokens[1]; String alternatenames = tokens[3]; Double latitude = -999999.0; try { latitude = Double.parseDouble(tokens[4]); } catch (NumberFormatException e) { latitude = OUT_OF_BOUNDS; } Double longitude = -999999.0; try { longitude = Double.parseDouble(tokens[5]); } catch (NumberFormatException e) { longitude = OUT_OF_BOUNDS; } int population = 0; try { population = Integer.parseInt(tokens[14]); } catch (NumberFormatException e) { population = 0;// Treat as population does not exists } // Additional fields to rank more known locations higher // All available codes can be viewed on www.geonames.org String featureCode = tokens[7];// more granular category String countryCode = tokens[8]; String admin1Code = tokens[10];// eg US State String admin2Code = tokens[11];// eg county Document doc = new Document(); doc.add(new IntField(FIELD_NAME_ID, ID, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_NAME, name, Field.Store.YES)); doc.add(new DoubleField(FIELD_NAME_LONGITUDE, longitude, Field.Store.YES)); doc.add(new DoubleField(FIELD_NAME_LATITUDE, latitude, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_ALTERNATE_NAMES, alternatenames, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_FEATURE_CODE, featureCode, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_COUNTRY_CODE, countryCode, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_ADMIN1_CODE, admin1Code, Field.Store.YES)); doc.add(new TextField(FIELD_NAME_ADMIN2_CODE, admin2Code, Field.Store.YES)); doc.add(new NumericDocValuesField(FIELD_NAME_POPULATION, population));//sort enabled field if (reverseGeocodingEnabled) { Point point = ctx.makePoint(longitude, latitude); for (IndexableField f : strategy.createIndexableFields(point)) { doc.add(f); } } try { indexWriter.addDocument(doc); } catch (IOException e) { e.printStackTrace(); } } @Override public void close() throws IOException { if (indexReader != null) { this.indexReader.close(); } } /** * Writes the result as formatted json to given PrintStream * * @param resolvedEntities map of resolved entities * @param out the print stream for writing output */ public static void writeResultJson(Map<String, List<Location>> resolvedEntities, PrintStream out) { out.println(new Gson().toJson(resolvedEntities)); } /** * Writes the result to given PrintStream * * @deprecated Use writeResultJson instead * @param resolvedEntities map of resolved entities * @param out the print stream for writing output */ @Deprecated public static void writeResult(Map<String, List<Location>> resolvedEntities, PrintStream out) { out.println("["); @SuppressWarnings("unchecked") List<String> keys = (List<String>) (List<?>) Arrays.asList(resolvedEntities.keySet().toArray()); //TODO: use org.json.JSONArray and remove this custom formatting code for (int j = 0; j < keys.size(); j++) { String n = keys.get(j); out.println("{\"" + n + "\" : ["); List<Location> terms = resolvedEntities.get(n); for (int i = 0; i < terms.size(); i++) { Location res = terms.get(i); if (i < terms.size() - 1) { out.println(res + ","); } else { out.println(res); } } if (j < keys.size() - 1) { out.println("]},"); } else { out.println("]}"); } } out.println("]"); } public String getIndexerPath() { return indexerPath; } }