bzh.terrevirtuelle.navisu.gazetteer.impl.lucene.GeoNameResolver.java Source code

Introduction

Here is the source code for bzh.terrevirtuelle.navisu.gazetteer.impl.lucene.GeoNameResolver.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package bzh.terrevirtuelle.navisu.gazetteer.impl.lucene;

import bzh.terrevirtuelle.navisu.gazetteer.impl.GazetteerComponentImpl;
import bzh.terrevirtuelle.navisu.gazetteer.impl.lucene.domain.Location;
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
//import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.spatial.SpatialStrategy;
import org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy;
import org.apache.lucene.spatial.prefix.tree.GeohashPrefixTree;
import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
import org.apache.lucene.spatial.query.SpatialArgs;
import org.apache.lucene.spatial.query.SpatialOperation;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import com.google.gson.Gson;
import com.spatial4j.core.context.SpatialContext;
import com.spatial4j.core.distance.DistanceUtils;
import com.spatial4j.core.shape.Point;
import edu.usc.ir.geo.gazetteer.service.Launcher;
import java.util.Properties;
import javafx.scene.control.Alert;
import javafx.scene.media.AudioClip;
import javafx.scene.text.Text;
import org.apache.commons.cli.BasicParser;
//import org.apache.commons.cli.DefaultParser;

public class GeoNameResolver implements Closeable {
    //UPPER BOUND FOR SEARCHING AN AREA IN MILES

    private static final double REVERSE_DISTANCE_LIMIT = 5;
    private static final String JSON_OPT = "json";
    private static final String REVERSE_OPT = "r";
    private static final String REVERSE_LONG_OPT = "enable-reverse";
    private static final String SEARCH_REVERSE_OPT = "sr";
    private static final String SEARCH_REVERSE_LONG_OPT = "search-reverse";
    /**
     * Below constants define name of field in lucene index
     */
    public static final String FIELD_NAME_ID = "ID";
    public static final String FIELD_NAME_NAME = "name";
    public static final String FIELD_NAME_LONGITUDE = "longitude";
    public static final String FIELD_NAME_LATITUDE = "latitude";
    public static final String FIELD_NAME_ALTERNATE_NAMES = "alternatenames";
    public static final String FIELD_NAME_FEATURE_CODE = "featureCode";
    public static final String FIELD_NAME_COUNTRY_CODE = "countryCode";
    public static final String FIELD_NAME_ADMIN1_CODE = "admin1Code";
    public static final String FIELD_NAME_ADMIN2_CODE = "admin2Code";
    public static final String FIELD_NAME_POPULATION = "population";
    /**
     * Below constants define weight multipliers used for result relevance.
     */
    private static final int WEIGHT_SORT_ORDER = 20;
    private static final int WEIGHT_SIZE_ALT_NAME = 50;
    private static final int WEIGHT_NAME_MATCH = 20000;
    private static final int WEIGHT_NAME_PART_MATCH = 15000;

    private static final Logger LOG = Logger.getLogger(GeoNameResolver.class.getName());
    private static final Double OUT_OF_BOUNDS = 999999.0;
    private static Analyzer analyzer = new StandardAnalyzer();
    private static IndexWriter indexWriter;
    private static Directory indexDir;
    private static int hitsPerPage = 8;

    //sort descending on population
    SortField populationSort = new SortedNumericSortField(FIELD_NAME_POPULATION, SortField.Type.LONG, true);

    private IndexReader indexReader;
    private SpatialContext ctx = SpatialContext.GEO;
    private SpatialPrefixTree grid = new GeohashPrefixTree(ctx, 11);
    private SpatialStrategy strategy = new RecursivePrefixTreeStrategy(grid, "location");

    private String indexerPath;
    private String gazetteerPath;
    private Properties properties;
    protected String CONFIG_FILE_NAME = System.getProperty("user.home") + "/.navisu/config/config.properties";
    private final String INDEXER_PATH = "luceneAllCountriesIndexPath";
    private final String GAZETEER_PATH = "allCountriesPath";

    public GeoNameResolver() {
        properties = new Properties();
        try {
            properties.load(new FileInputStream(CONFIG_FILE_NAME));
            indexerPath = properties.getProperty(INDEXER_PATH).trim();
            gazetteerPath = properties.getProperty(GAZETEER_PATH).trim();
        } catch (IOException ex) {
            Logger.getLogger(GazetteerComponentImpl.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        }
        if (indexerPath == null) {
            Alert alert = new Alert(Alert.AlertType.WARNING);
            alert.setTitle("Gazetteer");
            alert.setHeaderText("Attention");
            Text s = new Text("  Le chemin de l'index gographique est  incorrect."
                    + "\n  Vous devez complter le fichier config.properties"
                    + "\n Menu : TOOLS/Config/App/Options");
            s.setWrappingWidth(350);
            alert.getDialogPane().setContent(s);
            alert.show();
            AudioClip plonkSound = new AudioClip(
                    this.getClass().getResource("sounds/warning.mp3").toExternalForm());
            plonkSound.play();

        }
        if (gazetteerPath == null) {
            Alert alert = new Alert(Alert.AlertType.WARNING);
            alert.setTitle("Gazetteer");
            alert.setHeaderText("Attention");
            Text s = new Text("  Le chemin des donnes de l'index gographique \n est incorrect."
                    + "\n  Vous devez complter le fichier config.properties"
                    + "\n Menu : TOOLS/Config/App/Options");
            s.setWrappingWidth(350);
            alert.getDialogPane().setContent(s);
            alert.show();
        }
    }

    /**
     * Creates a GeoNameResolver for given path
     *
     * @param indexPath the path to lucene index
     * @throws IOException
     */
    public GeoNameResolver(String indexPath) throws IOException {
        this.indexReader = createIndexReader(indexPath);
    }

    /**
     *
     * @param locationNames List of location na,es
     * @param count Number of results per location
     * @return resolved Geo Names
     * @throws IOException
     */
    public HashMap<String, List<Location>> searchGeoName(List<String> locationNames, int count) throws IOException {
        return resolveEntities(locationNames, count, this.indexReader);
    }

    /**
     * Search corresponding GeoName for each location entity
     *
     * @param count Number of results for one locations
     * @param querystr it's the NER actually
     *
     * @return HashMap each name has a list of resolved entities
     * @throws IOException
     * @throws RuntimeException
     */
    public HashMap<String, List<Location>> searchGeoName(String indexerPath, List<String> locationNameEntities,
            int count) throws IOException {

        if (locationNameEntities.size() == 0 || locationNameEntities.get(0).length() == 0) {
            return new HashMap<String, List<Location>>();
        }
        IndexReader reader = createIndexReader(indexerPath);
        HashMap<String, List<Location>> resolvedEntities = resolveEntities(locationNameEntities, count, reader);
        reader.close();
        return resolvedEntities;

    }

    /**
     * Returns a list of location near a certain coordinate.
     *
     * @param latitude, @param longitude - Center of search area
     * @param distanceInMiles - Search Radius in miles
     * @param indexerPath - Path to Lucene index
     * @param count - Upper bound to number of results
     * @return - List of locations sorted by population
     * @throws IOException
     */
    public List<Location> searchNearby(Double latitude, Double longitude, Double distanceInMiles,
            String indexerPath, int count) throws IOException {

        double distanceInDeg = DistanceUtils.dist2Degrees(distanceInMiles,
                DistanceUtils.EARTH_EQUATORIAL_RADIUS_MI);
        SpatialArgs spatialArgs = new SpatialArgs(SpatialOperation.IsWithin,
                ctx.makeCircle(longitude, latitude, distanceInDeg));

        String key = latitude + "-" + longitude;
        Filter filter = strategy.makeFilter(spatialArgs);

        IndexSearcher searcher = new IndexSearcher(createIndexReader(indexerPath));
        Sort sort = new Sort(populationSort);
        TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), filter, count, sort);

        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        HashMap<String, List<Location>> allCandidates = new HashMap<String, List<Location>>();

        getMatchingCandidates(searcher, allCandidates, key, scoreDocs);
        List<Location> results = allCandidates.get(key);

        return results;
    }

    private IndexReader createIndexReader(String indexerPath) throws IOException {
        File indexfile = new File(indexerPath);
        indexDir = FSDirectory.open(indexfile.toPath());

        if (!DirectoryReader.indexExists(indexDir)) {
            LOG.log(Level.SEVERE, "No Lucene Index Dierctory Found, Invoke indexBuild() First !");
            System.exit(1);
        }

        return DirectoryReader.open(indexDir);
    }

    private HashMap<String, List<Location>> resolveEntities(List<String> locationNames, int count,
            IndexReader reader) throws IOException {
        if (locationNames.size() >= 200) {
            hitsPerPage = 5; // avoid heavy computation
        }
        IndexSearcher searcher = new IndexSearcher(reader);
        Query q = null;

        HashMap<String, List<Location>> allCandidates = new HashMap<String, List<Location>>();

        for (String name : locationNames) {

            if (!allCandidates.containsKey(name)) {
                try {
                    //query is wrapped in additional quotes (") to avoid query tokenization on space
                    q = new MultiFieldQueryParser(new String[] { FIELD_NAME_NAME, FIELD_NAME_ALTERNATE_NAMES },
                            analyzer).parse(String.format("\"%s\"", name));

                    Sort sort = new Sort(populationSort);
                    //Fetch 3 times desired values, these will be sorted on code and only desired number will be kept
                    ScoreDoc[] hits = searcher.search(q, hitsPerPage * 3, sort).scoreDocs;

                    getMatchingCandidates(searcher, allCandidates, name, hits);
                } catch (org.apache.lucene.queryparser.classic.ParseException e) {
                    e.printStackTrace();
                }
            }
        }

        HashMap<String, List<Location>> resolvedEntities = new HashMap<String, List<Location>>();
        pickBestCandidates(resolvedEntities, allCandidates, count);
        return resolvedEntities;
    }

    private void getMatchingCandidates(IndexSearcher searcher, HashMap<String, List<Location>> allCandidates,
            String name, ScoreDoc[] hits) {
        List<Location> topHits = new ArrayList<Location>();

        for (int i = 0; i < hits.length; ++i) {
            Location tmpLocObj = new Location();

            int docId = hits[i].doc;
            Document d;
            try {
                d = searcher.doc(docId);
                tmpLocObj.setName(d.get(FIELD_NAME_NAME));
                tmpLocObj.setLongitude(d.get(FIELD_NAME_LONGITUDE));
                tmpLocObj.setLatitude(d.get(FIELD_NAME_LATITUDE));
                //If alternate names are empty put name as actual name
                //This covers missing data and equals weight for later computation
                if (d.get(FIELD_NAME_ALTERNATE_NAMES).isEmpty()) {
                    tmpLocObj.setAlternateNames(d.get(FIELD_NAME_NAME));
                } else {
                    tmpLocObj.setAlternateNames(d.get(FIELD_NAME_ALTERNATE_NAMES));
                }
                tmpLocObj.setCountryCode(d.get(FIELD_NAME_COUNTRY_CODE));
                tmpLocObj.setAdmin1Code(d.get(FIELD_NAME_ADMIN1_CODE));
                tmpLocObj.setAdmin2Code(d.get(FIELD_NAME_ADMIN2_CODE));
                tmpLocObj.setFeatureCode(d.get(FIELD_NAME_FEATURE_CODE));

            } catch (IOException e) {
                e.printStackTrace();
            }
            topHits.add(tmpLocObj);
        }
        //Picking hitsPerPage number of locations from feature code sorted list 
        allCandidates.put(name, pickTopSortedByCode(topHits, hitsPerPage));
    }

    /**
     * Sorts inputLocations as per FeatureCodeComparator and returns at most
     * topCount locations
     *
     * @param inputLocations List of locations to be sorted
     * @param topCount Number of locations to be kept in curtailed list
     * @return List of at most topCount locations sorted by
     * edu.usc.ir.geo.gazetteer.CustomLuceneGeoGazetteerComparator.FeatureCodeComparator
     */
    private List<Location> pickTopSortedByCode(List<Location> inputLocations, int topCount) {
        if (inputLocations == null || inputLocations.size() == 0) {
            return new ArrayList<>();
        }

        Collections.sort(inputLocations, new CustomLuceneGeoGazetteerComparator.FeatureCodeComparator());
        return inputLocations.subList(0, inputLocations.size() > topCount ? topCount : inputLocations.size() - 1);
    }

    /**
     * Select the best match for each location name extracted from a document,
     * choosing from among a list of lists of candidate matches. Filter uses the
     * following features: 1) edit distance between name and the resolved name,
     * choose smallest one 2) content (haven't implemented)
     *
     * @param resolvedEntities final result for the input stream
     * @param allCandidates each location name may hits several documents, this
     * is the collection for all hitted documents
     * @param count Number of results for one locations
     * @throws IOException
     * @throws RuntimeException
     */
    private void pickBestCandidates(HashMap<String, List<Location>> resolvedEntities,
            HashMap<String, List<Location>> allCandidates, int count) {

        for (String extractedName : allCandidates.keySet()) {

            List<Location> cur = allCandidates.get(extractedName);
            if (cur.isEmpty()) {
                continue;//continue if no results found
            }
            int maxWeight = Integer.MIN_VALUE;
            //In case weight is equal for all return top element
            int bestIndex = 0;
            //Priority queue to return top elements
            PriorityQueue<Location> pq = new PriorityQueue<>(cur.size(), new Comparator<Location>() {
                @Override
                public int compare(Location o1, Location o2) {
                    return Integer.compare(o2.getWeight(), o1.getWeight());
                }
            });

            for (int i = 0; i < cur.size(); ++i) {
                int weight = 0;
                // get cur's ith resolved entry's name
                String resolvedName = String.format(" %s ", cur.get(i).getName());
                if (resolvedName.contains(String.format(" %s ", extractedName))) {
                    // Assign a weight as per configuration if extracted name is found as a exact word in name
                    weight = WEIGHT_NAME_MATCH;
                } else if (resolvedName.contains(extractedName)) {
                    // Assign a weight as per configuration if extracted name is found partly in name
                    weight = WEIGHT_NAME_PART_MATCH;
                }
                // get all alternate names of cur's ith resolved entry's
                String[] altNames = cur.get(i).getAlternateNames().split(",");
                float altEditDist = 0;
                for (String altName : altNames) {
                    if (altName.contains(extractedName)) {
                        altEditDist += StringUtils.getLevenshteinDistance(extractedName, altName);
                    }
                }
                //lesser the edit distance more should be the weight
                weight += getCalibratedWeight(altNames.length, altEditDist);

                //Give preference to sorted results. 0th result should have more priority
                weight += (cur.size() - i) * WEIGHT_SORT_ORDER;

                cur.get(i).setWeight(weight);

                if (weight > maxWeight) {
                    maxWeight = weight;
                    bestIndex = i;
                }

                pq.add(cur.get(i));
            }
            if (bestIndex == -1) {
                continue;
            }

            List<Location> resultList = new ArrayList<>();

            for (int i = 0; i < count && !pq.isEmpty(); i++) {
                resultList.add(pq.poll());
            }

            resolvedEntities.put(extractedName, resultList);
        }
    }

    /**
     * Returns a weight for average edit distance for set of alternate
     * name<br/><br/>
     * altNamesSize * WEIGHT_SIZE_ALT_NAME - (altEditDist/altNamesSize)
     * ;<br/><br/>
     * altNamesSize * WEIGHT_SIZE_ALT_NAME ensure more priority for results with
     * more alternate names.<br/>
     * altEditDist/altNamesSize is average edit distance. <br/>
     * Lesser the average, higher the over all expression
     *
     * @param altNamesSize - Count of altNames
     * @param altEditDist - sum of individual edit distances
     * @return
     */
    public float getCalibratedWeight(int altNamesSize, float altEditDist) {
        return altNamesSize * WEIGHT_SIZE_ALT_NAME - (altEditDist / altNamesSize);
    }

    /**
     * Build the gazetteer index line by line
     *
     * @param gazetteerPath path of the gazetteer file
     * @param indexerPath path to the created Lucene index directory.
     * @param reverseGeocodingEnabled
     * @throws IOException
     * @throws RuntimeException
     */
    public void buildIndex(String gazetteerPath, String indexerPath, boolean reverseGeocodingEnabled)
            throws IOException {
        File indexfile = new File(indexerPath);
        indexDir = FSDirectory.open(indexfile.toPath());
        if (!DirectoryReader.indexExists(indexDir)) {
            IndexWriterConfig config = new IndexWriterConfig(analyzer);
            indexWriter = new IndexWriter(indexDir, config);
            Logger logger = Logger.getLogger(this.getClass().getName());
            logger.log(Level.WARNING, "Start Building Index for Gazatteer");
            BufferedReader filereader = new BufferedReader(
                    new InputStreamReader(new FileInputStream(gazetteerPath), "UTF-8"));
            String line;
            int count = 0;
            while ((line = filereader.readLine()) != null) {
                try {
                    count += 1;
                    if (count % 100000 == 0) {
                        logger.log(Level.INFO, "Indexed Row Count: " + count);
                    }
                    addDoc(indexWriter, line, reverseGeocodingEnabled);

                } catch (RuntimeException re) {
                    logger.log(Level.WARNING, "Skipping... Error on line: {0}", line);
                    re.printStackTrace();
                }
            }
            logger.log(Level.WARNING, "Building Finished");
            filereader.close();
            indexWriter.close();
        }
    }

    /**
     * Index gazetteer's one line data by built-in Lucene Index functions
     *
     * @param indexWriter Lucene indexWriter to be loaded
     * @param line a line from the gazetteer file
     * @throws IOException
     * @throws NumberFormatException
     */
    private void addDoc(IndexWriter indexWriter, final String line, final boolean reverseGeocodingEnabled) {
        String[] tokens = line.split("\t");

        int ID = Integer.parseInt(tokens[0]);
        String name = tokens[1];
        String alternatenames = tokens[3];

        Double latitude = -999999.0;
        try {
            latitude = Double.parseDouble(tokens[4]);
        } catch (NumberFormatException e) {
            latitude = OUT_OF_BOUNDS;
        }
        Double longitude = -999999.0;
        try {
            longitude = Double.parseDouble(tokens[5]);
        } catch (NumberFormatException e) {
            longitude = OUT_OF_BOUNDS;
        }

        int population = 0;
        try {
            population = Integer.parseInt(tokens[14]);
        } catch (NumberFormatException e) {
            population = 0;// Treat as population does not exists
        }

        // Additional fields to rank more known locations higher
        // All available codes can be viewed on www.geonames.org
        String featureCode = tokens[7];// more granular category
        String countryCode = tokens[8];
        String admin1Code = tokens[10];// eg US State
        String admin2Code = tokens[11];// eg county

        Document doc = new Document();
        doc.add(new IntField(FIELD_NAME_ID, ID, Field.Store.YES));
        doc.add(new TextField(FIELD_NAME_NAME, name, Field.Store.YES));
        doc.add(new DoubleField(FIELD_NAME_LONGITUDE, longitude, Field.Store.YES));
        doc.add(new DoubleField(FIELD_NAME_LATITUDE, latitude, Field.Store.YES));
        doc.add(new TextField(FIELD_NAME_ALTERNATE_NAMES, alternatenames, Field.Store.YES));
        doc.add(new TextField(FIELD_NAME_FEATURE_CODE, featureCode, Field.Store.YES));
        doc.add(new TextField(FIELD_NAME_COUNTRY_CODE, countryCode, Field.Store.YES));
        doc.add(new TextField(FIELD_NAME_ADMIN1_CODE, admin1Code, Field.Store.YES));
        doc.add(new TextField(FIELD_NAME_ADMIN2_CODE, admin2Code, Field.Store.YES));
        doc.add(new NumericDocValuesField(FIELD_NAME_POPULATION, population));//sort enabled field

        if (reverseGeocodingEnabled) {
            Point point = ctx.makePoint(longitude, latitude);
            for (IndexableField f : strategy.createIndexableFields(point)) {
                doc.add(f);
            }
        }

        try {
            indexWriter.addDocument(doc);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    @Override
    public void close() throws IOException {
        if (indexReader != null) {
            this.indexReader.close();
        }
    }

    /**
     * Writes the result as formatted json to given PrintStream
     *
     * @param resolvedEntities map of resolved entities
     * @param out the print stream for writing output
     */
    public static void writeResultJson(Map<String, List<Location>> resolvedEntities, PrintStream out) {
        out.println(new Gson().toJson(resolvedEntities));
    }

    /**
     * Writes the result to given PrintStream
     *
     * @deprecated Use writeResultJson instead
     * @param resolvedEntities map of resolved entities
     * @param out the print stream for writing output
     */
    @Deprecated
    public static void writeResult(Map<String, List<Location>> resolvedEntities, PrintStream out) {
        out.println("[");
        @SuppressWarnings("unchecked")
        List<String> keys = (List<String>) (List<?>) Arrays.asList(resolvedEntities.keySet().toArray());
        //TODO: use org.json.JSONArray and remove this custom formatting code
        for (int j = 0; j < keys.size(); j++) {
            String n = keys.get(j);
            out.println("{\"" + n + "\" : [");
            List<Location> terms = resolvedEntities.get(n);
            for (int i = 0; i < terms.size(); i++) {
                Location res = terms.get(i);
                if (i < terms.size() - 1) {
                    out.println(res + ",");
                } else {
                    out.println(res);
                }
            }

            if (j < keys.size() - 1) {
                out.println("]},");
            } else {
                out.println("]}");
            }
        }
        out.println("]");
    }

    public String getIndexerPath() {
        return indexerPath;
    }

}