org.opensextant.extractors.geo.PlaceCandidate.java Source code

Introduction

Here is the source code for org.opensextant.extractors.geo.PlaceCandidate.java
Source

/**
 * Copyright 2012-2013 The MITRE Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 *
 * **************************************************************************
 * NOTICE This software was produced for the U. S. Government under Contract No.
 * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
 * Software and Noncommercial Computer Software Documentation Clause
 * 252.227-7014 (JUN 1995)
 *
 * (c) 2012 The MITRE Corporation. All Rights Reserved.
 * **************************************************************************
 *
 */
package org.opensextant.extractors.geo;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.opensextant.data.Geocoding;
import org.opensextant.data.LatLon;
import org.opensextant.data.Place;
import org.opensextant.extraction.TextMatch;
import org.opensextant.util.TextUtils;

/**
 * A PlaceCandidate represents a portion of a document which has been identified
 * as a possible named geographic location. It is used to collect together the
 * information from the document (the evidence), as well as the possible
 * geographic locations it could represent (the Places ). It also contains the
 * results of the final decision to include:
 * <ul>
 * <li>bestPlace - Of all the places with the same/similar names, which place is
 * it?
 * </ul>
 * @author ubaldino
 * @author dlutz, based on OpenSextant Toolbox
 */
public class PlaceCandidate extends TextMatch {

    private String textnorm = null;

    // --------------Place/NotPlace stuff ----------------------
    // which rules have expressed a Place/NotPlace opinion on this PC
    private final Set<String> rules = new HashSet<>();
    // --------------Disambiguation stuff ----------------------
    // the places along with their disambiguation scores
    private final Map<String, ScoredPlace> scoredPlaces = new HashMap<>();
    // the list of PlaceEvidences accumulated from the document about this PC
    private final List<PlaceEvidence> evidence = new ArrayList<>();
    // The chosen, best place:
    private ScoredPlace choice1 = null;
    private ScoredPlace choice2 = null;
    private int confidence = 0;
    private Set<String> hierarchicalPaths = new HashSet<>();
    private Set<String> countries = new HashSet<>();

    /**
     * Default weighting increments.
     */
    private static final String[] CLASS_SCALE = { "A:3", "P:2", "L:1", "R:0", "H:1", "V:0", "T:1" };

    private static final String[] DESIGNATION_SCALE = {
            /* Places: cities, villages, ruins, etc.*/
            "PPLC:12", "PPLA:8", "PPLG:7", "PPL:5", "PPLL:2", "PPLQ:2", "PPLX:2",
            /* Administrative regions */
            "ADM1:9", "ADM2:8", "ADM3:7",
            /* Other geographic features */
            "ISL:4", "ISLS:5" };

    private static final Map<String, Integer> classWeight = new HashMap<>();
    private static final Map<String, Integer> designationWeight = new HashMap<>();
    private static final int DEFAULT_DESIGNATION_WT = 2;

    static {
        for (String entry : DESIGNATION_SCALE) {
            String[] parts = entry.split(":");
            designationWeight.put(parts[0], Integer.parseInt(parts[1]));
        }
        for (String entry : CLASS_SCALE) {
            String[] parts = entry.split(":");
            classWeight.put(parts[0], Integer.parseInt(parts[1]));
        }
    }

    // basic constructor
    public PlaceCandidate() {
    }

    /**
     * Using a scale of 0 to 100, indicate how confident we are that the chosen place is best.
     * Note this is different than the individual score assigned to each candidate place.
     * We just need one final confidence measure for this place mention.
     */
    public void setConfidence(int c) {
        confidence = c;
    }

    /**
     * see setConfidence
     * 
     * @return
     */
    public int getConfidence() {
        return confidence;
    }

    /**
     * If caller is willing to claim an explicit choice, so be it. Otherwise
     * unchosen places go to disambiguation.
     */
    public void choose(Place geo) {
        if (geo instanceof ScoredPlace) {
            choice1 = (ScoredPlace) geo;
        } else if (scoredPlaces.containsKey(geo.getKey())) {
            choice1 = scoredPlaces.get(geo.getKey());
        } else {
            //             
        }
    }

    /**
     *
     * @return normalized version of text.
     */
    public String getTextnorm() {
        if (textnorm == null) {
            textnorm = TextUtils.removePunctuation(TextUtils.removeDiacritics(getText())).toLowerCase();
        }
        return textnorm;
    }

    // ---- the getters and setters ---------
    //

    private String[] preTokens = null;
    private String[] postTokens = null;
    private final int DEFAULT_TOKEN_SIZE = 40;

    /**
     * Get some sense of tokens surrounding match. Possibly optimize this by
     * getting token list from SolrTextTagger (which provides the
     * lang-specifics)
     *
     * @param sourceBuffer
     */
    protected void setSurroundingTokens(String sourceBuffer) {
        int[] window = TextUtils.get_text_window(start, end - start, sourceBuffer.length(), DEFAULT_TOKEN_SIZE);

        /*
         * Get right most or left most whole tokens, for now whitespace
         * delimited. TODO: ensure whole tokens are retrieved.
         */
        setPrematchTokens(TextUtils.tokensRight(sourceBuffer.substring(window[0], window[1])));
        setPostmatchTokens(TextUtils.tokensLeft(sourceBuffer.substring(window[2], window[3])));
    }

    /**
     * Common evidence flags -- isCountry, isPerson, isOrganization,
     * abbreviation, and acronym
     */
    public boolean isCountry = false;
    public boolean isContinent = false;
    public boolean isPerson = false;
    public boolean isOrganization = false;
    public boolean isAbbreviation = false;
    public boolean isAcronym = false;

    /**
     * After candidate has been scored and all, the final best place is the
     * geocoding result for the given name in context.
     */
    public Geocoding getGeocoding() {
        choose();
        return getChosen();
    }

    public ScoredPlace getChosen() {
        return choice1;
    }

    public ScoredPlace getFirstChoice() {
        return getChosen();
    }

    /**
     * Get the most highly ranked Place, or Null if empty list.
     * Typical usage:
     * 
     * choose() // this does work. performance cost.
     * getChosen() // this is a getter; no performance cost
     */
    public void choose() {
        if (choice1 != null) {
            // return chosen;
            return;
        }

        List<ScoredPlace> tmp = new ArrayList<>();
        tmp.addAll(scoredPlaces.values());
        Collections.sort(tmp);

        choice1 = tmp.get(0);
        if (tmp.size() > 1) {
            choice2 = tmp.get(1);
            secondPlaceScore = tmp.get(1).getScore();
        }
    }

    /**
     * This only makes sense if you tried choose() first 
     * to sort scored places.
     * 
     * @return
     */
    public boolean isAmbiguous() {
        if (choice2 != null && choice1 != null) {
            // float == float  does this work in Java?  7.125 == 7.125 ? 
            // 
            // first place Not better than second place?
            return !(choice1.getScore() > choice2.getScore());
        }
        return false;
    }

    private double secondPlaceScore = -1;

    /**
     * Only call after choose() operation.
     * 
     * @return
     */
    public double getSecondChoiceScore() {
        return secondPlaceScore;
    }

    public ScoredPlace getSecondChoice() {
        return choice2;
    }

    public Collection<ScoredPlace> getPlaces() {
        return scoredPlaces.values();
    }

    // add a new place with a default score
    public void addPlace(ScoredPlace place) {
        this.addPlace(place, defaultScore(place));
        this.rules.add("DefaultScore");
    }

    public boolean hasDefaultRuleOnly() {
        return rules.contains("DefaultScore") && rules.size() == 1;
    }

    // add a new place with a specific score
    public void addPlace(ScoredPlace place, Double score) {
        place.setScore(score);
        this.scoredPlaces.put(place.getKey(), place);

        // 'US.CA' or 'US.06', etc.
        this.hierarchicalPaths.add(place.getHierarchicalPath());
        // 'US'
        if (place.getCountryCode() != null) {
            this.countries.add(place.getCountryCode());
        }
    }

    public static final double NAME_WEIGHT = 0.2;
    public static final double FEAT_WEIGHT = 0.1;
    public static final double LOCATION_BIAS_WEIGHT = 0.7;

    /**
     * Given this candidate, how do you score the provided place
     * just based on those place properties (and not on context, document properties,
     * or other evidence)?
     * 
     * This 'should' produce a base score of something between 0 and 1.0, or 0..10.
     * These scores do not necessarily need to stay in that range, as they are all relative.
     * However, as rules fire and compare location data it is better to stay in a known range
     * for sanity sake.
     * 
     * @param g
     * @return
     */
    public double defaultScore(Place g) {
        double sn = scoreName(g);
        double sf = scoreFeature(g);
        double sb = g.getId_bias();

        double baseScore = (NAME_WEIGHT * sn) + (FEAT_WEIGHT * sf) + (LOCATION_BIAS_WEIGHT * sb);
        return 10 * baseScore;
    }

    /**
     * Produce a goodness score in the range 0 to 1.0
     * 
     * Trivial examples of name matching:
     * 
     * <pre>
     *  given some patterns, 'geo' match Text
     * 
     *   case 1. 'Alberta' matches ALBERTA or alberta just fine. 
     *   case 2. 'La' matches LA, however, knowing "LA" is a acronym/abbreviation 
     *       adds to the score of any geo that actually is "LA"
     *   case 3. 'Afghanestan' matches Afghanistan, but decrement because it is not perfectly spelled.
     * 
     * </pre>
     * 
     * @param g
     * @return
     */
    protected double scoreName(Place g) {
        int startingScore = getTextnorm().length();
        int editDist = StringUtils.getLevenshteinDistance(getTextnorm(), g.getNamenorm());
        int score = startingScore - editDist;
        if (isUpper() && (g.isAbbreviation() || TextUtils.isUpper(g.getName()))) {
            ++score;
        }
        // Mismatch in name diacritics downgrades name score here.
        if ((isASCII() && !g.isASCIIName()) || (!isASCII() && g.isASCIIName())) {
            --score;
        }
        if (isASCII() && g.isASCIIName()) {
            ++score;
        }
        return (float) score / startingScore;
    }

    /**
     * A preference for features that are major places or boundaries.
     * This yields a feature score on a 0 to 1.0 point scale.
     * 
     * @param g
     * @return
     */
    protected double scoreFeature(Place g) {

        Integer wt = designationWeight.get(g.getFeatureCode());
        if (wt != null) {
            return (float) wt / 10;
        }
        int score = DEFAULT_DESIGNATION_WT;
        wt = classWeight.get(g.getFeatureClass());
        if (wt != null) {
            score += wt.intValue();
        }

        return (float) score / 10;
    }

    // increment the score of an existing place
    public void incrementPlaceScore(Place place, Double score) {
        ScoredPlace currentScore = this.scoredPlaces.get(place.getKey());
        if (currentScore != null) {
            currentScore.incrementScore(score);
        } else {
            // logger.error("Tried to increment a score for a non-existent
            // Place");
        }
    }

    // set the score of an existing place
    public void setPlaceScore(ScoredPlace place, Double score) {
        if (!this.scoredPlaces.containsKey(place.getKey())) {
            // log.error("Tried to increment a score for a non-existent Place");
            return;
        }
        addPlace(place, score);
    }

    public Collection<String> getRules() {
        return rules;
    }

    public boolean hasRule(String rule) {
        return rules.contains(rule);
    }

    public void addRule(String rule) {
        rules.add(rule);
    }

    public void addEvidence(PlaceEvidence evidence) {
        this.evidence.add(evidence);
        if (evidence.getRule() != null) {
            this.rules.add(evidence.getRule());
        }
    }

    public void addEvidence(String rule, double weight, Place ev) {
        addEvidence(new PlaceEvidence(ev, rule, weight));
    }

    // some convenience methods to add evidence
    public void addEvidence(String rule, double weight, String cc, String adm1, String fclass, String fcode,
            LatLon geo) {
        PlaceEvidence ev = new PlaceEvidence();
        ev.setRule(rule);
        ev.setWeight(weight);
        if (cc != null) {
            ev.setCountryCode(cc);
        }
        if (adm1 != null) {
            ev.setAdmin1(adm1);
        }
        if (fclass != null) {
            ev.setFeatureClass(fclass);
        }
        if (fcode != null) {
            ev.setFeatureCode(fcode);
        }
        if (geo != null) {
            ev.setLatLon(geo);
        }
        this.evidence.add(ev);
    }

    /**
     * Add country evidence and increment score immediately.
     * 
     * @param rule
     * @param weight
     * @param cc
     * @param geo
     */
    public void addCountryEvidence(String rule, double weight, String cc, Place geo) {
        PlaceEvidence ev = new PlaceEvidence();
        ev.setRule(rule);
        ev.setWeight(weight);
        ev.setCountryCode(cc);
        this.evidence.add(ev);

        ev.setEvaluated(true);
        this.incrementPlaceScore(geo, /*1 x */ weight);
    }

    public void addAdmin1Evidence(String rule, double weight, String adm1, String cc) {
        PlaceEvidence ev = new PlaceEvidence();
        ev.setRule(rule);
        ev.setWeight(weight);
        ev.setAdmin1(adm1);
        ev.setCountryCode(cc);
        this.evidence.add(ev);
    }

    public void addFeatureClassEvidence(String rule, double weight, String fclass) {
        PlaceEvidence ev = new PlaceEvidence();
        ev.setRule(rule);
        ev.setWeight(weight);
        ev.setFeatureClass(fclass);
        this.evidence.add(ev);
    }

    public void addFeatureCodeEvidence(String rule, double weight, String fcode) {
        PlaceEvidence ev = new PlaceEvidence();
        ev.setRule(rule);
        ev.setWeight(weight);
        ev.setFeatureCode(fcode);
        this.evidence.add(ev);
    }

    /**
     * Add evidence and increment score immediately.
     * 
     * @param rule
     * @param weight
     * @param coord
     * @param geo
     * @param proximityScore
     */
    public void addGeocoordEvidence(String rule, double weight, LatLon coord, Place geo, double proximityScore) {
        PlaceEvidence ev = new PlaceEvidence();
        ev.setRule(rule);
        ev.setWeight(weight);
        ev.setLatLon(coord);
        this.evidence.add(ev);
        //
        ev.setEvaluated(true);
        this.incrementPlaceScore(geo, weight * proximityScore);
        // The indirect connection between found coord and closest geo candidate 
        // is assessed here.  The score for geo has already be incremented.
    }

    public List<PlaceEvidence> getEvidence() {
        return this.evidence;
    }

    public boolean hasPlaces() {
        return !this.scoredPlaces.isEmpty();
    }

    // an overide of toString to get a meaningful representation of this PC
    @Override
    public String toString() {
        return summarize(false);
    }

    /**
     * If you need a full print out of the data, use summarize(true);
     * 
     * @param dumpAll
     * @return
     */
    public String summarize(boolean dumpAll) {
        StringBuilder tmp = new StringBuilder(getText());
        tmp.append(String.format("(C=%d, N=%d)", this.getConfidence(), this.scoredPlaces.size()));
        tmp.append("\nRules=");
        tmp.append(rules.toString());
        tmp.append("\nEvidence=");
        tmp.append(evidence.toString());
        if (dumpAll) {
            tmp.append("\nPlaces=\n");
            for (ScoredPlace p : scoredPlaces.values()) {
                tmp.append("\t");
                tmp.append(p.toString());
                tmp.append("\n");
            }
        }
        return tmp.toString();
    }

    /**
     * @return the preTokens
     */
    public String[] getPrematchTokens() {
        return preTokens;
    }

    /**
     * @param tok
     *            the preTokens to set
     */
    public void setPrematchTokens(String[] tok) {
        this.preTokens = tok;
    }

    /**
     * @return the postTokens
     */
    public String[] getPostmatchTokens() {
        return postTokens;
    }

    /**
     * @param tok
     *            the postTokens to set
     */
    public void setPostmatchTokens(String[] tok) {
        this.postTokens = tok;
    }

    /**
     * Given a path, 'a.b' ( province b in country a),
     * see if this name is present there.
     * 
     * @param path
     * @return
     */
    public boolean presentInHierarchy(String path) {
        return this.hierarchicalPaths.contains(path);
    }

    public boolean presentInCountry(String cc) {
        return this.countries.contains(cc);
    }

    /**
     * How many different countries contain this name?
     * 
     * @return
     */
    public int distinctCountryCount() {
        return this.countries.size();
    }

    public int distinctLocationCount() {
        return this.scoredPlaces.size(); // These are keyed by PLACE ID, essentially location.
    }

}