org.opensextant.extractors.geo.GazetteerMatcher.java Source code

Introduction

Here is the source code for org.opensextant.extractors.geo.GazetteerMatcher.java
Source

/**
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 *               http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 *
 * **************************************************************************
 * NOTICE This software was produced for the U. S. Government under Contract No.
 * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
 * Software and Noncommercial Computer Software Documentation Clause
 * 252.227-7014 (JUN 1995)
 *
 * (c) 2012 The MITRE Corporation. All Rights Reserved.
 * **************************************************************************
 *
 * Continue contributions:
 *    Copyright 2013-2015 The MITRE Corporation.
 */
package org.opensextant.extractors.geo;

///** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
//
//_____                                ____                     __                       __
///\  __`\                             /\  _`\                  /\ \__                   /\ \__
//\ \ \/\ \   _____      __     ___    \ \,\L\_\      __   __  _\ \ ,_\     __       ___ \ \ ,_\
//\ \ \ \ \ /\ '__`\  /'__`\ /' _ `\   \/_\__ \    /'__`\/\ \/'\\ \ \/   /'__`\   /' _ `\\ \ \/
//\ \ \_\ \\ \ \L\ \/\  __/ /\ \/\ \    /\ \L\ \ /\  __/\/>  </ \ \ \_ /\ \L\.\_ /\ \/\ \\ \ \_
// \ \_____\\ \ ,__/\ \____\\ \_\ \_\   \ `\____\\ \____\/\_/\_\ \ \__\\ \__/.\_\\ \_\ \_\\ \__\
//  \/_____/ \ \ \/  \/____/ \/_/\/_/    \/_____/ \/____/\//\/_/  \/__/ \/__/\/_/ \/_/\/_/ \/__/
//          \ \_\
//           \/_/
//
// OpenSextant GazetteerMatcher
//*  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
//*/
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.commons.lang3.StringUtils;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.opensextant.ConfigException;
import org.opensextant.data.LatLon;
import org.opensextant.data.Place;
import org.opensextant.extraction.ExtractionException;
import org.opensextant.extraction.MatchFilter;
import org.opensextant.extraction.SolrMatcherSupport;
import org.opensextant.util.SolrProxy;
import org.opensextant.util.SolrUtil;
import org.opensextant.util.TextUtils;
import org.supercsv.io.CsvMapReader;
import org.supercsv.prefs.CsvPreference;

/**
 *
 * Connects to a Solr sever via HTTP and tags place names in document. The
 * <code>SOLR_HOME</code> environment variable must be set to the location of
 * the Solr server.
 * <p >
 * This class is not thread-safe. It could be made to be with little effort.
 *
 * @author David Smiley - dsmiley@mitre.org
 * @author Marc Ubaldino - ubaldino@mitre.org
 */
public class GazetteerMatcher extends SolrMatcherSupport {

    /*
     * Gazetteer specific stuff:
     */
    private TagFilter filter = null;
    private MatchFilter userfilter = null;
    private MatchFilter continents = null;

    /**
     * invocation counts: default filter count and user filter count
     */
    private long defaultFilterCount = 0;
    private long userFilterCount = 0;
    /**
     * lifecycle counts: default filter and matched counts. Ratio of filtered to
     * (filter+match) gives an idea of false positive rates.
     */
    private long filteredTotal = 0;
    private long matchedTotal = 0;
    private boolean allowLowercaseAbbrev = false;
    /*
     * enable trure for data such as tweets, blogs, etc. where case varies or
     * does not exist
     */
    private boolean allowLowerCase = false;
    // All of these Solr-parameters for tagging are not user-tunable.
    private final ModifiableSolrParams params = new ModifiableSolrParams();
    private SolrGazetteer gazetteer = null;

    public GazetteerMatcher() throws ConfigException {
        this(false);
    }

    /**
     * 
     * @param lowercaseAllowed
     *            variant is case insensitive.
     * 
     * @throws ConfigException
     *             on err
     */
    public GazetteerMatcher(boolean lowercaseAllowed) throws ConfigException {
        initialize();
        allowLowerCase = lowercaseAllowed;

        try {
            continents = new MatchFilter("/filters/continent-filter.txt");
        } catch (IOException err) {
            throw new ConfigException("Could not find continent list.", err);
        }
        // Instance variable that will have the transient payload to tag
        // this is not thread safe and is not static:
        // tag_request = new SolrTaggerRequest(params, SolrRequest.METHOD.POST);
        // Pre-loading the Solr FST
        //
        try {
            filter = new TagFilter();
            /*
             * Stop words should be filtered out by tagger/gazetteer filter
             */
            filter.enableStopwordFilter(true);
            filter.enableCaseSensitive(!allowLowerCase);

            tagText("trivial priming of the solr pump", "__initialization___");
        } catch (ExtractionException initErr) {
            throw new ConfigException("Unable to prime the tagger", initErr);
        }
    }

    @Override
    public void initialize() throws ConfigException {

        super.initialize();

        /*
         * Setup matcher params.
         */
        params.set(CommonParams.FL,
                "id,name,cc,adm1,adm2,feat_class,feat_code,geo,place_id,name_bias,id_bias,name_type");
        params.set("tagsLimit", 100000);
        params.set(CommonParams.ROWS, 100000);
        params.set("subTags", false);

        // we've got the input doc as a string instead; matchText=false means
        // the tagger will not report the text, just span offsets.
        params.set("matchText", false);

        /*
         * Possible overlaps: ALL, NO_SUB, LONGEST_DOMINANT_RIGHT See Solr Text
         * Tagger documentation for details.
         */
        params.set("overlaps", "LONGEST_DOMINANT_RIGHT");
        // params.set("overlaps", "NO_SUB");

        gazetteer = new SolrGazetteer(this.solr);
    }

    @Override
    public String getCoreName() {
        return "gazetteer";
    }

    @Override
    public SolrParams getMatcherParameters() {
        return params;
    }

    /**
     * For use within package or by subclass
     * 
     * @return internal gazetteer instance
     */
    public SolrGazetteer getGazetteer() {
        return this.gazetteer;
    }

    /**
     * A flag that will allow us to tag "in" or "in." as a possible
     * abbreviation. By default such things are not abbreviations, e.g., Indiana
     * is typically IN or In. or Ind., for example. Oregon, OR or Ore. etc.
     *
     * but almost never 'in' or 'or' for those cases.
     *
     * @param b
     *            flag true = allow lower case abbreviations to be tagged, e.g.,
     *            as in social media or
     */
    public void setAllowLowerCaseAbbreviations(boolean b) {
        allowLowercaseAbbrev = b;
    }

    /**
     * User-provided filters to filter out matched names immediately. Avoid
     * filtering out things that are indeed places, but require disambiguation
     * or refinement.
     *
     * @param f
     *            a match filter
     */
    public void setMatchFilter(MatchFilter f) {
        userfilter = f;
    }

    /**
     * Default advanced search.
     * 
     * @see #searchAdvanced(String, boolean, int)
     * 
     * @param place
     * @param as_solr
     * @return
     * @throws SolrServerException
     */
    public List<ScoredPlace> searchAdvanced(String place, boolean as_solr) throws SolrServerException {
        return searchAdvanced(place, as_solr, -1);
    }

    /**
     * This is a variation on SolrGazetteer.search(), just this creates ScoredPlace which is
     * immediately usable with scoring and ranking matches. The score for a ScoredPlace is
     * created when added to PlaceCandidate: a default score is created for the place.
     * 
     * <pre>
     *    Usage:
     *    pc = PlaceCandidate();
     *    list = gaz.searchAdvanced("name:Boston", true)  // solr fielded query used as-is.
     *    for ScoredPlace p: list:
     *        pc.addPlace( p )
     * </pre>
     * 
     * @param place
     *            the place string or text; or a Solr query
     * @param as_solr
     *            the as_solr
     * @param maxLen
     *            max length of gazetteer place names.
     * @return places List of scoreable place entries
     * @throws SolrServerException
     *             the solr server exception
     */
    public List<ScoredPlace> searchAdvanced(String place, boolean as_solr, int maxLen) throws SolrServerException {

        if (as_solr) {
            params.set("q", place);
        } else {
            // Bare keyword query needs to be quoted as "word word word"
            params.set("q", "\"" + place + "\"");
        }

        QueryResponse response = solr.getInternalSolrServer().query(params, SolrRequest.METHOD.GET);

        List<ScoredPlace> places = new ArrayList<>();
        for (SolrDocument solrDoc : response.getResults()) {
            /*
             * Length Filter.  Alternative: store name as string in solr, vice full text field 
             */
            if (maxLen > 0) {
                String nm = SolrProxy.getString(solrDoc, "name");
                if (nm.length() > maxLen) {
                    continue;
                }
            }

            places.add(createPlace(solrDoc));
        }

        return places;
    }

    /**
     * Geotag a buffer and return all candidates of gazetteer entries whose name
     * matches phrases in the buffer.
     *
     * @param buffer
     *            text
     * @param docid
     *            ID
     * @return list of place candidates
     * @throws ExtractionException
     *             on err
     */
    public List<PlaceCandidate> tagText(String buffer, String docid) throws ExtractionException {
        return tagText(buffer, docid, false);
    }

    /**
     * Tag names specifically with Chinese tokenizaiton
     * 
     * @param buffer
     *            text
     * @param docid
     *            ID
     * @return list of place candidates
     * @since 2.7.11
     * @throws ExtractionException
     *             on err
     * 
     */
    public List<PlaceCandidate> tagCJKText(String buffer, String docid) throws ExtractionException {
        return tagText(buffer, docid, false, CJK_TAG_FIELD);
    }

    public List<PlaceCandidate> tagCJKText(String buffer, String docid, boolean tagOnly)
            throws ExtractionException {
        return tagText(buffer, docid, tagOnly, CJK_TAG_FIELD);
    }

    /**
     * Tag place names in arabic.
     * 
     * @param buffer
     *            text
     * @param docid
     *            ID
     * @since 2.7.11
     * @return list of place candidates
     * @throws ExtractionException
     *             on err
     */
    public List<PlaceCandidate> tagArabicText(String buffer, String docid) throws ExtractionException {
        return tagText(buffer, docid, false, AR_TAG_FIELD);
    }

    public List<PlaceCandidate> tagArabicText(String buffer, String docid, boolean tagOnly)
            throws ExtractionException {
        return tagText(buffer, docid, tagOnly, AR_TAG_FIELD);
    }

    /** Most languages */
    public static final String DEFAULT_TAG_FIELD = "name_tag";

    /**
     * Use Solr param 'field' = name_tag_cjk to tag in Asian scripts.
     */
    public static final String CJK_TAG_FIELD = "name_tag_cjk";

    /**
     * Use Solr param 'field = name_tag_ar for Arabic.
     */
    public static final String AR_TAG_FIELD = "name_tag_ar";

    public List<PlaceCandidate> tagText(String buffer, String docid, boolean tagOnly) throws ExtractionException {
        return tagText(buffer, docid, tagOnly, DEFAULT_TAG_FIELD);
    }

    /**
     * Geotag a document, returning PlaceCandidates for the mentions in
     * document. Optionally just return the PlaceCandidates with name only and
     * no Place objects attached. Names of contients are passed back as matches,
     * with geo matches. Continents are filtered out by default.
     *
     * @param buffer
     *            text
     * @param docid
     *            identity of the text
     * @param tagOnly
     *            True if you wish to get the matched phrases only. False if you
     *            want the full list of Place Candidates.
     * @param fld
     *            gazetteer field to use for tagging
     * @return place_candidates List of place candidates
     * @throws ExtractionException
     *             on err
     */
    public List<PlaceCandidate> tagText(String buffer, String docid, boolean tagOnly, String fld)
            throws ExtractionException {
        // "tagsCount":10, "tags":[{ "ids":[35], "endOffset":40,
        // "startOffset":38},
        // { "ids":[750308, 2769912, 2770041, 10413973, 10417546],
        // "endOffset":49,
        // "startOffset":41},
        // ...
        // "matchingDocs":{"numFound":75, "start":0, "docs":[ {
        // "place_id":"USGS1992921", "name":"Monterrey", "cc":"PR"}, {
        // "place_id":"USGS1991763", "name":"Monterrey", "cc":"PR"}, ]

        // Reset counts.
        this.defaultFilterCount = 0;
        this.userFilterCount = 0;
        // during post-processing tags we may have to distinguish between tagging/tokenizing 
        // general vs. cjk vs. ar. But not yet though.
        // boolean useGeneralMode = DEFAULT_TAG_FIELD.equals(fld);

        long t0 = System.currentTimeMillis();
        log.debug("TEXT SIZE = {}", buffer.length());
        int[] textMetrics = TextUtils.measureCase(buffer);
        boolean isUpperCase = TextUtils.isUpperCaseDocument(textMetrics);

        params.set("field", fld);
        Map<Integer, Object> beanMap = new HashMap<Integer, Object>(100);
        QueryResponse response = tagTextCallSolrTagger(buffer, docid, beanMap);

        @SuppressWarnings("unchecked")
        List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");

        this.tagNamesTime = response.getQTime();
        long t1 = t0 + tagNamesTime;
        long t2 = System.currentTimeMillis();
        boolean geocode = !tagOnly;

        /*
         * Retrieve all offsets into a long list. These offsets will report a
         * text span and all the gazetteer record IDs that are associated to
         * that span. The text could either be a name, a code or some other
         * abbreviation.
         *
         * For practical reasons the default behavior is to filter trivial spans
         * given the gazetteer data that is returned for them.
         *
         * WARNING: lots of optimizations occur here due to the potentially
         * large volume of tags and gazetteer data that is involved. And this is
         * relatively early in the pipline.
         */
        log.debug("DOC={} TAGS SIZE={}", docid, tags.size());

        TreeMap<Integer, PlaceCandidate> candidates = new TreeMap<Integer, PlaceCandidate>();

        // names matched is used only for debugging, currently.
        Set<String> namesMatched = new HashSet<>();

        tagLoop: for (NamedList<?> tag : tags) {

            int x1 = (Integer) tag.get("startOffset");
            int x2 = (Integer) tag.get("endOffset");
            int len = x2 - x1;
            if (len == 1) {
                // Ignoring place names whose length is less than 2 chars
                ++this.defaultFilterCount;
                continue;
            }
            // +1 char after last matched
            // Could have enabled the "matchText" option from the tagger to get
            // this, but since we already have the content as a String then
            // we might as well not make the tagger do any more work.

            String matchText = (String) tag.get("matchText");
            // Get char immediately following match, for light NLP rules.
            char postChar = 0;
            if (x2 < buffer.length()) {
                postChar = buffer.charAt(x2);
            }

            // Then filter out trivial matches. E.g., Us is filtered out. vs. US would
            // be allowed. If lowercase abbreviations are allowed, then all matches are passed.               
            if (len < 3) {
                if (TextUtils.isASCII(matchText) && !StringUtils.isAllUpperCase(matchText)
                        && !allowLowercaseAbbrev) {
                    ++this.defaultFilterCount;
                    continue;
                }
            }

            if (TextUtils.countFormattingSpace(matchText) > 1) {
                // Phrases with words broken across more than one line are not
                // valid matches.
                // Phrase with a single TAB is okay
                ++this.defaultFilterCount;
                continue;
            }
            // Eliminate any newlines and extra whitespace in match
            matchText = TextUtils.squeeze_whitespace(matchText);

            /**
             * Filter out trivial tags. Due to normalization, we tend to get
             * lots of false positives that can be eliminated early.
             */
            if (filter.filterOut(matchText)) {
                ++this.defaultFilterCount;
                continue;
            }

            PlaceCandidate pc = new PlaceCandidate();
            pc.start = x1;
            pc.end = x2;
            pc.setText(matchText);

            /*
             * Filter out tags that user determined ahead of time as not-places
             * for their context.
             *
             */
            if (userfilter != null) {
                if (userfilter.filterOut(pc.getTextnorm())) {
                    log.debug("User Filter:{}", matchText);
                    ++this.userFilterCount;
                    continue;
                }
            }

            /*
             * Continent filter is needed, as many mentions of contients confuse
             * real geotagging/geocoding.
             * 
             */
            if (continents.filterOut(pc.getTextnorm())) {
                pc.isContinent = true;
                pc.setFilteredOut(true);
                candidates.put(pc.start, pc);
                continue;
            }
            /*
             * Found UPPER CASE text in a mixed-cased document.
             * Conservatively, this is likely an acronym or some heading.
             * But possibly still a valid place name.
             * HEURISTIC: acronyms are relatively short. 
             * HEURISTIC: region codes can be acronyms and are valid places
             * 
             * using such place candidates you may score short acronym matches lower than fully named ones.
             * when inferring boundaries (states, provinces, etc)
             */
            if (!isUpperCase && pc.isUpper() && len < 5) {
                pc.isAcronym = true;
            }

            /*
             * Everything Else.
             */
            pc.setSurroundingTokens(buffer);

            @SuppressWarnings("unchecked")
            List<Integer> placeRecordIds = (List<Integer>) tag.get("ids");

            /*
             * This assertion is helpful in debugging: assert
             * placeRecordIds.size() == new
             * HashSet<Integer>(placeRecordIds).size() : "ids should be unique";
             */
            // assert!placeRecordIds.isEmpty();
            namesMatched.clear();

            //double maxNameBias = 0.0;
            for (Integer solrId : placeRecordIds) {
                // Yes, we must cast here.
                // As long as createTag generates the correct type stored in
                // beanMap we are fine.
                ScoredPlace pGeo = (ScoredPlace) beanMap.get(solrId);
                // assert pGeo != null;

                // Optimization: abbreviation filter.
                //
                // Do not add PlaceCandidates for lower case tokens that are
                // marked as Abbreviations, unless flagged to do so.
                //
                // DEFAULT behavior is to avoid lower case text that is tagged
                // as an abbreviation in gazetteer,
                //
                // Common terms: in, or, oh, me, us, we, etc. Are all not
                // typically place names or valid abbreviations in text.
                //
                if (!allowLowercaseAbbrev && pGeo.isAbbreviation() && pc.isLower()) {
                    log.debug("Ignore lower case term={}", pc.getText());
                    // DWS: TODO what if there is another pGeo for this pc that
                    // isn't an abbrev? Therefore shouldn't we continue this
                    // loop and not tagLoop?
                    continue tagLoop;
                }

                /*
                 * If text match contains "." and it matches any abbreviation,
                 * mark the candidate as an abbrev. TODO: Possibly best confirm
                 * this by sentence detection, as well. However, this pertains
                 * to text spans that contain "." within the bounds, and not
                 * likely an ending. E.g., "U.S." or "U.S" are trivial examples;
                 * "US" is more ambiguous, as we need to know if document is
                 * upperCase.
                 * 
                 * Any place abbreviation will trigger isAbbreviation = true
                 * 
                 * "IF YOU FIND US HERE"  the term 'US' is ambiguous here, so 
                 * it is not classified as an abbreviation. Otherwise if you have
                 * "My organization YAK happens to coincide with a place named Yak.
                 * But we first must determine if 'YAK' is a valid abbreviation for an actual place.
                 * HEURISTIC: place abbreviations are relatively short, e.g. one word(len=7 or less)
                 */
                if (len < 8 && !pc.isAbbreviation) {
                    assessAbbreviation(pc, pGeo, postChar, isUpperCase);
                }

                if (log.isDebugEnabled()) {
                    namesMatched.add(pGeo.getName());
                }

                /**
                 * Country names are the only names you can reasonably set ahead
                 * of time. All other names need to be assessed in context.
                 * Negate country names, e.g., "Georgia", by exception.
                 */
                if (pGeo.isCountry()) {
                    pc.isCountry = true;
                }

                if (geocode) {
                    pGeo.defaultHierarchicalPath();
                    // Default score for geo will be calculated in PlaceCandidate
                    pc.addPlace(pGeo);
                }
            }

            // If geocoding, skip this PlaceCandidate if has no places (e.g. due
            // to filtering)
            if (geocode && !pc.hasPlaces()) {
                log.debug("Place has no places={}", pc.getText());
                continue;
            } else {
                if (log.isDebugEnabled()) {
                    log.debug("Text {} matched {}", pc.getText(), namesMatched);
                }
            }

            candidates.put(pc.start, pc);
        } // for tag
        long t3 = System.currentTimeMillis();

        // this.tagNamesTime = (int)(t1 - t0);
        this.getNamesTime = (int) (t2 - t1);
        this.totalTime = (int) (t3 - t0);

        if (log.isDebugEnabled()) {
            summarizeExtraction(candidates.values(), docid);
        }

        this.filteredTotal += this.defaultFilterCount + this.userFilterCount;
        this.matchedTotal += candidates.size();

        return new ArrayList<PlaceCandidate>(candidates.values());
    }

    private void assessAbbreviation(PlaceCandidate pc, ScoredPlace pGeo, char postChar, boolean docIsUPPER) {
        /* - Block re-entry to this logic. If Match is already marked as ABBREV, 
         * then no need to review
         * - We don't consider abbreviations longer than N=7 chars.
         * - If matched geo-location does not represent an abbreviation than this does not apply.
         * 
         * - postchar = 0 (null) means there is no chars after the match because Match is at end of text buffer.
         */
        if (!pGeo.isAbbreviation() || postChar == 0) {
            return;
        }

        if (postChar == '.') {
            // Add the post-punctuation to the match ONLY if a potential GEO matches. 
            pc.isAbbreviation = true;
            pc.end += 1;
            pc.setTextOnly(String.format("%s.", pc.getText()));
        } else if (pc.getText().contains(".")) {
            /* TODO: contains abbreviation. E.g. ,'St. Paul' is not fully 
             * an abbreviation.
             */
            pc.isAbbreviation = true;
        } else if (!docIsUPPER && pc.isUpper()) {
            /* Hack Warning: NOT everything UPPERCASE in a document
             * is an abbrev. 
             */
            // Upper case place matched
            pc.isAbbreviation = true;
            // Matched text is UPPER in a non-upper case document                        
            pc.isAcronym = true;
        }
        // Lower or mixed-case abbreviations without "." are not
        // tagged Mr, Us, etc.

    }

    /**
     * This computes the cumulative filtering rate of user-defined and other
     * non-place name patterns
     *
     * @return filtration ratio
     */
    public double getFiltrationRatio() {
        return (double) this.filteredTotal / (filteredTotal + matchedTotal);
    }

    @Override
    public Object createTag(SolrDocument tag) {
        return createPlace(tag);
    }

    /**
     * Adapt the SolrProxy method for creating a Place object. Here, for
     * disambiguation down stream gazetteer metrics are added.
     *
     * @param gazEntry
     *            a solr record from the gazetteer
     * @return Place (Xponents) object
     */
    public static ScoredPlace createPlace(SolrDocument gazEntry) {

        // Creates for now org.opensextant.placedata.Place
        //Place bean = SolrProxy.createPlace(gazEntry);
        String plid = SolrUtil.getString(gazEntry, "place_id");
        String nm = SolrProxy.getString(gazEntry, "name");
        ScoredPlace bean = new ScoredPlace(plid, nm);
        SolrProxy.populatePlace(gazEntry, bean);

        return bean;
    }

    private void summarizeExtraction(Collection<PlaceCandidate> candidates, String docid) {
        if (candidates == null) {
            log.error("Something is very wrong.");
            return;
        }

        log.debug("DOC=" + docid + " PLACE CANDIDATES SIZE = " + candidates.size());
        Map<String, Integer> countries = new HashMap<String, Integer>();
        Map<String, Integer> places = new HashMap<String, Integer>();
        int nullCount = 0;

        // This loops through findings and reports out just Country names for
        // now.
        for (PlaceCandidate candidate : candidates) {
            boolean dobreak = false;
            String namekey = TextUtils.normalizeTextEntity(candidate.getText()); // .toLowerCase();
            if (namekey == null) {
                // Why is this Null?
                countries.put("null", ++nullCount);
                continue;
            } else {
                namekey = namekey.toLowerCase();
            }

            for (Place p : candidate.getPlaces()) {
                if (p == null) {
                    continue;
                }

                /*
                 * if (p.isAbbreviation()) { log.debug(
                 * "Ignore all abbreviations for now " + candidate.getText());
                 * dobreak = true; break; }
                 */
                if (p.isCountry()) {
                    Integer count = countries.get(namekey);
                    if (count == null) {
                        count = new Integer(1);
                        countries.put(namekey, count);
                    }
                    ++count;
                    countries.put(namekey, count);
                    dobreak = true;
                    break;
                } else {
                    Integer count = places.get(namekey);
                    if (count == null) {
                        count = new Integer(1);
                        places.put(namekey, count);
                    }
                    ++count;
                    places.put(namekey, count);
                }
            }
            if (dobreak) {
                continue;
            }
        }
        log.debug("Countries found:" + countries.toString());
        log.debug("Places found:" + places.toString());
    }

    /*
     * We can filter out trivial place name matches that we know to be close to
     * false positives 100% of the time. E.g,. "way", "back", "north" You might
     * consider two different stop filters, Is "North" different than "north"?
     * This first pass filter should really filter out only text we know to be
     * false positives regardless of case.
     * 
     * Filter out unwanted tags via GazetteerETL data model or in Solr index. If
     * you believe certain items will always be filtered then set name_bias >
     * 0.0
     */
    class TagFilter extends MatchFilter {
        /**
         * This may need to be turned off for processing lower-case or dirty
         * data.
         */
        boolean filter_stopwords = true;
        boolean filter_on_case = true;
        Set<String> stopTerms = null;

        /**
         * NOTE:  This expects the files are all available. This fails if resource files are missing.
         * 
         * @throws ConfigException if any file has a problem. 
         */
        public TagFilter() throws ConfigException {
            super();
            stopTerms = new HashSet<>();
            String[] defaultNonPlaceFilters = { "/filters/non-placenames.csv",
                    "/filters/non-placenames,acronym.csv" };
            for (String f : defaultNonPlaceFilters) {
                stopTerms.addAll(loadExclusions(GazetteerMatcher.class.getResourceAsStream(f)));
            }
        }

        public void enableStopwordFilter(boolean b) {
            filter_stopwords = b;
        }

        public void enableCaseSensitive(boolean b) {
            filter_on_case = b;
        }

        @Override
        public boolean filterOut(String t) {
            if (filter_on_case && StringUtils.isAllLowerCase(t)) {
                return true;
            }

            if (filter_stopwords) {
                if (stopTerms.contains(t.toLowerCase())) {
                    return true;
                }
            }

            return false;
        }
    }

    /**
     * Find places located at a particular location.
     *
     * @param yx
     *            location
     * @return list of places near location
     * @throws SolrServerException
     *             on err
     * @deprecated Use SolrGazetteer directly
     */
    @Deprecated
    public List<Place> placesAt(LatLon yx) throws SolrServerException {
        return gazetteer.placesAt(yx, 50);
    }

    /**
     * Exclusions have two columns in a CSV file. 'exclusion', 'category'
     *
     * "#" in exclusion column implies a comment.
     * Call is responsible for getting I/O stream.
     *  
     * @param filestream
     *            URL/file with exclusion terms
     * @return set of filter terms
     * @throws ConfigException
     *             if filter is not found
     */
    public static Set<String> loadExclusions(InputStream filestream) throws ConfigException {
        /*
         * Load the exclusion names -- these are terms that are gazeteer
         * entries, e.g., gazetteer.name = <exclusion term>, that will be marked
         * as search_only = true.
         */
        try (Reader termsIO = new InputStreamReader(filestream)) {
            CsvMapReader termreader = new CsvMapReader(termsIO, CsvPreference.EXCEL_PREFERENCE);
            String[] columns = termreader.getHeader(true);
            Map<String, String> terms = null;
            HashSet<String> stopTerms = new HashSet<String>();
            while ((terms = termreader.read(columns)) != null) {
                String term = terms.get("exclusion");
                if (StringUtils.isBlank(term) || term.startsWith("#")) {
                    continue;
                }
                stopTerms.add(term.toLowerCase().trim());
            }
            termreader.close();
            return stopTerms;
        } catch (Exception err) {
            throw new ConfigException("Could not load exclusions.", err);
        }
    }
}