org.mitre.opensextant.extraction.PlacenameMatcher.java Source code

Introduction

Here is the source code for org.mitre.opensextant.extraction.PlacenameMatcher.java
Source

/**
 * Copyright 2009-2013 The MITRE Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 *
 * **************************************************************************
 * NOTICE This software was produced for the U. S. Government under Contract No.
 * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
 * Software and Noncommercial Computer Software Documentation Clause
 * 252.227-7014 (JUN 1995)
 *
 * (c) 2012 The MITRE Corporation. All Rights Reserved.
 * **************************************************************************
 *
 */
package org.mitre.opensextant.extraction;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.mitre.opensextant.util.TextUtils;
import org.mitre.opensextant.placedata.Place;
import org.mitre.opensextant.placedata.PlaceCandidate;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Connects to a Solr sever via HTTP and tags place names in document. The
 * <code>SOLR_HOME</code> environment variable must be set to the location of
 * the Solr server.
 *
 * @author David Smiley - dsmiley@mitre.org
 * @author Marc Ubaldino - ubaldino@mitre.org
 */
public class PlacenameMatcher {

    /** 
     Generic Solr Matcher stuff:
     */
    protected final static String requestHandler = "/tag";
    protected final Logger log = LoggerFactory.getLogger(this.getClass());
    protected final boolean debug = log.isDebugEnabled();

    /**
     * In the interest of optimization we made the Solr instance a static class
     * attribute that should be thread safe and shareable across instances of
     * SolrMatcher
     */
    protected static SolrParams params = null;
    protected static SolrProxy solr = null;

    /**  Gazetteer specific stuff: 
     */
    private final String APRIORI_NAME_RULE = "AprioriNameBias";
    private SolrTaggerRequest tag_request = null;
    private Map<Integer, Place> beanMap = new HashMap<>(100); // initial size

    /**
     * In the interest of optimization we made the Solr instance a static class
     * attribute that should be thread safe and shareable across instances of
     * SolrMatcher
     */
    private MatchFilter filter = null;
    private boolean allow_lowercase_abbrev = false;

    /**
     *
     * @throws IOException
     */
    public PlacenameMatcher() throws IOException {
        PlacenameMatcher.initialize();

        filter = new MatchFilter("/filters/tagging-filters.txt");

        // Instance variable that will have the transient payload to tag
        // this is not thread safe and is not static:
        tag_request = new SolrTaggerRequest(params, SolrRequest.METHOD.POST);

        // Pre-loading the Solr FST
        // 
        try {
            tagText("trivial priming of the solr pump", "__initialization___");
        } catch (MatcherException initErr) {
            throw new IOException("Unable to prime the tagger", initErr);
        }
    }

    /** allow_lowercase_abbrev is a flag that will allow us to tag "in"  or "in."
     * as a possible abbreviation.  By default such things are not abbreviations, e.g., 
     * Indiana is typically IN or In. or Ind., for example.
     * Oregon, OR or Ore.
     * etc.
     * 
     * but almost never in or or for those cases.
     */
    public void setAllowLowerCaseAbbreviations(boolean b) {
        allow_lowercase_abbrev = b;
    }

    /**
     * Close solr resources.
     */
    public static void shutdown() {
        if (solr != null) {
            solr.close();
        }
    }

    /**
     */
    protected static void initialize() throws IOException {

        if (solr != null) {
            return;
        }

        // NOTE: This is set via opensextant.apps.Config or by some other means
        // But it is required to intialize.  "gazetteer" is the core name of interest.
        // Being explicit here about the core name allows integrator to field multiple cores 
        // in the same gazetteer.  
        // 
        String config_solr_home = System.getProperty("solr.solr.home");
        solr = new SolrProxy(config_solr_home, "gazetteer");

        ModifiableSolrParams _params = new ModifiableSolrParams();
        _params.set(CommonParams.QT, requestHandler);
        //request all fields in the Solr index
        // Do we need specific fields or *?  If faster use specific fields. TODO.
        //_params.set(CommonParams.FL, "*,score");
        // Note -- removed score for now, as we have not evaluated how score could be used in this sense.
        // Score depends on FST creation and other factors.
        // 
        // TODO: verify that all the right metadata is being retrieved here
        _params.set(CommonParams.FL,
                "id,name,cc,adm1,adm2,feat_class,feat_code,lat,lon,place_id,name_bias,id_bias,name_type");

        _params.set("tagsLimit", 100000);
        _params.set(CommonParams.ROWS, 100000);
        _params.set("subTags", false);
        _params.set("matchText", false);//we've got the input doc as a string instead

        /* Possible overlaps: ALL, NO_SUB, LONGEST_DOMINANT_RIGHT
         * See Solr Text Tagger documentation for details. 
         */
        _params.set("overlaps", "LONGEST_DOMINANT_RIGHT");
        //_params.set("overlaps", "NO_SUB");

        params = _params;
    }

    /**
     * Tag a document, returning PlaceCandidates for the mentions in document.
     * Converts a GATE document to a string and passes it to the Solr server via
     * HTTP POST. The tokens and featureName parameters are not used.
     * @param buffer
     * @param docid
     *
     * @return place_candidates List of place candidates
     * @throws MatcherException
     */
    public List<PlaceCandidate> tagText(String buffer, String docid) throws MatcherException {
        // "tagsCount":10, "tags":[{ "ids":[35], "endOffset":40, "startOffset":38},
        // { "ids":[750308, 2769912, 2770041, 10413973, 10417546], "endOffset":49,
        // "startOffset":41},
        // ...
        // "matchingDocs":{"numFound":75, "start":0, "docs":[ {
        // "place_id":"USGS1992921", "name":"Monterrey", "cc":"PR"}, {
        //"place_id":"USGS1991763", "name":"Monterrey", "cc":"PR"}, ]   

        if (debug) {
            log.debug("TEXT SIZE = " + buffer.length());
        }

        List<PlaceCandidate> candidates = new ArrayList<>();

        // Setup request to tag... 
        tag_request.input = buffer;
        QueryResponse response = null;
        try {
            response = tag_request.process(solr.getInternalSolrServer());
        } catch (Exception err) {
            throw new MatcherException("Failed to tag document=" + docid, err);
        }

        // -- Process Solr Response

        //List<GeoBean> geoBeans = response.getBeans(GeoBean.class); maybe works but probably slow
        SolrDocumentList docList = (SolrDocumentList) response.getResponse().get("matchingDocs");
        beanMap.clear();
        String name = null;
        for (SolrDocument solrDoc : docList) {

            name = SolrProxy.getString(solrDoc, "name");
            if (filter.filterOut(name.toLowerCase())) {
                continue;
            }

            Place bean = new Place();

            bean.setName_type(SolrProxy.getChar(solrDoc, "name_type"));

            // Gazetteer place name & country:
            //   NOTE: this may be different than "matchtext" or PlaceCandidate.name field.
            // 
            bean.setPlaceName(name);
            bean.setCountryCode(SolrProxy.getString(solrDoc, "cc"));

            // Other metadata.
            bean.setAdmin1(SolrProxy.getString(solrDoc, "adm1"));
            bean.setAdmin2(SolrProxy.getString(solrDoc, "adm2"));
            bean.setFeatureClass(SolrProxy.getString(solrDoc, "feat_class"));
            bean.setFeatureCode(SolrProxy.getString(solrDoc, "feat_code"));
            bean.setLatitude(SolrProxy.getDouble(solrDoc, "lat"));
            bean.setLongitude(SolrProxy.getDouble(solrDoc, "lon"));

            bean.setPlaceID(SolrProxy.getString(solrDoc, "place_id"));
            bean.setName_bias(SolrProxy.getDouble(solrDoc, "name_bias"));
            bean.setId_bias(SolrProxy.getDouble(solrDoc, "id_bias"));

            // Hashed on "id"
            Integer id = (Integer) solrDoc.getFirstValue("id");
            beanMap.put(id, bean);
        }

        @SuppressWarnings("unchecked")
        List<NamedList<?>> tags = (List<NamedList<?>>) response.getResponse().get("tags");

        if (debug) {
            log.debug("DOC=" + docid + " TAGS SIZE = " + tags.size());
        }

        /*
         * Retrieve all offsets into a long list.  These offsets will report
         * a text span and all the gazetteer record IDs that are associated to that span.
         * The text could either be a name, a code or some other abbreviation.
         * 
         * For practical reasons the default behavior is to filter trivial spans given 
         * the gazetteer data that is returned for them.
         * 
         * WARNING: lots of optimizations occur here due to the potentially large volume of tags
         * and gazetteer data that is involved.  And this is relatively early in the pipline.
         * 
         */
        PlaceCandidate pc;
        Place Pgeo;
        int x1 = -1, x2 = -1;
        Set<String> seenPlaces = new HashSet<>();
        Double name_bias = 0.0;

        String matchText = null;
        for (NamedList<?> tag : tags) {
            x1 = (Integer) tag.get("startOffset");
            x2 = (Integer) tag.get("endOffset");//+1 char after last matched
            matchText = buffer.substring(x1, x2);

            /**
             * We can filter out trivial place name matches that we know to be
             * close to false positives 100% of the time. E.g,. "way", "back",
             * "north" You might consider two different stop filters, Is "North"
             * different than "north"? This first pass filter should really
             * filter out only text we know to be false positives regardless of
             * case.
             */
            if (filter.filterOut(matchText.toLowerCase())) {
                continue;
            }

            pc = new PlaceCandidate();
            pc.setStart(x1);
            pc.setEnd(x2);

            // Could have enabled the "matchText" option from the tagger to get
            // this, but since we already have the content as a String then
            // we might as well not make the tagger do any more work.
            pc.setPlaceName(matchText); //
            name_bias = 0.0;

            @SuppressWarnings("unchecked")
            List<Integer> placeRecordIds = (List<Integer>) tag.get("ids");
            //clear out places seen for the next candidate
            seenPlaces.clear();
            boolean _is_valid = true;
            boolean _is_lower = StringUtils.isAllLowerCase(pc.getText());

            for (Integer solrId : placeRecordIds) {
                Pgeo = beanMap.get(solrId);
                if (Pgeo == null) {
                    if (debug) {
                        log.debug("Logic error. Did not find place object for Solr ID=" + solrId);
                    }
                    continue;
                }

                // Optimization:  abbreviation filter.
                // 
                // Do not add PlaceCandidates for lower case tokens that are marked as Abbreviations
                // Unless flagged to do so.
                // DEFAULT behavior is to avoid lower case text that is tagged as an abbreviation in gazetteer,
                // 
                // Common terms:  in, or, oh, me, us, we, 
                //   etc.
                // Are all not typically place names or valid abbreviations in text.
                //                 
                if (!allow_lowercase_abbrev) {
                    if (Pgeo.isAbbreviation() && _is_lower) {
                        _is_valid = false;
                        if (debug) {
                            log.debug("Ignore lower case term=" + pc.getText());
                        }

                        break;
                    }

                }
                // Optimization: Add distinct place objects once. 
                //   don't add Place if null or already added to this instance of PlaceCandidate
                // 
                if (!seenPlaces.contains(Pgeo.getPlaceID())) {
                    pc.addPlace(Pgeo);
                    seenPlaces.add(Pgeo.getPlaceID());

                    // get max name bias
                    Double n_bias = Pgeo.getName_bias();
                    if (n_bias > name_bias) {
                        name_bias = n_bias;
                    }
                }

                // Indeed this does happen.
                // else { log.info("Does this ever happen -- ? " + pc.getText() + " " + Pgeo.getPlaceName()); }
            }

            /**
             * Some rule above triggered a flag that indicates this
             * token/place/name is not valid.
             *
             */
            if (!_is_valid) {
                continue;
            }

            // if the max name bias seen >0; add apriori evidence
            if (name_bias != 0.0) {
                pc.addRuleAndConfidence(APRIORI_NAME_RULE, name_bias);
            }

            candidates.add(pc);
        }

        if (debug) {
            summarizeExtraction(candidates, docid);
        }

        return candidates;
    }

    /**
     * Debugging
     */
    private void summarizeExtraction(List<PlaceCandidate> candidates, String docid) {
        if (candidates == null) {
            log.error("Something is very wrong.");
            return;
        }
        log.debug("DOC=" + docid + " PLACE CANDIDATES SIZE = " + candidates.size());
        Map<String, Integer> countries = new HashMap<>();

        // This loops through findings and reports out just Country names for now.
        for (PlaceCandidate candidate : candidates) {
            boolean _break = false;
            String namekey = TextUtils.normalize_text_entity(candidate.getText()); // .toLowerCase();
            namekey = namekey.toLowerCase();

            for (Place p : candidate.getPlaces()) {
                if (p.isAbbreviation()) {
                    log.debug("Ignore all abbreviations for now " + candidate.getText());
                    _break = true;
                    break;
                }
                if (p.isCountry()) {
                    Integer count = countries.get(namekey);
                    if (count == null) {
                        count = new Integer(1);
                        countries.put(namekey, count);
                    }
                    ++count;
                    countries.put(namekey, count);
                    _break = true;
                    break;
                }
            }
            if (_break) {
                continue;
            }
        }
        log.debug("Countries found:" + countries.toString());
    }

    /**
     * Do a basic test
     */
    public static void main(String[] args) throws Exception {
        //String solrHome = args[0];

        PlacenameMatcher sm = new PlacenameMatcher();

        try {
            String docContent = "I want to go to New York City some day.";

            System.out.println(docContent);

            List<PlaceCandidate> matches = sm.tagText(docContent, "main-test");

            for (PlaceCandidate pc : matches) {
                System.out.println(pc.toString());
            }

            sm.shutdown();
        } catch (Exception err) {
            err.printStackTrace();
        }
    }
}