info.semanticsoftware.sempub2016.LocationInferer.java Source code

Java tutorial

Introduction

Here is the source code for info.semanticsoftware.sempub2016.LocationInferer.java

Source

/*
 * Semantic Software Lab submission to Semantic Publishing Challenge 2016,
 * http://www.semanticsoftware.info/sempub-challenge-2016
 *
 * Copyright (c) 2016 Semantic Software Lab, http://www.semanticsoftware.info
 *    Rene Witte
 *    Bahar Sateli
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 3.0 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library.  If not, see <http://www.gnu.org/licenses/>.
 */

package info.semanticsoftware.sempub2016;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.List;

import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.hp.hpl.jena.query.Query;
import com.hp.hpl.jena.query.QueryExecution;
import com.hp.hpl.jena.query.QueryExecutionFactory;
import com.hp.hpl.jena.query.QueryFactory;
import com.hp.hpl.jena.query.QuerySolution;
import com.hp.hpl.jena.query.ResultSet;

import gate.*;
import gate.creole.*;
import gate.creole.metadata.*;

/**
 * This class is the implementation of the resource AFFILIATIONLOCATIONINFERER.
 */
@CreoleResource(name = "AffiliationLocationInferer", comment = "Infers in which country an academic institute is located.")
public class LocationInferer extends AbstractLanguageAnalyser implements ProcessingResource {
    /**
     * 
     */
    private static final long serialVersionUID = 477694444629115020L;

    @Override
    public final gate.Resource init() throws ResourceInstantiationException {
        return this;
    }

    @Override
    public final void reInit() throws ResourceInstantiationException {
        init();
    }

    @Override
    public final void execute() throws ExecutionException {
        AnnotationSet set = document.getAnnotations();
        AnnotationSet univSet = set.get("Affiliation_univ");
        Annotation metadata = set.get("Metadata_body").iterator().next();
        AnnotationSet locSet = set
                .getContained(metadata.getStartNode().getOffset(), metadata.getEndNode().getOffset())
                .get("Location");
        boolean flag = false;
        List<Annotation> univs = gate.Utils.inDocumentOrder(univSet);
        for (Annotation univ : univs) {
            FeatureMap feats = univ.getFeatures();
            if (!feats.containsKey("locatedIn")) {
                flag = true;
                System.out.println(" Hmmm... " + univ.getFeatures().get("content").toString()
                        + " does not have location information.");
            }
        }
        if (flag) {
            System.out.println(" Running heuristics for " + document.getName());

            List<Annotation> locs = gate.Utils.inDocumentOrder(locSet);
            // only keep the countries
            System.out.println("\t Pruning the Location annotations...");
            for (Iterator<Annotation> iter = locs.listIterator(); iter.hasNext();) {
                Annotation loc = iter.next();
                if (loc.getFeatures().containsKey("locType")) {
                    try {
                        //FIXME
                        //System.out.println("Is " + gate.Utils.cleanStringFor(document, loc) + " a country? " + loc.getFeatures().get("locType").toString().equalsIgnoreCase("country"));
                        // use the line below to cause an exception
                        loc.getFeatures().get("locType").toString().equalsIgnoreCase("country");
                        if (loc.getFeatures().get("locType") != null
                                && loc.getFeatures().get("locType").toString().trim().length() > 0
                                && !loc.getFeatures().get("locType").toString().equalsIgnoreCase("country")) {
                            System.out.println("\t Removing " + gate.Utils.cleanStringFor(document, loc));
                            iter.remove();
                        }
                    } catch (NullPointerException e) {
                        System.out.println("\t Found NULL as locType. Removing "
                                + gate.Utils.cleanStringFor(document, loc));
                        iter.remove();
                    }
                } else {
                    //System.out.println("Skipping " + loc.getFeatures());
                    iter.remove();
                }

            }

            //System.out.println("Remaining locs " + locs);

            /*
             * Heuristic 1: 
             * No countries in the metadata body. Infer using DBpedia.
             */
            if (locs.size() == 0) {
                System.out.println("\t Executing Heuristic [1] (DBpedia Lookup) for " + document.getName());
                // no countries, let's ask DBpedia
                for (Annotation univ : univs) {
                    String uniName = univ.getFeatures().get("content").toString();
                    //System.out.println("----- " + uniName);
                    String uri = dbpediaLookup(uniName);
                    if (uri != null) {
                        findCountry(uri, univ);
                    } else {
                        System.out.println("\t Sorry, no country information exists in DBpedia.");
                    }
                }
            } else {
                /*
                 * Heuristic 2: 
                 * Infer the location based on text distance (closest).
                 */
                System.out.println("\t Executing Heuristic [2] (Shortest Distance) for " + document.getName());

                for (Annotation u : univs) {
                    Iterator<Annotation> itr = locs.iterator();
                    while (itr.hasNext()) {
                        Long univOffset = u.getStartNode().getOffset();
                        Annotation country = itr.next();
                        Long countryOffset = country.getStartNode().getOffset();
                        if (countryOffset < univOffset) {
                            // fault tolerance
                            //System.out.println("Dangling country: " + gate.Utils.cleanStringFor(document, country));
                            itr.remove();
                            continue;
                        } else {
                            String locString = gate.Utils.cleanStringFor(document, country);
                            u.getFeatures().put("locatedIn", locString);
                            try {
                                u.getFeatures().put("locationURI", new URI("http://ceur-ws.org/country/"
                                        + locString.trim().toLowerCase().replace(' ', '-')));
                            } catch (URISyntaxException e) {
                                e.printStackTrace();
                            }
                            itr.remove();
                            System.out.println("\t Inferred: (" + gate.Utils.cleanStringFor(document, u)
                                    + ", locatedIn, " + locString + ")");
                            break;
                        }
                    }
                }

            }
        }
    }

    private String dbpediaLookup(final String university) {
        try {
            //System.out.println("----- dbpediaLookup " + university);
            final String endpointQuery = "http://lookup.dbpedia.org/api/search/KeywordSearch?QueryClass=University&MaxHits=1&QueryString="
                    + university;
            //System.out.println("----- query: " + endpointQuery);
            final URL url = new URL(endpointQuery);
            final HttpURLConnection conn = (HttpURLConnection) url.openConnection();
            conn.setRequestMethod("GET");
            conn.setRequestProperty("Accept", "application/json");
            conn.setRequestProperty("content-type", "application/x-www-form-urlencoded");
            conn.setDoOutput(true);
            if (conn.getResponseCode() != 200) {
                throw new RuntimeException("Failed: Http error code " + conn.getResponseCode());
            }
            BufferedReader in = new BufferedReader(
                    new InputStreamReader(conn.getInputStream(), Charset.forName("UTF-8")));
            String s = null;
            StringBuffer sb = new StringBuffer();
            while ((s = in.readLine()) != null) {
                sb.append(s);
            }
            //System.out.println(sb.toString());
            JsonParser parser = new JsonParser();
            JsonObject object = parser.parse(sb.toString()).getAsJsonObject();
            JsonArray arr = object.getAsJsonArray("results");
            // get(0) throws java.lang.IndexOutOfBoundsException: Index: 0, Size: 0 when nothing is found
            String uri = arr.get(0).getAsJsonObject().getAsJsonPrimitive("uri").getAsString();
            return uri;
        } catch (MalformedURLException e1) {
            e1.printStackTrace();
        } catch (IOException e2) {
            e2.printStackTrace();
        } catch (Exception e3) {
            e3.printStackTrace();
        }
        return null;
    }

    private void findCountry(final String universityURI, Annotation univ) {
        //System.out.println("----- country for " + universityURI);

        String sparqlQueryString = " select (str(?label) as ?strLabel) " + "where {" + "<" + universityURI
                + "> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/University>;"
                + "<http://dbpedia.org/ontology/country> ?country."
                + "?country <http://www.w3.org/2000/01/rdf-schema#label> ?label."
                + "filter langMatches( lang(?label), 'en' )" + "}";
        //System.out.println(sparqlQueryString);
        Query query = QueryFactory.create(sparqlQueryString);
        QueryExecution qexec = QueryExecutionFactory.sparqlService("http://dbpedia.org/sparql", query);
        try {
            ResultSet results = qexec.execSelect();
            if (!results.hasNext()) {
                System.out.println("\t DBpedia returned no information for " + universityURI);
            } else {
                for (; results.hasNext();) {
                    QuerySolution soln = results.nextSolution();
                    System.out.println("\t Infered: (" + universityURI + ", locatedIn, "
                            + soln.get("?strLabel").toString().trim() + ")");
                    univ.getFeatures().put("locatedIn", soln.get("?strLabel").toString().trim());
                    univ.getFeatures().put("locationURI", new URI("http://ceur-ws.org/country/"
                            + soln.get("?strLabel").toString().trim().toLowerCase().replace(' ', '-')));
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            qexec.close();
        }
    }
} // class LocationInferer