org.aksw.simba.bengal.triple2nl.converter.DefaultIRIConverter.java Source code

Java tutorial

Introduction

Here is the source code for org.aksw.simba.bengal.triple2nl.converter.DefaultIRIConverter.java

Source

/*
 * #%L
 * Triple2NL
 * %%
 * Copyright (C) 2015 Agile Knowledge Engineering and Semantic Web (AKSW)
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */
package org.aksw.simba.bengal.triple2nl.converter;

import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.List;

import org.aksw.jena_sparql_api.core.QueryExecutionFactory;
import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp;
import org.aksw.jena_sparql_api.model.QueryExecutionFactoryModel;
import org.aksw.simba.bengal.triple2nl.converter.URIDereferencer.DereferencingFailedException;
import org.apache.commons.collections15.map.LRUMap;
import org.apache.commons.lang.StringUtils;
import org.apache.jena.query.ParameterizedSparqlString;
import org.apache.jena.query.QueryExecution;
import org.apache.jena.query.ResultSet;
import org.apache.jena.rdf.model.Literal;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.RDFNode;
import org.apache.jena.rdf.model.Statement;
import org.apache.jena.sparql.engine.http.QueryExceptionHTTP;
import org.apache.jena.vocabulary.RDF;
import org.apache.jena.vocabulary.RDFS;
import org.apache.jena.vocabulary.XSD;
import org.apache.jena.web.HttpSC;
import org.dllearner.kb.sparql.SparqlEndpoint;
import org.dllearner.utilities.OwlApiJenaUtils;
import org.semanticweb.owlapi.model.IRI;
import org.semanticweb.owlapi.model.OWLOntology;
import org.semanticweb.owlapi.util.IRIShortFormProvider;
import org.semanticweb.owlapi.util.SimpleIRIShortFormProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;

/**
 * Converts IRIs into natural language.
 * 
 * @author Lorenz Buehmann
 *
 */
public class DefaultIRIConverter implements IRIConverter {

    private static final Logger logger = LoggerFactory.getLogger(DefaultIRIConverter.class);

    private IRIShortFormProvider sfp = new SimpleIRIShortFormProvider();
    private LRUMap<String, String> uri2LabelCache = new LRUMap<>(200);

    private QueryExecutionFactory qef;

    private List<String> labelProperties = Lists.newArrayList("http://www.w3.org/2000/01/rdf-schema#label",
            "http://www.w3.org/2004/02/skos/core#prefLabel", "http://www.w3.org/2004/02/skos/core#altLabel",
            "http://xmlns.com/foaf/0.1/name");

    private String language = "en";

    // normalization options
    private boolean splitCamelCase = true;
    private boolean replaceUnderScores = true;
    private boolean toLowerCase = false;
    private boolean omitContentInBrackets = true;

    private URIDereferencer uriDereferencer;

    public DefaultIRIConverter(SparqlEndpoint endpoint, String cacheDirectory) {
        this(new QueryExecutionFactoryHttp(endpoint.getURL().toString(), endpoint.getDefaultGraphURIs()),
                cacheDirectory);
    }

    public DefaultIRIConverter(SparqlEndpoint endpoint) {
        this(endpoint, null);
    }

    public DefaultIRIConverter(QueryExecutionFactory qef) {
        this(qef, null);
    }

    public DefaultIRIConverter(QueryExecutionFactory qef, String cacheDirectory) {
        this.qef = qef;

        // use tmp as default cache directory
        if (cacheDirectory == null) {
            cacheDirectory = System.getProperty("java.io.tmpdir") + "/triple2nl/cache";
        }

        cacheDirectory += "/dereferenced";
        try {
            Files.createDirectories(Paths.get(cacheDirectory));
        } catch (IOException e) {
            logger.error("Creation of folder + " + cacheDirectory + " failed.", e);
        }
        logger.warn("Using folder " + cacheDirectory + " as cache for IRI converter.");

        uriDereferencer = new URIDereferencer(new File(cacheDirectory));
    }

    public DefaultIRIConverter(Model model) {
        this(new QueryExecutionFactoryModel(model));
    }

    public DefaultIRIConverter(OWLOntology ontology) {
        this(OwlApiJenaUtils.getModel(ontology));
    }

    @Override
    public String convert(String iri) {
        return convert(iri, false);
    }

    @Override
    public String convert(String iri, boolean dereferenceURI) {

        // handle built-in entities first
        if (iri.equals(RDF.type.getURI())) {
            return "type";
        } else if (iri.equals(RDFS.label.getURI())) {
            return "label";
        }

        // check if already cached
        String label = uri2LabelCache.get(iri);

        // if not in cache
        if (label == null) {
            // 1. check if it's some built-in resource
            try {
                label = getLabelFromBuiltIn(iri);
            } catch (Exception e) {
                logger.error("Getting label for " + iri + " from knowledge base failed.", e);
            }

            // 2. try to get the label from the endpoint
            if (label == null) {
                try {
                    label = getLabelFromKnowledgebase(iri);
                } catch (Exception e) {
                    logger.error("Getting label for " + iri + " from knowledge base failed.", e);
                }
            }

            // 3. try to dereference the IRI and search for the label in the
            // returned triples
            if (dereferenceURI && label == null) {
                try {
                    label = getLabelFromLinkedData(iri);
                } catch (Exception e) {
                    e.printStackTrace();
                    logger.error("Dereferencing of " + iri + " failed.");
                }
            }

            // 4. use the short form of the IRI
            if (label == null) {
                try {
                    label = sfp.getShortForm(IRI.create(URLDecoder.decode(iri, "UTF-8")));

                    // do some normalization, e.g. remove underscores
                    label = normalize(label);

                } catch (UnsupportedEncodingException e) {
                    logger.error("Getting short form of " + iri + "failed.", e);
                }
            }

            // 5. use the IRI itself
            if (label == null) {
                label = iri;
            }

        }

        // put into cache
        uri2LabelCache.put(iri, label);

        return label;
    }

    /**
     * Set a list of properties that return textual representations a IRI, e.g.
     * rdfs:label, foaf:name, etc. The first property with a value is used.
     * 
     * @param labelProperties
     *            a list of properties
     */
    public void setLabelProperties(List<String> labelProperties) {
        this.labelProperties = labelProperties;
    }

    /**
     * Set the language of the returned textual representation.
     * 
     * @param language
     *            the language
     */
    public void setLanguage(String language) {
        this.language = language;
    }

    public void setSplitCamelCase(boolean splitCamelCase) {
        this.splitCamelCase = splitCamelCase;
    }

    public void setReplaceUnderScores(boolean replaceUnderScores) {
        this.replaceUnderScores = replaceUnderScores;
    }

    public void setOmitContentInBrackets(boolean omitContentInBrackets) {
        this.omitContentInBrackets = omitContentInBrackets;
    }

    public void setToLowerCase(boolean toLowerCase) {
        this.toLowerCase = toLowerCase;
    }

    private String getLabelFromBuiltIn(String uri) {
        try {
            IRI iri = IRI.create(URLDecoder.decode(uri, "UTF-8"));

            // if IRI is built-in entity
            if (iri.isReservedVocabulary()) {
                // use the short form
                String label = sfp.getShortForm(iri);

                // if it is a XSD numeric data type, we attach "value"
                if (uri.equals(XSD.nonNegativeInteger.getURI()) || uri.equals(XSD.integer.getURI())
                        || uri.equals(XSD.negativeInteger.getURI()) || uri.equals(XSD.decimal.getURI())
                        || uri.equals(XSD.xdouble.getURI()) || uri.equals(XSD.xfloat.getURI())
                        || uri.equals(XSD.xint.getURI()) || uri.equals(XSD.xshort.getURI())
                        || uri.equals(XSD.xbyte.getURI()) || uri.equals(XSD.xlong.getURI())) {
                    label += " value";
                }

                return label;
            }
        } catch (UnsupportedEncodingException e) {
            logger.error("Getting short form of " + uri + "failed.", e);
        }
        return null;
    }

    private String getLabelFromKnowledgebase(String iri) {
        ParameterizedSparqlString query = new ParameterizedSparqlString(
                "SELECT ?label WHERE {" + "?s ?p1 ?o ." + "optional {" + "      ?s ?p ?label. "
                        + "      FILTER (LANGMATCHES(LANG(?label),'" + language + "' ))" + "   }" + "optional {"
                        + "     ?s ?p ?label" + "   }" + "} " + "ORDER BY DESC(?label) LIMIT 1");
        query.setIri("s", iri);
        // for each label property
        for (String labelProperty : labelProperties) {
            query.setIri("p", labelProperty);
            try (QueryExecution qe = qef.createQueryExecution(query.toString())) {
                ResultSet rs = qe.execSelect();
                if (rs.hasNext()) {
                    return rs.next().getLiteral("label").getLexicalForm();
                }
            } catch (Exception e) {
                e.printStackTrace();
                int code = -1;
                // cached exception is wrapped in a RuntimeException
                if (e.getCause() instanceof QueryExceptionHTTP) {
                    code = ((QueryExceptionHTTP) e.getCause()).getResponseCode();
                } else if (e instanceof QueryExceptionHTTP) {
                    code = ((QueryExceptionHTTP) e).getResponseCode();
                }
                logger.warn("Getting label of " + iri + " from SPARQL endpoint failed: " + code + " - "
                        + HttpSC.getCode(code).getMessage());
            }
        }
        return null;
    }

    /**
     * Dereference the IRI and look for label property value.
     * 
     * @param iri
     *            the IRI
     * @return the label if exist, otherwise <code>null</code>
     */
    private String getLabelFromLinkedData(String iri) {
        logger.debug("Get label for " + iri + " from Linked Data...");

        try {
            // 1. get triples for the IRI by sending a Linked Data request
            Model model = uriDereferencer.dereference(iri);

            // 2. check if we find a label in the triples
            for (String labelProperty : labelProperties) {
                for (Statement st : model
                        .listStatements(model.getResource(iri), model.getProperty(labelProperty), (RDFNode) null)
                        .toList()) {
                    Literal literal = st.getObject().asLiteral();

                    // language check
                    String language = literal.getLanguage();
                    if (language != null && language.equals(this.language)) {
                        return literal.getLexicalForm();
                    }
                }
            }
        } catch (DereferencingFailedException e) {
            logger.error(e.getMessage(), e);
        }
        return null;
    }

    private String normalize(String s) {
        if (replaceUnderScores) {
            s = s.replace("_", " ");
        }
        if (splitCamelCase) {
            s = splitCamelCase(s);
        }
        if (toLowerCase) {
            s = s.toLowerCase();
        }
        if (omitContentInBrackets) {
            s = s.replaceAll("\\(.+?\\)", "").trim();
        }
        return s;
    }

    private static String splitCamelCase(String s) {
        StringBuilder sb = new StringBuilder();
        for (String token : s.split(" ")) {
            String[] split = StringUtils.splitByCharacterTypeCamelCase(token);
            Deque<String> list = new ArrayDeque<>();
            for (int i = 0; i < split.length; i++) {
                String s1 = split[i];
                if (i > 0 && s1.length() == 1 && !org.apache.commons.lang3.StringUtils.isNumeric(s1)) { // single
                    // character
                    // ->
                    // append
                    // to
                    // previous
                    // token
                    list.add(list.pollLast() + s1);
                } else {
                    list.add(s1);
                }
            }
            sb.append(StringUtils.join(list, ' ')).append(" ");
        }
        return sb.toString().trim();
        // return s.replaceAll(
        // String.format("%s|%s|%s",
        // "(?<=[A-Z])(?=[A-Z][a-z])",
        // "(?<=[^A-Z])(?=[A-Z])",
        // "(?<=[A-Za-z])(?=[^A-Za-z])"
        // ),
        // " "
        // );
    }

    public static void main(String[] args) {
        DefaultIRIConverter converter = new DefaultIRIConverter(SparqlEndpoint.getEndpointDBpedia());

        String label = converter.convert("http://dbpedia.org/resource/Nuclear_Reactor_Technology");
        System.out.println(label);

        label = converter.convert("http://dbpedia.org/resource/Woodroffe_School");
        System.out.println(label);

        label = converter.convert("http://dbpedia.org/ontology/isBornIn", true);
        System.out.println(label);

        label = converter.convert("http://www.w3.org/2001/XMLSchema#integer");
        System.out.println(label);
    }

}