org.aksw.simba.bengal.triple2nl.DocumentGenerator.java Source code

Introduction

Here is the source code for org.aksw.simba.bengal.triple2nl.DocumentGenerator.java
Source

/*
 * #%L
 * Triple2NL
 * %%
 * Copyright (C) 2015 Agile Knowledge Engineering and Semantic Web (AKSW)
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */
/**
 * 
 */
package org.aksw.simba.bengal.triple2nl;

import java.io.ByteArrayInputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.aksw.jena_sparql_api.core.QueryExecutionFactory;
import org.aksw.jena_sparql_api.http.QueryExecutionFactoryHttp;
import org.apache.jena.graph.Node;
import org.apache.jena.graph.NodeFactory;
import org.apache.jena.graph.Triple;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.Statement;
import org.apache.jena.rdf.model.StmtIterator;
import org.apache.jena.vocabulary.RDF;
import org.dllearner.kb.sparql.SparqlEndpoint;
import org.jgrapht.alg.ConnectivityInspector;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;

import simplenlg.features.Feature;
import simplenlg.features.InternalFeature;
import simplenlg.framework.CoordinatedPhraseElement;
import simplenlg.framework.DocumentElement;
import simplenlg.framework.NLGElement;
import simplenlg.framework.NLGFactory;
import simplenlg.lexicon.Lexicon;
import simplenlg.phrasespec.SPhraseSpec;
import simplenlg.realiser.english.Realiser;

/**
 * @author Lorenz Buehmann
 *
 */
public class DocumentGenerator {

    private TripleConverter tripleConverter;
    private NLGFactory nlgFactory;
    private Realiser realiser;

    private boolean useAsWellAsCoordination = true;

    public DocumentGenerator(SparqlEndpoint endpoint, String cacheDirectory) {
        this(endpoint, cacheDirectory, Lexicon.getDefaultLexicon());
    }

    public DocumentGenerator(SparqlEndpoint endpoint, String cacheDirectory, Lexicon lexicon) {
        this(new QueryExecutionFactoryHttp(endpoint.getURL().toString(), endpoint.getDefaultGraphURIs()),
                cacheDirectory, lexicon);
    }

    public DocumentGenerator(QueryExecutionFactory qef, String cacheDirectory, Lexicon lexicon) {
        tripleConverter = new TripleConverter(qef, null, null, cacheDirectory, null, lexicon);
        nlgFactory = new NLGFactory(lexicon);
        realiser = new Realiser(lexicon);
    }

    public String generateDocument(Model model) {
        Set<Triple> triples = asTriples(model);
        return generateDocument(triples);
    }

    private Set<Triple> asTriples(Model model) {
        Set<Triple> triples = new HashSet<>((int) model.size());
        StmtIterator iterator = model.listStatements();
        while (iterator.hasNext()) {
            Statement statement = iterator.next();
            triples.add(statement.asTriple());
        }
        return triples;
    }

    public String generateDocument(Set<Triple> documentTriples) {
        DefaultDirectedGraph<Node, DefaultEdge> graph = asGraph(documentTriples);

        // divide the document into paragraphs for each connected component in
        // the graph
        ConnectivityInspector<Node, DefaultEdge> connectivityInspector = new ConnectivityInspector<>(graph);
        List<Set<Node>> connectedNodes = connectivityInspector.connectedSets();

        for (Set<Node> nodes : connectedNodes) {
            System.out.println(nodes);
        }

        // group triples by subject
        Map<Node, Collection<Triple>> subject2Triples = groupBySubject(documentTriples);

        // do some sorting
        subject2Triples = sort(documentTriples, subject2Triples);

        List<DocumentElement> sentences = new ArrayList<>();
        for (Entry<Node, Collection<Triple>> entry : subject2Triples.entrySet()) {
            Node subject = entry.getKey();
            Collection<Triple> triples = entry.getValue();

            // combine with conjunction
            CoordinatedPhraseElement conjunction = nlgFactory.createCoordinatedPhrase();

            // get the type triples first
            Set<Triple> typeTriples = new HashSet<>();
            Set<Triple> otherTriples = new HashSet<>();

            for (Triple triple : triples) {
                if (triple.predicateMatches(RDF.type.asNode())) {
                    typeTriples.add(triple);
                } else {
                    otherTriples.add(triple);
                }
            }

            // convert the type triples
            List<SPhraseSpec> typePhrases = tripleConverter.convertToPhrases(typeTriples);

            // if there are more than one types, we combine them in a single
            // clause
            if (typePhrases.size() > 1) {
                // combine all objects in a coordinated phrase
                CoordinatedPhraseElement combinedObject = nlgFactory.createCoordinatedPhrase();

                // the last 2 phrases are combined via 'as well as'
                if (useAsWellAsCoordination) {
                    SPhraseSpec phrase1 = typePhrases.remove(typePhrases.size() - 1);
                    SPhraseSpec phrase2 = typePhrases.get(typePhrases.size() - 1);
                    // combine all objects in a coordinated phrase
                    CoordinatedPhraseElement combinedLastTwoObjects = nlgFactory
                            .createCoordinatedPhrase(phrase1.getObject(), phrase2.getObject());
                    combinedLastTwoObjects.setConjunction("as well as");
                    combinedLastTwoObjects.setFeature(Feature.RAISE_SPECIFIER, false);
                    combinedLastTwoObjects.setFeature(InternalFeature.SPECIFIER, "a");
                    phrase2.setObject(combinedLastTwoObjects);
                }

                Iterator<SPhraseSpec> iterator = typePhrases.iterator();
                // pick first phrase as representative
                SPhraseSpec representative = iterator.next();
                combinedObject.addCoordinate(representative.getObject());

                while (iterator.hasNext()) {
                    SPhraseSpec phrase = iterator.next();
                    NLGElement object = phrase.getObject();
                    combinedObject.addCoordinate(object);
                }

                combinedObject.setFeature(Feature.RAISE_SPECIFIER, true);
                // set the coordinated phrase as the object
                representative.setObject(combinedObject);
                // return a single phrase
                typePhrases = Lists.newArrayList(representative);
            }
            for (SPhraseSpec phrase : typePhrases) {
                conjunction.addCoordinate(phrase);
            }

            // convert the other triples, but use place holders for the subject
            String placeHolderToken = (typeTriples.isEmpty() || otherTriples.size() == 1) ? "it" : "whose";
            Node placeHolder = NodeFactory.createURI("http://sparql2nl.aksw.org/placeHolder/" + placeHolderToken);
            Collection<Triple> placeHolderTriples = new ArrayList<>(otherTriples.size());
            Iterator<Triple> iterator = otherTriples.iterator();
            // we have to keep one triple with subject if we have no type
            // triples
            if (typeTriples.isEmpty() && iterator.hasNext()) {
                placeHolderTriples.add(iterator.next());
            }
            while (iterator.hasNext()) {
                Triple triple = iterator.next();
                Triple newTriple = Triple.create(placeHolder, triple.getPredicate(), triple.getObject());
                placeHolderTriples.add(newTriple);
            }

            Collection<SPhraseSpec> otherPhrases = tripleConverter.convertToPhrases(placeHolderTriples);

            for (SPhraseSpec phrase : otherPhrases) {
                conjunction.addCoordinate(phrase);
            }

            DocumentElement sentence = nlgFactory.createSentence(conjunction);
            sentences.add(sentence);
        }

        DocumentElement paragraph = nlgFactory.createParagraph(sentences);

        String paragraphText = realiser.realise(paragraph).getRealisation();
        return paragraphText;
    }

    /**
     * @param documentTriples
     *            the set of triples
     * @param subject2Triples
     *            a map that contains for each node the triples in which it
     *            occurs as subject
     */
    private Map<Node, Collection<Triple>> sort(Set<Triple> documentTriples,
            Map<Node, Collection<Triple>> subject2Triples) {
        Map<Node, Collection<Triple>> sortedTriples = new LinkedHashMap<>();
        // we can order by occurrence, i.e. which subjects do not occur in
        // object position
        // for each subject we check how often they occur in subject/object
        // position
        Multimap<Node, Node> outgoing = HashMultimap.create();
        Multimap<Node, Node> incoming = HashMultimap.create();
        for (Node subject : subject2Triples.keySet()) {
            for (Triple triple : documentTriples) {
                if (triple.subjectMatches(subject)) {
                    outgoing.put(subject, triple.getObject());
                } else if (triple.objectMatches(subject)) {
                    incoming.put(subject, triple.getSubject());
                }
            }
        }
        // prefer subjects that do not occur in object position first
        for (Iterator<Entry<Node, Collection<Triple>>> iterator = subject2Triples.entrySet().iterator(); iterator
                .hasNext();) {
            Entry<Node, Collection<Triple>> entry = iterator.next();
            Node subject = entry.getKey();
            if (!incoming.containsKey(subject)) {
                sortedTriples.put(subject, new HashSet<>(entry.getValue()));
                iterator.remove();
            }
        }
        // add the rest
        sortedTriples.putAll(subject2Triples);

        // TODO order by triple count

        // TODO order by prominence

        return sortedTriples;
    }

    private Map<Node, Collection<Triple>> groupBySubject(Set<Triple> triples) {
        Multimap<Node, Triple> subject2Triples = HashMultimap.create();
        for (Triple triple : triples) {
            subject2Triples.put(triple.getSubject(), triple);
        }
        return subject2Triples.asMap();
    }

    private DefaultDirectedGraph<Node, DefaultEdge> asGraph(Set<Triple> triples) {
        DefaultDirectedGraph<Node, DefaultEdge> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
        for (Triple triple : triples) {
            // we have to omit type triples to get connected subgraphs later on
            if (!triple.predicateMatches(RDF.type.asNode())) {
                graph.addVertex(triple.getSubject());
                graph.addVertex(triple.getObject());
                graph.addEdge(triple.getSubject(), triple.getObject());
            }
        }
        return graph;
    }

    public static void main(String[] args) throws Exception {
        String triples = "@prefix : <http://dbpedia.org/resource/> ."
                + "@prefix dbo: <http://dbpedia.org/ontology/> ."
                + "@prefix xsd: <http://www.w3.org/2001/XMLSchema#> ."
                + ":Albert_Einstein a dbo:Physican, dbo:Philosopher;" + "dbo:birthPlace :Ulm;"
                + "dbo:birthDate \"1879-03-14\"^^xsd:date ;" + "dbo:studiedIn :Frankfurt ." + ":Ulm a dbo:City;"
                + "dbo:country :Germany;" + "dbo:federalState :Baden_Wrttemberg ." + ":Leipzig a dbo:City;"
                + "dbo:country :Germany;" + "dbo:federalState :Saxony .";

        Model model = ModelFactory.createDefaultModel();
        model.read(new ByteArrayInputStream(triples.getBytes()), null, "TURTLE");

        DocumentGenerator gen = new DocumentGenerator(SparqlEndpoint.getEndpointDBpedia(), "cache");
        String document = gen.generateDocument(model);
        System.out.println(document);
    }

}