org.nuxeo.stanbol.temis.engine.TemisLuxidEnhancementEngine.java Source code

Java tutorial

Introduction

Here is the source code for org.nuxeo.stanbol.temis.engine.TemisLuxidEnhancementEngine.java

Source

/* Copyright 2011 Nuxeo and contributors.
 *
 * This file is licensed to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.nuxeo.stanbol.temis.engine;

import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Dictionary;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.xml.bind.JAXBException;
import javax.xml.namespace.QName;
import javax.xml.ws.Holder;

import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.ConfigurationPolicy;
import org.apache.felix.scr.annotations.Properties;
import org.apache.felix.scr.annotations.Property;
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
import org.nuxeo.stanbol.temis.impl.AnnotationPlan;
import org.nuxeo.stanbol.temis.impl.ArrayOfAnnotationPlan;
import org.nuxeo.stanbol.temis.impl.Fault;
import org.nuxeo.stanbol.temis.impl.Output;
import org.nuxeo.stanbol.temis.impl.OutputPart;
import org.nuxeo.stanbol.temis.impl.TemisWebService;
import org.nuxeo.stanbol.temis.impl.TemisWebServicePortType;
import org.osgi.framework.Constants;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;

/**
 * Enhancement engine implementation that delegate the analysis work to a Temis
 * Luxid Annotation Factory service.
 * 
 * The connection properties can be looked up from environment variables (upper
 * case OSGi property names with '_' instead of '.') if the properties are left
 * undefined in the OSGi configuration.
 */
@Component(configurationFactory = true, immediate = true, metatype = true, policy = ConfigurationPolicy.REQUIRE, specVersion = "1.1", inherit = true, label = "%stanbol.TemisEnhancementEngine.name", description = "%stanbol.TemisEnhancementEngine.description")
@Service
@Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "temis"),
        @Property(name = Constants.SERVICE_RANKING, intValue = 0) })
public class TemisLuxidEnhancementEngine extends AbstractEnhancementEngine<ConfigurationException, RuntimeException>
        implements EnhancementEngine, ServiceProperties {

    public static final String DEFAULT_CAS_CONSUMER = "SimpleXML";

    public static final Log log = LogFactory.getLog(TemisLuxidEnhancementEngine.class);

    public static final QName SERVICE_NAME = new QName("http://luxid.temis.com/ws", "TemisWebService");

    @Property
    public static final String SERVICE_WSDL_URL_PROPERTY = "stanbol.temis.luxid.service.wsdl.url";

    @Property
    public static final String SERVICE_ACCOUNT_ID_PROPERTY = "stanbol.temis.luxid.service.account.id";

    @Property
    public static final String SERVICE_ACCOUNT_PASSWORD_PROPERTY = "stanbol.temis.luxid.service.account.password";

    @Property
    public static final String SERVICE_ANNOTATION_PLAN_PROPERTY = "stanbol.temis.luxid.service.annotation.plan";

    @Property
    public static final String SERVICE_CAS_CONSUMER_PROPERTY = "stanbol.temis.luxid.service.cas.consumer";

    public static final UriRef TRANSLITERATION = new UriRef(NamespaceEnum.fise + "transliteration");

    /**
     * The default value for the Execution of this Engine. Currently set to
     * {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
     */
    public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION;

    protected static final String TEXT_PLAIN_MIMETYPE = "text/plain";

    public static final String LUXID_NS = "http://www.temis.com/luxid#";

    protected String annotationPlan;

    protected String accountId;

    protected String accountPassword;

    protected String casConsumer;

    protected TemisWebServicePortType wsPort;

    /**
     * Load a required property value from OSGi context with fall-back on
     * environment variable.
     * 
     * @throws ConfigurationException if no configuration is found in either
     *             contexts.
     */
    protected String getFromPropertiesOrEnv(Dictionary<String, String> properties, String propertyName)
            throws ConfigurationException {
        return getFromPropertiesOrEnv(properties, propertyName, null);
    }

    protected String getFromPropertiesOrEnv(Dictionary<String, String> properties, String propertyName,
            String defaultValue) throws ConfigurationException {
        String envVariableName = propertyName.replaceAll("\\.", "_").toUpperCase();
        String propertyValue = System.getenv(envVariableName);
        if (properties.get(propertyName) != null && !properties.get(propertyName).trim().isEmpty()) {
            propertyValue = properties.get(propertyName);
        }
        if (propertyValue == null || propertyValue.trim().isEmpty()) {
            if (defaultValue == null) {
                throw new ConfigurationException(propertyName,
                        String.format("%s is a required property", propertyName));
            } else {
                return defaultValue;
            }
        }
        return propertyValue;
    }

    @Override
    protected void activate(ComponentContext ce) throws ConfigurationException {
        super.activate(ce);
        @SuppressWarnings("unchecked")
        Dictionary<String, String> properties = ce.getProperties();
        String urlString = getFromPropertiesOrEnv(properties, SERVICE_WSDL_URL_PROPERTY);
        accountId = getFromPropertiesOrEnv(properties, SERVICE_ACCOUNT_ID_PROPERTY);
        accountPassword = getFromPropertiesOrEnv(properties, SERVICE_ACCOUNT_PASSWORD_PROPERTY);
        annotationPlan = getFromPropertiesOrEnv(properties, SERVICE_ANNOTATION_PLAN_PROPERTY);
        casConsumer = getFromPropertiesOrEnv(properties, SERVICE_CAS_CONSUMER_PROPERTY, DEFAULT_CAS_CONSUMER);

        // check the connection to fail early in case of bad configuration
        // parameters
        String sessionId = null;
        try {
            TemisWebService tws = new TemisWebService(new URL(urlString), SERVICE_NAME);
            wsPort = tws.getWebAnnotationPort();
            sessionId = connect();
            // check that the requested annotationPlan is available to the
            // authenticated user
            Holder<ArrayOfAnnotationPlan> plans = new Holder<ArrayOfAnnotationPlan>();
            Holder<Fault> fault = new Holder<Fault>();
            wsPort.getPlans(sessionId, plans, fault);
            handleFault(fault);
            boolean foundPlan = false;
            List<String> availablePlanNames = new ArrayList<String>();
            for (AnnotationPlan availablePlan : plans.value.getReturn()) {
                if (availablePlan.getName().equals(annotationPlan)) {
                    foundPlan = true;
                    break;
                }
                availablePlanNames.add(availablePlan.getName());
            }
            if (!foundPlan) {
                throw new TemisEnhancementEngineException(String.format(
                        "The requested annotationPlan '%s' is does not belong to"
                                + " the list of available plans: '%s'",
                        annotationPlan, StringUtils.join(availablePlanNames, ", ")));
            }
        } catch (TemisEnhancementEngineException e) {
            log.error(e, e);
            throw new ConfigurationException(SERVICE_WSDL_URL_PROPERTY, e.getMessage());
        } catch (MalformedURLException e) {
            throw new ConfigurationException(SERVICE_WSDL_URL_PROPERTY, e.getMessage());
        } finally {
            if (sessionId != null) {
                wsPort.closeSession(sessionId);
            }
        }
    }

    @Override
    protected void deactivate(ComponentContext ce) {
        super.deactivate(ce);
        wsPort = null;
        annotationPlan = null;
    }

    public String connect() throws TemisEnhancementEngineException {
        Holder<String> token = new Holder<String>();
        Holder<Fault> fault = new Holder<Fault>();
        wsPort.authenticate(accountId, accountPassword, token, fault);
        handleFault(fault);
        return token.value;
    }

    protected void handleFault(Holder<Fault> fault) throws TemisEnhancementEngineException {
        if (fault.value != null && fault.value.getMessage() != null && !fault.value.getMessage().isEmpty()) {
            throw new TemisEnhancementEngineException(fault.value);
        }
    }

    @Override
    public void computeEnhancements(ContentItem ci) throws EngineException {
        String token = null;
        LiteralFactory literalFactory = LiteralFactory.getInstance();
        MGraph g = ci.getMetadata();
        try {
            token = connect();
            Holder<Fault> fault = new Holder<Fault>();
            // TODO: read charset from the request instead of hardcoding UTF-8
            // requirement
            // TODO: extract ~3 sentences context for each annotation is
            // possible
            String luxidInput = IOUtils.toString(ci.getStream(), "UTF-8");
            Holder<Output> output = new Holder<Output>();
            wsPort.annotateString(token, annotationPlan, luxidInput, casConsumer, output, fault);
            handleFault(fault);
            for (OutputPart part : output.value.getParts()) {
                if ("DOCUMENT".equals(part.getName()) && "text/xml".equals(part.getMime())) {
                    String luxidOutput = part.getText();
                    handleLuxidOutput(ci, literalFactory, g, luxidInput, luxidOutput);
                }
            }
        } catch (IOException e) {
            throw new EngineException(e);
        } catch (JAXBException e) {
            throw new EngineException(e);
        } finally {
            if (token != null) {
                wsPort.closeSession(token);
            }
        }
    }

    protected void handleLuxidOutput(ContentItem ci, LiteralFactory literalFactory, MGraph g, String text,
            String luxidOutput) throws JAXBException {
        Doc result = Doc.readFrom(luxidOutput);
        for (Entity entity : result.getTopicEntities()) {
            if ("other".equals(entity.getName().toLowerCase())) {
                // skip place holder topic
                continue;
            }
            UriRef topicAnnotation = EnhancementEngineHelper.createTopicEnhancement(ci, this);
            addCommonEntityAttributes(literalFactory, g, entity, topicAnnotation);
        }
        for (Entity entity : result.getMergedEntities()) {
            UriRef entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            Set<UriRef> stanbolTypes = addCommonEntityAttributes(literalFactory, g, entity, entityAnnotation);
            // register entity occurrences
            for (Occurrence occurrence : entity.getOccurrences()) {
                UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                for (UriRef entityType : stanbolTypes) {
                    g.add(new TripleImpl(textAnnotation, DC_TYPE, entityType));
                }
                String context = findContext(text, occurrence.getBegin(), occurrence.getEnd());
                String selectedText = occurrence.getText();
                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT,
                        literalFactory.createTypedLiteral(selectedText)));
                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT,
                        literalFactory.createTypedLiteral(context)));
                g.add(new TripleImpl(textAnnotation, ENHANCER_START,
                        literalFactory.createTypedLiteral(context.indexOf(selectedText))));
                g.add(new TripleImpl(textAnnotation, ENHANCER_END,
                        literalFactory.createTypedLiteral(context.indexOf(selectedText) + selectedText.length())));
                for (String transliteration : entity.transliterations) {
                    g.add(new TripleImpl(textAnnotation, TRANSLITERATION,
                            literalFactory.createTypedLiteral(transliteration)));
                }
                // Link entity annotations to its occurrences
                g.add(new TripleImpl(entityAnnotation, DC_RELATION, textAnnotation));
            }
        }
    }

    protected Set<UriRef> addCommonEntityAttributes(LiteralFactory literalFactory, MGraph g, Entity entity,
            UriRef entityAnnotation) {
        String entityPath = entity.getPath();
        UriRef entityUri = new UriRef(LUXID_NS + entityPath);
        String entityLabel = entity.getName();

        // add the link to the referred entity
        g.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, entityUri));
        g.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL,
                literalFactory.createTypedLiteral(entityLabel)));
        Set<UriRef> stanbolTypes = getStanbolTypes(entityPath);
        for (UriRef entityType : stanbolTypes) {
            g.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, entityType));
        }
        return stanbolTypes;
    }

    protected String findContext(String text, int begin, int end) {
        if (begin < 0) {
            begin = 0;
        }
        if (text.length() < end) {
            end = text.length();
        }
        String prefix = shorten(text.substring(0, begin), 30, true);
        String suffix = shorten(text.substring(end), 30, false);
        String selected = text.substring(begin, end);
        return String.format("%s %s %s", prefix, selected, suffix);
    }

    protected String shorten(String content, int maxWords, boolean reverse) {
        if (content == null) {
            return "";
        }
        List<String> tokens = Arrays.asList(content.split(" "));
        if (tokens.size() > maxWords) {
            if (reverse) {
                Collections.reverse(tokens);
            }
            tokens = new ArrayList<String>(tokens.subList(0, maxWords));
            if ((!reverse && content.startsWith(" ")) || (reverse && content.endsWith(" "))) {
                // re-add missing space removed by split
                tokens.add(0, " ");
            }
            if (reverse) {
                Collections.reverse(tokens);
            }
            return StringUtils.join(tokens, " ");
        }
        return content;
    }

    protected Set<UriRef> getStanbolTypes(String entityPath) {
        Set<UriRef> types = new LinkedHashSet<UriRef>();

        // TODO: un-hard-code mapping: use a configuration file or an OSGi
        // property
        Map<String, UriRef> typeMap = new HashMap<String, UriRef>();
        typeMap.put("/Entity/Person", OntologicalClasses.DBPEDIA_PERSON);
        typeMap.put("/Entity/Media", OntologicalClasses.DBPEDIA_ORGANISATION);
        typeMap.put("/Entity/Organisation", OntologicalClasses.DBPEDIA_ORGANISATION);
        typeMap.put("/Entity/Organization", OntologicalClasses.DBPEDIA_ORGANISATION);
        typeMap.put("/Entity/Company", OntologicalClasses.DBPEDIA_ORGANISATION);
        typeMap.put("/Entity/Location", OntologicalClasses.DBPEDIA_PLACE);
        typeMap.put("/Category", OntologicalClasses.SKOS_CONCEPT);

        while (entityPath.lastIndexOf('/') != -1) {
            entityPath = entityPath.substring(0, entityPath.lastIndexOf('/'));
            UriRef type = typeMap.get(entityPath);
            if (type != null) {
                types.add(type);
            }
        }
        return types;
    }

    public int canEnhance(ContentItem ci) {
        // TODO: check what format are supported by Luxid instead of
        // constraining to text/plain
        String mimeType = ci.getMimeType().split(";", 2)[0];
        if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) {
            return ENHANCE_SYNCHRONOUS;
        }
        return CANNOT_ENHANCE;
    }

    @Override
    public Map<String, Object> getServiceProperties() {
        return Collections
                .unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
    }

}