nl.minbzk.dwr.zoeken.enricher.processor.UIMAInjector.java Source code

Introduction

Here is the source code for nl.minbzk.dwr.zoeken.enricher.processor.UIMAInjector.java
Source

/* Copyright (c) 2010 Ministry of the Interior and Kingdom Relations,
 * the Netherlands. All rights reserved.
 * 
 * This file is part of the MinBZK Search Enricher indexing generator.
 * 
 * Search Enricher is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * Search Enricher is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with Search Enricher. If not, see <http://www.gnu.org/licenses/>. */

package nl.minbzk.dwr.zoeken.enricher.processor;

import com.spatial4j.core.context.SpatialContext;
import com.spatial4j.core.io.GeohashUtils;
import com.spatial4j.core.shape.Point;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import nl.minbzk.dwr.zoeken.enricher.ProcessorResult.ProcessorContent;
import nl.minbzk.dwr.zoeken.enricher.processor.uima.UIMAUnit;
import nl.minbzk.dwr.zoeken.enricher.util.TextNormalizer;
import org.apache.tika.mime.MediaType;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.analysis_engine.impl.AnalysisEngineManagementImpl;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.StringUtils;

import static java.lang.String.format;

/**
 * UIMA-based injector.
 * 
 * @author Jasper van Veghel <jasper@seajas.com>
 */
public class UIMAInjector {
    /**
     * The logger.
     */
    private static final Logger logger = LoggerFactory.getLogger(UIMAInjector.class);

    /**
     * Constants.
     */
    private static final String GEO_LOCATION_UNKNOWN = "0";
    private static final String GEO_LOCATION_SEPARATOR = "#";
    private static final String GEO_LOCATION_HASH_SUFFIX = "_hash";

    private static final String CONTEXT_PARAMETER_CAS = "tika.processor.uima.cas";

    private static final Integer UIMA_MINIMUM_LENGTH = 6;
    private static final Integer UIMA_MAXIMUM_LENGTH = 12000;

    private static final String SENTIMENT_FIELD = "sentiment";
    private static final String CLASSIFICATIONS_FIELD = "classifications";

    /**
     * The entity-detection scan types.
     */
    private final List<String> entityDetectionScanTypes;

    /**
     * The entity-detection field prefix.
     */
    private final String entityDetectionFieldPrefix;

    /**
     * The geo-spatial field prefix.
     */
    private final String geoSpatialFieldPrefix;

    /**
     * The UIMA unit.
     */
    private final UIMAUnit uimaUnit;

    /**
     * The CAS.
     */
    private final CAS cas;

    /**
     * Default constructor.
     * 
     * @param uimaUnit
     * @param entityDetectionScanTypes
     * @param entityDetectionFieldPrefix
     * @param geoSpatialFieldPrefix
     */
    public UIMAInjector(final ProcessorContext context, final UIMAUnit uimaUnit,
            final List<String> entityDetectionScanTypes, final String entityDetectionFieldPrefix,
            final String geoSpatialFieldPrefix) {
        this.uimaUnit = uimaUnit;
        this.cas = getCasFromContext(context, uimaUnit);

        if (cas == null)
            throw new IllegalArgumentException(
                    "Unable to obtain a CAS to process the given document content through");

        this.entityDetectionScanTypes = entityDetectionScanTypes;
        this.entityDetectionFieldPrefix = entityDetectionFieldPrefix;
        this.geoSpatialFieldPrefix = geoSpatialFieldPrefix;
    }

    /**
     * Process the given content through the UIMA pipeline.
     * 
     * @param documentOutput
     * @param documentId
     * @param detectedLanguage
     * @param mediaType
     * @param processorOutput
     * @throws AnalysisEngineProcessException
     */
    public void inject(final String documentOutput, final String documentId, final String detectedLanguage,
            final MediaType mediaType, final ProcessorContent processorOutput)
            throws AnalysisEngineProcessException {
        if (logger.isTraceEnabled())
            logger.trace(format("Feeding document %s (%s) as text: %s", documentId, mediaType, documentOutput));

        try {
            Feature alternativeFeature = getFeature("alternative");
            Feature locationFeature = getFeature("location");

            cas.setDocumentLanguage(uimaUnit.getLanguage());
            cas.setDocumentText(cleanOutput(documentOutput, mediaType));

            // Now add in document metadata and set the document ID

            Type metadataType = cas.getTypeSystem().getType("enricher.uima.DocumentMetadata");
            FeatureStructure metadataFS = cas.createFS(metadataType);

            metadataFS.setStringValue(metadataType.getFeatureByBaseName("id"), documentId);

            cas.addFsToIndexes(metadataFS);

            // And continue on with processing

            if (logger.isTraceEnabled())
                logger.trace("Start processing the relevant CAS");

            uimaUnit.getAnalysisEngine().process(cas);

            if (logger.isTraceEnabled())
                logger.trace("Finished processing the relevant CAS - now cycling through results");

            long sessionProcessingTime = uimaUnit.getSessionProcessingTime();

            if (sessionProcessingTime > 0) {
                if (logger.isInfoEnabled())
                    logger.info(format("Reporting the session processing time as %d", sessionProcessingTime));

                ((AnalysisEngineManagementImpl) uimaUnit.getInternalManagementInterface())
                        .reportServiceCallTime(sessionProcessingTime);
                ((AnalysisEngineManagementImpl) uimaUnit.getInternalManagementInterface())
                        .incrementCASesProcessed();
            }

            // Filter down to the relevant data sets

            Map<String, Map<String, Integer>> annotationsWithCounts = new HashMap<String, Map<String, Integer>>();
            Map<String, List<String>> annotationsWithDuplicates = new HashMap<String, List<String>>();
            Map<String, List<String>> annotationsWithLocations = new HashMap<String, List<String>>();

            if (logger.isTraceEnabled())
                logger.trace("Filtering types");

            filterTypes(documentId, alternativeFeature, locationFeature, annotationsWithCounts,
                    annotationsWithDuplicates, annotationsWithLocations);

            if (logger.isTraceEnabled())
                logger.trace("Ordering types");

            orderTypes(detectedLanguage, processorOutput, annotationsWithCounts);

            if (logger.isTraceEnabled())
                logger.trace("Deduplicating types");

            deduplicateTypes(processorOutput, annotationsWithDuplicates);

            // And the relevant geo-locations (also including duplicates)

            if (shouldPerformGeoSpatialAnalysis()) {
                if (logger.isTraceEnabled())
                    logger.trace("Geo-filtering types");

                filterGeo(processorOutput, annotationsWithLocations);
            }

            // And extract the sentiment value, if present

            String sentimentValue = metadataFS.getStringValue(metadataType.getFeatureByBaseName("sentiment"));

            if (StringUtils.hasText(sentimentValue)) {
                if (logger.isInfoEnabled())
                    logger.info(format("Sentiment value determined to be %s", sentimentValue));

                List<String> sentimentValues = new ArrayList<String>(1);

                sentimentValues.add(sentimentValue);

                processorOutput.getMetadata().put(SENTIMENT_FIELD, sentimentValues);
            } else {
                if (logger.isTraceEnabled())
                    logger.trace("No sentiment value was extracted or could be provided");
            }

            // And extract the sentiment value, if present

            String classificationsValue = metadataFS
                    .getStringValue(metadataType.getFeatureByBaseName("classifications"));

            if (StringUtils.hasText(classificationsValue)) {
                if (logger.isInfoEnabled())
                    logger.info(format("Classification value determined to be %s", classificationsValue));

                List<String> classificationValues = new ArrayList<String>();

                classificationValues.addAll(
                        Arrays.asList(StringUtils.tokenizeToStringArray(classificationsValue, ",", true, false)));

                processorOutput.getMetadata().put(CLASSIFICATIONS_FIELD, classificationValues);
            } else {
                if (logger.isTraceEnabled())
                    logger.trace("No classifications value was extracted or could be provided");
            }

            if (logger.isTraceEnabled())
                logger.trace("Finished injecting UIMA-generated metadata");
        } catch (RuntimeException e) {
            logger.error("UIMA processing failed", e);
        } catch (Exception e) {
            logger.error("UIMA processing failed", e);
        } finally {
            cas.reset();
        }
    }

    private void filterGeo(final ProcessorContent processorOutput,
            final Map<String, List<String>> annotationsWithLocations) {
        for (Entry<String, List<String>> annotationWithLocations : annotationsWithLocations.entrySet()) {
            List<String> geoValues = annotationWithLocations.getValue();
            List<String> geoValuesRemove = new ArrayList<String>();

            // XXX: Add an additional field containing the geo-names

            for (int i = 0; i < geoValues.size(); i++) {
                if (!StringUtils.hasText(geoValues.get(i)) || geoValues.get(i).equals(GEO_LOCATION_UNKNOWN)) {
                    logger.info("[GEO] Dropping geo-location '" + geoValues.get(i)
                            + "' - location could not be resolved");

                    geoValuesRemove.add(geoValues.get(i));
                }
            }

            geoValues.removeAll(geoValuesRemove);

            // Now split it up, and add it as a field containing just the coordinate as well as one containing the name + separator + geohash

            List<String> geoValuesCoordinates = new ArrayList<String>(geoValues.size());

            for (String geoValue : geoValues)
                if (geoValue.lastIndexOf(GEO_LOCATION_SEPARATOR) == -1)
                    logger.error("Invalid name / geohash combination '" + geoValue
                            + "' given - should have been removed");
                else {
                    Point geoPoint = GeohashUtils.decode(
                            geoValue.substring(geoValue.lastIndexOf(GEO_LOCATION_SEPARATOR) + 1),
                            SpatialContext.GEO);

                    // Add as lat,lon

                    geoValuesCoordinates.add(geoPoint.getY() + "," + geoPoint.getX());
                }

            // And add it to the final result

            logger.info("[GEO] " + annotationWithLocations.getKey() + " : " + geoValuesCoordinates + " ("
                    + annotationWithLocations.getKey() + GEO_LOCATION_HASH_SUFFIX + " : " + geoValues + ")");

            processorOutput.getMetadata().put(annotationWithLocations.getKey(), geoValuesCoordinates);
            processorOutput.getMetadata().put(annotationWithLocations.getKey() + GEO_LOCATION_HASH_SUFFIX,
                    geoValues);
        }
    }

    private void deduplicateTypes(final ProcessorContent processorOutput,
            final Map<String, List<String>> annotationsWithDuplicates) {
        // Including duplicates

        for (Entry<String, List<String>> annotationWithDuplicates : annotationsWithDuplicates.entrySet()) {
            logger.info("[DUPLICATE] " + annotationWithDuplicates.getKey() + " : "
                    + annotationWithDuplicates.getValue());
            processorOutput.getMetadata().put(annotationWithDuplicates.getKey(),
                    annotationWithDuplicates.getValue());
        }
    }

    private void orderTypes(final String detectedLanguage, final ProcessorContent processorOutput,
            final Map<String, Map<String, Integer>> annotationsWithCounts) {
        // Ordered by their occurrence

        for (Entry<String, Map<String, Integer>> fieldWithEntities : annotationsWithCounts.entrySet()) {
            List<String> entities = new ArrayList<String>(fieldWithEntities.getValue().size());
            SortedSet<Entry<String, Integer>> sortedEntities = new TreeSet<Entry<String, Integer>>(
                    new Comparator<Entry<String, Integer>>() {
                        @Override
                        public int compare(final Entry<String, Integer> e1, final Entry<String, Integer> e2) {
                            int result = e2.getValue().compareTo(e1.getValue());

                            return result != 0 ? result : 1;
                        }
                    });

            sortedEntities.addAll(fieldWithEntities.getValue().entrySet());

            for (Entry<String, Integer> sortedEntity : sortedEntities) {
                if (logger.isDebugEnabled())
                    logger.debug("Adding sorted entity '" + sortedEntity.getKey() + "' to entity list for field '"
                            + fieldWithEntities.getKey() + "-" + detectedLanguage + "'");

                entities.add(sortedEntity.getKey());
            }

            logger.info("[ORDERED] " + fieldWithEntities.getKey() + "-" + detectedLanguage + " : " + entities);
            processorOutput.getMetadata().put(fieldWithEntities.getKey() + "-" + detectedLanguage, entities);
        }
    }

    private void filterTypes(final String documentId, final Feature alternativeFeature,
            final Feature locationFeature, final Map<String, Map<String, Integer>> annotationsWithCounts,
            final Map<String, List<String>> annotationsWithDuplicates,
            final Map<String, List<String>> annotationsWithLocations) {
        for (AnnotationFS annotationFeature : cas.getAnnotationIndex()) {
            String annotationTypeName = annotationFeature.getType().getName();
            if (logger.isTraceEnabled())
                logger.trace("Considering annotation type " + annotationTypeName);
            for (String scanType : entityDetectionScanTypes) {
                if (annotationTypeName.endsWith(scanType)) {
                    String fieldName = (entityDetectionFieldPrefix != null ? entityDetectionFieldPrefix : "")
                            + scanType.toLowerCase();
                    if (logger.isDebugEnabled())
                        logger.debug("Evaluating annotation field " + fieldName);

                    String entityText = annotationFeature.getStringValue(alternativeFeature);

                    String locationFieldName = shouldPerformGeoSpatialAnalysis()
                            ? geoSpatialFieldPrefix + scanType.toLowerCase()
                            : null;
                    String locationValue = annotationFeature.getStringValue(locationFeature);

                    if (entityText == null) {
                        String fmt = "No alternative feature-text specified type %s in document ID %s, retrying with [%d,%d] from: %s";
                        logger.debug(format(fmt, annotationTypeName, documentId, annotationFeature.getBegin(),
                                annotationFeature.getEnd(), annotationFeature.getView().getDocumentText()));
                        entityText = annotationFeature.getCoveredText().replace("\n", " ").replaceAll("\\s+", " ");
                    }

                    if (annotationsWithCounts.containsKey(fieldName)) {
                        Integer count = annotationsWithCounts.get(fieldName).get(entityText);

                        annotationsWithCounts.get(fieldName).put(entityText, count != null ? count + 1 : 1);
                        annotationsWithDuplicates.get(fieldName)
                                .add(TextNormalizer.normalizeDiacritics(entityText));

                        if (locationFieldName != null && StringUtils.hasText(locationValue))
                            annotationsWithLocations.get(locationFieldName).add(locationValue);
                    } else {
                        Map<String, Integer> fieldCountMap = new TreeMap<String, Integer>();
                        List<String> duplicateInclusiveFieldList = new ArrayList<String>();

                        fieldCountMap.put(entityText, 1);
                        duplicateInclusiveFieldList.add(TextNormalizer.normalizeDiacritics(entityText));

                        annotationsWithCounts.put(fieldName, fieldCountMap);
                        annotationsWithDuplicates.put(fieldName, duplicateInclusiveFieldList);

                        // Only when enabled

                        if (locationFieldName != null && StringUtils.hasText(locationValue)) {
                            List<String> locationInclusiveFieldList = new ArrayList<String>();

                            locationInclusiveFieldList.add(locationValue);

                            annotationsWithLocations.put(locationFieldName, locationInclusiveFieldList);
                        }
                    }
                }
            }
        }
    }

    /**
     * Subtitle files often contain superfluous things like "..." and explicit newlines can be ignored as they don't typically represent sentence boundaries.
     *
     * @param documentOutput
     * @param mediaType
     * @return String
     */
    private String cleanOutput(final String documentOutput, final MediaType mediaType) {
        String result = documentOutput;

        if (mediaType.getType().startsWith("text") && mediaType.getSubtype().startsWith("srt")) {
            logger.info(
                    "Prettying up the given UIMA input content - subtitles may contain content spaced out over multiple lines");

            result = result.replace("...", "").replaceAll("\\s+", " ");
        }

        return result;
    }

    /**
     * Retrieve a type-system feature by name.
     *
     * @param name
     * @return Feature
     */
    private Feature getFeature(final String name) {
        return cas.getTypeSystem().getType("enricher.uima.Annotation").getFeatureByBaseName(name);
    }

    /**
     * Adjust the content for eligibility.
     * 
     * @param output
     * @return Boolean
     */
    public static String adjustForEligibility(final String output) {
        if (output.length() < UIMA_MINIMUM_LENGTH) {
            logger.warn("Not enough input for UIMA processing, requires at least " + UIMA_MINIMUM_LENGTH
                    + " characters");

            return null;
        } else if (output.length() > UIMA_MAXIMUM_LENGTH) {
            logger.warn("Too much input for UIMA processing, can efficiently process at most " + UIMA_MAXIMUM_LENGTH
                    + " characters - truncating");

            return output.substring(0, UIMA_MAXIMUM_LENGTH);
        }

        return output;
    }

    /**
     * Attempt to secure a CAS from the UIMA unit, and then tie it to this context.
     * 
     * @param context
     * @param uimaUnit
     * @return CAS
     * @throws ResourceInitializationException
     */
    @SuppressWarnings("unchecked")
    public static CAS getCasFromContext(final ProcessorContext context, final UIMAUnit uimaUnit) {
        if (((Map<String, CASUnit>) context.getParameter(CONTEXT_PARAMETER_CAS))
                .containsKey(uimaUnit.getLanguage()))
            return ((Map<String, CASUnit>) context.getParameter(CONTEXT_PARAMETER_CAS)).get(uimaUnit.getLanguage())
                    .getCas();
        else {
            CAS cas = uimaUnit.getCas();

            ((Map<String, CASUnit>) context.getParameter(CONTEXT_PARAMETER_CAS)).put(uimaUnit.getLanguage(),
                    new CASUnit(uimaUnit, cas));

            return cas;
        }
    }

    /**
     * Determine whether geo-spatial analysis should be performed.
     * 
     * @return boolean
     */
    private boolean shouldPerformGeoSpatialAnalysis() {
        return geoSpatialFieldPrefix != null;
    }

    /**
     * Initialize the given processor context.
     * 
     * @param context
     */
    public static void initialize(final ProcessorContext context) {
        context.setParameter(CONTEXT_PARAMETER_CAS, Collections.synchronizedMap(new HashMap<String, CASUnit>()));
    }

    /**
     * Release all context resources back to their respective CasPools.
     * 
     * @param context
     */
    @SuppressWarnings("unchecked")
    public static void release(final ProcessorContext context) {
        Map<String, CASUnit> casses = (Map<String, CASUnit>) context.getParameter(CONTEXT_PARAMETER_CAS);

        if (logger.isDebugEnabled())
            logger.debug(format("Found %d CAS structures that can be released back to their pool", casses.size()));

        Iterator<Entry<String, CASUnit>> iterator = casses.entrySet().iterator();

        while (iterator.hasNext()) {
            Entry<String, CASUnit> entry = iterator.next();

            if (logger.isDebugEnabled())
                logger.debug(format("About to release CAS %s", entry.getValue().getCas().toString()));

            entry.getValue().getUimaUnit().releaseCas(entry.getValue().getCas());

            iterator.remove();
        }

        if (logger.isDebugEnabled())
            logger.debug("Finished releasing all CAS structures back to their pool");
    }

    /**
     * A simple CAS unit within the processor context.
     * 
     * @author Jasper van Veghel <jasper@seajas.com>
     */
    protected static class CASUnit {
        /**
         * The UIMA unit.
         */
        private final UIMAUnit uimaUnit;

        /**
         * The CAS.
         */
        private final CAS cas;

        /**
         * Default constructor.
         * 
         * @param uimaUnit
         * @param cas
         */
        public CASUnit(final UIMAUnit uimaUnit, final CAS cas) {
            this.uimaUnit = uimaUnit;
            this.cas = cas;
        }

        /**
         * Retrieve the uimaUnit.
         * 
         * @return UIMAUnit
         */
        public UIMAUnit getUimaUnit() {
            return uimaUnit;
        }

        /**
         * Retrieve the cas.
         * 
         * @return CAS
         */
        public CAS getCas() {
            return cas;
        }
    }
}