Java tutorial
/* Copyright (c) 2010 Ministry of the Interior and Kingdom Relations, * the Netherlands. All rights reserved. * * This file is part of the MinBZK Search Enricher indexing generator. * * Search Enricher is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Search Enricher is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Search Enricher. If not, see <http://www.gnu.org/licenses/>. */ package nl.minbzk.dwr.zoeken.enricher.processor; import com.spatial4j.core.context.SpatialContext; import com.spatial4j.core.io.GeohashUtils; import com.spatial4j.core.shape.Point; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import nl.minbzk.dwr.zoeken.enricher.ProcessorResult.ProcessorContent; import nl.minbzk.dwr.zoeken.enricher.processor.uima.UIMAUnit; import nl.minbzk.dwr.zoeken.enricher.util.TextNormalizer; import org.apache.tika.mime.MediaType; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.analysis_engine.impl.AnalysisEngineManagementImpl; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Feature; import org.apache.uima.cas.FeatureStructure; import org.apache.uima.cas.Type; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.resource.ResourceInitializationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.util.StringUtils; import static java.lang.String.format; /** * UIMA-based injector. * * @author Jasper van Veghel <jasper@seajas.com> */ public class UIMAInjector { /** * The logger. */ private static final Logger logger = LoggerFactory.getLogger(UIMAInjector.class); /** * Constants. */ private static final String GEO_LOCATION_UNKNOWN = "0"; private static final String GEO_LOCATION_SEPARATOR = "#"; private static final String GEO_LOCATION_HASH_SUFFIX = "_hash"; private static final String CONTEXT_PARAMETER_CAS = "tika.processor.uima.cas"; private static final Integer UIMA_MINIMUM_LENGTH = 6; private static final Integer UIMA_MAXIMUM_LENGTH = 12000; private static final String SENTIMENT_FIELD = "sentiment"; private static final String CLASSIFICATIONS_FIELD = "classifications"; /** * The entity-detection scan types. */ private final List<String> entityDetectionScanTypes; /** * The entity-detection field prefix. */ private final String entityDetectionFieldPrefix; /** * The geo-spatial field prefix. */ private final String geoSpatialFieldPrefix; /** * The UIMA unit. */ private final UIMAUnit uimaUnit; /** * The CAS. */ private final CAS cas; /** * Default constructor. * * @param uimaUnit * @param entityDetectionScanTypes * @param entityDetectionFieldPrefix * @param geoSpatialFieldPrefix */ public UIMAInjector(final ProcessorContext context, final UIMAUnit uimaUnit, final List<String> entityDetectionScanTypes, final String entityDetectionFieldPrefix, final String geoSpatialFieldPrefix) { this.uimaUnit = uimaUnit; this.cas = getCasFromContext(context, uimaUnit); if (cas == null) throw new IllegalArgumentException( "Unable to obtain a CAS to process the given document content through"); this.entityDetectionScanTypes = entityDetectionScanTypes; this.entityDetectionFieldPrefix = entityDetectionFieldPrefix; this.geoSpatialFieldPrefix = geoSpatialFieldPrefix; } /** * Process the given content through the UIMA pipeline. * * @param documentOutput * @param documentId * @param detectedLanguage * @param mediaType * @param processorOutput * @throws AnalysisEngineProcessException */ public void inject(final String documentOutput, final String documentId, final String detectedLanguage, final MediaType mediaType, final ProcessorContent processorOutput) throws AnalysisEngineProcessException { if (logger.isTraceEnabled()) logger.trace(format("Feeding document %s (%s) as text: %s", documentId, mediaType, documentOutput)); try { Feature alternativeFeature = getFeature("alternative"); Feature locationFeature = getFeature("location"); cas.setDocumentLanguage(uimaUnit.getLanguage()); cas.setDocumentText(cleanOutput(documentOutput, mediaType)); // Now add in document metadata and set the document ID Type metadataType = cas.getTypeSystem().getType("enricher.uima.DocumentMetadata"); FeatureStructure metadataFS = cas.createFS(metadataType); metadataFS.setStringValue(metadataType.getFeatureByBaseName("id"), documentId); cas.addFsToIndexes(metadataFS); // And continue on with processing if (logger.isTraceEnabled()) logger.trace("Start processing the relevant CAS"); uimaUnit.getAnalysisEngine().process(cas); if (logger.isTraceEnabled()) logger.trace("Finished processing the relevant CAS - now cycling through results"); long sessionProcessingTime = uimaUnit.getSessionProcessingTime(); if (sessionProcessingTime > 0) { if (logger.isInfoEnabled()) logger.info(format("Reporting the session processing time as %d", sessionProcessingTime)); ((AnalysisEngineManagementImpl) uimaUnit.getInternalManagementInterface()) .reportServiceCallTime(sessionProcessingTime); ((AnalysisEngineManagementImpl) uimaUnit.getInternalManagementInterface()) .incrementCASesProcessed(); } // Filter down to the relevant data sets Map<String, Map<String, Integer>> annotationsWithCounts = new HashMap<String, Map<String, Integer>>(); Map<String, List<String>> annotationsWithDuplicates = new HashMap<String, List<String>>(); Map<String, List<String>> annotationsWithLocations = new HashMap<String, List<String>>(); if (logger.isTraceEnabled()) logger.trace("Filtering types"); filterTypes(documentId, alternativeFeature, locationFeature, annotationsWithCounts, annotationsWithDuplicates, annotationsWithLocations); if (logger.isTraceEnabled()) logger.trace("Ordering types"); orderTypes(detectedLanguage, processorOutput, annotationsWithCounts); if (logger.isTraceEnabled()) logger.trace("Deduplicating types"); deduplicateTypes(processorOutput, annotationsWithDuplicates); // And the relevant geo-locations (also including duplicates) if (shouldPerformGeoSpatialAnalysis()) { if (logger.isTraceEnabled()) logger.trace("Geo-filtering types"); filterGeo(processorOutput, annotationsWithLocations); } // And extract the sentiment value, if present String sentimentValue = metadataFS.getStringValue(metadataType.getFeatureByBaseName("sentiment")); if (StringUtils.hasText(sentimentValue)) { if (logger.isInfoEnabled()) logger.info(format("Sentiment value determined to be %s", sentimentValue)); List<String> sentimentValues = new ArrayList<String>(1); sentimentValues.add(sentimentValue); processorOutput.getMetadata().put(SENTIMENT_FIELD, sentimentValues); } else { if (logger.isTraceEnabled()) logger.trace("No sentiment value was extracted or could be provided"); } // And extract the sentiment value, if present String classificationsValue = metadataFS .getStringValue(metadataType.getFeatureByBaseName("classifications")); if (StringUtils.hasText(classificationsValue)) { if (logger.isInfoEnabled()) logger.info(format("Classification value determined to be %s", classificationsValue)); List<String> classificationValues = new ArrayList<String>(); classificationValues.addAll( Arrays.asList(StringUtils.tokenizeToStringArray(classificationsValue, ",", true, false))); processorOutput.getMetadata().put(CLASSIFICATIONS_FIELD, classificationValues); } else { if (logger.isTraceEnabled()) logger.trace("No classifications value was extracted or could be provided"); } if (logger.isTraceEnabled()) logger.trace("Finished injecting UIMA-generated metadata"); } catch (RuntimeException e) { logger.error("UIMA processing failed", e); } catch (Exception e) { logger.error("UIMA processing failed", e); } finally { cas.reset(); } } private void filterGeo(final ProcessorContent processorOutput, final Map<String, List<String>> annotationsWithLocations) { for (Entry<String, List<String>> annotationWithLocations : annotationsWithLocations.entrySet()) { List<String> geoValues = annotationWithLocations.getValue(); List<String> geoValuesRemove = new ArrayList<String>(); // XXX: Add an additional field containing the geo-names for (int i = 0; i < geoValues.size(); i++) { if (!StringUtils.hasText(geoValues.get(i)) || geoValues.get(i).equals(GEO_LOCATION_UNKNOWN)) { logger.info("[GEO] Dropping geo-location '" + geoValues.get(i) + "' - location could not be resolved"); geoValuesRemove.add(geoValues.get(i)); } } geoValues.removeAll(geoValuesRemove); // Now split it up, and add it as a field containing just the coordinate as well as one containing the name + separator + geohash List<String> geoValuesCoordinates = new ArrayList<String>(geoValues.size()); for (String geoValue : geoValues) if (geoValue.lastIndexOf(GEO_LOCATION_SEPARATOR) == -1) logger.error("Invalid name / geohash combination '" + geoValue + "' given - should have been removed"); else { Point geoPoint = GeohashUtils.decode( geoValue.substring(geoValue.lastIndexOf(GEO_LOCATION_SEPARATOR) + 1), SpatialContext.GEO); // Add as lat,lon geoValuesCoordinates.add(geoPoint.getY() + "," + geoPoint.getX()); } // And add it to the final result logger.info("[GEO] " + annotationWithLocations.getKey() + " : " + geoValuesCoordinates + " (" + annotationWithLocations.getKey() + GEO_LOCATION_HASH_SUFFIX + " : " + geoValues + ")"); processorOutput.getMetadata().put(annotationWithLocations.getKey(), geoValuesCoordinates); processorOutput.getMetadata().put(annotationWithLocations.getKey() + GEO_LOCATION_HASH_SUFFIX, geoValues); } } private void deduplicateTypes(final ProcessorContent processorOutput, final Map<String, List<String>> annotationsWithDuplicates) { // Including duplicates for (Entry<String, List<String>> annotationWithDuplicates : annotationsWithDuplicates.entrySet()) { logger.info("[DUPLICATE] " + annotationWithDuplicates.getKey() + " : " + annotationWithDuplicates.getValue()); processorOutput.getMetadata().put(annotationWithDuplicates.getKey(), annotationWithDuplicates.getValue()); } } private void orderTypes(final String detectedLanguage, final ProcessorContent processorOutput, final Map<String, Map<String, Integer>> annotationsWithCounts) { // Ordered by their occurrence for (Entry<String, Map<String, Integer>> fieldWithEntities : annotationsWithCounts.entrySet()) { List<String> entities = new ArrayList<String>(fieldWithEntities.getValue().size()); SortedSet<Entry<String, Integer>> sortedEntities = new TreeSet<Entry<String, Integer>>( new Comparator<Entry<String, Integer>>() { @Override public int compare(final Entry<String, Integer> e1, final Entry<String, Integer> e2) { int result = e2.getValue().compareTo(e1.getValue()); return result != 0 ? result : 1; } }); sortedEntities.addAll(fieldWithEntities.getValue().entrySet()); for (Entry<String, Integer> sortedEntity : sortedEntities) { if (logger.isDebugEnabled()) logger.debug("Adding sorted entity '" + sortedEntity.getKey() + "' to entity list for field '" + fieldWithEntities.getKey() + "-" + detectedLanguage + "'"); entities.add(sortedEntity.getKey()); } logger.info("[ORDERED] " + fieldWithEntities.getKey() + "-" + detectedLanguage + " : " + entities); processorOutput.getMetadata().put(fieldWithEntities.getKey() + "-" + detectedLanguage, entities); } } private void filterTypes(final String documentId, final Feature alternativeFeature, final Feature locationFeature, final Map<String, Map<String, Integer>> annotationsWithCounts, final Map<String, List<String>> annotationsWithDuplicates, final Map<String, List<String>> annotationsWithLocations) { for (AnnotationFS annotationFeature : cas.getAnnotationIndex()) { String annotationTypeName = annotationFeature.getType().getName(); if (logger.isTraceEnabled()) logger.trace("Considering annotation type " + annotationTypeName); for (String scanType : entityDetectionScanTypes) { if (annotationTypeName.endsWith(scanType)) { String fieldName = (entityDetectionFieldPrefix != null ? entityDetectionFieldPrefix : "") + scanType.toLowerCase(); if (logger.isDebugEnabled()) logger.debug("Evaluating annotation field " + fieldName); String entityText = annotationFeature.getStringValue(alternativeFeature); String locationFieldName = shouldPerformGeoSpatialAnalysis() ? geoSpatialFieldPrefix + scanType.toLowerCase() : null; String locationValue = annotationFeature.getStringValue(locationFeature); if (entityText == null) { String fmt = "No alternative feature-text specified type %s in document ID %s, retrying with [%d,%d] from: %s"; logger.debug(format(fmt, annotationTypeName, documentId, annotationFeature.getBegin(), annotationFeature.getEnd(), annotationFeature.getView().getDocumentText())); entityText = annotationFeature.getCoveredText().replace("\n", " ").replaceAll("\\s+", " "); } if (annotationsWithCounts.containsKey(fieldName)) { Integer count = annotationsWithCounts.get(fieldName).get(entityText); annotationsWithCounts.get(fieldName).put(entityText, count != null ? count + 1 : 1); annotationsWithDuplicates.get(fieldName) .add(TextNormalizer.normalizeDiacritics(entityText)); if (locationFieldName != null && StringUtils.hasText(locationValue)) annotationsWithLocations.get(locationFieldName).add(locationValue); } else { Map<String, Integer> fieldCountMap = new TreeMap<String, Integer>(); List<String> duplicateInclusiveFieldList = new ArrayList<String>(); fieldCountMap.put(entityText, 1); duplicateInclusiveFieldList.add(TextNormalizer.normalizeDiacritics(entityText)); annotationsWithCounts.put(fieldName, fieldCountMap); annotationsWithDuplicates.put(fieldName, duplicateInclusiveFieldList); // Only when enabled if (locationFieldName != null && StringUtils.hasText(locationValue)) { List<String> locationInclusiveFieldList = new ArrayList<String>(); locationInclusiveFieldList.add(locationValue); annotationsWithLocations.put(locationFieldName, locationInclusiveFieldList); } } } } } } /** * Subtitle files often contain superfluous things like "..." and explicit newlines can be ignored as they don't typically represent sentence boundaries. * * @param documentOutput * @param mediaType * @return String */ private String cleanOutput(final String documentOutput, final MediaType mediaType) { String result = documentOutput; if (mediaType.getType().startsWith("text") && mediaType.getSubtype().startsWith("srt")) { logger.info( "Prettying up the given UIMA input content - subtitles may contain content spaced out over multiple lines"); result = result.replace("...", "").replaceAll("\\s+", " "); } return result; } /** * Retrieve a type-system feature by name. * * @param name * @return Feature */ private Feature getFeature(final String name) { return cas.getTypeSystem().getType("enricher.uima.Annotation").getFeatureByBaseName(name); } /** * Adjust the content for eligibility. * * @param output * @return Boolean */ public static String adjustForEligibility(final String output) { if (output.length() < UIMA_MINIMUM_LENGTH) { logger.warn("Not enough input for UIMA processing, requires at least " + UIMA_MINIMUM_LENGTH + " characters"); return null; } else if (output.length() > UIMA_MAXIMUM_LENGTH) { logger.warn("Too much input for UIMA processing, can efficiently process at most " + UIMA_MAXIMUM_LENGTH + " characters - truncating"); return output.substring(0, UIMA_MAXIMUM_LENGTH); } return output; } /** * Attempt to secure a CAS from the UIMA unit, and then tie it to this context. * * @param context * @param uimaUnit * @return CAS * @throws ResourceInitializationException */ @SuppressWarnings("unchecked") public static CAS getCasFromContext(final ProcessorContext context, final UIMAUnit uimaUnit) { if (((Map<String, CASUnit>) context.getParameter(CONTEXT_PARAMETER_CAS)) .containsKey(uimaUnit.getLanguage())) return ((Map<String, CASUnit>) context.getParameter(CONTEXT_PARAMETER_CAS)).get(uimaUnit.getLanguage()) .getCas(); else { CAS cas = uimaUnit.getCas(); ((Map<String, CASUnit>) context.getParameter(CONTEXT_PARAMETER_CAS)).put(uimaUnit.getLanguage(), new CASUnit(uimaUnit, cas)); return cas; } } /** * Determine whether geo-spatial analysis should be performed. * * @return boolean */ private boolean shouldPerformGeoSpatialAnalysis() { return geoSpatialFieldPrefix != null; } /** * Initialize the given processor context. * * @param context */ public static void initialize(final ProcessorContext context) { context.setParameter(CONTEXT_PARAMETER_CAS, Collections.synchronizedMap(new HashMap<String, CASUnit>())); } /** * Release all context resources back to their respective CasPools. * * @param context */ @SuppressWarnings("unchecked") public static void release(final ProcessorContext context) { Map<String, CASUnit> casses = (Map<String, CASUnit>) context.getParameter(CONTEXT_PARAMETER_CAS); if (logger.isDebugEnabled()) logger.debug(format("Found %d CAS structures that can be released back to their pool", casses.size())); Iterator<Entry<String, CASUnit>> iterator = casses.entrySet().iterator(); while (iterator.hasNext()) { Entry<String, CASUnit> entry = iterator.next(); if (logger.isDebugEnabled()) logger.debug(format("About to release CAS %s", entry.getValue().getCas().toString())); entry.getValue().getUimaUnit().releaseCas(entry.getValue().getCas()); iterator.remove(); } if (logger.isDebugEnabled()) logger.debug("Finished releasing all CAS structures back to their pool"); } /** * A simple CAS unit within the processor context. * * @author Jasper van Veghel <jasper@seajas.com> */ protected static class CASUnit { /** * The UIMA unit. */ private final UIMAUnit uimaUnit; /** * The CAS. */ private final CAS cas; /** * Default constructor. * * @param uimaUnit * @param cas */ public CASUnit(final UIMAUnit uimaUnit, final CAS cas) { this.uimaUnit = uimaUnit; this.cas = cas; } /** * Retrieve the uimaUnit. * * @return UIMAUnit */ public UIMAUnit getUimaUnit() { return uimaUnit; } /** * Retrieve the cas. * * @return CAS */ public CAS getCas() { return cas; } } }