de.julielab.jcore.ae.jnet.uima.ConsistencyPreservation.java Source code

Introduction

Here is the source code for de.julielab.jcore.ae.jnet.uima.ConsistencyPreservation.java
Source

/** 
 * ConsistencyPreservation.java
 * 
 * Copyright (c) 2008, JULIE Lab. 
 * All rights reserved. This program and the accompanying materials 
 * are made available under the terms of the Common Public License v1.0 
 *
 * Author: tomanek
 * 
 * Current version: 2.3.5
 * Since version:   2.2
 *
 * Helper functions for consistency preservation 
 **/

package de.julielab.jcore.ae.jnet.uima;

import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.lang3.StringUtils;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.JFSIndexRepository;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;

import de.julielab.jcore.types.Abbreviation;
import de.julielab.jcore.types.EntityMention;
import de.julielab.jcore.types.Token;
import de.julielab.jcore.utility.JCoReAnnotationTools;

public class ConsistencyPreservation {

    private static final String COMPONENT_ID = "JNET ConsistencyPreservation";

    private static final Logger LOGGER = LoggerFactory.getLogger(ConsistencyPreservation.class);

    public static final String MODE_ACRO2FULL = "acro2full";
    public static final String MODE_FULL2ACRO = "full2acro";
    public static final String MODE_STRING = "string";

    private TreeSet<String> activeModes = null;

    /**
     * builds the modes used during consistency preservation from a string which
     * is a coma-separated list of modes.
     * 
     * @param tring
     *            coma-separated list of modes to be used
     * @throws AnalysisEngineProcessException
     */
    public ConsistencyPreservation(final String modesString) throws ResourceInitializationException {
        activeModes = new TreeSet<String>();
        final String[] modes = modesString.split(",");
        for (final String mode2 : modes) {
            final String mode = mode2.trim();
            if (!mode.equals(MODE_ACRO2FULL) && (!mode.equals(MODE_FULL2ACRO)) && (!mode.equals(MODE_STRING))) {
                LOGGER.error("ConsistencyPreservation() - unknown mode found!");
                throw new ResourceInitializationException();
            }
            activeModes.add(mode);
        }

        LOGGER.info("ConsistencyPreservation() - modes used in consistency engine: " + activeModes.toString());
    }

    /**
     * this method checks whether the full form (at the position where an
     * abbreviation was introduced) of an abbreviation is labeled as an entity.
     * If so, and the abbreviation was not labeled as an entity, the entity
     * label is copied to the abbreviation. As only the full form where the
     * abbreviation was introduced is considered, this method should be run
     * AFTER e.g. doStringBased() which makes sure that all Strings get the same
     * entity annotation. For modes: _full2acro_ and _acro2full_
     * 
     * @param aJCas
     * @param entityMentionClassnames
     *            the entity mention class names to be considered
     * @throws AnalysisEngineProcessException
     */
    public void acroMatch(final JCas aJCas, final TreeSet<String> entityMentionClassnames)
            throws AnalysisEngineProcessException {

        // check whether any mode enabled
        if ((activeModes == null) || (activeModes.size() == 0)
                || !(activeModes.contains(ConsistencyPreservation.MODE_FULL2ACRO)
                        || activeModes.contains(ConsistencyPreservation.MODE_ACRO2FULL)))
            return;

        // TODO needs to be checked for performance

        // make a set of Annotation objects for entity class names to be
        // considered
        // EF, 28.5.2013: Changed from TreeSet to HashSet because our UIMA
        // annotation types do not implement Comparable which is a prerequisite
        // for the usage of TreeSet. However, before Java7, there was a bug
        // allowing the first inserted element not to be Comparable. I hope it
        // wasn't important in any way that this was a TreeMap.
        // When using a TreeMap here and running on a Java7 JVM, a
        // ClassCastException (cannot cast to Comparable) would be risen.
        Set<EntityMention> entityMentionTypes = null;
        try {
            entityMentionTypes = new HashSet<EntityMention>();
            for (final String className : entityMentionClassnames)
                entityMentionTypes
                        .add((EntityMention) JCoReAnnotationTools.getAnnotationByClassName(aJCas, className));
        } catch (final SecurityException e1) {
            e1.printStackTrace();
        } catch (final IllegalArgumentException e1) {
            e1.printStackTrace();
        } catch (final ClassNotFoundException e1) {
            e1.printStackTrace();
        } catch (final NoSuchMethodException e1) {
            e1.printStackTrace();
        } catch (final InstantiationException e1) {
            e1.printStackTrace();
        } catch (final IllegalAccessException e1) {
            e1.printStackTrace();
        } catch (final InvocationTargetException e1) {
            e1.printStackTrace();
        }

        // loop over these full forms
        final JFSIndexRepository indexes = aJCas.getJFSIndexRepository();
        final Iterator<org.apache.uima.jcas.tcas.Annotation> abbrevIter = indexes
                .getAnnotationIndex(Abbreviation.type).iterator();

        while (abbrevIter.hasNext()) {
            final Abbreviation abbrev = (Abbreviation) abbrevIter.next();
            final Annotation fullFormAnnotation = abbrev.getTextReference();
            LOGGER.debug("doAbbreviationBased() - checking abbreviation: " + abbrev.getCoveredText());

            final ArrayList<EntityMention> mentionList = new ArrayList<EntityMention>();

            // check whether abbreviation was identified as an entity mention of
            // interest
            for (final EntityMention mention : entityMentionTypes)
                mentionList.addAll(UIMAUtils.getAnnotations(aJCas, abbrev, mention.getClass()));
            if ((mentionList == null) || (mentionList.size() == 0)) {

                // check whether full2acro mode is enabled
                if (activeModes.contains(ConsistencyPreservation.MODE_FULL2ACRO)) {

                    // if the abbreviation has no entity annotation of the types
                    // of interest
                    LOGGER.debug(
                            "doAbbreviationBased() -  no entity mentions of interest found on this abbreviation");

                    final ArrayList<EntityMention> fullFormMentionList = new ArrayList<EntityMention>();
                    for (final EntityMention mention : entityMentionTypes)
                        // check whether respective full form does have an
                        // entity annotation of
                        // interest. Important: exact match ! Theses below...
                        fullFormMentionList.addAll(
                                UIMAUtils.getExactAnnotations(aJCas, fullFormAnnotation, mention.getClass()));

                    if ((fullFormMentionList != null) && (fullFormMentionList.size() > 0)) {
                        // if we found an entity mention on the full form (exact
                        // match!), add first entity mention
                        // to abbreviation
                        final EntityMention refEntityMention = fullFormMentionList.get(0);
                        LOGGER.debug("doAbbreviationBased() -  but found entity mention on full form");
                        LOGGER.debug("doAbbreviationBased() -  adding annotation to unlabeled entity mention");
                        try {
                            final EntityMention newEntityMention = (EntityMention) JCoReAnnotationTools
                                    .getAnnotationByClassName(aJCas, refEntityMention.getClass().getName());
                            newEntityMention.setBegin(abbrev.getBegin());
                            newEntityMention.setEnd(abbrev.getEnd());
                            newEntityMention.setSpecificType(refEntityMention.getSpecificType());
                            newEntityMention.setResourceEntryList(refEntityMention.getResourceEntryList());
                            newEntityMention.setConfidence(refEntityMention.getConfidence());
                            newEntityMention.setTextualRepresentation(abbrev.getCoveredText());
                            newEntityMention.setComponentId(COMPONENT_ID + " Abbrev");
                            newEntityMention.addToIndexes();
                        } catch (final Exception e) {
                            LOGGER.error(
                                    "doAbbreviationBased() - could not get create new entity mention annotation: "
                                            + refEntityMention.getClass().getName());
                            throw new AnalysisEngineProcessException();
                        }
                    }
                }
            } else // check whether acro2full mode is enabled
            if (activeModes.contains(ConsistencyPreservation.MODE_ACRO2FULL))
                if (mentionList.size() > 0) {
                    LOGGER.debug("doAbbreviationBased() -  abbreviation has entity mentions of interest");
                    final ArrayList<EntityMention> fullFormMentionList = new ArrayList<EntityMention>();
                    for (final EntityMention mention : entityMentionTypes)
                        // check whether respective full form does have an
                        // entity annotation of
                        // interest
                        fullFormMentionList
                                .addAll(UIMAUtils.getAnnotations(aJCas, fullFormAnnotation, mention.getClass()));

                    if ((fullFormMentionList == null) || (fullFormMentionList.size() == 0)) {
                        // if full form has none, add one
                        final EntityMention refEntityMention = mentionList.get(0);
                        LOGGER.debug(
                                "doAbbreviationBased() -  but reference full form has no entity mentions of interest");
                        LOGGER.debug("doAbbreviationBased() -  adding annotation to unlabeled entity mention");
                        try {
                            final EntityMention newEntityMention = (EntityMention) JCoReAnnotationTools
                                    .getAnnotationByClassName(aJCas, refEntityMention.getClass().getName());
                            newEntityMention.setBegin(fullFormAnnotation.getBegin());
                            newEntityMention.setEnd(fullFormAnnotation.getEnd());
                            newEntityMention.setSpecificType(refEntityMention.getSpecificType());
                            newEntityMention.setResourceEntryList(refEntityMention.getResourceEntryList());
                            newEntityMention.setConfidence(refEntityMention.getConfidence());
                            newEntityMention.setTextualRepresentation(abbrev.getCoveredText());
                            newEntityMention.setComponentId(COMPONENT_ID + " Abbrev");
                            newEntityMention.addToIndexes();
                        } catch (final Exception e) {
                            LOGGER.error(
                                    "doAbbreviationBased() - could not get create new entity mention annotation: "
                                            + refEntityMention.getClass().getName());
                            throw new AnalysisEngineProcessException();
                        }
                    }

                }

        }
    }

    /**
     * consistency presevation based on (exact) string matching. If string was
     * annotated once as entity, all other occurrences of this string get the
     * same label. For mode: _string_ TODO: more intelligent (voting) mechanism
     * needed to avoid false positives TODO: needs to be checked for performance
     * 
     * @param aJCas
     * @param entityMentionClassnames
     * @param confidenceThresholdForConsistencyPreservation
     * @throws AnalysisEngineProcessException
     */
    public void stringMatch(final JCas aJCas, final TreeSet<String> entityMentionClassnames,
            double confidenceThresholdForConsistencyPreservation) throws AnalysisEngineProcessException {

        // check whether this mode is enabled
        if ((activeModes == null) || (activeModes.size() == 0)
                || !activeModes.contains(ConsistencyPreservation.MODE_STRING))
            return;

        final String text = aJCas.getDocumentText();

        final TypeSystem ts = aJCas.getTypeSystem();
        // This map stores the EntityMentions that share the same specificType.
        // We want to use the TreeSet to check - for a given specificType - if
        // there is already an annotation overlapping a specific text offset.
        // See the comparator below.
        final Map<String, TreeSet<EntityMention>> overlapIndex = new HashMap<>();
        // This Comparator checks whether two Entities overlap in any way. If
        // so, they are deemed "equal". The idea is to use this Comparator with
        // a TreeSet in which we store all existing entities. Then, we can
        // efficiently check for a specific span if there already exists any
        // overlapping entity.
        Comparator<EntityMention> overlapComparator = new Comparator<EntityMention>() {

            @Override
            public int compare(EntityMention o1, EntityMention o2) {
                int b1 = o1.getBegin();
                int e1 = o1.getEnd();
                int b2 = o2.getBegin();
                int e2 = o2.getEnd();

                if ((b1 <= b2) && (e1 >= e2)) {
                    return 0;
                } else if ((b1 >= b2) && (e1 <= e2)) {
                    return 0;
                }
                //
                else if ((b1 < e2) && (e1 > e2)) {
                    return 0;
                } else if ((b1 < b2) && (e1 > b2)) {
                    return 0;
                }
                return b1 - b2;
            }
        };

        for (final String entityMentionClassname : entityMentionClassnames) {
            // we use the index entity class wise; we don't want one class to
            // interfer with another
            overlapIndex.clear();
            try {
                // loop over all entity types to be considered
                EntityMention mentionForOffsetComparison = (EntityMention) JCoReAnnotationTools
                        .getAnnotationByClassName(aJCas, entityMentionClassname);

                LOGGER.debug("doStringBased() - checking consistency for type: " + entityMentionClassname);
                final Multimap<String, EntityMention> entityMap = HashMultimap.create();

                // final EntityMention myEntity = (EntityMention)
                // JCoReAnnotationTools
                // .getAnnotationByClassName(aJCas, entityMentionClassname);
                final Type entityType = ts.getType(entityMentionClassname);
                if (null == entityType)
                    throw new IllegalArgumentException(
                            "Entity type \"" + entityMentionClassname + "\" was not found in the type system.");

                // loop over all entity annotations in document and put them in
                // hashmap
                LOGGER.debug("doStringBased() - building entity map");
                final Iterator<Annotation> entityIter = aJCas.getAnnotationIndex(entityType).iterator();
                while (entityIter.hasNext()) {
                    final EntityMention entity = (EntityMention) entityIter.next();
                    entityMap.put(entity.getCoveredText(), entity);
                    // additionally, add the entities into the overlap index so
                    // we can later quickly lookup whether there is already an
                    // entity with the same specific type at a certain location
                    String specificType = "<null>";
                    if (!StringUtils.isBlank(entity.getSpecificType()))
                        specificType = entity.getSpecificType();
                    TreeSet<EntityMention> set = overlapIndex.get(specificType);
                    if (null == set) {
                        set = new TreeSet<>(overlapComparator);
                        overlapIndex.put(specificType, set);
                    }
                    set.add(entity);

                }

                // now search for strings not detected as this kind of entity
                LOGGER.debug("doStringBased() - searching for missed entities...");
                for (final String entityString : entityMap.keySet()) {
                    final EntityMention entity = entityMap.get(entityString).iterator().next();
                    String specificType = "<null>";
                    if (!StringUtils.isBlank(entity.getSpecificType()))
                        specificType = entity.getSpecificType();
                    TreeSet<EntityMention> overlapSet = overlapIndex.get(specificType);

                    LOGGER.debug("doStringBased() - checking entity string: " + entityString);

                    int pos = 0;
                    int length = 0;
                    List<EntityMention> stringMatchedEntities = new ArrayList<>();
                    while ((pos = text.indexOf(entityString, (pos + length))) > -1) {
                        // for each position where we have found this entity
                        // string
                        LOGGER.debug("doStringBased() - found string at pos: " + pos);

                        // check whether there is already an annotation of this
                        // type
                        // this older approach had the issue that only one
                        // overlapping annotation of entityMentionClassname was
                        // returned; but this type could be the wrong one in
                        // that the returned had a different specific type but
                        // another existed with the same specificType as the
                        // sought entity
                        // EntityMention refEntity = (EntityMention)
                        // JCoReAnnotationTools
                        // .getOverlappingAnnotation(aJCas,
                        // entityMentionClassname, pos, pos
                        // + entityString.length());

                        mentionForOffsetComparison.setBegin(pos);
                        mentionForOffsetComparison.setEnd(pos + length);
                        boolean overlappingExists = overlapSet.contains(mentionForOffsetComparison);

                        // if (refEntity == null
                        // || (refEntity.getSpecificType() == null ^
                        // entity.getSpecificType() == null)
                        // || (refEntity.getSpecificType() != null
                        // && entity.getSpecificType() != null && !refEntity
                        // .getSpecificType().equals(entity.getSpecificType())))
                        // {
                        if (!overlappingExists) {
                            // if there is no annotation of same type on this
                            // text span yet...
                            LOGGER.debug("doStringBased() - adding annotation to unlabeled entity mention");
                            EntityMention refEntity = (EntityMention) JCoReAnnotationTools
                                    .getAnnotationByClassName(aJCas, entityMentionClassname);
                            // We will not directly just annotate the found
                            // string but extend it to offsets of
                            // overlapped tokens.
                            List<Token> overlappingTokens = JCoReAnnotationTools.getNearestOverlappingAnnotations(
                                    aJCas,
                                    new Annotation(entity.getCAS().getJCas(), pos, pos + entityString.length()),
                                    Token.class);
                            int begin = overlappingTokens.size() > 0 ? overlappingTokens.get(0).getBegin() : pos;
                            int end = overlappingTokens.size() > 0
                                    ? overlappingTokens.get(overlappingTokens.size() - 1).getEnd()
                                    : pos + entityString.length();
                            // If we would have to adjust the offsets too much,
                            // we have most like just hit some
                            // substring of a larger token by coincidence.
                            refEntity.setBegin(begin);
                            refEntity.setEnd(end);
                            refEntity.setSpecificType(entity.getSpecificType());
                            refEntity.setResourceEntryList(entity.getResourceEntryList());
                            refEntity.setConfidence(entity.getConfidence());
                            refEntity.setTextualRepresentation(entity.getTextualRepresentation());
                            refEntity.setComponentId(COMPONENT_ID + " String (" + entity.getCoveredText() + ", "
                                    + begin + "-" + end + ")");
                            stringMatchedEntities.add(refEntity);

                        } else
                            LOGGER.debug("doStringBased() - there is already an entity!");

                        length = entityString.length();
                    }

                    // A.R. 30.06.15: this option can now be turned on, just by
                    // setting the config parameter
                    // confidenceThresholdForConsistencyPreservation to a value
                    // greater than 0
                    // earlier it has been switched by commenting or
                    // un-commenting the following code

                    // If confidenceThresholdForConsistencyPreservation is given
                    // (value != -1)
                    // only add the new entities if there is enough evidence by
                    // originally found entities with the same string that
                    // this is indeed an entity we would like to find.
                    if (confidenceThresholdForConsistencyPreservation > 0) {
                        if (!stringMatchedEntities.isEmpty()) {

                            double meanConfidence = 0;
                            for (EntityMention recognizedEntity : entityMap.get(entityString)) {
                                if (null != entity.getConfidence()) {
                                    meanConfidence += Double.parseDouble(recognizedEntity.getConfidence());
                                }
                            }
                            meanConfidence /= entityMap.get(entityString).size();

                            int allMatches = stringMatchedEntities.size() + entityMap.get(entityString).size();
                            if (entityMap.get(entityString).size() >= allMatches / 3d) {
                                if (meanConfidence > confidenceThresholdForConsistencyPreservation) {
                                    for (EntityMention refEntity : stringMatchedEntities) {
                                        // we have to add the new entities to
                                        // the overlap-index to avoid duplicates
                                        // by other entities that are a
                                        // substring of the current entity
                                        overlapSet.add(refEntity);
                                        refEntity.addToIndexes();
                                    }
                                }
                            }
                        }
                    }
                    // if confidence score doesn't need to be checked, just add
                    // all occurrences
                    else {
                        for (EntityMention refEntity : stringMatchedEntities) {
                            // we have to add the new entities to the
                            // overlap-index to avoid duplicates by other
                            // entities that are a substring of the current
                            // entity
                            overlapSet.add(refEntity);
                            refEntity.addToIndexes();
                        }
                    }
                }

            } catch (final Exception e) {
                LOGGER.error("doStringBased() - exception occured: " + e.getMessage());
                throw new AnalysisEngineProcessException();
            }

        }
    }

    @Override
    public String toString() {
        return "activeModes: " + activeModes;
    }
}