org.apache.ctakes.lvg.ae.LvgAnnotator.java Source code

Introduction

Here is the source code for org.apache.ctakes.lvg.ae.LvgAnnotator.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.lvg.ae;

import gov.nih.nlm.nls.lvg.Api.LvgCmdApi;
import gov.nih.nlm.nls.lvg.Api.LvgLexItemApi;
import gov.nih.nlm.nls.lvg.Lib.Category;
import gov.nih.nlm.nls.lvg.Lib.LexItem;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Vector;

import org.apache.commons.io.FileUtils;
import org.apache.ctakes.core.util.ListFactory;
import org.apache.ctakes.lvg.resource.LvgCmdApiResource;
import org.apache.ctakes.lvg.resource.LvgCmdApiResourceImpl;
import org.apache.ctakes.typesystem.type.syntax.Lemma;
import org.apache.ctakes.typesystem.type.syntax.WordToken;
import org.apache.ctakes.typesystem.type.textspan.Segment;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.JFSIndexRepository;
import org.apache.uima.jcas.cas.FSList;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.ExternalResourceFactory;

/**
 * UIMA annotator that uses the UMLS LVG package to find the canonical form of
 * WordTokens. The package is also used to find one or more lemmas for a given
 * WordToken along with its associated part of speech.
 * 
 * @author Mayo Clinic
 * 
 *         to do: what effect does using the cache have on words that may be
 *         misspelled. It seems that if you automatically normalize a word from
 *         the cache, this may be bad if it is misspelled in the case where the
 *         misspelling is a word in the lexicon.
 */
public class LvgAnnotator extends JCasAnnotator_ImplBase {
    public static final String[] defaultExclusionWords = { "And", "and", "By", "by", "For", "for", "In", "in", "Of",
            "of", "On", "on", "The", "the", "To", "to", "With", "with" };
    public static final String[] defaultTreebankMap = { "adj|JJ", "adv|RB", "aux|AUX", "compl|CS", "conj|CC",
            "det|DET", "modal|MD", "noun|NN", "prep|IN", "pron|PRP", "verb|VB" };

    /**
      * Value is "PostLemmas". This parameter determines whether the feature
      * lemmaEntries will be populated for word annotations.
      */
    public static final String PARAM_POST_LEMMAS = "PostLemmas";
    @ConfigurationParameter(name = PARAM_POST_LEMMAS, mandatory = false, defaultValue = "false", description = "Whether to extract the lexical variants and write to cas (creates large files)")
    private boolean postLemmas;

    /**
     * Value is "UseLemmaCache". This parameter determines whether a cache will
     * be used to improve performance of setting lemma entries.
     */
    public static final String PARAM_USE_LEMMA_CACHE = "UseLemmaCache";
    @ConfigurationParameter(name = PARAM_USE_LEMMA_CACHE, mandatory = false, defaultValue = "false", description = "Whether to use a cache for lemmas")
    private boolean useLemmaCache;

    /**
     * Value is "LemmaCacheFileLocation". This parameter determines where the
     * lemma cache is located.
     */
    public static final String PARAM_LEMMA_CACHE_FILE_LOCATION = "LemmaCacheFileLocation";
    @ConfigurationParameter(name = PARAM_LEMMA_CACHE_FILE_LOCATION, mandatory = false, defaultValue = "/org/apache/ctakes/lvg/2005_lemma.voc", description = "Path to lemma cache file -- if useLemmaCache and postLemmas are true")
    private String lemmaCacheFileLocation = null;

    /**
     * Value is "LemmaCacheFrequencyCutoff". This parameter sets a threshold for
     * the frequency of a lemma to be loaded into the cache.
     */
    public static final String PARAM_LEMMA_CACHE_FREQUENCY_CUTOFF = "LemmaCacheFrequencyCutoff";
    @ConfigurationParameter(name = PARAM_LEMMA_CACHE_FREQUENCY_CUTOFF, mandatory = false, description = "Threshold for the frequency of a lemma to be loaded into the cache", defaultValue = "20")
    private int cmdCacheFreqCutoff;

    public static final String PARAM_USE_SEGMENTS = "UseSegments";
    @ConfigurationParameter(name = PARAM_USE_SEGMENTS, mandatory = false, defaultValue = "false", description = "Whether to use segments found in upstream cTAKES components")
    private boolean useSegments;

    public static final String PARAM_SKIP_SEGMENTS = "SegmentsToSkip";
    @ConfigurationParameter(name = PARAM_SKIP_SEGMENTS, mandatory = false, defaultValue = {}, description = "Segment IDs to skip during processing")
    private String[] skipSegmentIDs;
    private Set<String> skipSegmentsSet;

    public static final String PARAM_XT_MAP = "XeroxTreebankMap";
    @ConfigurationParameter(name = PARAM_XT_MAP, mandatory = false, description = "Mapping from Xerox parts of speech to Treebank equivalents")
    private String[] xtMaps = defaultTreebankMap;
    private Map<String, String> xeroxTreebankMap;

    public static final String PARAM_USE_CMD_CACHE = "UseCmdCache";
    @ConfigurationParameter(name = PARAM_USE_CMD_CACHE, mandatory = false, defaultValue = "false", description = "Use cache to track canonical forms")
    private boolean useCmdCache;

    public static final String PARAM_CMD_CACHE_FILE = "CmdCacheFileLocation";
    @ConfigurationParameter(name = PARAM_CMD_CACHE_FILE, mandatory = false, defaultValue = "/org/apache/ctakes/lvg/2005_norm.voc", description = "File with stored cache of canonical forms")
    private String cmdCacheFileLocation;

    public static final String PARAM_LEMMA_FREQ_CUTOFF = "CmdCacheFrequencyCutoff";
    @ConfigurationParameter(name = PARAM_LEMMA_FREQ_CUTOFF, mandatory = false, description = "Minimum frequency required for loading from cache", defaultValue = "20")
    private int lemmaCacheFreqCutoff;

    public static final String PARAM_EXCLUSION_WORDS = "ExclusionSet";
    @ConfigurationParameter(name = PARAM_EXCLUSION_WORDS, mandatory = false, description = "Words to exclude when doing LVG normalization")
    private String[] wordsToExclude = defaultExclusionWords;
    private Set<String> exclusionSet;

    // LOG4J logger based on class name
    private Logger logger = Logger.getLogger(getClass().getName());

    public static final String PARAM_LVGCMDAPI_RESRC_KEY = "LvgCmdApi";
    @ExternalResource(key = PARAM_LVGCMDAPI_RESRC_KEY, mandatory = true)
    private LvgCmdApiResource lvgResource;

    private LvgCmdApi lvgCmd;

    private LvgLexItemApi lvgLexItem;

    // key = word, value = canonical word
    private Map<String, String> normCacheMap;

    // key = word, value = Set of Lemma objects
    private Map<String, Set<LemmaLocalClass>> lemmaCacheMap;

    /**
     * Performs initialization logic. This implementation just reads values for
     * the configuration parameters.
     * 
     * @see org.apache.uima.analysis_engine.annotator.BaseAnnotator#initialize(AnnotatorContext)
     */
    @Override
    public void initialize(UimaContext aContext) throws ResourceInitializationException {
        super.initialize(aContext);

        configInit();

        try {
            lvgCmd = lvgResource.getLvg();

            if (useCmdCache) {
                logger.info("Loading Cmd cache=" + cmdCacheFileLocation);
                loadCmdCacheFile(cmdCacheFileLocation);
                logger.info("Loaded " + normCacheMap.size() + " entries");
            }

            if (postLemmas) {
                lvgLexItem = lvgResource.getLvgLex();
                if (useLemmaCache) {
                    logger.info("Loading Lemma cache=" + lemmaCacheFileLocation);
                    loadLemmaCacheFile(lemmaCacheFileLocation);
                    logger.info("Loaded " + lemmaCacheMap.size() + " entries");
                }
            }

        } catch (IOException e) {
            throw new ResourceInitializationException(e);
        }
    }

    /**
     * Sets configuration parameters with values from the descriptor.
     */
    private void configInit() {
        skipSegmentsSet = new HashSet<>();
        for (int i = 0; i < skipSegmentIDs.length; i++) {
            skipSegmentsSet.add(skipSegmentIDs[i]);
        }

        // Load Xerox Treebank tagset map
        xeroxTreebankMap = new HashMap<>();
        for (int i = 0; i < xtMaps.length; i++) {
            StringTokenizer tokenizer = new StringTokenizer(xtMaps[i], "|");
            if (tokenizer.countTokens() == 2) {
                String xTag = tokenizer.nextToken();
                String tTag = tokenizer.nextToken();
                xeroxTreebankMap.put(xTag, tTag);
            }
        }

        exclusionSet = new HashSet<>();
        for (int i = 0; i < wordsToExclude.length; i++) {
            exclusionSet.add(wordsToExclude[i]);
        }
    }

    /**
     * Invokes this annotator's analysis logic.
     */
    @Override
    public void process(JCas jcas) throws AnalysisEngineProcessException {

        logger.info("process(JCas)");

        String text = jcas.getDocumentText();

        try {
            if (useSegments) {
                JFSIndexRepository indexes = jcas.getJFSIndexRepository();
                Iterator<?> segmentItr = indexes.getAnnotationIndex(Segment.type).iterator();
                while (segmentItr.hasNext()) {
                    Segment segmentAnnotation = (Segment) segmentItr.next();
                    String segmentID = segmentAnnotation.getId();

                    if (!skipSegmentsSet.contains(segmentID)) {
                        int start = segmentAnnotation.getBegin();
                        int end = segmentAnnotation.getEnd();
                        annotateRange(jcas, text, start, end);
                    }
                }
            } else {
                // annotate over full doc text
                annotateRange(jcas, text, 0, text.length());
            }
        } catch (Exception e) {
            throw new AnalysisEngineProcessException(e);
        }

    }

    /**
     * A utility method that annotates a given range.
     */
    protected void annotateRange(JCas jcas, String text, int rangeBegin, int rangeEnd)
            throws AnalysisEngineProcessException {
        JFSIndexRepository indexes = jcas.getJFSIndexRepository();
        Iterator<?> wordItr = indexes.getAnnotationIndex(WordToken.type).iterator();
        while (wordItr.hasNext()) {
            WordToken wordAnnotation = (WordToken) wordItr.next();
            if (wordAnnotation.getBegin() >= rangeBegin && wordAnnotation.getEnd() <= rangeEnd) {
                String word = text.substring(wordAnnotation.getBegin(), wordAnnotation.getEnd());

                // if the original word was misspelled, use the spell correction
                String suggestion = wordAnnotation.getSuggestion();

                if ((suggestion != null) && (suggestion.length() > 0)) {
                    word = suggestion;
                }

                // skip past words that are part of the exclusion set
                if (exclusionSet.contains(word))
                    continue;

                setCanonicalForm(wordAnnotation, word);
                if (postLemmas)
                    setLemma(wordAnnotation, word, jcas);
            }
        }
    }

    private void setCanonicalForm(WordToken wordAnnotation, String word) throws AnalysisEngineProcessException {
        // apply LVG processing to get canonical form
        String canonicalForm = null;
        if (useCmdCache) {
            canonicalForm = normCacheMap.get(word);
            if (canonicalForm == null) {
                // logger.info("["+ word+ "] was not found in LVG norm cache.");
            }
        }

        // only apply LVG processing if not found in cache first
        if (canonicalForm == null) {
            try {
                String out = lvgCmd.MutateToString(word);

                String[] output = out.split("\\|");

                if ((output != null) && (output.length >= 2) && (!output[1].matches("No Output"))) {
                    canonicalForm = output[1];
                }
            } catch (Exception e) {
                throw new AnalysisEngineProcessException(e);
            }

        }

        if (canonicalForm != null) {
            wordAnnotation.setCanonicalForm(canonicalForm);
        }
    }

    private void setLemma(WordToken wordAnnotation, String word, JCas jcas) throws AnalysisEngineProcessException {
        // apply LVG processing to get lemmas
        // key = lemma string, value = Set of POS tags
        Map<String, Set<String>> lemmaMap = null;

        if (useLemmaCache) {
            Set<?> lemmaSet = lemmaCacheMap.get(word);
            if (lemmaSet == null) {
                // logger.info("["+ word+
                // "] was not found in LVG lemma cache.");
            } else {
                lemmaMap = new HashMap<>();
                Iterator<?> lemmaItr = lemmaSet.iterator();
                while (lemmaItr.hasNext()) {
                    LemmaLocalClass l = (LemmaLocalClass) lemmaItr.next();
                    lemmaMap.put(l.word, l.posSet);
                }
            }
        }

        if (lemmaMap == null) {
            lemmaMap = new HashMap<>();
            try {
                Vector<?> lexItems = lvgLexItem.MutateLexItem(word);
                Iterator<?> lexItemItr = lexItems.iterator();
                while (lexItemItr.hasNext()) {
                    LexItem li = (LexItem) lexItemItr.next();

                    Category c = li.GetTargetCategory();
                    String lemmaStr = li.GetTargetTerm();
                    long[] bitValues = Category.ToValuesArray(c.GetValue());
                    for (int i = 0; i < bitValues.length; i++) {
                        // note that POS is Xerox tagset
                        String lemmaPos = Category.ToName(bitValues[i]);
                        // convert Xerox tagset to PennTreebank tagset
                        String treebankTag = xeroxTreebankMap.get(lemmaPos);
                        if (treebankTag != null) {
                            Set<String> posSet = null;
                            if (lemmaMap.containsKey(lemmaStr)) {
                                posSet = lemmaMap.get(lemmaStr);
                            } else {
                                posSet = new HashSet<>();
                            }
                            posSet.add(treebankTag);
                            lemmaMap.put(lemmaStr, posSet);
                        }
                    }
                }
            } catch (Exception e) {
                throw new AnalysisEngineProcessException(e);
            }
        }

        // add lemma information to CAS
        // FSArray lemmas = new FSArray(jcas, lemmaMap.keySet().size());
        Collection<Lemma> lemmas = new ArrayList<>(lemmaMap.keySet().size());

        Iterator<String> lemmaStrItr = lemmaMap.keySet().iterator();
        while (lemmaStrItr.hasNext()) {
            String form = lemmaStrItr.next();
            Set<?> posTagSet = lemmaMap.get(form);
            Iterator<?> posTagItr = posTagSet.iterator();
            while (posTagItr.hasNext()) {
                String pos = (String) posTagItr.next(); // part of speech
                Lemma lemma = new Lemma(jcas);
                lemma.setKey(form);
                lemma.setPosTag(pos);
                lemmas.add(lemma);
            }
        }
        Lemma[] lemmaArray = lemmas.toArray(new Lemma[lemmas.size()]);
        FSList fsList = ListFactory.buildList(jcas, lemmaArray);
        wordAnnotation.setLemmaEntries(fsList);
    }

    /**
     * Helper method that loads a Norm cache file.
     * 
     * @param location
     */
    private void loadCmdCacheFile(String cpLocation) throws FileNotFoundException, IOException {
        try (InputStream inStream = getClass().getResourceAsStream(cpLocation);
                BufferedReader br = new BufferedReader(new InputStreamReader(inStream));) {
            // initialize map
            normCacheMap = new HashMap<>();

            String line = br.readLine();
            while (line != null) {
                StringTokenizer st = new StringTokenizer(line, "|");
                if (st.countTokens() == 7) {
                    int freq = Integer.parseInt(st.nextToken());
                    if (freq > cmdCacheFreqCutoff) {
                        String origWord = st.nextToken();
                        String normWord = st.nextToken();
                        if (!normCacheMap.containsKey(origWord)) {
                            // if there are duplicates, then only have the first
                            // occurrence in the map
                            normCacheMap.put(origWord, normWord);
                        }
                    } else {
                        logger.debug("Discarding norm cache line due to frequency cutoff: " + line);
                    }
                } else {
                    logger.warn("Invalid LVG norm cache " + "line: " + line);
                }
                line = br.readLine();
            }
        }
    }

    /**
     * Helper method that loads a Lemma cache file.
     * 
     * @param location
     */
    private void loadLemmaCacheFile(String cpLocation) throws FileNotFoundException, IOException {
        try (InputStream inStream = getClass().getResourceAsStream(cpLocation);
                BufferedReader br = new BufferedReader(new InputStreamReader(inStream));) {
            // initialize map
            lemmaCacheMap = new HashMap<>();

            String line = br.readLine();
            while (line != null) {
                StringTokenizer st = new StringTokenizer(line, "|");
                if (st.countTokens() == 4) // JZ: changed from 7 to 4 as used a new
                // dictionary
                {
                    int freq = Integer.parseInt(st.nextToken());
                    if (freq > lemmaCacheFreqCutoff) {
                        String origWord = st.nextToken();
                        String lemmaWord = st.nextToken();
                        String combinedCategories = st.nextToken();

                        // strip < and > chars
                        combinedCategories = combinedCategories.substring(1, combinedCategories.length() - 1);

                        // construct Lemma object
                        LemmaLocalClass l = new LemmaLocalClass();
                        l.word = lemmaWord;
                        l.posSet = new HashSet<>();
                        long bitVector = Category.ToValue(combinedCategories);
                        long[] bitValues = Category.ToValuesArray(bitVector);
                        for (int i = 0; i < bitValues.length; i++) {
                            String pos = Category.ToName(bitValues[i]);
                            // convert Xerox tag into Treebank
                            String treebankTag = xeroxTreebankMap.get(pos);
                            if (treebankTag != null) {
                                l.posSet.add(treebankTag);
                            }
                        }

                        // add Lemma to cache map
                        Set<LemmaLocalClass> lemmaSet = null;
                        if (!lemmaCacheMap.containsKey(origWord)) {
                            lemmaSet = new HashSet<>();
                        } else {
                            lemmaSet = lemmaCacheMap.get(origWord);
                        }
                        lemmaSet.add(l);
                        lemmaCacheMap.put(origWord, lemmaSet);
                    } else {
                        logger.debug("Discarding lemma cache line due to frequency cutoff: " + line);
                    }
                } else {
                    logger.warn("Invalid LVG lemma cache " + "line: " + line);
                }
                line = br.readLine();
            }
        }
    }

    @SuppressWarnings("resource")
    public static AnalysisEngineDescription createAnnotatorDescription()
            throws ResourceInitializationException, MalformedURLException {
        InputStream lvgStream = LvgAnnotator.class.getClassLoader()
                .getResourceAsStream("org/apache/ctakes/lvg/data/config/lvg.properties");
        File lvgFile = new File("/tmp/lvg.properties");
        try {
            FileUtils.copyInputStreamToFile(lvgStream, lvgFile);
        } catch (IOException e) {
            throw new RuntimeException(
                    "Error copying temporary InpuStream org/apache/ctakes/lvg/data/config/lvg.properties to /tmp/lvg.properties.",
                    e);
        }
        return AnalysisEngineFactory.createEngineDescription(LvgAnnotator.class, LvgAnnotator.PARAM_USE_CMD_CACHE,
                false, LvgAnnotator.PARAM_USE_LEMMA_CACHE, false, LvgAnnotator.PARAM_USE_SEGMENTS, false,
                LvgAnnotator.PARAM_LEMMA_CACHE_FREQUENCY_CUTOFF, 20, LvgAnnotator.PARAM_LEMMA_FREQ_CUTOFF, 20,
                LvgAnnotator.PARAM_POST_LEMMAS, false, LvgAnnotator.PARAM_LVGCMDAPI_RESRC_KEY,
                ExternalResourceFactory.createExternalResourceDescription(LvgCmdApiResourceImpl.class,
                        lvgFile.toURI().toURL()));
    }

    /**
     * Basic class to group a lemma word with its various parts of speech.
     * 
     * @author Mayo Clinic
     */
    class LemmaLocalClass {
        public String word;

        public Set<String> posSet;
    }

}