banner.tagging.dictionary.DictionaryTagger.java Source code

Introduction

Here is the source code for banner.tagging.dictionary.DictionaryTagger.java
Source

/* 
 Copyright (c) 2007 Arizona State University, Dept. of Computer Science and Dept. of Biomedical Informatics.
 This file is part of the BANNER Named Entity Recognition System, http://banner.sourceforge.net
 This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org.  For further information, see the file 'LICENSE.txt' included with this distribution.
 */

package banner.tagging.dictionary;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import org.apache.commons.configuration.HierarchicalConfiguration;

import dragon.nlp.tool.PorterStemmer;

import banner.tagging.Tagger;
import banner.tokenization.Tokenizer;
import banner.types.Mention;
import banner.types.EntityType;
import banner.types.Sentence;
import banner.types.Token;
import banner.types.Mention.MentionType;
import banner.util.Trie;

/**
 * This class represents a very simple dictionary-based tagger. All text subsequences which match an entry will be tagged, without regard to the context.
 * 
 * @author Bob
 */
public class DictionaryTagger implements Tagger {
    // TODO Add ability to do fuzzy / prefix / suffix searches
    // TODO Add ability to associate text with both a type and an identifier

    private Tokenizer tokenizer;
    private boolean filterContainedMentions;
    protected Trie<String, Set<EntityType>> entities;
    protected Trie<String, Boolean> notInclude;
    private boolean normalizeMixedCase;
    private boolean normalizeDigits;
    private boolean generate2PartVariations;
    private boolean dropEndParentheticals;
    private boolean stemTokens;
    private PorterStemmer stemmer;

    /**
     * Creates a new {@link DictionaryTagger}
     */
    public DictionaryTagger() {
        entities = new Trie<String, Set<EntityType>>();
        notInclude = new Trie<String, Boolean>();
    }

    // TODO Determine how to combine this with loading
    public void configure(HierarchicalConfiguration config, Tokenizer tokenizer) {
        HierarchicalConfiguration localConfig = config.configurationAt(this.getClass().getName());
        filterContainedMentions = localConfig.getBoolean("filterContainedMentions", false);
        normalizeMixedCase = localConfig.getBoolean("normalizeMixedCase", false);
        normalizeDigits = localConfig.getBoolean("normalizeDigits", false);
        generate2PartVariations = localConfig.getBoolean("generate2PartVariations", false);
        dropEndParentheticals = localConfig.getBoolean("dropEndParentheticals", false);
        stemTokens = localConfig.getBoolean("stemTokens", false);
        if (stemTokens) {
            stemmer = new PorterStemmer();
        }
        this.tokenizer = tokenizer;
    }

    public void load(HierarchicalConfiguration config) throws IOException {
        HierarchicalConfiguration localConfig = config.configurationAt(this.getClass().getName());
        String dictionaryFilename = localConfig.getString("dictionaryFile");
        if (dictionaryFilename == null)
            throw new IllegalArgumentException("Must specify dictionary filename");
        String dictionaryTypeName = localConfig.getString("dictionaryType");
        if (dictionaryTypeName == null)
            throw new IllegalArgumentException("Must specify dictionary type");
        String delimiter = localConfig.getString("delimiter");
        int column = localConfig.getInt("column", -1);
        if (delimiter != null && column == -1)
            throw new IllegalArgumentException("Must specify column if delimiter specified");
        EntityType dictionaryType = EntityType.getType(dictionaryTypeName);

        // Load data
        BufferedReader reader = new BufferedReader(new FileReader(dictionaryFilename));
        String line = reader.readLine();
        while (line != null) {
            line = line.trim();
            if (line.length() > 0) {
                if (delimiter == null) {
                    add(line, dictionaryType);
                } else {
                    // TODO Performance - don't use split
                    String[] split = line.split(delimiter);
                    add(split[column], dictionaryType);
                }
            }
            line = reader.readLine();
        }
        reader.close();
    }

    protected List<String> process(String input) {
        if (input == null)
            throw new IllegalArgumentException();
        List<String> tokens = tokenizer.getTokens(input);
        for (int i = 0; i < tokens.size(); i++)
            tokens.set(i, transform(tokens.get(i)));
        return tokens;
    }

    protected String transform(String str) {
        // This has been optimized for very fast operation
        String result = str;

        if (stemTokens) {
            String stem = stemmer.stem(str);
            // System.out.println("Stemmer; original= " + str + ", stemmed= " + stem);
            str = stem;
        }
        if (normalizeMixedCase || normalizeDigits) {
            char[] chars = str.toCharArray();
            if (normalizeMixedCase) {
                boolean hasUpper = false;
                boolean hasLower = false;
                for (int i = 0; i < chars.length && (!hasUpper || !hasLower); i++) {
                    hasUpper |= Character.isUpperCase(chars[i]);
                    hasLower |= Character.isLowerCase(chars[i]);
                }
                if (hasUpper && hasLower)
                    for (int i = 0; i < chars.length; i++)
                        chars[i] = Character.toLowerCase(chars[i]);
            }
            // Note that this only works on single digits
            if (normalizeDigits)
                for (int i = 0; i < chars.length; i++)
                    if (Character.isDigit(chars[i]))
                        chars[i] = '0';
            result = new String(chars);
        }
        return result;
    }

    /**
     * Adds a single entry to the dictionary. The text is processed by the tokenizer and the resulting tokens are stored.
     * 
     * @param text
     *            The text to find
     * @param type
     *            The {@link EntityType} to tag the text with
     */
    public void add(String text, EntityType type) {
        add(text, Collections.singleton(type));
    }

    public void add(String text, Collection<EntityType> types) {
        // TODO Make configurable
        // if (text.length() == 1)
        // return;
        // TODO Add ability to not add items over N (eg 10) tokens long
        List<String> tokens = process(text);
        add(tokens, types);
        if (generate2PartVariations) {
            if (tokens.size() == 1 && tokens.get(0).matches("[A-Za-z]+[0-9]+")) {
                int split = 0;
                String token = tokens.get(0);
                while (Character.isLetter(token.charAt(split)))
                    split++;
                add2Part(token.substring(0, split), token.substring(split, token.length()), types);
            }
            if (tokens.size() == 2) {
                add2Part(tokens.get(0), tokens.get(1), types);
            }
            if (tokens.size() == 3 && (tokens.get(1).equals("-") || tokens.get(1).equals("/"))) {
                add2Part(tokens.get(0), tokens.get(2), types);
            }
        }
        // TODO These lines add GENE recall but drop precision
        // if (tokens.size() > 1 && tokens.get(tokens.size() -
        // 1).equals("homolog"))
        // add(tokens.subList(0, tokens.size() - 1), types);
    }

    private void add2Part(String part1, String part2, Collection<EntityType> types) {
        List<String> tokens = new ArrayList<String>();
        tokens.add(part1 + part2);
        tokens.add(part2);
        add(tokens, types);
        tokens = new ArrayList<String>();
        tokens.add(part1);
        tokens.add(part2);
        add(tokens, types);
        tokens.add(1, "-");
        add(tokens, types);
        tokens.set(1, "/");
        add(tokens, types);
    }

    public boolean add(List<String> tokens, Collection<EntityType> types) {
        if (tokens.size() == 0)
            throw new IllegalArgumentException("Number of tokens must be greater than zero");
        // Verify that the sequence to be added is not listed as not included
        Boolean value = notInclude.getValue(tokens);
        if (value != null)
            return false;
        // If configured, drop parenthetical phrases at the end of the sequence
        if (dropEndParentheticals && tokens.get(tokens.size() - 1).equals(")")) {
            int openParen = tokens.size() - 1;
            while (openParen > 0 && !tokens.get(openParen).equals("("))
                openParen--;
            if (openParen <= 0)
                return false;
            tokens = tokens.subList(0, openParen);
        }
        Set<EntityType> currentTypes = entities.getValue(tokens);
        if (currentTypes == null) {
            currentTypes = new HashSet<EntityType>(1);
            entities.add(tokens, currentTypes);
        }
        return currentTypes.addAll(types);
    }

    public void tag(Sentence sentence) {
        List<Token> tokens = sentence.getTokens();
        // Lookup mentions
        List<Mention> mentions = new LinkedList<Mention>();
        for (int startIndex = 0; startIndex < tokens.size(); startIndex++) {
            Trie<String, Set<EntityType>> t = entities;
            for (int currentIndex = startIndex; currentIndex < tokens.size() && t != null; currentIndex++) {
                Set<EntityType> entityTypes = t.getValue();
                if (entityTypes != null)
                    for (EntityType entityType : entityTypes)
                        mentions.add(
                                new Mention(sentence, startIndex, currentIndex, entityType, MentionType.Found));
                Token currentToken = tokens.get(currentIndex);
                t = t.getChild(transform(currentToken.getText()));
            }
        }

        // Add mentions found

        // Iterator<Mention> mentionIterator = mentions.iterator();
        // while (mentionIterator.hasNext())
        // {
        // Mention mention = mentionIterator.next();
        // boolean contained = false;
        // for (Mention mention2 : mentions)
        // contained |= !mention2.equals(mention) && mention2.contains(mention);
        // if (!filterContainedMentions || !contained)
        // sentence.addMention(mention);
        // }

        if (filterContainedMentions) {
            while (!mentions.isEmpty()) {
                Mention mention1 = mentions.remove(0);
                int start = mention1.getStart();
                int end = mention1.getEnd();
                ArrayList<Mention> adjacentMentions = new ArrayList<Mention>();
                Iterator<Mention> mentionIterator = mentions.iterator();
                boolean changed = true;
                while (changed) {
                    changed = false;
                    while (mentionIterator.hasNext()) {
                        Mention mention2 = mentionIterator.next();
                        boolean adjacent = (end >= mention2.getStart()) && (start <= mention2.getEnd());
                        if (mention1.getEntityType().equals(mention2.getEntityType()) && adjacent) {
                            adjacentMentions.add(mention2);
                            mentionIterator.remove();
                            start = Math.min(start, mention2.getStart());
                            end = Math.max(end, mention2.getEnd());
                            changed = true;
                        }
                    }
                }
                sentence.addMention(new Mention(sentence, start, end, mention1.getEntityType(), MentionType.Found));
            }
        } else {
            for (Mention mention : mentions)
                sentence.addMention(mention);
        }

        // System.out.println(sentence.getText());
        // for (Mention mention : sentence.getMentions())
        // System.out.println("\t" + mention.getText());
    }

    public void suppress(String text) {
        notInclude.add(process(text), Boolean.TRUE);
    }

    /**
     * @return The number of entries in this dictionary
     */
    public int size() {
        // TODO PERFORMANCE This is a very intensive operation due to having to
        // search the entire tree!
        return entities.size();
    }

    public Tokenizer getTokenizer() {
        return tokenizer;
    }

    public void setTokenizer(Tokenizer tokenizer) {
        this.tokenizer = tokenizer;
    }

    public boolean isFilterContainedMentions() {
        return filterContainedMentions;
    }

    public void setFilterContainedMentions(boolean filterContainedMentions) {
        this.filterContainedMentions = filterContainedMentions;
    }

    public boolean isNormalizeMixedCase() {
        return normalizeMixedCase;
    }

    public void setNormalizeMixedCase(boolean normalizeMixedCase) {
        this.normalizeMixedCase = normalizeMixedCase;
    }

    public boolean isNormalizeDigits() {
        return normalizeDigits;
    }

    public void setNormalizeDigits(boolean normalizeDigits) {
        this.normalizeDigits = normalizeDigits;
    }

    public boolean isGenerate2PartVariations() {
        return generate2PartVariations;
    }

    public void setGenerate2PartVariations(boolean generate2PartVariations) {
        this.generate2PartVariations = generate2PartVariations;
    }

    public boolean isDropEndParentheticals() {
        return dropEndParentheticals;
    }

    public void setDropEndParentheticals(boolean dropEndParentheticals) {
        this.dropEndParentheticals = dropEndParentheticals;
    }

    public boolean isStemTokens() {
        return stemTokens;
    }

    public void setStemTokens(boolean stemTokens) {
        this.stemTokens = stemTokens;
    }

}