de.tudarmstadt.ukp.dkpro.keyphrases.bookindexing.candidate.HeuristicNamedEntityAnnotator.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.keyphrases.bookindexing.candidate.HeuristicNamedEntityAnnotator.java
Source

/*******************************************************************************
 * Copyright 2013
    
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Public License v3.0
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/gpl-3.0.txt
 ******************************************************************************/
package de.tudarmstadt.ukp.dkpro.keyphrases.bookindexing.candidate;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;

import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

/**
 * This component adds {@link NamedEntity} annotations to every list of
 * consecutive terms (= phrase) that satisfy a simple heuristic.
 * <p>
 * - A term has to be alphanumeric.<br>
 * - The phrase has to start with an upper case letter. <br>
 * - When the phrase consists of a single term, it is not allowed to be at the
 * beginning of a sentence.<br>
 * - Terms that are written in capital letters are not added to a phrase.
 * <p>
 * Obviously, not a valid approach for German :)
 *
 * @author zesch, parzonka
 *
 */
public class HeuristicNamedEntityAnnotator extends JCasAnnotator_ImplBase {
    private static Pattern p = Pattern.compile("[^A-Za-z0-9]");

    @Override
    public void initialize(UimaContext aContext) throws ResourceInitializationException {

        super.initialize(aContext);
    }

    @Override
    public void process(JCas jcas) throws AnalysisEngineProcessException {

        String document = jcas.getDocumentText();
        if (document.length() > 0) {
            processDocument(jcas);
        } else {
            getContext().getLogger().log(Level.WARNING, "Document is empty");
        }
    }

    private void processDocument(JCas aJCas) throws AnalysisEngineProcessException {

        for (Sentence sentence : JCasUtil.select(aJCas, Sentence.class)) {

            int sentenceOffset = 0;
            List<Token> namedEntityList = new ArrayList<Token>();

            for (Token token : JCasUtil.selectCovered(Token.class, sentence)) {
                String tokenString = token.getCoveredText();

                if (startsWithUpperCase(tokenString) && isAlphaNumeric(tokenString) && !isAllCaps(tokenString)) {
                    namedEntityList.add(token);

                } else {
                    if (namedEntityList.size() > 0) {
                        // do only add, if not a single sentence beginning token
                        if (namedEntityList.size() != 1 || sentenceOffset != 1) {
                            NamedEntity ne = new NamedEntity(aJCas);
                            ne.setBegin(namedEntityList.get(0).getBegin());
                            ne.setEnd(namedEntityList.get(namedEntityList.size() - 1).getEnd());

                            List<String> namedEntityStrings = new ArrayList<String>();
                            for (Token t : namedEntityList) {
                                namedEntityStrings.add(t.getCoveredText());
                            }
                            ne.setValue(StringUtils.join(namedEntityStrings, " "));
                            ne.addToIndexes();
                        }
                        namedEntityList.clear();
                    }
                }
                sentenceOffset++;
            }
        }
    }

    private boolean startsWithUpperCase(String token) {
        if (!token.isEmpty()) {
            return Character.isUpperCase(token.charAt(0));
        } else {
            return false;
        }
    }

    /**
     * Checks if the token is written in all capitals.
     *
     * @param token
     * @return true when token in all-caps, else false
     */
    private boolean isAllCaps(String token) {
        if (!token.isEmpty()) {
            for (int i = 0; i < token.length(); i++) {
                if (!Character.isUpperCase(token.charAt(i))) {
                    return false;
                }
            }
            return true;
        } else {
            return false;
        }
    }

    private boolean isAlphaNumeric(String token) {
        return !p.matcher(token).find();
    }

}