eu.project.ttc.resources.CharacterFootprintTermFilter.java Source code

Java tutorial

Introduction

Here is the source code for eu.project.ttc.resources.CharacterFootprintTermFilter.java

Source

/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package eu.project.ttc.resources;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;

import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.uima.resource.DataResource;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.SharedResourceObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import eu.project.ttc.types.WordAnnotation;
import fr.univnantes.lina.uima.tkregex.LabelledAnnotation;
import fr.univnantes.lina.uima.tkregex.RegexOccurrence;

public class CharacterFootprintTermFilter implements SharedResourceObject, OccurrenceFilter {
    private static final Logger LOGGER = LoggerFactory.getLogger(CharacterFootprintTermFilter.class);

    private static final int BAD_CHAR_RATE_THRESHOLD = 41;
    private char[] allowedChars;

    @Override
    public boolean accept(RegexOccurrence occurrence) {
        if (allowedChars == null)
            return true;
        int totalChars = 0;
        int totalWords = 0;
        int nbBadWords = 0;
        MutableInt badChars = new MutableInt(0);
        for (LabelledAnnotation a : occurrence.getLabelledAnnotations()) {
            WordAnnotation w = (WordAnnotation) a.getAnnotation();
            totalChars += w.getCoveredText().length();
            totalWords += 1;
            if (isBadWord(w, badChars))
                nbBadWords += 1;
        }
        if (nbBadWords > 1)
            return false;
        if (totalChars <= totalWords * 3 && totalWords > 1)
            return false;
        int badCharRate = 100 * badChars.intValue() / totalChars;
        if (badCharRate >= BAD_CHAR_RATE_THRESHOLD)
            return false;
        return true;
    }

    /**
     * 
     * @param anno the word anno
     * @param badChars the bad char counter. Being incremented
     * @return true if the word has one bad char, false otherwise
     */
    private boolean isBadWord(WordAnnotation anno, MutableInt badChars) {
        final String coveredText = anno.getCoveredText();
        boolean foundOneBadChar = false;
        for (int i = 0; i < coveredText.length(); i++) {
            boolean found = false;
            char c = coveredText.charAt(i);
            for (char a : this.allowedChars) {
                if (a == c)
                    found = true;
            }
            if (!found) {
                badChars.increment();
                foundOneBadChar = true;
            }
        }
        return foundOneBadChar;
    }

    @Override
    public void load(DataResource aData) throws ResourceInitializationException {
        LOGGER.debug("Loading resource character footprint resource at: " + aData.getUri());
        InputStream inputStream = null;
        try {
            inputStream = aData.getInputStream();
            List<Character> chars = new LinkedList<Character>();
            BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, Charset.forName("UTF-8")));
            int c;
            while ((c = br.read()) != -1) {
                if (!Character.isWhitespace((char) c))
                    chars.add((char) c);
            }
            this.allowedChars = new char[chars.size()];
            for (int i = 0; i < chars.size(); i++)
                this.allowedChars[i] = chars.get(i);
            br.close();
        } catch (IOException e) {
            LOGGER.error("Could not load resource character footprint resource due to an exception.");
            LOGGER.warn("Continuing with the TrueFilter (always accept terms)");
            this.allowedChars = null;
        } catch (Exception e) {
            LOGGER.warn("PB loading " + aData.getUri() + ". Continuing with the TrueFilter (always accept terms)");
            this.allowedChars = null;
            return;
        } finally {
            if (inputStream != null)
                try {
                    inputStream.close();
                } catch (IOException e) {
                    LOGGER.error("Could not close input stream.");
                }
        }
    }
}