Java tutorial
/******************************************************************************* * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique) * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *******************************************************************************/ package eu.project.ttc.resources; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.LinkedList; import java.util.List; import org.apache.commons.lang3.mutable.MutableInt; import org.apache.uima.resource.DataResource; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.SharedResourceObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.project.ttc.types.WordAnnotation; import fr.univnantes.lina.uima.tkregex.LabelledAnnotation; import fr.univnantes.lina.uima.tkregex.RegexOccurrence; public class CharacterFootprintTermFilter implements SharedResourceObject, OccurrenceFilter { private static final Logger LOGGER = LoggerFactory.getLogger(CharacterFootprintTermFilter.class); private static final int BAD_CHAR_RATE_THRESHOLD = 41; private char[] allowedChars; @Override public boolean accept(RegexOccurrence occurrence) { if (allowedChars == null) return true; int totalChars = 0; int totalWords = 0; int nbBadWords = 0; MutableInt badChars = new MutableInt(0); for (LabelledAnnotation a : occurrence.getLabelledAnnotations()) { WordAnnotation w = (WordAnnotation) a.getAnnotation(); totalChars += w.getCoveredText().length(); totalWords += 1; if (isBadWord(w, badChars)) nbBadWords += 1; } if (nbBadWords > 1) return false; if (totalChars <= totalWords * 3 && totalWords > 1) return false; int badCharRate = 100 * badChars.intValue() / totalChars; if (badCharRate >= BAD_CHAR_RATE_THRESHOLD) return false; return true; } /** * * @param anno the word anno * @param badChars the bad char counter. Being incremented * @return true if the word has one bad char, false otherwise */ private boolean isBadWord(WordAnnotation anno, MutableInt badChars) { final String coveredText = anno.getCoveredText(); boolean foundOneBadChar = false; for (int i = 0; i < coveredText.length(); i++) { boolean found = false; char c = coveredText.charAt(i); for (char a : this.allowedChars) { if (a == c) found = true; } if (!found) { badChars.increment(); foundOneBadChar = true; } } return foundOneBadChar; } @Override public void load(DataResource aData) throws ResourceInitializationException { LOGGER.debug("Loading resource character footprint resource at: " + aData.getUri()); InputStream inputStream = null; try { inputStream = aData.getInputStream(); List<Character> chars = new LinkedList<Character>(); BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, Charset.forName("UTF-8"))); int c; while ((c = br.read()) != -1) { if (!Character.isWhitespace((char) c)) chars.add((char) c); } this.allowedChars = new char[chars.size()]; for (int i = 0; i < chars.size(); i++) this.allowedChars[i] = chars.get(i); br.close(); } catch (IOException e) { LOGGER.error("Could not load resource character footprint resource due to an exception."); LOGGER.warn("Continuing with the TrueFilter (always accept terms)"); this.allowedChars = null; } catch (Exception e) { LOGGER.warn("PB loading " + aData.getUri() + ". Continuing with the TrueFilter (always accept terms)"); this.allowedChars = null; return; } finally { if (inputStream != null) try { inputStream.close(); } catch (IOException e) { LOGGER.error("Could not close input stream."); } } } }