eu.annocultor.converters.europeana.EuropeanaLabelExtractor.java Source code

Introduction

Here is the source code for eu.annocultor.converters.europeana.EuropeanaLabelExtractor.java
Source

/*
 * Copyright 2005-2009 the original author or authors.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eu.annocultor.converters.europeana;

import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.StringUtils;

import eu.annocultor.tagger.preprocessors.LabelFilter;

/**
 * Preprocesses and filters values before they are looked up in a vocabulary (followed by disambiguation).
 * Improves recall.
 *  
 * @author Borys Omelayenko
 * 
 */
public class EuropeanaLabelExtractor extends LabelFilter {

    private boolean fullTextMode = false;
    private String pattern = ",";
    private int minLength = 2;

    public EuropeanaLabelExtractor(boolean fullTextMode) {
        this.fullTextMode = fullTextMode;
        if (fullTextMode) {
            pattern = "[ \\.,;:\\]\\[\\(\\)\"]";
            minLength = 4;
        }
    }

    @Override
    public List<String> extract(List<String> labels) throws Exception {
        List<String> extracted = new ArrayList<String>();
        for (String label : labels) {
            if (label != null) {
                if (extractSpain(extracted, label)) {
                    break;
                }
                if (extractDfgCoverage(extracted, label)) {
                    break;
                }
                split(extracted, label.split(pattern));
            }
        }
        return extracted;
    }

    // Espana-Aragon-Teruel-Teruel
    boolean extractSpain(List<String> extracted, String label) {
        if (label.startsWith("Espa\u00F1a-") && !label.contains(":")) { //\u00F1a-")) {
            String[] places = label.split("\\-");
            if (places.length == 4) {
                if (places.length > 0) {
                    for (String place : places) {
                        extracted.add(place);
                    }
                    return true;
                }
            }
        }
        return false;
    }

    // GB UNITED KINGDOM
    boolean extractDfgCoverage(List<String> extracted, String label) {
        if (precheckisDfgCoverage(label)) {
            if (StringUtils.capitalize(label).equals(label)) {
                String countryAbbreviated = StringUtils.substringBefore(label, " ");
                extracted.add(countryAbbreviated);
                String countrySpelled = StringUtils.substringAfter(label, " ");
                if (!StringUtils.isEmpty(countrySpelled)) {
                    extracted.add(countrySpelled);
                    return true;
                }
            }
        }
        return false;
    }

    boolean precheckisDfgCoverage(String label) {
        return label.length() > 3 && Character.isUpperCase(label.charAt(0))
                && Character.isUpperCase(label.charAt(1));
    }

    void split(List<String> extracted, String[] lbls) {
        for (String l : lbls) {
            String lt = l.trim();
            if (lengthFilter(lt)) {
                if (specialCharacterFilter(lt)) {
                    extracted.add(lt);
                }
            }
        }
    }

    private boolean specialCharacterFilter(String label) {

        // in full-text labels should be capitalised
        if (fullTextMode && !Character.isUpperCase(label.charAt(0))) {
            return false;
        }

        // otherwise make it lower case
        label = label.toLowerCase();
        for (int i = 0; i < label.length(); i++) {
            int words = 0;
            char c = label.charAt(i);
            if (Character.isLetter(c) || c == ' ' || c == '-') {
                if (c == ' ' || c == '-') {
                    words++;
                }
                if (words > 3) {
                    // multi-word names sound suspicious
                    return false;
                }
                // ok
            } else {
                return false;
            }
        }
        return true;
    }

    private boolean lengthFilter(String label) {
        return (label.length() > minLength && label.length() < 30);
    }
}