eu.annocultor.converters.europeana.RecordCompletenessRanking.java Source code

Introduction

Here is the source code for eu.annocultor.converters.europeana.RecordCompletenessRanking.java
Source

/*
 * Copyright 2005-2009 the original author or authors.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eu.annocultor.converters.europeana;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.solr.common.SolrInputDocument;

/**
 * Ranking records by completeness.
 * 
 * It gives rank from 0 to 10 for a record, that consists of two parts: 
 * up to 5 points for tags with values (potentially) coming from controlled vocabularies, and
 * up to 5 points for free-text fields.
 * 
 * These two parts reflect on metadata precision and its attractedness to human users.
 * 
 * Records with one of the title, description, or thumbnail missing get rank 0.
 * 
 * Then we count unique different words that are longer than 3 characters. 
 * 
 * First, we count words in several metadata tags: place, time, creator, etc. 
 * Each unique word counts as 1 point.
 * Maximal 5 points for tags.
 *
 * Second, we count words in text fields: title and description.
 * Each 5 unique words, that were not seen before in this record, count as 1 point.
 * Maximal 5 points for text fields.
 * 
 * If the record gets less than 2 points for fields, or less than 1 point for texts,
 * then it gets rank 0.
 *  
 * @author Borys Omelayenko
 */

public class RecordCompletenessRanking {

    private static final int MIN_WORD_LENGTH = 3;
    private static final int WORDS_IN_TAGS_PER_POINT = 1;
    private static final int WORDS_IN_TEXTS_PER_POINT = 5;

    /**
     * Checking completeness at ingestion to store in solr index.
     */
    public static int rankRecordCompleteness(SolrInputDocument document) {

        List<String> tags = new ArrayList<String>();
        tags.add(objectAsString(document.getFieldValue("dc_coverage")));
        tags.add(objectAsString(document.getFieldValue("dc_contributor")));
        //        tags.add(objectAsString(document.getFieldValue("dc_description")));
        tags.add(objectAsString(document.getFieldValue("dc_creator")));
        tags.add(objectAsString(document.getFieldValue("dc_date")));
        tags.add(objectAsString(document.getFieldValue("dc_format")));
        tags.add(objectAsString(document.getFieldValue("dc_identifier")));
        tags.add(objectAsString(document.getFieldValue("dc_language")));
        tags.add(objectAsString(document.getFieldValue("dc_publisher")));
        tags.add(objectAsString(document.getFieldValue("dc_relation")));
        tags.add(objectAsString(document.getFieldValue("dc_rights")));
        tags.add(objectAsString(document.getFieldValue("dc_source")));
        tags.add(objectAsString(document.getFieldValue("dc_subject")));
        //        tags.add(objectAsString(document.getFieldValue("dc_title")));
        //        tags.add(objectAsString(document.getFieldValue("dc_type")));

        tags.add(objectAsString(document.getFieldValue("dcterms_alternative")));
        tags.add(objectAsString(document.getFieldValue("dcterms_created")));
        tags.add(objectAsString(document.getFieldValue("dcterms_conformsTo")));
        tags.add(objectAsString(document.getFieldValue("dcterms_extent")));
        tags.add(objectAsString(document.getFieldValue("dcterms_hasFormat")));
        tags.add(objectAsString(document.getFieldValue("dcterms_hasPart")));
        tags.add(objectAsString(document.getFieldValue("dcterms_hasVersion")));
        tags.add(objectAsString(document.getFieldValue("dcterms_isFormatOf")));
        tags.add(objectAsString(document.getFieldValue("dcterms_isPartOf")));
        tags.add(objectAsString(document.getFieldValue("dcterms_isReferencedBy")));
        tags.add(objectAsString(document.getFieldValue("dcterms_isReplacedBy")));
        tags.add(objectAsString(document.getFieldValue("dcterms_isRequiredBy")));
        tags.add(objectAsString(document.getFieldValue("dcterms_issued")));
        tags.add(objectAsString(document.getFieldValue("dcterms_isVersionOf")));
        tags.add(objectAsString(document.getFieldValue("dcterms_medium")));
        tags.add(objectAsString(document.getFieldValue("dcterms_provenance")));
        tags.add(objectAsString(document.getFieldValue("dcterms_references")));
        tags.add(objectAsString(document.getFieldValue("dcterms_replaces")));
        tags.add(objectAsString(document.getFieldValue("dcterms_requires")));
        tags.add(objectAsString(document.getFieldValue("dcterms_spatial")));
        tags.add(objectAsString(document.getFieldValue("dcterms_tableOfContents")));
        tags.add(objectAsString(document.getFieldValue("dcterms_temporal")));

        String thumbnailUrl = objectAsString(document.getFieldValue("europeana_object"));
        String title = objectAsString(document.getFieldValue("dc_title"));
        String description = objectAsString(StringUtils.join(document.getFieldValues("dc_description"), "."));

        return rankRecordCompleteness(thumbnailUrl, title, description, tags);
    }

    /**
     * Here actual computation happens.
     */
    public static int rankRecordCompleteness(String thumbnailUrl, String title, String description,
            List<String> tags) {

        if (StringUtils.isEmpty(thumbnailUrl) || StringUtils.isEmpty(title) && StringUtils.isEmpty(description)) {
            return 0;
        }

        Set<String> words = new HashSet<String>();
        List<String> text = toList(title, description);

        int pointsForTags = computePoints(words, tags, WORDS_IN_TAGS_PER_POINT);
        int pointsForText = computePoints(words, text, WORDS_IN_TEXTS_PER_POINT);

        return pointsForText + pointsForTags;
    }

    /**
     * Utility function: count new words and compute points for them.
     */
    static int computePoints(Set<String> words, Collection<String> fields, int wordsPerPoint) {

        int wordsBefore = words.size();
        words.addAll(extractWordsFromFields(fields));
        return capPoints((words.size() - wordsBefore) / wordsPerPoint);
    }

    /**
     * Utility function: extract words from field values.
     */
    static Collection<String> extractWordsFromFields(Collection<String> fields) {

        List<String> words = new ArrayList<String>();
        for (String field : fields) {
            if (!StringUtils.isEmpty(field)) {
                words.addAll(extractWordsFromField(field));
            }
        }
        return words;
    }

    /**
     * Utility function: extract words from a single field value.
     */
    static List<String> extractWordsFromField(String field) {
        List<String> words = new ArrayList<String>();
        for (String word : field.split("\\W")) {
            if (!StringUtils.isEmpty(word) && word.length() >= MIN_WORD_LENGTH) {
                words.add(word.toLowerCase());
            }
        }
        return words;
    }

    /**
     * Utility function: cap points.
     */
    private static int capPoints(int points) {
        if (points > 5) {
            points = 5;
        }
        return points;
    }

    public static List<String> toList(String... strings) {
        List<String> text = new ArrayList<String>();
        for (String string : strings) {
            text.add(string);
        }
        return text;
    }

    private static String objectAsString(Object object) {
        if (object == null) {
            return null;
        }
        return object.toString();
    }

}