nl.knaw.huygens.timbuctoo.tools.importer.neww.PublisherNormalizer.java Source code

Java tutorial

Introduction

Here is the source code for nl.knaw.huygens.timbuctoo.tools.importer.neww.PublisherNormalizer.java

Source

package nl.knaw.huygens.timbuctoo.tools.importer.neww;

/*
 * #%L
 * Timbuctoo tools
 * =======
 * Copyright (C) 2012 - 2015 Huygens ING
 * =======
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 3 of the 
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public 
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-3.0.html>.
 * #L%
 */

import java.io.File;
import java.io.PrintWriter;
import java.util.Map;

import nl.knaw.huygens.timbuctoo.tools.importer.CSVImporter;

import org.apache.commons.lang.StringUtils;

import com.google.common.collect.Maps;

/**
 * Normalizes names of publishers.
 */
public class PublisherNormalizer extends CSVImporter {

    private final Map<String, String> map = Maps.newHashMap();

    public PublisherNormalizer(File file) throws Exception {
        super(new PrintWriter(System.err));
        if (file != null) {
            handleFile(file, 2, false);
        }
    }

    @Override
    protected void handleLine(String[] items) {
        if (map.containsKey(items[0])) {
            throw new RuntimeException("Duplicate entry for key " + items[0]);
        }
        map.put(items[0], items[1]);
    }

    @Override
    protected void handleEndOfFile() {
        System.out.printf("Publisher concordance size : %d%n", map.size());
    };

    public String preprocess(String text) {
        // remove suspect entries
        if (text.contains("(") || text.contains("[") || text.contains("etc.") || text.startsWith("in ")) {
            return "";
        }

        text = StringUtils.capitalize(text);
        text = text.replaceAll("/", " / ");
        text = text.replaceAll("\\band\\b", "&");
        text = text.replaceAll("\\bco\\b", "Co");
        text = text.replaceAll("\\ben\\b", "&");
        text = text.replaceAll("\\bet\\b", "&");

        text = text.replaceAll("\\b([A-Z])\\s+", "$1. ");
        text = text.replaceAll("\\b([A-Z]\\.)(\\w\\w)", "$1 $2");
        text = text.replaceAll("\\b([A-Z]\\.)\\s+([A-Z]\\.)", "$1$2");

        text = text.replaceAll("[\\s\\u00A0]+", " ").trim();

        text = text.replaceAll("& [Cc]omp\\.", "& Co.");
        text = text.replaceAll("& [Cc]ompany", "& Co.");
        text = text.replaceAll("& [Cc]o\\.?$", "& Co.");
        text = text.replaceAll("& son", "& Son");
        text = text.replaceAll("& de", "& De");
        text = text.replaceAll("& van", "& Van");
        text = text.replaceAll("& Zn\\.?", "& Zoon");
        text = text.replaceAll("& zoon", "& Zoon");

        if (text.contains("etc.")) {
            text = "";
        }

        return (text.length() > 50 || text.matches(".*?\\d.*?")) ? "" : text;
    }

    public String normalize(String text) {
        text = preprocess(text);
        String mapped = map.get(text);
        if (mapped == null) {
            return text;
        } else if (mapped.equals("IGNORE")) {
            return "";
        } else {
            return mapped;
        }
    }

}