nl.knaw.huygens.timbuctoo.tools.util.EncodingFixer.java Source code

Java tutorial

Introduction

Here is the source code for nl.knaw.huygens.timbuctoo.tools.util.EncodingFixer.java

Source

package nl.knaw.huygens.timbuctoo.tools.util;

/*
 * #%L
 * Timbuctoo tools
 * =======
 * Copyright (C) 2012 - 2015 Huygens ING
 * =======
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation, either version 3 of the 
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public 
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/gpl-3.0.html>.
 * #L%
 */

import org.apache.commons.lang.StringUtils;

/**
 * Fixes latin-1 data that was wrongly converted to UTF-8.
 * Currently handles common lowercase accented characters.
 * See: http://www.weblogism.com/item/270/why-does-e-become-a
 *
 * Some texts may contain a '' followed by a regular space that was
 * orginally a nonbreaking space. You would need to check this.
 */
public class EncodingFixer {

    private static final int N = 22;
    private static final String[][] MAP = { //
            { "", "" }, //  + nonbreaking space
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //  + optional hyphen
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" }, //
            { "", "" } //
    };

    private static final String[] SOURCE = new String[N];
    private static final String[] TARGET = new String[N];

    static {
        for (int i = 0; i < N; i++) {
            SOURCE[i] = MAP[i][0];
            TARGET[i] = MAP[i][1];
        }
    }

    public String convert1(String text) {
        return StringUtils.replaceEach(text, SOURCE, TARGET);
    }

    public String convert2(String text) {
        String conv1 = convert1(text);
        if (!conv1.equals(text)) {
            String conv2 = convert1(conv1);
            if (!conv2.equals(conv1)) {
                throw new RuntimeException("Instable conversion");
            }
        }
        return conv1;
    }

}