pl.edu.icm.cermine.evaluation.tools.EvaluationUtils.java Source code

Java tutorial

Introduction

Here is the source code for pl.edu.icm.cermine.evaluation.tools.EvaluationUtils.java

Source

/**
 * This file is part of CERMINE project.
 * Copyright (c) 2011-2013 ICM-UW
 *
 * CERMINE is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CERMINE is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with CERMINE. If not, see <http://www.gnu.org/licenses/>.
 */

package pl.edu.icm.cermine.evaluation.tools;

import com.google.common.collect.Lists;
import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import javax.xml.transform.TransformerException;
import org.apache.commons.lang.StringUtils;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
import org.jdom.JDOMException;
import org.jdom.output.DOMOutputter;
import org.w3c.dom.Document;
import pl.edu.icm.cermine.tools.TextUtils;
import pl.edu.icm.cermine.tools.distance.CosineDistance;
import pl.edu.icm.cermine.tools.distance.SmithWatermanDistance;

public class EvaluationUtils {

    public static double compareStringsSW(String expectedText, String extractedText) {
        List<String> expectedTokens = TextUtils.tokenize(expectedText.trim());
        List<String> extractedTokens = TextUtils.tokenize(extractedText.trim());
        SmithWatermanDistance distanceFunc = new SmithWatermanDistance(.0, .0);
        double distance = distanceFunc.compare(expectedTokens, extractedTokens);
        return 2 * distance / (double) (expectedTokens.size() + extractedTokens.size());
    }

    public static List<String> removeLeadingZerosFromDate(List<String> strings) {
        List<String> ret = new ArrayList<String>();
        for (String string : strings) {
            String[] parts = string.split("\\s");
            if (parts.length > 1) {
                List<String> newDate = new ArrayList<String>();
                for (String part : parts) {
                    newDate.add(part.replaceFirst("^0+(?!$)", ""));
                }
                ret.add(StringUtils.join(newDate, " "));
            } else {
                ret.add(string);
            }
        }
        return ret;
    }

    public static boolean isSubsequence(String str, String sub) {
        if (sub.isEmpty()) {
            return true;
        }
        if (str.isEmpty()) {
            return false;
        }
        if (str.charAt(0) == sub.charAt(0)) {
            return isSubsequence(str.substring(1), sub.substring(1));
        }
        return isSubsequence(str.substring(1), sub);
    }

    public static Comparator<String> defaultComparator = new Comparator<String>() {

        @Override
        public int compare(String t1, String t2) {
            return t1.trim().replaceAll(" +", " ").compareToIgnoreCase(t2.trim().replaceAll(" +", " "));
        }
    };

    public static Comparator<String> cosineComparator() {
        return cosineComparator(0.7);
    }

    public static Comparator<String> cosineComparator(final double threshold) {
        return new Comparator<String>() {

            @Override
            public int compare(String t1, String t2) {
                if (new CosineDistance().compare(TextUtils.tokenize(t1), TextUtils.tokenize(t2)) > threshold) {
                    return 0;
                }
                return t1.compareToIgnoreCase(t2);
            }
        };
    }

    public static Comparator<String> swComparator = new Comparator<String>() {

        @Override
        public int compare(String t1, String t2) {
            String t1Norm = t1.replaceAll("[^a-zA-Z]", "");
            String t2Norm = t2.replaceAll("[^a-zA-Z]", "");
            if (compareStringsSW(t1, t2) >= .9 || (!t1Norm.isEmpty() && t1Norm.equals(t2Norm))) {
                return 0;
            }
            return t1.compareToIgnoreCase(t2);
        }
    };

    public static Comparator<String> authorComparator = new Comparator<String>() {

        @Override
        public int compare(String t1, String t2) {
            if (t1.toLowerCase().replaceAll("[^a-z]", "").equals(t2.toLowerCase().replaceAll("[^a-z]", ""))) {
                return 0;
            }
            return t1.trim().compareToIgnoreCase(t2.trim());
        }
    };

    public static Comparator<String> emailComparator = new Comparator<String>() {

        @Override
        public int compare(String t1, String t2) {
            String t1Norm = t1.toLowerCase().replaceAll("[^a-z0-9@]", "").replaceFirst("^e.?mail:? *", "");
            String t2Norm = t1.toLowerCase().replaceAll("[^a-z0-9@]", "").replaceFirst("^e.?mail:? *", "");

            if (t1Norm.equals(t2Norm)) {
                return 0;
            }
            return t1.trim().compareToIgnoreCase(t2.trim());
        }
    };

    public static Comparator<String> journalComparator = new Comparator<String>() {

        @Override
        public int compare(String t1, String t2) {
            if (EvaluationUtils.isSubsequence(t1.toLowerCase().replaceAll("[^a-z]", ""),
                    t2.toLowerCase().replaceAll("[^a-z]", ""))) {
                return 0;
            }
            return t1.trim().compareToIgnoreCase(t2.trim());
        }
    };

    public static Comparator<String> yearComparator = new Comparator<String>() {

        @Override
        public int compare(String t1, String t2) {
            List<String> expected = Arrays.asList(t1.split("---"));
            List<String> extracted = Arrays.asList(t2.split("---"));
            Boolean match = DateComparator.yearsMatch(expected, extracted);
            if (match != null && match) {
                return 0;
            }
            return t1.trim().compareToIgnoreCase(t2.trim());
        }
    };

    public static Comparator<String> headerComparator(final Comparator<String> comp) {
        return new Comparator<String>() {

            @Override
            public int compare(String t1, String t2) {
                List<String> t1Lines = Lists.newArrayList(t1.split("\n"));
                List<String> t2Lines = Lists.newArrayList(t2.split("\n"));
                if (t1Lines.size() != t2Lines.size()) {
                    return -1;
                }
                for (int i = 0; i < t1Lines.size(); i++) {
                    if (t1Lines.get(i).equals(t2Lines.get(i))) {
                        continue;
                    }
                    String trimmed1 = t1Lines.get(i).trim();
                    String trimmed2 = t2Lines.get(i).trim();
                    if (t1Lines.get(i).length() - trimmed1.length() != t2Lines.get(i).length()
                            - trimmed2.length()) {
                        return -1;
                    }
                    if (comp.compare(trimmed1, trimmed2) != 0) {
                        return -1;
                    }
                }
                return 0;
            }
        };
    }

    public static org.w3c.dom.Document elementToW3CDocument(org.jdom.Element elem) throws JDOMException {
        org.jdom.Document metaDoc = new org.jdom.Document();
        metaDoc.setRootElement(elem);
        org.jdom.output.DOMOutputter domOutputter = new DOMOutputter();
        return domOutputter.output(metaDoc);
    }

    public static String outputDoc(Document document) throws IOException, TransformerException {
        OutputFormat format = new OutputFormat(document);
        format.setLineWidth(65);
        format.setIndenting(true);
        format.setIndent(2);
        Writer out = new StringWriter();
        XMLSerializer serializer = new XMLSerializer(out, format);
        serializer.serialize(document);
        return out.toString();
    }

}