com.schnobosoft.semeval.cortical.Util.java Source code

Java tutorial

Introduction

Here is the source code for com.schnobosoft.semeval.cortical.Util.java

Source

/**
 * This file is part of SemEvalCortical.
 * <p>
 * Foobar is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * <p>
 * Foobar is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * <p>
 * You should have received a copy of the GNU General Public License
 * along with SemEvalCortical.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.schnobosoft.semeval.cortical;

import io.cortical.rest.model.Metric;
import io.cortical.services.RetinaApis;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * Utilities class for I/O and data conversion.
 *
 * @author Carsten Schnober
 */
public class Util {
    /* the file prefixes for SemEval input and gold standard files */
    public static final String COMMON_PREFIX = "STS.";
    public static final String INPUT_FILE_PREFIX = COMMON_PREFIX + "input.";
    public static final String GS_FILE_PREFIX = COMMON_PREFIX + "gs.";
    public static final String RETINA_IP = "api.cortical.io";
    public static final double MAX_OUT = 5;
    public static final double MIN_OUT = 0;
    private static final Log LOG = LogFactory.getLog(Util.class);

    /**
     * Get the output file object for an input file. The output file begins with the {@link #COMMON_PREFIX},
     * and appends the retina name and the {@code measure} name.
     *
     * @param inputFile  the input file object, beginning with {@link #INPUT_FILE_PREFIX}.
     * @param measure    the {@link Measure}
     * @param retinaName the {@link Retina}
     * @return a {@link File} object for the output file
     * @throws IOException
     */
    public static File getOutputFile(File inputFile, Measure measure, Retina retinaName) throws IOException {
        if (!inputFile.getName().startsWith(INPUT_FILE_PREFIX)) {
            throw new IllegalArgumentException(inputFile + " does not match expected pattern.");
        }

        return new File(inputFile.getCanonicalPath().replace(INPUT_FILE_PREFIX,
                COMMON_PREFIX + retinaName.name().toLowerCase() + "." + measure.name() + "."));
    }

    /**
     * Read a file that contains one score per line, as a SemEval gold {@code .gs} file. Empty lines
     * are allowed and are read as missing values.
     *
     * @param scoresFile the scores file to read
     * @return a list of optional double values of the same length as the input file. For an empty
     * line, a {@link Optional#EMPTY} object is added to the output list.
     * @throws IOException
     */
    public static List<Optional> readScoresFile(File scoresFile) throws IOException {
        if (!scoresFile.getName().startsWith(GS_FILE_PREFIX)) {
            throw new IllegalArgumentException(scoresFile + " does not match expected pattern.");
        }
        LOG.info("Reading scores file " + scoresFile);

        return Files.lines(scoresFile.toPath())
                .map(line -> line.isEmpty() ? Optional.empty() : Optional.of(Double.valueOf(line)))
                .collect(Collectors.toList());
    }

    /**
     * Convert a list of Doubles to an array of primitive double types.
     *
     * @param doubles a list of Doubles
     * @return an array of doubles of the length of the input list
     */
    public static double[] listToArray(List<Double> doubles) {
        double[] array = new double[doubles.size()];
        for (int i = 0; i < doubles.size(); i++) {
            array[i] = doubles.get(i);
        }
        return array;
    }

    public static RetinaApis getApi(String apiKey, Retina retinaName, String ip) {
        return new RetinaApis(retinaName.name().toLowerCase(), ip, apiKey);
    }

    /**
     * Scale a collection of double values to a new scale as defined by min and max.
     *
     * @param values  the values to scale
     * @param measure the {@link Measure} to use for scaling (some have predefined min/max boundaries)
     * @return a list of doubles in the range between {@link #MIN_OUT} and {@link #MAX_OUT}
     */
    public static List<Double> scale(Collection<Double> values, Measure measure) {
        double maxIn;
        double minIn;

        switch (measure) {
        case COSINE_SIM:
            maxIn = 1.0;
            minIn = 0.0;
            break;
        case JACCARD_DIST:
            maxIn = 0.0;
            minIn = -1.0;
            break;
        case EUCLIDIAN_DIST:
            maxIn = 0;
            minIn = values.stream().min(Double::compare).get();
            break;
        default:
            maxIn = values.stream().max(Double::compare).get();
            minIn = values.stream().min(Double::compare).get();
        }
        return values.stream().map(value -> scaleValue(MIN_OUT, MAX_OUT, maxIn, minIn, value))
                .collect(Collectors.toList());
    }

    public static double scaleValue(double min, double max, double maxIn, double minIn, Double value) {
        return (((value - minIn) * (max - min)) / (maxIn - minIn)) + min;
    }

    /**
     * Get the value for a specific measure type from a {@link Metric}.
     * Distance measures ({@link Measure#EUCLIDIAN_DIST} and {@link Measure#JACCARD_DIST}) are
     * negated so that larger values mean more similarity in all cases.
     *
     * @param metric  a {@link Metric} object
     * @param measure a {@link Measure} definition
     * @return the value for the measure
     */
    private static Double getSimilarity(Metric metric, Measure measure) {
        switch (measure) {
        case WEIGHTED:
            return metric.getWeightedScoring();
        case COSINE_SIM:
            return metric.getCosineSimilarity();
        case EUCLIDIAN_DIST:
            return -metric.getEuclideanDistance(); // negative for distance -> similarity
        case JACCARD_DIST:
            return -metric.getJaccardDistance(); // negative for distance -> similarity
        case OVERLAP:
            return (double) metric.getOverlappingAll();
        default:
            throw new IllegalArgumentException("Invalid measure: " + measure);
        }
    }

    /**
     * Get the scores for a specific measure from a list of {@link Metric}s.
     *
     * @param metrics a list of {@link Metric} objects
     * @param measure the {@link Measure} type
     * @return a list of scores, one for each input {@link Metric}
     */
    public static List<Double> getScores(Metric[] metrics, Measure measure) {
        return Stream.of(metrics).map(metric -> getSimilarity(metric, measure)).collect(Collectors.toList());
    }

    /**
     * All known measure (metric) types returned by the API.
     * <p>
     * This class is named {@link Measure} in order to distinguish it from the {@link Metric} class.
     * </p>
     */
    public enum Measure {
        COSINE_SIM, EUCLIDIAN_DIST, JACCARD_DIST, OVERLAP, WEIGHTED
    }

    public enum Retina {
        EN_ASSOCIATIVE, EN_SYNONYMOUS
    }
}