com.schnobosoft.semeval.cortical.SemEvalTextSimilarity.java Source code

Introduction

Here is the source code for com.schnobosoft.semeval.cortical.SemEvalTextSimilarity.java
Source

/**
 * This file is part of SemEvalCortical.
 * <p>
 * Foobar is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * <p>
 * Foobar is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * <p>
 * You should have received a copy of the GNU General Public License
 * along with SemEvalCortical.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.schnobosoft.semeval.cortical;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.schnobosoft.semeval.cortical.Util.Measure;
import com.schnobosoft.semeval.cortical.Util.Retina;
import io.cortical.rest.model.Metric;
import io.cortical.rest.model.Text;
import io.cortical.services.Compare;
import io.cortical.services.RetinaApis;
import io.cortical.services.api.client.ApiException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.*;
import java.nio.file.Files;
import java.util.List;
import java.util.stream.Collectors;

import static com.schnobosoft.semeval.cortical.Util.INPUT_FILE_PREFIX;
import static com.schnobosoft.semeval.cortical.Util.Retina.EN_ASSOCIATIVE;
import static com.schnobosoft.semeval.cortical.Util.Retina.EN_SYNONYMOUS;
import static com.schnobosoft.semeval.cortical.Util.getOutputFile;
import static io.cortical.services.Compare.CompareModels;

/**
 * Read text pairs from a SemEval input file, compare using the Cortical.io Compare API, and write
 * comparison metrics to separate files. The output values are scaled to the range [0,5] as defined
 * by {@code Util#MIN_OUT} and {@code Util#MAX_OUT}.
 * <p>
 * The first parameter is a SemEval input file, beginning with the prefix specified by {@link Util#INPUT_FILE_PREFIX}.
 * From each similarity metric, one output file is stored in the same directory as the input file, named
 * after the used API and the input file suffix.
 * <p>
 * Call arguments: {@code <input file> <api key> [<syn|ass>]}
 * <p>
 * If the latest argument is given, the Retina is changed to {@link Retina#EN_SYNONYMOUS} or {@link Retina#EN_ASSOCIATIVE}
 * respectively. Otherwise, the default is used ({@link #DEFAULT_RETINA_NAME}).
 *
 * @author Carsten Schnober
 * @see <a href="http://documentation.cortical.io/index.html">Cortical.io API documentation</a>
 * @see <a href="http://ixa2.si.ehu.es/stswiki/index.php/Main_Page">
 * Semantic Textual Similarity Wiki</a>
 */
public class SemEvalTextSimilarity {
    private static final Log LOG = LogFactory.getLog(SemEvalTextSimilarity.class);

    private static Retina DEFAULT_RETINA_NAME = EN_ASSOCIATIVE; // default retina name

    public static void main(String[] args) throws IOException, ApiException {
        /* read command line arguments (input file and API key) */
        String apiKey;
        File inputFile;
        Retina retinaName;
        if (args.length >= 2) {
            inputFile = new File(args[0]);
            assert inputFile.getName().startsWith(INPUT_FILE_PREFIX);
            apiKey = args[1];
            retinaName = (args.length > 2 && args[2].toLowerCase().startsWith("syn")) ? EN_SYNONYMOUS
                    : DEFAULT_RETINA_NAME;
        } else {
            throw new IllegalArgumentException(
                    "Call: " + SemEvalTextSimilarity.class.getCanonicalName() + " <input file> <api key> [<syn>]");
        }
        LOG.info("Using Retina " + retinaName.name().toLowerCase() + " at " + Util.RETINA_IP + ".");

        CompareModels[] input = readInput(inputFile);
        RetinaApis api = Util.getApi(apiKey, retinaName, Util.RETINA_IP);
        Metric[] scores = compare(input, api);
        assert input.length == scores.length;

        saveScores(scores, inputFile, retinaName);
    }

    /**
     * Get the similarity metrics for each text pair
     *
     * @param input a list of {@link CompareModels}
     * @param api   the {@link RetinaApis} object to use
     * @return a List of {@link Metric}s, one for each input pair
     */
    private static Metric[] compare(CompareModels[] input, RetinaApis api)
            throws JsonProcessingException, ApiException {
        Compare compareApiInstance = api.compareApi();
        return compareApiInstance.compareBulk(input);
    }

    /**
     * Save the values for the metrics using all measures defined in {@link Measure}. All values
     * are scaled to the range [0,5].
     *
     * @param metrics   a list of {@link Metric}s
     * @param inputFile the input file, used for specifying the output files
     * @throws IOException
     */
    private static void saveScores(Metric[] metrics, File inputFile, Retina retinaName) throws IOException {
        for (Measure measure : Measure.values()) {
            File outputFile = getOutputFile(inputFile, measure, retinaName);
            Writer writer = new BufferedWriter(new FileWriter(outputFile));

            List<Double> scores = Util.scale(Util.getScores(metrics, measure), measure);

            LOG.info("Writing output for '" + inputFile + "'.");
            for (Double score : scores) {
                writer.write(String.valueOf(score) + "\n");
            }
            writer.close();
        }
    }

    /**
     * Read an input file of tab-separated texts. Ignoring empty lines.
     *
     * @param inputFile the input {@link File}
     * @return an array {@link CompareModels}, each holding two {@link Text}s which have been read from the file.
     * @throws IOException
     */
    private static CompareModels[] readInput(File inputFile) throws IOException {
        LOG.info("Reading input file " + inputFile);
        assert inputFile.getName().startsWith(INPUT_FILE_PREFIX);
        List<CompareModels> lines = Files.lines(inputFile.toPath()).filter((s) -> !s.isEmpty())
                .map(line -> line.split("\t")).map(line -> new CompareModels(new Text(line[0]), new Text(line[1])))
                .collect(Collectors.toList());
        return lines.toArray(new CompareModels[lines.size()]);
    }

}