de.tudarmstadt.ukp.dkpro.tc.examples.io.STSReader.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.tc.examples.io.STSReader.java

Source

/**
 * Copyright 2015
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see http://www.gnu.org/licenses/.
 */
package de.tudarmstadt.ukp.dkpro.tc.examples.io;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.tc.api.exception.TextClassificationException;
import de.tudarmstadt.ukp.dkpro.tc.api.io.TCReaderSingleLabel;
import de.tudarmstadt.ukp.dkpro.tc.api.type.TextClassificationOutcome;
import de.tudarmstadt.ukp.dkpro.tc.core.io.PairReader_ImplBase;

/**
 * Reads the Semantic Text Similarity (STS) SemEval format.
 */
public class STSReader extends PairReader_ImplBase implements TCReaderSingleLabel {

    /**
     * File that lists the the text pairs to be used. One pair per line, tab separated.
     */
    public static final String PARAM_INPUT_FILE = "InputFile";
    @ConfigurationParameter(name = PARAM_INPUT_FILE, mandatory = true)
    protected File inputFile;

    /**
     * The gold standard values for each pair. Same line offsets as in the text pair file.
     */
    public static final String PARAM_GOLD_FILE = "GoldFile";
    @ConfigurationParameter(name = PARAM_GOLD_FILE, mandatory = true)
    protected File goldFile;

    private List<String> texts1;
    private List<String> texts2;
    private List<Double> golds;

    private int fileOffset;

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);

        fileOffset = 0;
        texts1 = new ArrayList<String>();
        texts2 = new ArrayList<String>();
        golds = new ArrayList<Double>();

        try {
            for (String line : FileUtils.readLines(inputFile)) {
                String parts[] = line.split("\t");

                if (parts.length != 2) {
                    throw new ResourceInitializationException(new Throwable("Wrong file format: " + line));
                }

                texts1.add(parts[0]);
                texts2.add(parts[1]);
            }

            for (String line : FileUtils.readLines(goldFile)) {
                try {
                    double goldValue = Double.parseDouble(line);
                    golds.add(goldValue);
                } catch (NumberFormatException e) {
                    throw new ResourceInitializationException(e);
                }
            }

            if (texts1.size() != golds.size()) {
                throw new ResourceInitializationException(
                        new Throwable("Size of text list does not match size of gold list."));
            }
        } catch (IOException e) {
            throw new ResourceInitializationException(e);
        }
    }

    @Override
    public boolean hasNext() throws IOException, CollectionException {
        return fileOffset < texts1.size();
    }

    @Override
    public void getNext(JCas jcas) throws IOException, CollectionException {
        super.getNext(jcas);

        TextClassificationOutcome outcome = new TextClassificationOutcome(jcas);
        outcome.setOutcome(getTextClassificationOutcome(jcas));
        outcome.addToIndexes();

        // as we are creating more than one CAS out of a single file, we need to have different
        // document titles and URIs for each CAS
        // otherwise, serialized CASes will be overwritten
        DocumentMetaData dmd = DocumentMetaData.get(jcas);
        dmd.setDocumentTitle(dmd.getDocumentTitle() + "-" + fileOffset);
        dmd.setDocumentUri(dmd.getDocumentUri() + "-" + fileOffset);
        fileOffset++;

    }

    @Override
    public Progress[] getProgress() {
        return new Progress[] { new ProgressImpl(fileOffset, texts1.size(), Progress.ENTITIES) };
    }

    @Override
    public String getTextClassificationOutcome(JCas jcas) {
        return golds.get(fileOffset).toString();
    }

    @Override
    public String getCollectionId1() throws TextClassificationException {
        return inputFile.getParent();
    }

    @Override
    public String getCollectionId2() throws TextClassificationException {
        return inputFile.getParent();
    }

    @Override
    public String getDocumentId1() throws TextClassificationException {
        return inputFile.getName() + "-" + fileOffset;
    }

    @Override
    public String getDocumentId2() throws TextClassificationException {
        return inputFile.getName() + "-" + fileOffset;
    }

    @Override
    public String getTitle1() throws TextClassificationException {
        return inputFile.getName() + "-" + fileOffset;
    }

    @Override
    public String getTitle2() throws TextClassificationException {
        return inputFile.getName() + "-" + fileOffset;
    }

    @Override
    public String getLanguage1() throws TextClassificationException {
        return "en";
    }

    @Override
    public String getLanguage2() throws TextClassificationException {
        return "en";
    }

    @Override
    public String getText1() throws TextClassificationException {
        return texts1.get(fileOffset);
    }

    @Override
    public String getText2() throws TextClassificationException {
        return texts2.get(fileOffset);
    }
}