de.tudarmstadt.ukp.dkpro.tc.testing.TestPairReader.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.tc.testing.TestPairReader.java
Source

/*******************************************************************************
 * Copyright 2015
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.dkpro.tc.testing;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.tc.api.exception.TextClassificationException;
import de.tudarmstadt.ukp.dkpro.tc.api.io.TCReaderMultiLabel;
import de.tudarmstadt.ukp.dkpro.tc.api.type.TextClassificationOutcome;
import de.tudarmstadt.ukp.dkpro.tc.core.io.PairReader_ImplBase;

public class TestPairReader extends PairReader_ImplBase implements TCReaderMultiLabel {

    public static final String PARAM_INPUT_FILE = "InputFile";
    @ConfigurationParameter(name = PARAM_INPUT_FILE, mandatory = true)
    protected File inputFile;

    private List<String> texts1;
    private List<String> texts2;

    private int fileOffset;

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);

        fileOffset = 0;
        texts1 = new ArrayList<String>();
        texts2 = new ArrayList<String>();

        try {
            for (String line : FileUtils.readLines(inputFile)) {
                String parts[] = line.split("\t");

                if (parts.length != 2) {
                    throw new ResourceInitializationException(new Throwable("Wrong file format: " + line));
                }

                texts1.add(parts[0]);
                texts2.add(parts[1]);
            }
        } catch (IOException e) {
            throw new ResourceInitializationException(e);
        }
    }

    @Override
    public boolean hasNext() throws IOException, CollectionException {
        return fileOffset < texts1.size();
    }

    @Override
    public void getNext(JCas jcas) throws IOException, CollectionException {
        super.getNext(jcas);

        for (String outcomeValue : getTextClassificationOutcomes(jcas)) {
            TextClassificationOutcome outcome = new TextClassificationOutcome(jcas);
            outcome.setOutcome(outcomeValue);
            outcome.addToIndexes();
        }

        // as we are creating more than one CAS out of a single file, we need to have different
        // document titles and URIs for each CAS
        // otherwise, serialized CASes will be overwritten
        DocumentMetaData dmd = DocumentMetaData.get(jcas);
        dmd.setDocumentTitle(dmd.getDocumentTitle() + "-" + fileOffset);
        dmd.setDocumentUri(dmd.getDocumentUri() + "-" + fileOffset);
        fileOffset++;

    }

    @Override
    public Progress[] getProgress() {
        return new Progress[] { new ProgressImpl(fileOffset, texts1.size(), Progress.ENTITIES) };
    }

    @Override
    public Set<String> getTextClassificationOutcomes(JCas jcas) {
        Set<String> outcomes = new HashSet<String>();
        outcomes.add("test");
        return outcomes;
    }

    @Override
    public String getCollectionId1() throws TextClassificationException {
        return inputFile.getParent();
    }

    @Override
    public String getCollectionId2() throws TextClassificationException {
        return inputFile.getParent();
    }

    @Override
    public String getDocumentId1() throws TextClassificationException {
        return inputFile.getName() + "-" + fileOffset;
    }

    @Override
    public String getDocumentId2() throws TextClassificationException {
        return inputFile.getName() + "-" + fileOffset;
    }

    @Override
    public String getTitle1() throws TextClassificationException {
        return inputFile.getName() + "-" + fileOffset;
    }

    @Override
    public String getTitle2() throws TextClassificationException {
        return inputFile.getName() + "-" + fileOffset;
    }

    @Override
    public String getLanguage1() throws TextClassificationException {
        return "en";
    }

    @Override
    public String getLanguage2() throws TextClassificationException {
        return "en";
    }

    @Override
    public String getText1() throws TextClassificationException {
        return texts1.get(fileOffset);
    }

    @Override
    public String getText2() throws TextClassificationException {
        return texts2.get(fileOffset);
    }
}