de.tudarmstadt.ukp.dkpro.core.io.text.TokenizedTextWriterTest.java Source code

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.core.io.text.TokenizedTextWriterTest.java
Source

/*
 * Copyright 2015
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 * <p>
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * <p>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.tudarmstadt.ukp.dkpro.core.io.text;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext;
import de.tudarmstadt.ukp.dkpro.core.testing.TestRunner;
import org.apache.commons.io.FileUtils;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.jcas.JCas;
import org.junit.Rule;
import org.junit.Test;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.List;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class TokenizedTextWriterTest {
    @Rule
    public DkproTestContext context = new DkproTestContext();

    @Test
    public void testDefault() throws UIMAException, IOException {
        String text = "This is the 1st sentence .\nHere is another sentence .";
        File targetFile = new File(context.getTestOutputFolder(), "TokenizedTextWriterTest.out");
        File tokenized = new File("src/test/resources/tokenizedTexts/textTokenized.txt");

        AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class,
                TokenizedTextWriter.PARAM_TARGET_LOCATION, targetFile, TokenizedTextWriter.PARAM_SINGULAR_TARGET,
                true, TokenizedTextWriter.PARAM_OVERWRITE, true);
        TestRunner.runTest("id", writer, "en", text);
        assertTrue(FileUtils.contentEquals(tokenized, targetFile));
    }

    @Test
    public void testMultipleFiles() throws UIMAException, IOException {
        String text = "This is the 1st sentence .\nHere is another sentence .";
        File targetDir = context.getTestOutputFolder();
        File targetFile = new File(targetDir, "id.txt");
        File tokenized = new File("src/test/resources/tokenizedTexts/textTokenized.txt");

        AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class,
                TokenizedTextWriter.PARAM_TARGET_LOCATION, targetDir, TokenizedTextWriter.PARAM_SINGULAR_TARGET,
                false, TokenizedTextWriter.PARAM_OVERWRITE, true);
        TestRunner.runTest("id", writer, "en", text);
        assertTrue(targetDir.isDirectory());
        assertTrue(targetFile.exists());
        assertTrue(FileUtils.contentEquals(tokenized, targetFile));
    }

    @Test
    public void testTokens() throws UIMAException, IOException {
        File targetFile = new File(context.getTestOutputFolder(), "TokenizedTextWriterTokensTest.out");
        String text = "This is the 1st sentence .\nHere is another sentence .";
        String typeName = Token.class.getTypeName();
        File tokenized = new File("src/test/resources/tokenizedTexts/textTokenized.txt");

        AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class,
                TokenizedTextWriter.PARAM_TARGET_LOCATION, targetFile, TokenizedTextWriter.PARAM_FEATURE_PATH,
                typeName, TokenizedTextWriter.PARAM_SINGULAR_TARGET, true);
        TestRunner.runTest("id", writer, "en", text);
        assertTrue(FileUtils.contentEquals(tokenized, targetFile));
    }

    @Test
    public void testLemmas() throws IOException, UIMAException {
        File targetFile = new File(context.getTestOutputFolder(), "lemmas.out");
        targetFile.deleteOnExit();
        String expected = "lemma1 lemma2";
        int expectedLines = 1;
        String featurePath = Token.class.getName() + "/lemma/value";

        JCas jCas = JCasFactory.createJCas();
        jCas.setDocumentText("token1 token2");
        DocumentMetaData metaData = DocumentMetaData.create(jCas);
        metaData.setDocumentId("lemmasTest");
        metaData.addToIndexes(jCas);

        Token token1 = new Token(jCas, 0, 6);
        Token token2 = new Token(jCas, 7, 13);
        Lemma lemma1 = new Lemma(jCas, 0, 6);
        lemma1.setValue("lemma1");
        Lemma lemma2 = new Lemma(jCas, 7, 13);
        lemma2.setValue("lemma2");

        token1.setLemma(lemma1);
        token2.setLemma(lemma2);

        token1.addToIndexes(jCas);
        token2.addToIndexes(jCas);
        lemma1.addToIndexes(jCas);
        lemma2.addToIndexes(jCas);

        Sentence sentence = new Sentence(jCas, 0, 13);
        sentence.addToIndexes(jCas);

        AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class,
                TokenizedTextWriter.PARAM_TARGET_LOCATION, targetFile, TokenizedTextWriter.PARAM_FEATURE_PATH,
                featurePath, TokenizedTextWriter.PARAM_SINGULAR_TARGET, true, TokenizedTextWriter.PARAM_OVERWRITE,
                true);

        SimplePipeline.runPipeline(jCas, writer);

        List<String> output = Files.readAllLines(targetFile.toPath());
        assertEquals(expectedLines, output.size());
        assertEquals(expected, output.get(0));
    }

    @Test
    public void testStopwords() throws UIMAException, IOException {
        File targetFile = new File(context.getTestOutputFolder(), "TokenizedTextWriterNoStopwords.out");
        targetFile.deleteOnExit();
        File tokenized = new File("src/test/resources/tokenizedTexts/textTokenizedNoStopwords.txt");
        String text = "This is the 1st sentence .\nHere is another sentence .";
        String stopwordsFile = "src/test/resources/stopwords_en.txt";

        AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class,
                TokenizedTextWriter.PARAM_TARGET_LOCATION, targetFile, TokenizedTextWriter.PARAM_STOPWORDS_FILE,
                stopwordsFile, TokenizedTextWriter.PARAM_SINGULAR_TARGET, true, TokenizedTextWriter.PARAM_OVERWRITE,
                true);
        TestRunner.runTest("id", writer, "en", text);
        assertTrue(FileUtils.contentEquals(tokenized, targetFile));
    }

    @Test
    public void testNumbers() throws UIMAException, IOException {
        File targetFile = new File(context.getTestOutputFolder(), "TokenizedTextWriterNoStopwords.out");
        targetFile.deleteOnExit();
        File tokenized = new File("src/test/resources/tokenizedTexts/textTokenizedNoNumbers.txt");
        String text = "This is 1 sentence .\nHere is 2 sentences , or even 2.5 .";
        String numbersRegex = "^[0-9]+(\\.[0-9]*)?$";

        AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class,
                TokenizedTextWriter.PARAM_TARGET_LOCATION, targetFile, TokenizedTextWriter.PARAM_NUMBER_REGEX,
                numbersRegex, TokenizedTextWriter.PARAM_SINGULAR_TARGET, true, TokenizedTextWriter.PARAM_OVERWRITE,
                true);
        TestRunner.runTest("id", writer, "en", text);
        assertTrue(FileUtils.contentEquals(tokenized, targetFile));
    }

    @Test
    public void testNoSentences() throws IOException, UIMAException {
        File targetFile = new File(context.getTestOutputFolder(), "TokenizedTextWriterNoSentences.out");
        File tokenized = new File("src/test/resources/tokenizedTexts/textNoSentences.txt");
        String text = "This is the 1st sentence . Here is another sentence .";

        AnalysisEngineDescription writer = createEngineDescription(TokenizedTextWriter.class,
                TokenizedTextWriter.PARAM_TARGET_LOCATION, targetFile, TokenizedTextWriter.PARAM_SINGULAR_TARGET,
                true, TokenizedTextWriter.PARAM_OVERWRITE, true, TokenizedTextWriter.PARAM_COVERING_TYPE, null);
        TestRunner.runTest("id", writer, "en", text);
        assertTrue(FileUtils.contentEquals(tokenized, targetFile));
    }
}