de.julielab.jtbd.TokenizerTest.java Source code

Introduction

Here is the source code for de.julielab.jtbd.TokenizerTest.java
Source

/**
 * Tokenizer.java
 *
 * Copyright (c) 2015, JULIE Lab.
 * All rights reserved. This program and the accompanying materials 
 * are made available under the terms of the GNU Lesser General Public License (LGPL) v3.0
 *
 * Author: muehlhausen
 *
 * Current version: 2.0 Since version: 1.6
 *
 * Creation date: 14.10.2008
 **/

package de.julielab.jtbd;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import de.julielab.jcore.ae.jtbd.main.TokenAnnotator;
import de.julielab.jtbd.Tokenizer;
import de.julielab.jtbd.Unit;
import de.julielab.jcore.types.Token;

/**
 * Test for the class {@link Tokenizer}
 * 
 * @author tomanek
 */
public class TokenizerTest {

    private static final Logger LOGGER = LoggerFactory.getLogger(TokenizerTest.class);

    private static final String FILENAME_MODEL = "src/test/resources/de/julielab/jcore/ae/jtbd/model/jtbd-2.0-biomed.gz";
    private static final String FILENAME_TRAIN_DATA_ORG = "src/test/resources/testdata/train/train.sent";
    private static final String FILENAME_TRAIN_DATA_TOK = "src/test/resources/testdata/train/train.tok";
    private static final String FILENAME_TRAIN_MODEL_OUTPUT = "/tmp/TestModelOuput.mod";
    private static final String FILENAME_ABSTRACT = "src/test/resources/testdata/test/abstract.txt";

    private List<String> readLinesFromFile(final String filename) throws IOException {
        return FileUtils.readLines(new File(filename), "utf-8");
    }

    /**
     * Test predict
     * 
     * @throws Exception
     */
    @Test
    public void testPredict() throws Exception {

        final Tokenizer tokenizer = new Tokenizer();
        tokenizer.readModel(new File(FILENAME_MODEL));

        final List<String> orgSentences = readLinesFromFile(FILENAME_ABSTRACT);
        final ArrayList<String> tokSentences = new ArrayList<String>();
        for (int i = 0; i < orgSentences.size(); ++i)
            tokSentences.add("");

        final InstanceList iList = tokenizer.makePredictionData(orgSentences, tokSentences); // why not use
        // predict(String) like
        // belwo?
        for (final Instance instance : iList) {
            final ArrayList<Unit> unitList = tokenizer.predict(instance);
            assertNotNull(unitList);
            // TODO this is a rather weak test, was broken for several years due to changed paths without failing.
            // I fixed the latter, yet it is still only marginally useful, just checking if everything works,
            // yet not if it works correctly
            for (final Unit unit : unitList)
                LOGGER.trace("unit=" + unit);
        }
    }

    /**
     * Test predict, probably a better version
     * 
     * @author hellrich
     * 
     * @throws Exception
     */
    @Test
    public void testPredictNewVersion() throws Exception {

        final Tokenizer tokenizer = new Tokenizer();
        tokenizer.readModel(new File(FILENAME_MODEL));

        final int[] expectedStarts = new int[] { 0, 9, 14, 21 };
        final int[] expectedEnds = new int[] { 8, 13, 20, 27 };

        final ArrayList<Unit> unitList = tokenizer.predict("Sentence with proper ending.");
        assertNotNull(unitList);
        assertEquals(4, unitList.size());

        for (int i = 0; i < unitList.size(); ++i) {
            final Unit unit = unitList.get(i);
            assertEquals(expectedStarts[i], unit.begin);
            assertEquals(expectedEnds[i], unit.end);
            LOGGER.trace("unit=" + unit);
        }
    }

    /**
     * @throws Test
     *             reading a serialized model object
     */
    @Test
    public void testReadModel() throws Exception {

        final Tokenizer tokenizer = new Tokenizer();
        tokenizer.readModel(new File(FILENAME_MODEL));
        assertNotNull(tokenizer.model);
    }

    /**
     * Test training and outputting a model object using training data in a file
     * 
     * @throws Exception
     */
    @Test
    public void testTrain() throws Exception {

        final Tokenizer tokenizer = new Tokenizer();
        final List<String> trainDataORG = readLinesFromFile(FILENAME_TRAIN_DATA_ORG);
        final List<String> trainDataTOK = readLinesFromFile(FILENAME_TRAIN_DATA_TOK);
        final InstanceList trainData = tokenizer.makeTrainingData(trainDataORG, trainDataTOK);
        final Pipe trainPipe = trainData.getPipe();
        tokenizer.train(trainData, trainPipe);
        tokenizer.writeModel(FILENAME_TRAIN_MODEL_OUTPUT);

        assertTrue(new File(FILENAME_TRAIN_MODEL_OUTPUT + ".gz").isFile());
    }

    @Test
    public void testClassPathModel() throws Exception {
        JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-morpho-syntax-types");
        jCas.setDocumentText("Please tokenize this sentence.");
        AnalysisEngine engine = AnalysisEngineFactory.createEngine(TokenAnnotator.class, TokenAnnotator.PARAM_MODEL,
                "de/julielab/jcore/ae/jtbd/model/jtbd-2.0-biomed.gz", TokenAnnotator.USE_DOC_TEXT_PARAM, true);
        engine.process(jCas.getCas());

        Collection<Token> tokens = JCasUtil.select(jCas, Token.class);
        assertEquals(5, tokens.size());
    }

}