org.apache.joshua.decoder.ff.lm.berkeley_lm.LMGrammarBerkeleyTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.joshua.decoder.ff.lm.berkeley_lm.LMGrammarBerkeleyTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.joshua.decoder.ff.lm.berkeley_lm;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.FileAlreadyExistsException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.zip.GZIPOutputStream;

import edu.berkeley.nlp.lm.io.MakeLmBinaryFromArpa;
import org.apache.commons.io.IOUtils;
import org.apache.joshua.decoder.Decoder;
import org.apache.joshua.decoder.JoshuaConfiguration;
import org.apache.joshua.decoder.Translation;
import org.apache.joshua.decoder.segment_file.Sentence;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import static org.testng.Assert.assertEquals;

/**
 * Replacement for test/lm/berkeley/test.sh regression test
 */

public class LMGrammarBerkeleyTest {

    private static final String INPUT = "the chat-rooms";
    private static final String EXPECTED_OUTPUT = "tm_glue_0=2.000 lm_0=-7.153\n";
    private static final String EXPECTED_OUTPUT_WITH_OOV = "tm_glue_0=2.000 lm_0=-7.153 lm_0_oov=0.000\n";
    private static final String[] OPTIONS = "-v 0 -output-format %f".split(" ");

    private static final String lmFile = "src/test/resources/berkeley_lm/lm";
    private static final String compressedLmFile = "target/lm.gz";
    private static final String lmFileBin = "target/lm.berkeleylm";
    private static final String compressedLmFileBin = "target/lm.berkeleylm.gz";

    private JoshuaConfiguration joshuaConfig;
    private Decoder decoder;

    @BeforeClass
    public static void before() throws Exception {
        // generate lm.gz
        FileInputStream lmFileStream = new FileInputStream(new File(lmFile));
        compress(lmFileStream, compressedLmFile);

        // generate lm.berkeleylm
        MakeLmBinaryFromArpa.main(new String[] { lmFile, lmFileBin });

        // generate lm.berkeleylm.gz
        FileInputStream lmFileBinStream = new FileInputStream(new File(lmFileBin));
        compress(lmFileBinStream, compressedLmFileBin);
    }

    private static void compress(FileInputStream lmFileStream, String target) throws IOException {
        try {
            Files.createFile(Paths.get(target));
            GZIPOutputStream gzipOutputStream = new GZIPOutputStream(new FileOutputStream(target));
            IOUtils.copy(lmFileStream, gzipOutputStream);
            gzipOutputStream.finish();
        } catch (FileAlreadyExistsException fae) {
            // the file already exists, no need to recreate it
        }
    }

    @DataProvider(name = "languageModelFiles")
    public Object[][] lmFiles() {
        return new Object[][] { { lmFile }, { compressedLmFile }, { lmFileBin }, { compressedLmFileBin } };
    }

    @AfterMethod
    public void tearDown() throws Exception {
        decoder.cleanUp();
    }

    @Test(dataProvider = "languageModelFiles")
    public void verifyLM(String lmFile) {
        joshuaConfig = new JoshuaConfiguration();
        joshuaConfig.processCommandLineOptions(OPTIONS);
        joshuaConfig.features.add("LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
        decoder = new Decoder(joshuaConfig, null);
        final String translation = decode(INPUT).toString();
        assertEquals(translation, EXPECTED_OUTPUT);
    }

    private Translation decode(String input) {
        final Sentence sentence = new Sentence(input, 0, joshuaConfig);
        return decoder.decode(sentence);
    }

    @Test
    public void givenLmWithOovFeature_whenDecoder_thenCorrectFeaturesReturned() {
        joshuaConfig = new JoshuaConfiguration();
        joshuaConfig.processCommandLineOptions(OPTIONS);
        joshuaConfig.features.add(
                "LanguageModel -lm_type berkeleylm -oov_feature -lm_order 2 -lm_file src/test/resources/berkeley_lm/lm");
        decoder = new Decoder(joshuaConfig, null);
        final String translation = decode(INPUT).toString();
        assertEquals(Decoder.weights.getDenseFeatures().size(), 3);
        assertEquals(translation, EXPECTED_OUTPUT_WITH_OOV);
    }

}