Java tutorial
/* * Copyright 2015 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.tc.svmhmm.writer; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.File; import java.io.FileInputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Random; import java.util.Set; import java.util.SortedMap; import org.apache.commons.io.IOUtils; import org.apache.log4j.BasicConfigurator; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import de.tudarmstadt.ukp.dkpro.tc.api.features.Feature; import de.tudarmstadt.ukp.dkpro.tc.api.features.FeatureStore; import de.tudarmstadt.ukp.dkpro.tc.api.features.Instance; import de.tudarmstadt.ukp.dkpro.tc.fstore.simple.SparseFeatureStore; import de.tudarmstadt.ukp.dkpro.tc.svmhmm.util.OriginalTextHolderFeatureExtractor; import de.tudarmstadt.ukp.dkpro.tc.svmhmm.util.SVMHMMUtils; public class SVMHMMDataWriterTest { private static final int TESTING_INSTANCES = 50000; private static final int TESTING_FEATURES_PER_INSTANCE = 50; @Rule public TemporaryFolder temporaryFolder = new TemporaryFolder(); private Random random = new Random(System.currentTimeMillis()); private FeatureStore featureStore; @BeforeClass public static void setUpBeforeClass() { BasicConfigurator.configure(); Logger.getRootLogger().setLevel(Level.DEBUG); } @Test public void testWrite() throws Exception { Set<String> randomFeatureNames = new HashSet<>(); int maxFeatureVectorSize = 100000; for (int i = 0; i < maxFeatureVectorSize; i++) { randomFeatureNames.add(String.valueOf(i)); } List<String> allFeatureNames = new ArrayList<>(randomFeatureNames); featureStore = new SparseFeatureStore(); // add 100.000 instances for (int i = 0; i < TESTING_INSTANCES; i++) { Instance instance = new Instance(); instance.setOutcomes("outcome"); // add 10 random features int offset = random.nextInt(maxFeatureVectorSize - TESTING_FEATURES_PER_INSTANCE); List<String> featureNames = allFeatureNames.subList(offset, offset + TESTING_FEATURES_PER_INSTANCE); for (String featureName : featureNames) { instance.addFeature(new Feature(featureName, 1)); } instance.addFeature(new Feature(OriginalTextHolderFeatureExtractor.ORIGINAL_TEXT, "token")); featureStore.addInstance(instance); } SVMHMMDataWriter svmhmmDataWriter = new SVMHMMDataWriter(); System.out.println(featureStore.getNumberOfInstances()); svmhmmDataWriter.write(temporaryFolder.getRoot(), featureStore, false, null, false); List<String> lines = IOUtils .readLines(new FileInputStream(new File(temporaryFolder.getRoot(), "feature-vectors.txt"))); System.out.println(lines.subList(0, 5)); } @Test public void testWriteMultiLineComment() throws Exception { featureStore = new SparseFeatureStore(); featureStore.addInstance(new Instance(Arrays .asList(new Feature(OriginalTextHolderFeatureExtractor.ORIGINAL_TEXT, "multi line \n text")))); SVMHMMDataWriter svmhmmDataWriter = new SVMHMMDataWriter(); System.out.println(featureStore.getNumberOfInstances()); svmhmmDataWriter.write(temporaryFolder.getRoot(), featureStore, false, null, false); List<String> lines = IOUtils .readLines(new FileInputStream(new File(temporaryFolder.getRoot(), "feature-vectors.txt"))); System.out.println(lines); // each instance must be on one line! assertEquals(1, lines.size()); } @Test public void testMetDataFeatures() throws Exception { String longText = "rO0ABXNyABNqYXZhLnV0aWwuQXJyYXlMaXN0eIHSHZnHYZ0DAAFJAARzaXpleHAAAAAedwQAAAAedAABT3EAfgACcQB%2BAAJxAH4AAnEAfgACcQB%2BAAJxAH4AAnEAfgACcQB%2BAAJxAH4AAnEAfgACcQB%2BAAJxAH4AAnEAfgACcQB%2BAAJxAH4AAnEAfgACcQB%2BAAJxAH4AAnEAfgACcQB%2BAAJxAH4AAnEAfgACcQB%2BAAJxAH4AAnEAfgACcQB%2BAAJxAH4AAnEAfgACcQB%2BAAJ4"; featureStore = new SparseFeatureStore(); Feature f1 = new Feature(OriginalTextHolderFeatureExtractor.ORIGINAL_TEXT, "multi line \n text"); Feature f2 = new Feature(SVMHMMDataWriter.META_DATA_FEATURE_PREFIX + "someFeature", longText); Instance instance = new Instance(Arrays.asList(f1, f2), "outcome"); featureStore.addInstance(instance); SVMHMMDataWriter svmhmmDataWriter = new SVMHMMDataWriter(); System.out.println(featureStore.getNumberOfInstances()); svmhmmDataWriter.write(temporaryFolder.getRoot(), featureStore, false, null, false); File featureVectorsFile = new File(temporaryFolder.getRoot(), "feature-vectors.txt"); List<String> lines = IOUtils.readLines(new FileInputStream(featureVectorsFile)); System.out.println(lines); assertEquals("outcome", SVMHMMUtils.extractOutcomeLabelsFromFeatureVectorFiles(featureVectorsFile).iterator().next()); assertEquals(Integer.valueOf(0), SVMHMMUtils.extractOriginalSequenceIDs(featureVectorsFile).iterator().next()); SortedMap<String, String> metaDataFeatures = SVMHMMUtils.extractMetaDataFeatures(featureVectorsFile).get(0); assertTrue(metaDataFeatures.containsKey(SVMHMMDataWriter.META_DATA_FEATURE_PREFIX + "someFeature")); assertEquals(longText, metaDataFeatures.get(SVMHMMDataWriter.META_DATA_FEATURE_PREFIX + "someFeature")); } @Test public void testDoubleFeatures() throws Exception { featureStore = new SparseFeatureStore(); featureStore.addInstance(new Instance(Arrays.asList(new Feature("doubleFeature", 0.123456789)))); SVMHMMDataWriter svmhmmDataWriter = new SVMHMMDataWriter(); System.out.println(featureStore.getNumberOfInstances()); svmhmmDataWriter.write(temporaryFolder.getRoot(), featureStore, false, null, false); List<String> lines = IOUtils .readLines(new FileInputStream(new File(temporaryFolder.getRoot(), "feature-vectors.txt"))); System.out.println(lines); // each instance must be on one line! assertEquals(1, lines.size()); assertTrue(lines.get(0).contains(" 1:0.12345679 ")); } }