Java tutorial
/******************************************************************************* * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique) * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * *******************************************************************************/ package eu.project.ttc.test.unit.io; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.offset; import static org.assertj.core.api.Assertions.tuple; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import org.junit.Before; import org.junit.Test; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.BiMap; import com.google.common.collect.HashBiMap; import eu.project.ttc.api.JsonOptions; import eu.project.ttc.engines.desc.Lang; import eu.project.ttc.models.CompoundType; import eu.project.ttc.models.ContextVector; import eu.project.ttc.models.Document; import eu.project.ttc.models.Term; import eu.project.ttc.models.TermBuilder; import eu.project.ttc.models.TermIndex; import eu.project.ttc.models.VariationType; import eu.project.ttc.models.Word; import eu.project.ttc.models.WordBuilder; import eu.project.ttc.models.index.JsonTermIndexIO; import eu.project.ttc.models.index.MemoryTermIndex; import eu.project.ttc.models.occstore.MemoryOccurrenceStore; import eu.project.ttc.test.unit.TestUtil; public class JsonTermIndexIOSpec { public static final String jsonFile1 = "org/project/ttc/test/json/termIndex1.json"; private TermIndex termIndex; private Term term1; private Term term2; private Word word1; private Word word2; private Word word3; private Document doc1; private Document doc2; private Document doc3; private String json1; @Before public void initTermIndex() { termIndex = new MemoryTermIndex("Titi va voir Toto", Lang.FR, new MemoryOccurrenceStore()); termIndex.setCorpusId("ccid"); termIndex.setWordAnnotationsNum(222); termIndex.setSpottedTermsNum(111); doc1 = termIndex.getDocument("source1"); doc2 = termIndex.getDocument("source2"); doc3 = termIndex.getDocument("source3"); word1 = new Word("word1", "stem1"); word2 = new Word("word2", "stem2"); word3 = WordBuilder.start().setLemma("word3").setStem("stem3").addComponent(0, 2, "wop") .addComponent(2, 5, "rd3").setCompoundType(CompoundType.NATIVE).create(); term1 = TermBuilder.start(termIndex).setRank(1).addWord(word1, "L1").addWord(word2, "L2") .addOccurrence(10, 12, doc2, "coveredText 3").addOccurrence(20, 30, doc3, "coveredText 4") .setSpottingRule("spotRule1").setSpecificity(1.1).createAndAddToIndex(); term2 = TermBuilder.start(termIndex).setRank(2).addWord(word1, "L1").addWord(word2, "L2") .addWord(word3, "L3").setSpottingRule("spotRule1").addOccurrence(0, 2, doc2, "coveredText 1") .addOccurrence(10, 12, doc1, "coveredText 2").addOccurrence(14, 20, doc2, "coveredText 2") .setSpecificity(2.2).createAndAddToIndex(); term1.addTermVariation(term2, VariationType.SYNTACTICAL, "variationRule1"); term1.addTermVariation(term2, VariationType.GRAPHICAL, 0.956d); // generate context vectors ContextVector v = new ContextVector(term1); v.addEntry(term2, 21, 2.0); term1.setContextVector(v); } @Before public void initJsonTermIndex() { json1 = TestUtil.readFile(jsonFile1); } @Test public void testSaveLoadReturnWithNoVariant() throws IOException { term1.removeTermVariation(term1.getVariations(VariationType.SYNTACTICAL).iterator().next()); StringWriter writer = new StringWriter(); JsonTermIndexIO.save(writer, termIndex, new JsonOptions().withContexts(true).withOccurrences(true)); String string = writer.toString(); JsonTermIndexIO.load(new StringReader(string), new JsonOptions().withOccurrences(true)); } @Test public void testSaveLoadReturn() throws IOException { StringWriter writer = new StringWriter(); JsonTermIndexIO.save(writer, termIndex, new JsonOptions().withContexts(true).withOccurrences(true)); String string = writer.toString(); TermIndex termIndex2 = JsonTermIndexIO.load(new StringReader(string), new JsonOptions().withOccurrences(true)); assertEquals(111, termIndex2.getSpottedTermsNum()); assertEquals(222, termIndex2.getWordAnnotationsNum()); assertThat(termIndex2.getTerms()).hasSameElementsAs(termIndex.getTerms()); assertThat(termIndex2.getWords()).hasSameElementsAs(termIndex.getWords()); for (Term t : termIndex.getTerms()) { Term t2 = termIndex2.getTermByGroupingKey(t.getGroupingKey()); assertThat(t2.getOccurrences()).hasSameElementsAs(t.getOccurrences()); assertThat(t2.getVariations()).hasSameElementsAs(t.getVariations()); assertThat(t2.getBases()).hasSameElementsAs(t.getBases()); assertThat(t2.getForms()).hasSameElementsAs(t.getForms()); assertThat(t2.getFrequency()).isEqualTo(t.getFrequency()); assertThat(t2.getSpecificity()).isEqualTo(t.getSpecificity()); assertThat(t2.getFrequencyNorm()).isEqualTo(t.getFrequencyNorm()); assertThat(t2.getGeneralFrequencyNorm()).isEqualTo(t.getGeneralFrequencyNorm()); assertThat(t2.getSpottingRule()).isEqualTo(t.getSpottingRule()); assertThat(t2.getPattern()).isEqualTo(t.getPattern()); assertThat(t2.getWords()).isEqualTo(t.getWords()); assertThat(t2.getRank()).isEqualTo(t.getRank()); if (t2.getId() == term1.getId()) { assertTrue(t.isContextVectorComputed()); assertTrue(t2.isContextVectorComputed()); assertThat(t2.getContextVector()).isEqualTo(t.getContextVector()); } else if (t2.getId() == term2.getId()) { assertFalse(t.isContextVectorComputed()); assertFalse(t2.isContextVectorComputed()); } else { fail("should never happen"); } } for (Word w : termIndex.getWords()) { Word w2 = termIndex2.getWord(w.getLemma()); assertThat(w2.getStem()).isEqualTo(w.getStem()); assertThat(w2.isCompound()).isEqualTo(w.isCompound()); assertThat(w2.getCompoundType()).isEqualTo(w.getCompoundType()); assertThat(w2.getComponents()).hasSameElementsAs(w.getComponents()); } } @Test public void testExportTermIndexToJsonWithoutOccurrences() throws IOException { StringWriter writer = new StringWriter(); JsonTermIndexIO.save(writer, termIndex, new JsonOptions().withContexts(true).withOccurrences(false)); ObjectMapper mapper = new ObjectMapper(); Map<String, Object> map = mapper.readValue(writer.toString(), new TypeReference<HashMap<String, Object>>() { }); @SuppressWarnings("unchecked") Map<String, Object> t1 = (Map<String, Object>) ((List<?>) map.get("terms")).iterator().next(); assertThat(t1.keySet()).contains("id", "key").doesNotContain("occurrences"); } @Test public void testLoadJsonTermIndex() throws IOException { TermIndex termIndex = JsonTermIndexIO.load(new StringReader(json1), new JsonOptions().withOccurrences(true)); assertEquals("Toto va la plage", termIndex.getName()); assertEquals("Toto va la montagne", termIndex.getCorpusId()); assertEquals(Lang.EN, termIndex.getLang()); assertEquals(123, termIndex.getWordAnnotationsNum()); assertEquals(456, termIndex.getSpottedTermsNum()); // test term rank assertThat(termIndex.getTerms()).hasSize(3).extracting("rank").containsOnly(1, 2, 3); // test terms assertThat(termIndex.getTerms()).hasSize(3).extracting("groupingKey").containsOnly("na: word1 word2", "n: word1", "a: word2"); // test terms Term t1 = termIndex.getTermByGroupingKey("na: word1 word2"); Term t2 = termIndex.getTermByGroupingKey("n: word1"); Term t3 = termIndex.getTermByGroupingKey("a: word2"); assertThat(t1.getId()).isEqualTo(1); assertThat(t1.getSpecificity()).isCloseTo(0.321d, offset(0.000001d)); assertThat(t1.getFrequencyNorm()).isCloseTo(0.123d, offset(0.000001d)); assertThat(t1.getGeneralFrequencyNorm()).isCloseTo(0.025d, offset(0.000001d)); assertThat(t1.getFrequency()).isEqualTo(6); assertThat(t1.getVariations(VariationType.GRAPHICAL)).extracting("variant").containsOnly(t2); assertThat(t1.getVariations(VariationType.SYNTACTICAL)).hasSize(0); assertThat(t1.getBases()).hasSize(2).extracting("base").containsOnly(t2, t3); // test words assertThat(termIndex.getWords()).hasSize(2).extracting("lemma", "stem") .containsOnly(tuple("word1", "stem1"), tuple("word2", "stem2")); // test word composition Iterator<Word> iterator = termIndex.getWords().iterator(); Word w1 = iterator.next(); assertFalse(w1.isCompound()); assertThat(w1.getComponents()).hasSize(0); Word w2 = iterator.next(); assertTrue(w2.isCompound()); assertThat(w2.getComponents()).extracting("lemma", "begin", "end").containsOnly(tuple("wor", 0, 3), tuple("d3", 3, 5)); assertThat(t1.getContextVector().getEntries()).hasSize(2).extracting("coTerm.id", "nbCooccs", "assocRate") .contains(tuple(2, 18, 1.2000000476837158d), tuple(3, 12, 6.5d)); } @Test public void testExportTermIndexToJsonWithOccurrencesAndContext() throws IOException { StringWriter writer = new StringWriter(); JsonTermIndexIO.save(writer, termIndex, new JsonOptions().withContexts(true).withOccurrences(true)); ObjectMapper mapper = new ObjectMapper(); // System.out.println(writer.toString()); Map<String, Object> map = mapper.readValue(writer.toString(), new TypeReference<HashMap<String, Object>>() { }); assertThat(map.keySet()).hasSize(5).containsOnly("metadata", "words", "terms", "variations", "input_sources"); // test metadata Map<String, String> metadata = (LinkedHashMap<String, String>) map.get("metadata"); assertThat(metadata).containsOnlyKeys("name", "corpus-id", "wordsNum", "spottedTermsNum", "lang", "occurrence_storage"); // test input sources1 @SuppressWarnings("unchecked") Map<String, String> inputSources = (LinkedHashMap<String, String>) map.get("input_sources"); assertThat(inputSources).containsOnlyKeys("1", "2", "3"); assertThat(inputSources.values()).containsOnly("source1", "source2", "source3"); // test words List<?> wordList = (List<?>) map.get("words"); assertThat(wordList).hasSize(3).extracting("lemma").containsOnly("word1", "word2", "word3"); LinkedHashMap<?, ?> w3 = null; for (Object wl : wordList) { if (((LinkedHashMap<?, ?>) wl).get("lemma").equals("word3")) w3 = (LinkedHashMap<?, ?>) wl; } assertEquals("word3", w3.get("lemma")); assertEquals("stem3", w3.get("stem")); assertEquals("NATIVE", w3.get("compound_type")); List<?> components = (List<?>) w3.get("components"); assertThat(components).hasSize(2).extracting("lemma", "begin", "end").contains(tuple("wop", 0, 2), tuple("rd3", 2, 5)); // test terms BiMap<String, String> sources = HashBiMap.create(inputSources); List<?> termList = (List<?>) map.get("terms"); assertThat(termList).hasSize(2).extracting("id").containsOnly(term1.getId(), term2.getId()); LinkedHashMap<?, ?> t1 = (LinkedHashMap<?, ?>) termList.get(0); assertThat(t1.get("rank")).isEqualTo(1); assertThat(t1.get("spec")).isEqualTo(1.1); assertThat((List<?>) t1.get("words")).extracting("lemma", "syn").containsOnly(tuple("word1", "L1"), tuple("word2", "L2")); assertThat((List<?>) t1.get("occurrences")).hasSize(2).extracting("begin", "end", "file", "text") .containsOnly(tuple(10, 12, Integer.parseInt(sources.inverse().get("source2")), "coveredText 3"), tuple(20, 30, Integer.parseInt(sources.inverse().get("source3")), "coveredText 4")); final Map<?, ?> t1Ctxt = (Map<?, ?>) t1.get("context"); assertEquals(21, t1Ctxt.get("total_cooccs")); assertThat((List<?>) t1Ctxt.get("cooccs")).hasSize(1).extracting("co_term", "cnt", "assoc_rate") .contains(tuple("l1l2l3: word1 word2 word3", 21, 2.0d)); LinkedHashMap<?, ?> t2 = (LinkedHashMap<?, ?>) termList.get(1); assertThat((List<?>) t2.get("occurrences")).hasSize(3).extracting("begin", "end", "file", "text") .containsOnly(tuple(0, 2, Integer.parseInt(sources.inverse().get("source2")), "coveredText 1"), tuple(10, 12, Integer.parseInt(sources.inverse().get("source1")), "coveredText 2"), tuple(14, 20, Integer.parseInt(sources.inverse().get("source2")), "coveredText 2")); assertThat((List<?>) t2.get("words")).extracting("lemma", "syn").containsOnly(tuple("word1", "L1"), tuple("word2", "L2"), tuple("word3", "L3")); // test syntactic variants List<?> variantList = (List<?>) map.get("variations"); assertThat(variantList).hasSize(2).extracting("base", "variant", "info", "type").contains( tuple(term1.getGroupingKey(), term2.getGroupingKey(), "variationRule1", "syn"), tuple(term1.getGroupingKey(), term2.getGroupingKey(), "0.956", "graph")); } }