Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.enhancer.nlp.model; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.EnumSet; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map.Entry; import java.util.Set; import org.apache.clerezza.commons.rdf.IRI; import org.apache.commons.collections.CollectionUtils; import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory; import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation; import org.apache.stanbol.enhancer.nlp.model.annotation.Value; import org.apache.stanbol.enhancer.nlp.utils.NIFHelper; import org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper; import org.apache.stanbol.enhancer.servicesapi.Blob; import org.apache.stanbol.enhancer.servicesapi.ContentItem; import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory; import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper; import org.apache.stanbol.enhancer.servicesapi.impl.StringSource; import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * The Class added as ContentPart to the contentItem * @author westei * */ public class AnalysedTextTest { private static Logger log = LoggerFactory.getLogger(AnalysedTextTest.class); public static final String text = "The Stanbol enhancer can detect famous " + "cities such as Paris and people such as Bob Marley. With " + "disambiguation it would even be able to detect the Comedian " + "Bob Marley trafeling to Paris in Texas."; public static final Annotation<Number> testAnnotation = new Annotation<Number>("test", Number.class); /* ----- * Test data creates within the BeforeClass * ----- */ /** * AnalysedText instance filled in {@link #setup()} with test dats */ private static AnalysedText analysedTextWithData; private static LinkedHashMap<Sentence, String> expectedSentences = new LinkedHashMap<Sentence, String>(); private static LinkedHashMap<Chunk, String> expectedChunks = new LinkedHashMap<Chunk, String>(); private static LinkedHashMap<Token, String> expectedTokens = new LinkedHashMap<Token, String>(); /* ----- * Test data creates before every single test * ----- */ /** * Empty AnalysedText instance created before each test */ private static AnalysedText at; private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance(); private static final AnalysedTextFactory atFactory = AnalysedTextFactory.getDefaultInstance(); private static ContentItem ci; @BeforeClass public static final void setup() throws IOException { analysedTextWithData = createAnalysedText(); int sentence = text.indexOf('.') + 1; Sentence sent1 = analysedTextWithData.addSentence(0, sentence); expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " + "cities such as Paris and people such as Bob Marley."); Sentence sent2 = analysedTextWithData.addSentence(sentence + 1, text.length()); expectedSentences.put(sent2, "With disambiguation it would even be able " + "to detect the Comedian Bob Marley trafeling to Paris in Texas."); Token the = sent1.addToken(0, 3); expectedTokens.put(the, "The"); Token stanbol = sent1.addToken(4, 11); expectedTokens.put(stanbol, "Stanbol"); //use index to create Tokens int enhancerStart = sent1.getSpan().indexOf("enhancer"); Token enhancer = sent1.addToken(enhancerStart, enhancerStart + "enhancer".length()); expectedTokens.put(enhancer, "enhancer"); //create a chunk Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd()); expectedChunks.put(stanbolEnhancer, "Stanbol enhancer"); int parisStart = sent1.getSpan().indexOf("Paris"); Token paris = sent1.addToken(parisStart, parisStart + 5); expectedTokens.put(paris, "Paris"); int bobMarleyStart = sent1.getSpan().indexOf("Bob Marley"); Chunk bobMarley = sent1.addChunk(bobMarleyStart, bobMarleyStart + 10); expectedChunks.put(bobMarley, "Bob Marley"); Token bob = bobMarley.addToken(0, 3); expectedTokens.put(bob, "Bob"); Token marley = bobMarley.addToken(4, 10); expectedTokens.put(marley, "Marley"); Token with = sent2.addToken(0, 4); expectedTokens.put(with, "With"); Token disambiguation = sent2.addToken(5, 5 + "disambiguation".length()); expectedTokens.put(disambiguation, "disambiguation"); int comedianBobMarleyIndex = sent2.getSpan().indexOf("Comedian"); Chunk comedianBobMarley = sent2.addChunk(comedianBobMarleyIndex, comedianBobMarleyIndex + "Comedian Bob Marley".length()); expectedChunks.put(comedianBobMarley, "Comedian Bob Marley"); Token comedian = comedianBobMarley.addToken(0, "Comedian".length()); expectedTokens.put(comedian, "Comedian"); Token bobSent2 = comedianBobMarley.addToken(9, 9 + "Bob".length()); expectedTokens.put(bobSent2, "Bob"); Token marleySent2 = comedianBobMarley.addToken(13, 13 + "Marley".length()); expectedTokens.put(marleySent2, "Marley"); int parisIndex = sent2.getSpan().indexOf("Paris"); Chunk parisInTexas = sent2.addChunk(parisIndex, parisIndex + "Paris in Texas".length()); expectedChunks.put(parisInTexas, "Paris in Texas"); Token parisSent2 = parisInTexas.addToken(0, "Paris".length()); expectedTokens.put(parisSent2, "Paris"); int inIndex = parisInTexas.getSpan().indexOf("in"); Token in = parisInTexas.addToken(inIndex, inIndex + 2); expectedTokens.put(in, "in"); Token texasSent2 = parisInTexas.addToken(parisInTexas.getSpan().indexOf("Texas"), parisInTexas.getSpan().indexOf("Texas") + "Texas".length()); expectedTokens.put(texasSent2, "Texas"); } @Before public void initAnalysedText() throws Exception { at = createAnalysedText(); } /** * @throws IOException */ private static AnalysedText createAnalysedText() throws IOException { ci = ciFactory.createContentItem(new StringSource(text)); Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain")); return atFactory.createAnalysedText(ci, textBlob.getValue()); } @Test public void testSpanFilter() { Iterator<Sentence> sentences = analysedTextWithData.getSentences(); Iterator<Chunk> chunks = analysedTextWithData.getChunks(); Iterator<Token> tokens = analysedTextWithData.getTokens(); for (Entry<Sentence, String> sentEntry : expectedSentences.entrySet()) { Sentence sent = sentences.next(); Assert.assertEquals(sentEntry.getKey(), sent); Assert.assertEquals(sentEntry.getValue(), sent.getSpan()); } for (Entry<Chunk, String> chunkEntry : expectedChunks.entrySet()) { Chunk chunk = chunks.next(); Assert.assertEquals(chunkEntry.getKey(), chunk); Assert.assertEquals(chunkEntry.getValue(), chunk.getSpan()); } for (Entry<Token, String> tokenEntry : expectedTokens.entrySet()) { Token token = tokens.next(); Assert.assertEquals(tokenEntry.getKey(), token); Assert.assertEquals(tokenEntry.getValue(), token.getSpan()); } } @Test public void testAnalysedText() { Assert.assertEquals(text, at.getText()); Assert.assertEquals(text, at.getSpan()); Assert.assertEquals(0, at.getStart()); Assert.assertEquals(text.length(), at.getEnd()); } /** * Spans created relative to an other MUST NOT exceed the span of the * other one */ @Test(expected = IllegalArgumentException.class) public void testExceedsRelativeSpan() { Sentence sent = at.addSentence(0, 10); sent.addChunk(5, 15); //Invalid } @Test(expected = IllegalArgumentException.class) public void testNegativeStart() { at.addSentence(-1, 10); } @Test(expected = IllegalArgumentException.class) public void testRelativeNegativeStart() { Sentence sent = at.addSentence(0, 10); sent.addToken(-1, 5); } @Test public void testAnalysedTextaddSpanMethods() { Collection<Span> spans = new HashSet<Span>(); //add some span of different types spans.add(at.addToken(4, 11)); spans.add(at.addChunk(4, 19)); spans.add(at.addSentence(0, 91)); Set<Span> atSpans = AnalysedTextUtils.asSet(at.getEnclosed(EnumSet.allOf(SpanTypeEnum.class))); Assert.assertTrue(spans.containsAll(atSpans)); Assert.assertTrue(atSpans.containsAll(spans)); } /** * Test relative additions (with relative indexes) as well as iterators * over this hierarchy */ @Test public void testSpanHierarchy() { int[] startPos = new int[] { 0, 1, 2 }; int[] endPos = new int[] { 1, 2, 3 }; int maxVal = endPos[endPos.length - 1]; int tokenLength = 5; int chunkLength = tokenLength * maxVal; int sentenceLength = tokenLength * maxVal * maxVal; List<Sentence> sentences = new ArrayList<Sentence>(startPos.length); List<Chunk> chunks = new ArrayList<Chunk>(startPos.length * 2); List<Token> tokens = new ArrayList<Token>(startPos.length * 3); int start; int end; //1. test relative add and absolute start/end log.info("--- adding Spans ---"); for (int s = 0; s < startPos.length; s++) { start = startPos[s] * sentenceLength; end = endPos[s] * sentenceLength; Sentence sent = at.addSentence(start, end); log.info("add {}", sent); Assert.assertEquals(start, sent.getStart()); Assert.assertEquals(end, sent.getEnd()); sentences.add(sent); } //1.b iterate over the sentences while adding Chunks and Tokens to // test that returned Iterators MUST NOT throw // ConcurrentModificationExceptions when adding Spans to the AnalysedText Iterator<Sentence> sentenceIt = at.getSentences(); while (sentenceIt.hasNext()) { Sentence sent = sentenceIt.next(); for (int c = 0; c < startPos.length; c++) { start = startPos[c] * chunkLength; end = endPos[c] * chunkLength; Chunk chunk = sent.addChunk(start, end); log.info(" add {}", chunk); start = sent.getStart() + start; end = sent.getStart() + end; Assert.assertEquals(start, chunk.getStart()); Assert.assertEquals(end, chunk.getEnd()); chunks.add(chunk); for (int t = 0; t < startPos.length; t++) { start = startPos[t] * tokenLength; end = endPos[t] * tokenLength; Token token = chunk.addToken(start, end); log.info(" add {}", token); start = chunk.getStart() + start; end = chunk.getStart() + end; Assert.assertEquals(start, token.getStart()); Assert.assertEquals(end, token.getEnd()); tokens.add(token); } } } //2. test iterations of enclosed int chunksInSentence = startPos.length; int tokensInChunk = chunksInSentence; int tokensInSentence = chunksInSentence * tokensInChunk; Iterator<Sentence> sentIt = at.getSentences(); int s = 0; int c = 0; int t = 0; log.info("--- iterating over Spans ---"); log.info("{}", at); for (; sentIt.hasNext(); s++) { Assert.assertTrue(sentences.size() + " Sentences Expected (found: " + (s + 1) + ")", s < sentences.size()); Sentence sent = sentIt.next(); log.info(" {}", sent); Assert.assertEquals(sentences.get(s), sent); Iterator<Chunk> chunkIt = sent.getChunks(); int foundChunks = 0; for (; chunkIt.hasNext(); c++) { Assert.assertTrue(chunks.size() + " Chunks Expected (found: " + (c + 1) + ")", c < chunks.size()); Chunk chunk = chunkIt.next(); log.info(" {}", chunk); Assert.assertEquals(chunks.get(c), chunk); Iterator<Token> tokenIt = chunk.getTokens(); int foundTokens = 0; for (; tokenIt.hasNext(); t++) { Assert.assertTrue(tokens.size() + " Tokens Expected (found: " + (t + 1) + ")", t < tokens.size()); Token token = tokenIt.next(); log.info(" {}", token); Assert.assertEquals(tokens.get(t), token); foundTokens++; } Assert.assertEquals(tokensInChunk + " Tokens expected in Chunk", tokensInChunk, foundTokens); foundChunks++; } Assert.assertEquals(chunksInSentence + " Chunks expected in Sentence", chunksInSentence, foundChunks); //also iterate over tokens within a sentence log.info(" {}", sent); Iterator<Token> tokenIt = sent.getTokens(); int foundTokens = 0; for (; tokenIt.hasNext(); foundTokens++) { Token token = tokenIt.next(); log.info(" {}", token); Assert.assertEquals(tokens.get(s * tokensInSentence + foundTokens), token); } Assert.assertEquals(tokensInSentence + " Tokens expected in Sentence", tokensInSentence, foundTokens); } Assert.assertEquals(sentences.size() + " Sentences Expected (found: " + s + ")", sentences.size(), s); Assert.assertEquals(chunks.size() + " Chunks Expected (found: " + c + ")", chunks.size(), c); Assert.assertEquals(tokens.size() + " Sentences Expected (found: " + t + ")", tokens.size(), t); //also iterate over Chunks in AnalysedText Iterator<Chunk> chunkIt = at.getChunks(); int foundChunks = 0; log.info("{}", at); for (; chunkIt.hasNext(); foundChunks++) { Chunk chunk = chunkIt.next(); log.info(" {}", chunk); Assert.assertEquals(chunks.get(foundChunks), chunk); } Assert.assertEquals(chunks.size() + " Chunks expected in AnalysedText", chunks.size(), foundChunks); //also iterate over Tokens in AnalysedText Iterator<Token> tokenIt = at.getTokens(); int foundTokens = 0; log.info("{}", at); for (; tokenIt.hasNext(); foundTokens++) { Token token = tokenIt.next(); log.info(" {}", token); Assert.assertEquals(tokens.get(foundTokens), token); } Assert.assertEquals(tokens.size() + " Tokens expected in AnalysedText", tokens.size(), foundTokens); //Finally iterate over multiple token types Iterator<Span> sentencesAndChunks = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Chunk)); s = 0; c = 0; log.info("{} >> Iterate over Sentences and Chunks", at); while (sentencesAndChunks.hasNext()) { Span span = sentencesAndChunks.next(); log.info(" {}", span); if (span.getType() == SpanTypeEnum.Chunk) { Assert.assertEquals(chunks.get(c), span); c++; } else if (span.getType() == SpanTypeEnum.Sentence) { Assert.assertEquals(sentences.get(s), span); s++; } else { Assert.fail("Unexpected SpanType '" + span.getType() + " (Span: " + span.getClass() + ")"); } } Assert.assertEquals(sentences.size() + " Sentences expected in AnalysedText", sentences.size(), s); Assert.assertEquals((sentences.size() * chunksInSentence) + " Chunks expected in AnalysedText", (sentences.size() * chunksInSentence), c); } /** * Tests the {@link Section#getEnclosed(Set, int, int)} method introduced * with <code>0.12.1</code> */ @Test public void testSubSectionIteration() { log.info("testSubSectionIteration ..."); List<Span> expectedSpans = new ArrayList<Span>(); List<Sentence> sentences = new ArrayList<Sentence>(); Set<SpanTypeEnum> enabledTypes = EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Token); Iterator<Span> allIt = analysedTextWithData.getEnclosed(enabledTypes); while (allIt.hasNext()) { Span s = allIt.next(); expectedSpans.add(s); if (s.getType() == SpanTypeEnum.Sentence) { sentences.add((Sentence) s); } } //first test an section that exceeds the end of the text int[] testSpan = new int[] { 4, 90 }; Assert.assertEquals(5, assertSectionIterator(analysedTextWithData, expectedSpans, testSpan, enabledTypes)); //second test a section relative to an sentence Sentence lastSent = sentences.get(sentences.size() - 1); int[] offsetSpan = new int[] { 5, 25 }; Assert.assertEquals(1, assertSectionIterator(lastSent, expectedSpans, offsetSpan, enabledTypes)); } /** * @param span * @param testSpan */ private int assertSectionIterator(Section section, List<Span> span, int[] testSpan, Set<SpanTypeEnum> types) { log.info("> assert span {} over {}", Arrays.toString(testSpan), section); Iterator<Span> sectionIt = section.getEnclosed(types, testSpan[0], testSpan[1]); int startIdx = section.getStart() + testSpan[0]; int endIdx = section.getStart() + testSpan[1]; int count = 0; for (Span s : span) { if (s.getStart() < startIdx) { log.info(" - asserted {} before section", s); } else if (s.getEnd() < endIdx) { Assert.assertTrue(sectionIt.hasNext()); Assert.assertEquals(s, sectionIt.next()); count++; log.info(" - asserted section token {}", s); } else { log.info(" - asserted correct section end", s); Assert.assertFalse(sectionIt.hasNext()); break; } } return count; } @Test public void testAnnotation() { List<Value<Number>> values = new ArrayList<Value<Number>>(); values.add(new Value<Number>(26, 0.6)); values.add(new Value<Number>(27l)); values.add(new Value<Number>(28.0f)); values.add(new Value<Number>(25.0, 0.8)); at.addAnnotations(testAnnotation, values); Value<Number> value = at.getAnnotation(testAnnotation); Assert.assertNotNull(value); Assert.assertEquals(Double.valueOf(25.0), value.value()); Assert.assertEquals(0.8d, value.probability(), 0.0d); Number prev = Float.valueOf(24f); for (Value<Number> v : at.getAnnotations(testAnnotation)) { Assert.assertNotNull(v); Assert.assertTrue(v.value().doubleValue() > prev.doubleValue()); prev = v.value(); } //check that the order of Annotations without probability is kept at.addAnnotation(testAnnotation, new Value<Number>(29)); prev = Integer.valueOf(24); for (Value<Number> v : at.getAnnotations(testAnnotation)) { Assert.assertNotNull(v); Assert.assertTrue(v.value().intValue() > prev.intValue()); prev = v.value(); } } }