Java tutorial
/////////////////////////////////////////////////////////////////////////////// //Copyright (C) 2012 Assaf Urieli // //This file is part of Talismane. // //Talismane is free software: you can redistribute it and/or modify //it under the terms of the GNU Affero General Public License as published by //the Free Software Foundation, either version 3 of the License, or //(at your option) any later version. // //Talismane is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU Affero General Public License for more details. // //You should have received a copy of the GNU Affero General Public License //along with Talismane. If not, see <http://www.gnu.org/licenses/>. ////////////////////////////////////////////////////////////////////////////// package com.joliciel.talismane.tokeniser.patterns; import static org.junit.Assert.assertEquals; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; import junit.framework.Assert; import mockit.NonStrict; import mockit.NonStrictExpectations; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.junit.Test; import com.joliciel.talismane.TalismaneServiceLocator; import com.joliciel.talismane.filters.Sentence; import com.joliciel.talismane.tokeniser.Token; import com.joliciel.talismane.tokeniser.TokenSequence; import com.joliciel.talismane.tokeniser.Tokeniser; import com.joliciel.talismane.tokeniser.TokeniserService; public class TokenPatternImplTest { private static final Log LOG = LogFactory.getLog(TokenPatternImplTest.class); @Test public void testParsePattern() { String regexp = "(?![cdCD]\\z|qu\\z|jusqu\\z).+'.+"; TokenPatternImpl tokenPattern = new TokenPatternImpl(regexp, Tokeniser.SEPARATORS); List<Pattern> patterns = tokenPattern.getParsedPattern(); Assert.assertEquals(3, patterns.size()); int i = 0; for (Pattern pattern : patterns) { if (i == 0) { Assert.assertEquals("(?![cdCD]\\z|[qQ]u\\z|[jJ]usqu\\z).+", pattern.pattern()); } else if (i == 1) { Assert.assertEquals("'", pattern.pattern()); } else if (i == 2) { Assert.assertEquals(".+", pattern.pattern()); } i++; } regexp = "tre (de|d)"; tokenPattern = new TokenPatternImpl(regexp, Tokeniser.SEPARATORS); patterns = tokenPattern.getParsedPattern(); Assert.assertEquals(3, patterns.size()); i = 0; for (Pattern pattern : patterns) { if (i == 0) { Assert.assertEquals("[E]tre", pattern.pattern()); } else if (i == 1) { Assert.assertEquals(" ", pattern.pattern()); } else if (i == 2) { Assert.assertEquals("(de|d)", pattern.pattern()); } i++; } regexp = ".+\\.\\p"; tokenPattern = new TokenPatternImpl(regexp, Tokeniser.SEPARATORS); patterns = tokenPattern.getParsedPattern(); Assert.assertEquals(3, patterns.size()); i = 0; for (Pattern pattern : patterns) { if (i == 0) { Assert.assertEquals(".+", pattern.pattern()); } else if (i == 1) { Assert.assertEquals("\\.", pattern.pattern()); } else if (i == 2) { Assert.assertEquals(Tokeniser.SEPARATORS.pattern(), pattern.pattern()); } i++; } regexp = ".+qu'"; tokenPattern = new TokenPatternImpl(regexp, Tokeniser.SEPARATORS); patterns = tokenPattern.getParsedPattern(); Assert.assertEquals(2, patterns.size()); i = 0; for (Pattern pattern : patterns) { if (i == 0) { Assert.assertEquals(".+qu", pattern.pattern()); } else if (i == 1) { Assert.assertEquals("'", pattern.pattern()); } i++; } regexp = "\\D+\\.a[ \\)]c[abc]"; tokenPattern = new TokenPatternImpl(regexp, Tokeniser.SEPARATORS); patterns = tokenPattern.getParsedPattern(); LOG.debug(patterns); Assert.assertEquals(5, patterns.size()); i = 0; for (Pattern pattern : patterns) { if (i == 0) { Assert.assertEquals("\\D+", pattern.pattern()); } else if (i == 1) { Assert.assertEquals("\\.", pattern.pattern()); } else if (i == 2) { Assert.assertEquals("a", pattern.pattern()); } else if (i == 3) { Assert.assertEquals("[ \\)]", pattern.pattern()); } else if (i == 4) { Assert.assertEquals("c[abc]", pattern.pattern()); } i++; } } @Test public void testGetParsedPattern() { final String separators = "[\\s\\p{Punct}]"; Pattern separatorPattern = Pattern.compile(separators); List<String> testPatterns = new ArrayList<String>(); testPatterns.add(".+'.+"); testPatterns.add(".+-t-elle"); testPatterns.add(".+\\..+"); testPatterns.add("\\d+ \\d+"); testPatterns.add("lors (de|du|des|d)"); testPatterns.add("([^cdjlmnst]|..+)'.+"); testPatterns.add(".+-t{-}elle"); int i = 0; for (String testPattern : testPatterns) { LOG.debug("Test Pattern = " + testPatterns.get(i)); TokenPatternImpl tokeniserPattern = new TokenPatternImpl(testPattern, separatorPattern); List<Pattern> parsedPattern = tokeniserPattern.getParsedPattern(); LOG.debug("Parsed Pattern = " + parsedPattern); if (i == 0) { assertEquals(".+", parsedPattern.get(0).pattern()); assertEquals("'", parsedPattern.get(1).pattern()); assertEquals(".+", parsedPattern.get(2).pattern()); } else if (i == 1) { assertEquals(".+", parsedPattern.get(0).pattern()); assertEquals("-", parsedPattern.get(1).pattern()); assertEquals("t", parsedPattern.get(2).pattern()); assertEquals("-", parsedPattern.get(3).pattern()); assertEquals("elle", parsedPattern.get(4).pattern()); assertEquals(4, tokeniserPattern.getIndexesToTest().size()); assertEquals(1, tokeniserPattern.getIndexesToTest().get(0).intValue()); assertEquals(2, tokeniserPattern.getIndexesToTest().get(1).intValue()); assertEquals(3, tokeniserPattern.getIndexesToTest().get(2).intValue()); assertEquals(4, tokeniserPattern.getIndexesToTest().get(3).intValue()); } else if (i == 2) { assertEquals(".+", parsedPattern.get(0).pattern()); assertEquals("\\.", parsedPattern.get(1).pattern()); assertEquals(".+", parsedPattern.get(2).pattern()); } else if (i == 3) { assertEquals("\\d+", parsedPattern.get(0).pattern()); assertEquals(" ", parsedPattern.get(1).pattern()); assertEquals("\\d+", parsedPattern.get(2).pattern()); } else if (i == 4) { assertEquals("[lL]ors", parsedPattern.get(0).pattern()); assertEquals(" ", parsedPattern.get(1).pattern()); assertEquals("(de|du|des|d)", parsedPattern.get(2).pattern()); } else if (i == 5) { assertEquals("([^cdjlmnst]|..+)", parsedPattern.get(0).pattern()); assertEquals("'", parsedPattern.get(1).pattern()); assertEquals(".+", parsedPattern.get(2).pattern()); } else if (i == 6) { assertEquals(".+", parsedPattern.get(0).pattern()); assertEquals("-", parsedPattern.get(1).pattern()); assertEquals("t", parsedPattern.get(2).pattern()); assertEquals("-", parsedPattern.get(3).pattern()); assertEquals("elle", parsedPattern.get(4).pattern()); assertEquals(3, tokeniserPattern.getIndexesToTest().size()); assertEquals(1, tokeniserPattern.getIndexesToTest().get(0).intValue()); assertEquals(2, tokeniserPattern.getIndexesToTest().get(1).intValue()); assertEquals(4, tokeniserPattern.getIndexesToTest().get(2).intValue()); } i++; } } @Test public void testMatch(@NonStrict final TokenSequence tokenSequence) { final String separators = "[\\s\\p{Punct}]"; final List<TokenPatternMatchImpl> matches3 = new ArrayList<TokenPatternMatchImpl>(); final List<TokenPatternMatchImpl> matches4 = new ArrayList<TokenPatternMatchImpl>(); final List<TokenPatternMatchImpl> matches5 = new ArrayList<TokenPatternMatchImpl>(); final List<TokenPatternMatchImpl> matches6 = new ArrayList<TokenPatternMatchImpl>(); final List<TokenPatternMatchImpl> matches7 = new ArrayList<TokenPatternMatchImpl>(); TokeniserPatternServiceInternal tokeniserPatternService = new TokeniserPatternServiceImpl(); new NonStrictExpectations() { Iterator<Token> i; List<Token> listWithWhiteSpaces; Token token0, token1, token2, token3, token4, token5, token6, token7; { tokenSequence.listWithWhiteSpace(); returns(listWithWhiteSpaces); listWithWhiteSpaces.get(0); returns(token0); token0.getText(); returns("Moi"); token0.isSeparator(); returns(false); token0.getIndex(); returns(0); listWithWhiteSpaces.get(1); returns(token1); token1.getText(); returns(","); token1.isSeparator(); returns(true); token1.getIndex(); returns(1); listWithWhiteSpaces.get(2); returns(token2); token2.getText(); returns(" "); token2.isSeparator(); returns(true); token2.getIndex(); returns(2); listWithWhiteSpaces.get(3); returns(token3); token3.getText(); returns("j"); token3.isSeparator(); returns(false); token3.getIndex(); returns(3); token3.getMatches(); returns(matches3); listWithWhiteSpaces.get(4); returns(token4); token4.getText(); returns("'"); token4.isSeparator(); returns(true); token4.getIndex(); returns(4); token4.getMatches(); returns(matches4); listWithWhiteSpaces.get(5); returns(token5); token5.getText(); returns("aim"); token5.isSeparator(); returns(false); token5.getIndex(); returns(5); token5.getMatches(); returns(matches5); listWithWhiteSpaces.get(6); returns(token6); token6.getText(); returns("'"); token6.isSeparator(); returns(true); token6.getIndex(); returns(6); token6.getMatches(); returns(matches6); listWithWhiteSpaces.get(7); returns(token7); token7.getText(); returns("rais"); token7.isSeparator(); returns(false); token7.getIndex(); returns(7); token7.getMatches(); returns(matches7); listWithWhiteSpaces.size(); returns(8); listWithWhiteSpaces.iterator(); returns(i); i.hasNext(); returns(true, true, true, true, true, true, true, true, false); i.next(); returns(token0, token1, token2, token3, token4, token5, token6, token7); } }; Pattern separatorPattern = Pattern.compile(separators); TokenPatternImpl tokeniserPatternImpl = new TokenPatternImpl(".+'.+", separatorPattern); tokeniserPatternImpl.setTokeniserPatternServiceInternal(tokeniserPatternService); List<TokenPatternMatchSequence> patternMatches = tokeniserPatternImpl.match(tokenSequence); assertEquals(2, patternMatches.size()); List<Token> patternMatch = patternMatches.get(0).getTokenSequence(); assertEquals(3, patternMatch.size()); for (int i = 0; i < 3; i++) { Token token = patternMatch.get(i); if (i == 0) { assertEquals(3, token.getIndex()); assertEquals("j", token.getText()); assertEquals(1, token.getMatches().size()); assertEquals(0, token.getMatches().get(0).getIndex()); } if (i == 1) { assertEquals(4, token.getIndex()); assertEquals("'", token.getText()); assertEquals(1, token.getMatches().size()); assertEquals(1, token.getMatches().get(0).getIndex()); } if (i == 2) { assertEquals(5, token.getIndex()); assertEquals("aim", token.getText()); assertEquals(2, token.getMatches().size()); assertEquals(2, token.getMatches().get(0).getIndex()); } } patternMatch = patternMatches.get(1).getTokenSequence(); assertEquals(3, patternMatch.size()); for (int i = 0; i < 3; i++) { Token token = patternMatch.get(i); if (i == 0) { assertEquals(5, token.getIndex()); assertEquals("aim", token.getText()); assertEquals(2, token.getMatches().size()); assertEquals(0, token.getMatches().get(1).getIndex()); } if (i == 1) { assertEquals(6, token.getIndex()); assertEquals("'", token.getText()); assertEquals(1, token.getMatches().size()); assertEquals(1, token.getMatches().get(0).getIndex()); } if (i == 2) { assertEquals(7, token.getIndex()); assertEquals("rais", token.getText()); assertEquals(1, token.getMatches().size()); assertEquals(2, token.getMatches().get(0).getIndex()); } } } @Test public void testMatch2(@NonStrict final Sentence sentence) { TalismaneServiceLocator locator = TalismaneServiceLocator.getInstance(); TokeniserService tokeniserService = locator.getTokeniserServiceLocator().getTokeniserService(); TokeniserPatternServiceInternal tokeniserPatternService = new TokeniserPatternServiceImpl(); new NonStrictExpectations() { { sentence.getText(); returns("Qu'ensuite il aille..."); } }; TokenSequence tokenSequence = tokeniserService.getTokenSequence(sentence, Tokeniser.SEPARATORS); TokenPatternImpl tokenPattern = new TokenPatternImpl( "{(?![cdjlmnstCDJLMNST]\\z|qu\\z|jusqu\\z|puisqu\\z|lorsqu\\z|aujourd\\z|prud\\z|quelqu\\z|quoiqu\\z).+'}.+", Tokeniser.SEPARATORS); tokenPattern.setTokeniserPatternServiceInternal(tokeniserPatternService); List<TokenPatternMatchSequence> patternMatches = tokenPattern.match(tokenSequence); assertEquals(0, patternMatches.size()); } @Test public void testMatch3(@NonStrict final Sentence sentence) { TalismaneServiceLocator locator = TalismaneServiceLocator.getInstance(); TokeniserService tokeniserService = locator.getTokeniserServiceLocator().getTokeniserService(); TokeniserPatternServiceInternal tokeniserPatternService = new TokeniserPatternServiceImpl(); new NonStrictExpectations() { { sentence.getText(); returns("Z'ensuite il aille..."); } }; TokenSequence tokenSequence = tokeniserService.getTokenSequence(sentence, Tokeniser.SEPARATORS); TokenPatternImpl tokenPattern = new TokenPatternImpl( "{(?![cdjlmnstCDJLMNST]\\z|qu\\z|jusqu\\z|puisqu\\z|lorsqu\\z|aujourd\\z|prud\\z|quelqu\\z|quoiqu\\z).+'}.+", Tokeniser.SEPARATORS); tokenPattern.setTokeniserPatternServiceInternal(tokeniserPatternService); List<TokenPatternMatchSequence> patternMatches = tokenPattern.match(tokenSequence); assertEquals(1, patternMatches.size()); TokenPatternMatchSequence matchSequence = patternMatches.get(0); assertEquals(3, matchSequence.getTokenSequence().size()); assertEquals("Z", matchSequence.getTokenSequence().get(0).getOriginalText()); } @Test public void testMatch4(@NonStrict final Sentence sentence) { TalismaneServiceLocator locator = TalismaneServiceLocator.getInstance(); TokeniserService tokeniserService = locator.getTokeniserServiceLocator().getTokeniserService(); TokeniserPatternServiceInternal tokeniserPatternService = new TokeniserPatternServiceImpl(); new NonStrictExpectations() { { sentence.getText(); returns("Aix-les-Bains"); } }; TokenSequence tokenSequence = tokeniserService.getTokenSequence(sentence, Tokeniser.SEPARATORS); TokenPatternImpl tokenPattern = new TokenPatternImpl(".+-{(ce|je|la|le|les|leur|lui|moi|nous|toi|tu)[^-]}", Tokeniser.SEPARATORS); tokenPattern.setTokeniserPatternServiceInternal(tokeniserPatternService); List<TokenPatternMatchSequence> patternMatches = tokenPattern.match(tokenSequence); assertEquals(0, patternMatches.size()); } }