Java tutorial
/////////////////////////////////////////////////////////////////////////////// //Copyright (C) 2012 Assaf Urieli // //This file is part of Talismane. // //Talismane is free software: you can redistribute it and/or modify //it under the terms of the GNU Affero General Public License as published by //the Free Software Foundation, either version 3 of the License, or //(at your option) any later version. // //Talismane is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU Affero General Public License for more details. // //You should have received a copy of the GNU Affero General Public License //along with Talismane. If not, see <http://www.gnu.org/licenses/>. ////////////////////////////////////////////////////////////////////////////// package com.joliciel.talismane.parser; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.joliciel.talismane.TalismaneException; import com.joliciel.talismane.TalismaneServiceLocator; import com.joliciel.talismane.TalismaneSession; import com.joliciel.talismane.lexicon.LexicalEntry; import com.joliciel.talismane.lexicon.LexicalEntryReader; import com.joliciel.talismane.machineLearning.Decision; import com.joliciel.talismane.posTagger.PosTag; import com.joliciel.talismane.posTagger.PosTagSequence; import com.joliciel.talismane.posTagger.PosTagSet; import com.joliciel.talismane.posTagger.PosTaggedToken; import com.joliciel.talismane.posTagger.PosTaggerService; import com.joliciel.talismane.posTagger.UnknownPosTagException; import com.joliciel.talismane.posTagger.filters.PosTagSequenceFilter; import com.joliciel.talismane.tokeniser.PretokenisedSequence; import com.joliciel.talismane.tokeniser.Token; import com.joliciel.talismane.tokeniser.TokenSequence; import com.joliciel.talismane.tokeniser.TokeniserService; import com.joliciel.talismane.tokeniser.filters.TokenFilter; import com.joliciel.talismane.tokeniser.filters.TokenFilterService; import com.joliciel.talismane.tokeniser.filters.TokenSequenceFilter; import com.joliciel.talismane.utils.CoNLLFormatter; import com.joliciel.talismane.utils.LogUtils; import com.joliciel.talismane.utils.PerformanceMonitor; public class ParserRegexBasedCorpusReaderImpl implements ParserRegexBasedCorpusReader { private static final Log LOG = LogFactory.getLog(ParserRegexBasedCorpusReaderImpl.class); private static final PerformanceMonitor MONITOR = PerformanceMonitor .getMonitor(ParserRegexBasedCorpusReaderImpl.class); private String regex = ParserRegexBasedCorpusReader.DEFAULT_REGEX; private static final String INDEX_PLACEHOLDER = "%INDEX%"; private static final String TOKEN_PLACEHOLDER = "%TOKEN%"; private static final String GOVERNOR_PLACEHOLDER = "%GOVERNOR%"; private static final String LABEL_PLACEHOLDER = "%LABEL%"; private static final String POSTAG_PLACEHOLDER = "%POSTAG%"; private static final String FILENAME_PLACEHOLDER = "%FILENAME%"; private static final String ROW_PLACEHOLDER = "%ROW%"; private static final String COLUMN_PLACEHOLDER = "%COLUMN%"; private static final String POSTAG_COMMENT_PLACEHOLDER = "%POSTAG_COMMENT%"; private static final String DEP_COMMENT_PLACEHOLDER = "%DEP_COMMENT%"; private Pattern pattern; private ParseConfiguration configuration = null; private Scanner scanner; private File corpusLocation; private Charset charset; private ParserService parserService; private PosTaggerService posTaggerService; private TokeniserService tokeniserService; private TokenFilterService tokenFilterService; private int maxSentenceCount = 0; private int sentenceCount = 0; private int lineNumber = 0; private int crossValidationSize = -1; private int includeIndex = -1; private int excludeIndex = -1; private int totalSentenceCount = 0; private String excludeFileName = null; private List<File> files; private int currentFileIndex = 0; private boolean needsToReturnBlankLine = false; private List<TokenFilter> tokenFilters = new ArrayList<TokenFilter>(); private List<TokenSequenceFilter> tokenSequenceFilters = new ArrayList<TokenSequenceFilter>(); private List<PosTagSequenceFilter> posTagSequenceFilters = new ArrayList<PosTagSequenceFilter>(); private TokenSequenceFilter tokenFilterWrapper = null; private Map<String, Integer> placeholderIndexMap = new HashMap<String, Integer>(); private LexicalEntryReader lexicalEntryReader; private TalismaneServiceLocator locator = TalismaneServiceLocator.getInstance(); private boolean predictTransitions = true; public ParserRegexBasedCorpusReaderImpl(File corpusLocation, Charset charset) { this.corpusLocation = corpusLocation; this.charset = charset; } public ParserRegexBasedCorpusReaderImpl(Reader reader) { this.scanner = new Scanner(reader); } @Override public boolean hasNextConfiguration() { MONITOR.startTask("hasNextConfiguration"); try { if (maxSentenceCount > 0 && sentenceCount >= maxSentenceCount) { // we've reached the end, do nothing } else { while (configuration == null) { List<ParseDataLine> dataLines = new ArrayList<ParseDataLine>(); List<LexicalEntry> lexicalEntries = new ArrayList<LexicalEntry>(); boolean hasLine = false; if (!this.hasNextLine()) break; int sentenceStartLineNumber = lineNumber; while (configuration == null) { // break out when there's no next line & nothing in the buffer to process if (!this.hasNextLine() && !hasLine) break; String line = ""; if (this.hasNextLine()) line = this.nextLine().replace("\r", ""); lineNumber++; if (line.trim().length() == 0) { if (!hasLine) continue; // end of sentence totalSentenceCount++; if (LOG.isTraceEnabled()) LOG.trace("totalSentenceCount: " + totalSentenceCount); // check cross-validation if (crossValidationSize > 0) { boolean includeMe = true; if (includeIndex >= 0) { if (totalSentenceCount % crossValidationSize != includeIndex) { includeMe = false; } } else if (excludeIndex >= 0) { if (totalSentenceCount % crossValidationSize == excludeIndex) { includeMe = false; } } if (!includeMe) { dataLines = new ArrayList<ParseDataLine>(); lexicalEntries = new ArrayList<LexicalEntry>(); hasLine = false; continue; } } // construct the configuration if (dataLines.size() > 0) { boolean badConfig = false; for (ParseDataLine dataLine : dataLines) { badConfig = !this.checkDataLine(dataLine); if (badConfig) { dataLines = new ArrayList<ParseDataLine>(); hasLine = false; break; } } if (!badConfig) { PretokenisedSequence tokenSequence = this.getTokeniserService() .getEmptyPretokenisedSequence(); int maxIndex = 0; for (ParseDataLine dataLine : dataLines) { Token token = tokenSequence.addToken(dataLine.getWord()); dataLine.setToken(token); if (dataLine.getIndex() > maxIndex) maxIndex = dataLine.getIndex(); token.setFileName(dataLine.getOriginalFileName()); token.setLineNumber(dataLine.getOriginalLineNumber()); token.setColumnNumber(dataLine.getOriginalColumnNumber()); } LOG.debug("Sentence " + (sentenceCount + 1) + ": " + tokenSequence.getText()); tokenSequence.cleanSlate(); // first apply the token filters - which might replace the text of an individual token // with something else if (tokenFilterWrapper == null) { tokenFilterWrapper = this.getTokenFilterService() .getTokenSequenceFilter(this.tokenFilters); } tokenFilterWrapper.apply(tokenSequence); for (TokenSequenceFilter tokenFilter : this.tokenSequenceFilters) { tokenFilter.apply(tokenSequence); } if (tokenSequence.getTokensAdded().size() > 0) { // create an empty data line for each empty token that was added by the filters List<ParseDataLine> newDataLines = new ArrayList<ParseDataLine>(); int i = 0; ParseDataLine lastDataLine = null; for (Token token : tokenSequence) { if (tokenSequence.getTokensAdded().contains(token)) { ParseDataLine emptyDataLine = new ParseDataLine(); emptyDataLine.setToken(token); emptyDataLine.setWord(""); emptyDataLine.setIndex(++maxIndex); if (lastDataLine != null) emptyDataLine.setLineNumber(lastDataLine.getLineNumber()); else emptyDataLine.setLineNumber(sentenceStartLineNumber); newDataLines.add(emptyDataLine); } else { lastDataLine = dataLines.get(i++); newDataLines.add(lastDataLine); } } dataLines = newDataLines; } boolean hasSkip = false; for (int i = 0; i < dataLines.size(); i++) { this.updateDataLine(dataLines, i); ParseDataLine dataLine = dataLines.get(i); if (dataLine.getWord().equals("") && dataLine.getPosTagCode().equals("")) dataLine.setSkip(true); if (dataLine.isSkip()) hasSkip = true; } if (hasSkip) { List<ParseDataLine> newDataLines = new ArrayList<ParseDataLine>(); for (ParseDataLine dataLine : dataLines) { if (dataLine.isSkip()) { tokenSequence.removeEmptyToken(dataLine.getToken()); } else { newDataLines.add(dataLine); } } dataLines = newDataLines; } if (LOG.isTraceEnabled()) { LOG.trace("Data lines after update:"); for (ParseDataLine dataLine : dataLines) { LOG.trace(dataLine.toString()); } } PosTagSequence posTagSequence = this.getPosTaggerService() .getPosTagSequence(tokenSequence, tokenSequence.size()); Map<Integer, PosTaggedToken> idTokenMap = new HashMap<Integer, PosTaggedToken>(); int i = 0; int lexicalEntryIndex = 0; PosTagSet posTagSet = TalismaneSession.getPosTagSet(); for (ParseDataLine dataLine : dataLines) { Token token = tokenSequence.get(i); PosTag posTag = null; try { posTag = posTagSet.getPosTag(dataLine.getPosTagCode()); } catch (UnknownPosTagException upte) { throw new TalismaneException("Unknown posTag on line " + dataLine.getLineNumber() + ": " + dataLine.getPosTagCode()); } Decision<PosTag> posTagDecision = posTagSet.createDefaultDecision(posTag); PosTaggedToken posTaggedToken = this.getPosTaggerService() .getPosTaggedToken(token, posTagDecision); if (LOG.isTraceEnabled()) { LOG.trace(posTaggedToken.toString()); } posTaggedToken.setComment(dataLine.getPosTagComment()); // set the lexical entry if we have one if (this.lexicalEntryReader != null) { List<LexicalEntry> lexicalEntrySet = new ArrayList<LexicalEntry>(1); if (!tokenSequence.getTokensAdded().contains(token)) { lexicalEntrySet.add(lexicalEntries.get(lexicalEntryIndex++)); } posTaggedToken.setLexicalEntries(lexicalEntrySet); } posTagSequence.addPosTaggedToken(posTaggedToken); idTokenMap.put(dataLine.getIndex(), posTaggedToken); i++; } for (PosTagSequenceFilter posTagSequenceFilter : this.posTagSequenceFilters) { posTagSequenceFilter.apply(posTagSequence); } PosTaggedToken rootToken = posTagSequence.prependRoot(); idTokenMap.put(0, rootToken); Set<DependencyArc> dependencies = new TreeSet<DependencyArc>(); for (ParseDataLine dataLine : dataLines) { PosTaggedToken head = idTokenMap.get(dataLine.getGovernorIndex()); PosTaggedToken dependent = idTokenMap.get(dataLine.getIndex()); DependencyArc arc = this.getParserService().getDependencyArc(head, dependent, dataLine.getDependencyLabel()); if (LOG.isTraceEnabled()) LOG.trace(arc); dependencies.add(arc); arc.setComment(dataLine.getDependencyComment()); } configuration = this.getParserService().getInitialConfiguration(posTagSequence); if (this.predictTransitions) { TransitionSystem transitionSystem = TalismaneSession.getTransitionSystem(); transitionSystem.predictTransitions(configuration, dependencies); } else { for (DependencyArc arc : dependencies) { configuration.addDependency(arc.getHead(), arc.getDependent(), arc.getLabel(), null); } } sentenceCount++; } // is the configuration a valid one } // have we data lines? } else { // add a token to the current sentence hasLine = true; Matcher matcher = this.getPattern().matcher(line); if (!matcher.matches()) throw new TalismaneException("Didn't match pattern \"" + regex + "\" on line " + lineNumber + ": " + line); if (matcher.groupCount() != placeholderIndexMap.size()) { throw new TalismaneException( "Expected " + placeholderIndexMap.size() + " matches (but found " + matcher.groupCount() + ") on line " + lineNumber); } int index = Integer.parseInt(matcher.group(placeholderIndexMap.get(INDEX_PLACEHOLDER))); String rawWord = matcher.group(placeholderIndexMap.get(TOKEN_PLACEHOLDER)); String word = this.readWord(rawWord); String posTagCode = matcher.group(placeholderIndexMap.get(POSTAG_PLACEHOLDER)); String depLabel = matcher.group(placeholderIndexMap.get(LABEL_PLACEHOLDER)); if (depLabel.equals("_")) depLabel = ""; int governorIndex = Integer .parseInt(matcher.group(placeholderIndexMap.get(GOVERNOR_PLACEHOLDER))); ParseDataLine dataLine = new ParseDataLine(); dataLine.setLineNumber(this.lineNumber); dataLine.setIndex(index); dataLine.setWord(word); dataLine.setPosTagCode(posTagCode); dataLine.setDependencyLabel(depLabel); dataLine.setGovernorIndex(governorIndex); if (placeholderIndexMap.containsKey(FILENAME_PLACEHOLDER)) dataLine.setOriginalFileName( matcher.group(placeholderIndexMap.get(FILENAME_PLACEHOLDER))); if (placeholderIndexMap.containsKey(ROW_PLACEHOLDER)) dataLine.setOriginalLineNumber( Integer.parseInt(matcher.group(placeholderIndexMap.get(ROW_PLACEHOLDER)))); if (placeholderIndexMap.containsKey(COLUMN_PLACEHOLDER)) dataLine.setOriginalColumnNumber(Integer .parseInt(matcher.group(placeholderIndexMap.get(COLUMN_PLACEHOLDER)))); if (placeholderIndexMap.containsKey(POSTAG_COMMENT_PLACEHOLDER)) dataLine.setPosTagComment( matcher.group(placeholderIndexMap.get(POSTAG_COMMENT_PLACEHOLDER))); if (placeholderIndexMap.containsKey(DEP_COMMENT_PLACEHOLDER)) dataLine.setDependencyComment( matcher.group(placeholderIndexMap.get(DEP_COMMENT_PLACEHOLDER))); dataLines.add(dataLine); if (this.lexicalEntryReader != null) { LexicalEntry lexicalEntry = this.lexicalEntryReader.readEntry(line); lexicalEntries.add(lexicalEntry); } } } } // is configuration still null? } // have we reached the max sentence count? return configuration != null; } finally { MONITOR.endTask("hasNextConfiguration"); } } /** * Returns true if the data line is valid, false otherwise. * @param dataLine * @return */ protected boolean checkDataLine(ParseDataLine dataLine) { return true; } /** * Updates the data line prior to processing. * At this point, empty lines may have been added to correspond to empty tokens that were added by filters. * @param dataLines */ protected void updateDataLine(List<ParseDataLine> dataLines, int index) { // nothing to do in the base class } @Override public ParseConfiguration nextConfiguration() { ParseConfiguration nextConfiguration = null; if (this.hasNextConfiguration()) { nextConfiguration = configuration; configuration = null; } return nextConfiguration; } public PosTaggerService getPosTaggerService() { if (posTaggerService == null) posTaggerService = locator.getPosTaggerServiceLocator().getPosTaggerService(); return posTaggerService; } public void setPosTaggerService(PosTaggerService posTaggerService) { this.posTaggerService = posTaggerService; } public TokeniserService getTokeniserService() { if (tokeniserService == null) tokeniserService = locator.getTokeniserServiceLocator().getTokeniserService(); return tokeniserService; } public void setTokeniserService(TokeniserService tokeniserService) { this.tokeniserService = tokeniserService; } public ParserService getParserService() { if (parserService == null) parserService = locator.getParserServiceLocator().getParserService(); return parserService; } public void setParserService(ParserService parserService) { this.parserService = parserService; } protected static class ParseDataLine { public ParseDataLine() { } private int lineNumber = 0; private int index; private String word = ""; private String posTagCode = ""; private int governorIndex; private String dependencyLabel = ""; private Token token; private String originalFileName = ""; private int originalLineNumber = 0; private int originalColumnNumber = 0; private boolean skip = false; private String posTagComment = ""; private String dependencyComment = ""; public int getLineNumber() { return lineNumber; } public void setLineNumber(int lineNumber) { this.lineNumber = lineNumber; } public int getIndex() { return index; } public void setIndex(int index) { this.index = index; } public String getWord() { return word; } public void setWord(String word) { this.word = word; } public String getPosTagCode() { return posTagCode; } public void setPosTagCode(String posTagCode) { this.posTagCode = posTagCode; } public int getGovernorIndex() { return governorIndex; } public void setGovernorIndex(int governorIndex) { this.governorIndex = governorIndex; } public String getDependencyLabel() { return dependencyLabel; } public void setDependencyLabel(String dependencyLabel) { this.dependencyLabel = dependencyLabel; } public Token getToken() { return token; } public void setToken(Token token) { this.token = token; } /** * Should this data line be skipped or not? This should only be set for data lines corresponding to empty tokens. * The empty token will be removed. */ public boolean isSkip() { return skip; } public void setSkip(boolean skip) { this.skip = skip; } public String getOriginalFileName() { return originalFileName; } public void setOriginalFileName(String originalFileName) { this.originalFileName = originalFileName; } public int getOriginalLineNumber() { return originalLineNumber; } public void setOriginalLineNumber(int originalLineNumber) { this.originalLineNumber = originalLineNumber; } public int getOriginalColumnNumber() { return originalColumnNumber; } public void setOriginalColumnNumber(int originalColumnNumber) { this.originalColumnNumber = originalColumnNumber; } public String getPosTagComment() { return posTagComment; } public void setPosTagComment(String posTagComment) { this.posTagComment = posTagComment; } public String getDependencyComment() { return dependencyComment; } public void setDependencyComment(String dependencyComment) { this.dependencyComment = dependencyComment; } public String toString() { String string = lineNumber + ": " + index + "," + word + "," + posTagCode + "," + governorIndex + "," + dependencyLabel; return string; } } /** * If 0, all sentences will be read - otherwise will only read a certain number of sentences. * @return */ public int getMaxSentenceCount() { return maxSentenceCount; } public void setMaxSentenceCount(int maxSentenceCount) { this.maxSentenceCount = maxSentenceCount; } public void addTokenFilter(TokenFilter tokenFilter) { this.tokenFilters.add(tokenFilter); } public void addTokenSequenceFilter(TokenSequenceFilter tokenFilter) { this.tokenSequenceFilters.add(tokenFilter); } @Override public Map<String, String> getCharacteristics() { Map<String, String> attributes = new LinkedHashMap<String, String>(); attributes.put("maxSentenceCount", "" + this.maxSentenceCount); attributes.put("crossValidationSize", "" + this.crossValidationSize); attributes.put("includeIndex", "" + this.includeIndex); attributes.put("excludeIndex", "" + this.excludeIndex); attributes.put("transitionSystem", TalismaneSession.getTransitionSystem().getClass().getSimpleName()); int i = 0; for (TokenSequenceFilter tokenFilter : this.tokenSequenceFilters) { attributes.put("filter" + i, "" + tokenFilter.getClass().getSimpleName()); i++; } return attributes; } @Override public String getRegex() { return regex; } public Pattern getPattern() { if (this.pattern == null) { int indexPos = regex.indexOf(INDEX_PLACEHOLDER); if (indexPos < 0) throw new TalismaneException( "The regex must contain the string \"" + INDEX_PLACEHOLDER + "\": " + regex); int tokenPos = regex.indexOf(TOKEN_PLACEHOLDER); if (tokenPos < 0) throw new TalismaneException("The regex must contain the string \"" + TOKEN_PLACEHOLDER + "\""); int posTagPos = regex.indexOf(POSTAG_PLACEHOLDER); if (posTagPos < 0) throw new TalismaneException("The regex must contain the string \"" + POSTAG_PLACEHOLDER + "\""); int labelPos = regex.indexOf(LABEL_PLACEHOLDER); if (labelPos < 0) throw new TalismaneException("The regex must contain the string \"" + LABEL_PLACEHOLDER + "\""); int governorPos = regex.indexOf(GOVERNOR_PLACEHOLDER); if (governorPos < 0) throw new TalismaneException("The regex must contain the string \"" + GOVERNOR_PLACEHOLDER + "\""); int filenamePos = regex.indexOf(FILENAME_PLACEHOLDER); int rowNumberPos = regex.indexOf(ROW_PLACEHOLDER); int columnNumberPos = regex.indexOf(COLUMN_PLACEHOLDER); int posTagCommentPos = regex.indexOf(POSTAG_COMMENT_PLACEHOLDER); int depCommentPos = regex.indexOf(DEP_COMMENT_PLACEHOLDER); Map<Integer, String> placeholderMap = new TreeMap<Integer, String>(); placeholderMap.put(indexPos, INDEX_PLACEHOLDER); placeholderMap.put(tokenPos, TOKEN_PLACEHOLDER); placeholderMap.put(posTagPos, POSTAG_PLACEHOLDER); placeholderMap.put(labelPos, LABEL_PLACEHOLDER); placeholderMap.put(governorPos, GOVERNOR_PLACEHOLDER); if (filenamePos >= 0) placeholderMap.put(filenamePos, FILENAME_PLACEHOLDER); if (rowNumberPos >= 0) placeholderMap.put(rowNumberPos, ROW_PLACEHOLDER); if (columnNumberPos >= 0) placeholderMap.put(columnNumberPos, COLUMN_PLACEHOLDER); if (posTagCommentPos >= 0) placeholderMap.put(posTagCommentPos, POSTAG_COMMENT_PLACEHOLDER); if (depCommentPos >= 0) placeholderMap.put(depCommentPos, DEP_COMMENT_PLACEHOLDER); int i = 1; for (String placeholderName : placeholderMap.values()) { placeholderIndexMap.put(placeholderName, i++); } String regexWithGroups = regex.replace(INDEX_PLACEHOLDER, "(.+)"); regexWithGroups = regexWithGroups.replace(TOKEN_PLACEHOLDER, "(.*)"); regexWithGroups = regexWithGroups.replace(POSTAG_PLACEHOLDER, "(.+)"); regexWithGroups = regexWithGroups.replace(LABEL_PLACEHOLDER, "(.*)"); regexWithGroups = regexWithGroups.replace(GOVERNOR_PLACEHOLDER, "(.+)"); regexWithGroups = regexWithGroups.replace(FILENAME_PLACEHOLDER, "(.+)"); regexWithGroups = regexWithGroups.replace(ROW_PLACEHOLDER, "(.+)"); regexWithGroups = regexWithGroups.replace(COLUMN_PLACEHOLDER, "(.+)"); regexWithGroups = regexWithGroups.replace(POSTAG_COMMENT_PLACEHOLDER, "(.*)"); regexWithGroups = regexWithGroups.replace(DEP_COMMENT_PLACEHOLDER, "(.*)"); this.pattern = Pattern.compile(regexWithGroups); } return pattern; } private boolean hasNextLine() { try { if (needsToReturnBlankLine) return true; if (this.scanner == null && currentFileIndex == 0) { if (corpusLocation == null) { return false; } else if (corpusLocation.isDirectory()) { File[] theFiles = corpusLocation.listFiles(); Arrays.sort(theFiles); files = new ArrayList<File>(); for (File file : theFiles) { if (!file.getName().equals(excludeFileName)) { files.add(file); } } Reader reader = new BufferedReader( new InputStreamReader(new FileInputStream(files.get(0)), charset)); this.scanner = new Scanner(reader); } else { Reader reader = new BufferedReader( new InputStreamReader(new FileInputStream(corpusLocation), charset)); this.scanner = new Scanner(reader); } currentFileIndex++; } while (this.scanner != null) { if (this.scanner.hasNextLine()) return true; needsToReturnBlankLine = true; this.scanner = null; if (files != null) { if (currentFileIndex < files.size()) { Reader reader = new BufferedReader( new InputStreamReader(new FileInputStream(files.get(currentFileIndex)), charset)); this.scanner = new Scanner(reader); currentFileIndex++; } } } return false; } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } } private String nextLine() { if (needsToReturnBlankLine) { needsToReturnBlankLine = false; return ""; } return this.scanner.nextLine(); } @Override public void rewind() { if (this.corpusLocation == null) { throw new TalismaneException( this.getClass().getName() + " does not support rewind if not constructed from File"); } this.scanner = null; this.currentFileIndex = 0; configuration = null; sentenceCount = 0; lineNumber = 0; totalSentenceCount = 0; } public void setRegex(String regex) { this.regex = regex; } public LexicalEntryReader getLexicalEntryReader() { return lexicalEntryReader; } public void setLexicalEntryReader(LexicalEntryReader lexicalEntryReader) { this.lexicalEntryReader = lexicalEntryReader; } @Override public void addPosTagSequenceFilter(PosTagSequenceFilter posTagSequenceFilter) { this.posTagSequenceFilters.add(posTagSequenceFilter); } public TokenFilterService getTokenFilterService() { if (this.tokenFilterService == null) { this.tokenFilterService = locator.getTokenFilterServiceLocator().getTokenFilterService(); } return tokenFilterService; } public void setTokenFilterService(TokenFilterService tokenFilterService) { this.tokenFilterService = tokenFilterService; } @Override public boolean hasNextPosTagSequence() { return this.hasNextConfiguration(); } @Override public PosTagSequence nextPosTagSequence() { PosTagSequence sequence = this.nextConfiguration().getPosTagSequence(); PosTagSequence clone = sequence.clonePosTagSequence(); clone.removeRoot(); return clone; } @Override public boolean hasNextTokenSequence() { return this.hasNextConfiguration(); } @Override public TokenSequence nextTokenSequence() { TokenSequence tokenSequence = this.nextPosTagSequence().getTokenSequence(); return tokenSequence; } @Override public List<TokenSequenceFilter> getTokenSequenceFilters() { return this.tokenSequenceFilters; } @Override public List<TokenFilter> getTokenFilters() { return this.tokenFilters; } protected String readWord(String rawWord) { return CoNLLFormatter.fromCoNLL(rawWord); } public boolean isPredictTransitions() { return predictTransitions; } public void setPredictTransitions(boolean predictTransitions) { this.predictTransitions = predictTransitions; } @Override public int getCrossValidationSize() { return crossValidationSize; } @Override public void setCrossValidationSize(int crossValidationSize) { this.crossValidationSize = crossValidationSize; } @Override public int getIncludeIndex() { return includeIndex; } @Override public void setIncludeIndex(int includeIndex) { this.includeIndex = includeIndex; } @Override public int getExcludeIndex() { return excludeIndex; } @Override public void setExcludeIndex(int excludeIndex) { this.excludeIndex = excludeIndex; } public String getExcludeFileName() { return excludeFileName; } public void setExcludeFileName(String excludeFileName) { this.excludeFileName = excludeFileName; } }