Java tutorial
/////////////////////////////////////////////////////////////////////////////// //Copyright (C) 2014 Joliciel Informatique // //This file is part of Talismane. // //Talismane is free software: you can redistribute it and/or modify //it under the terms of the GNU Affero General Public License as published by //the Free Software Foundation, either version 3 of the License, or //(at your option) any later version. // //Talismane is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU Affero General Public License for more details. // //You should have received a copy of the GNU Affero General Public License //along with Talismane. If not, see <http://www.gnu.org/licenses/>. ////////////////////////////////////////////////////////////////////////////// package com.joliciel.talismane.extensions.standoff; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.TreeMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.joliciel.talismane.TalismaneException; import com.joliciel.talismane.TalismaneSession; import com.joliciel.talismane.lexicon.LexicalEntryReader; import com.joliciel.talismane.machineLearning.Decision; import com.joliciel.talismane.parser.DependencyArc; import com.joliciel.talismane.parser.ParseConfiguration; import com.joliciel.talismane.parser.ParserAnnotatedCorpusReader; import com.joliciel.talismane.parser.ParserService; import com.joliciel.talismane.posTagger.PosTag; import com.joliciel.talismane.posTagger.PosTagSequence; import com.joliciel.talismane.posTagger.PosTagSet; import com.joliciel.talismane.posTagger.PosTaggedToken; import com.joliciel.talismane.posTagger.PosTaggerService; import com.joliciel.talismane.posTagger.UnknownPosTagException; import com.joliciel.talismane.posTagger.filters.PosTagSequenceFilter; import com.joliciel.talismane.tokeniser.PretokenisedSequence; import com.joliciel.talismane.tokeniser.Token; import com.joliciel.talismane.tokeniser.TokeniserService; import com.joliciel.talismane.tokeniser.filters.TokenFilter; import com.joliciel.talismane.tokeniser.filters.TokenFilterService; import com.joliciel.talismane.tokeniser.filters.TokenSequenceFilter; public class StandoffReader implements ParserAnnotatedCorpusReader { private static final Log LOG = LogFactory.getLog(StandoffReader.class); private int maxSentenceCount = 0; private int startSentence = 0; private int sentenceCount = 0; private int lineNumber = 1; private int includeIndex = -1; private int excludeIndex = -1; private int crossValidationSize = 0; private ParserService parserService; private PosTaggerService posTaggerService; private TokeniserService tokeniserService; private TokenFilterService tokenFilterService; ParseConfiguration configuration = null; private int sentenceIndex = 0; private List<TokenFilter> tokenFilters = new ArrayList<TokenFilter>(); private List<TokenSequenceFilter> tokenSequenceFilters = new ArrayList<TokenSequenceFilter>(); private List<PosTagSequenceFilter> posTagSequenceFilters = new ArrayList<PosTagSequenceFilter>(); private TokenSequenceFilter tokenFilterWrapper = null; private Map<String, StandoffToken> tokenMap = new HashMap<String, StandoffReader.StandoffToken>(); private Map<String, StandoffRelation> relationMap = new HashMap<String, StandoffReader.StandoffRelation>(); private Map<String, StandoffRelation> idRelationMap = new HashMap<String, StandoffReader.StandoffRelation>(); private Map<String, String> notes = new HashMap<String, String>(); private List<List<StandoffToken>> sentences = new ArrayList<List<StandoffReader.StandoffToken>>(); private TalismaneSession talismaneSession; public StandoffReader(TalismaneSession talismaneSession, Scanner scanner) { this.talismaneSession = talismaneSession; PosTagSet posTagSet = talismaneSession.getPosTagSet(); Map<Integer, StandoffToken> sortedTokens = new TreeMap<Integer, StandoffReader.StandoffToken>(); while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.startsWith("T")) { String[] parts = line.split("[\\t]"); String id = parts[0]; String[] posTagParts = parts[1].split(" "); String posTagCode = posTagParts[0].replace('_', '+'); int startPos = Integer.parseInt(posTagParts[1]); String text = parts[2]; PosTag posTag = null; if (posTagCode.equalsIgnoreCase(PosTag.ROOT_POS_TAG_CODE)) { posTag = PosTag.ROOT_POS_TAG; } else { try { posTag = posTagSet.getPosTag(posTagCode); } catch (UnknownPosTagException upte) { throw new TalismaneException("Unknown posTag on line " + lineNumber + ": " + posTagCode); } } StandoffToken token = new StandoffToken(); token.posTag = posTag; token.text = text; token.id = id; sortedTokens.put(startPos, token); tokenMap.put(id, token); } else if (line.startsWith("R")) { String[] parts = line.split("[\\t :]"); String id = parts[0]; String label = parts[1]; String headId = parts[3]; String dependentId = parts[5]; StandoffRelation relation = new StandoffRelation(); relation.fromToken = headId; relation.toToken = dependentId; relation.label = label; idRelationMap.put(id, relation); relationMap.put(dependentId, relation); } else if (line.startsWith("#")) { String[] parts = line.split("\t"); String itemId = parts[1].substring("AnnotatorNotes ".length()); String note = parts[2]; notes.put(itemId, note); } } for (String itemId : notes.keySet()) { String comment = notes.get(itemId); if (itemId.startsWith("R")) { StandoffRelation relation = idRelationMap.get(itemId); relation.comment = comment; } else { StandoffToken token = tokenMap.get(itemId); token.comment = comment; } } List<StandoffToken> currentSentence = null; for (StandoffToken token : sortedTokens.values()) { if (token.text.equals("ROOT")) { if (currentSentence != null) sentences.add(currentSentence); currentSentence = new ArrayList<StandoffReader.StandoffToken>(); } currentSentence.add(token); } if (currentSentence != null) sentences.add(currentSentence); } @Override public boolean hasNextConfiguration() { if (maxSentenceCount > 0 && sentenceCount >= maxSentenceCount) { // we've reached the end, do nothing } else { if (configuration == null && sentenceIndex < sentences.size()) { PretokenisedSequence tokenSequence = this.getTokeniserService().getEmptyPretokenisedSequence(); PosTagSequence posTagSequence = this.getPosTaggerService().getPosTagSequence(tokenSequence, tokenSequence.size()); Map<String, PosTaggedToken> idTokenMap = new HashMap<String, PosTaggedToken>(); PosTagSet posTagSet = talismaneSession.getPosTagSet(); List<StandoffToken> tokens = sentences.get(sentenceIndex++); for (StandoffToken standoffToken : tokens) { Token token = tokenSequence.addToken(standoffToken.text); Decision<PosTag> posTagDecision = posTagSet.createDefaultDecision(standoffToken.posTag); PosTaggedToken posTaggedToken = this.getPosTaggerService().getPosTaggedToken(token, posTagDecision); if (LOG.isTraceEnabled()) { LOG.trace(posTaggedToken.toString()); } posTaggedToken.setComment(standoffToken.comment); posTagSequence.addPosTaggedToken(posTaggedToken); idTokenMap.put(standoffToken.id, posTaggedToken); LOG.debug("Found token " + standoffToken.id + ", " + posTaggedToken); } tokenSequence.setWithRoot(true); tokenSequence.cleanSlate(); // first apply the token filters - which might replace the text of an individual token // with something else if (tokenFilterWrapper == null) { tokenFilterWrapper = this.getTokenFilterService().getTokenSequenceFilter(this.tokenFilters); } tokenFilterWrapper.apply(tokenSequence); for (TokenSequenceFilter tokenFilter : this.tokenSequenceFilters) { tokenFilter.apply(tokenSequence); } if (tokenSequence.getTokensAdded().size() > 0) { throw new TalismaneException("Added tokens not currently supported by StandoffReader"); } tokenSequence.finalise(); configuration = this.getParserService().getInitialConfiguration(posTagSequence); for (StandoffToken standoffToken : tokens) { StandoffRelation relation = relationMap.get(standoffToken.id); if (relation != null) { PosTaggedToken head = idTokenMap.get(relation.fromToken); PosTaggedToken dependent = idTokenMap.get(relation.toToken); if (head == null) { throw new TalismaneException("No token found for head id: " + relation.fromToken); } if (dependent == null) { throw new TalismaneException("No token found for dependent id: " + relation.toToken); } DependencyArc arc = configuration.addDependency(head, dependent, relation.label, null); arc.setComment(relation.comment); } } } } return (configuration != null); } @Override public ParseConfiguration nextConfiguration() { ParseConfiguration nextConfiguration = null; if (this.hasNextConfiguration()) { nextConfiguration = configuration; configuration = null; } return nextConfiguration; } public void addTokenFilter(TokenFilter tokenFilter) { this.tokenFilters.add(tokenFilter); } @Override public void addTokenSequenceFilter(TokenSequenceFilter tokenFilter) { this.tokenSequenceFilters.add(tokenFilter); } @Override public void addPosTagSequenceFilter(PosTagSequenceFilter posTagSequenceFilter) { this.posTagSequenceFilters.add(posTagSequenceFilter); } @Override public Map<String, String> getCharacteristics() { Map<String, String> attributes = new LinkedHashMap<String, String>(); return attributes; } @Override public LexicalEntryReader getLexicalEntryReader() { throw new RuntimeException("Not supported"); } @Override public void setLexicalEntryReader(LexicalEntryReader lexicalEntryReader) { throw new RuntimeException("Not supported"); } @Override public int getMaxSentenceCount() { return maxSentenceCount; } @Override public void setMaxSentenceCount(int maxSentenceCount) { this.maxSentenceCount = maxSentenceCount; } public TokenFilterService getTokenFilterService() { return tokenFilterService; } public void setTokenFilterService(TokenFilterService tokenFilterService) { this.tokenFilterService = tokenFilterService; } public PosTaggerService getPosTaggerService() { return posTaggerService; } public void setPosTaggerService(PosTaggerService posTaggerService) { this.posTaggerService = posTaggerService; } public TokeniserService getTokeniserService() { return tokeniserService; } public void setTokeniserService(TokeniserService tokeniserService) { this.tokeniserService = tokeniserService; } public ParserService getParserService() { return parserService; } public void setParserService(ParserService parserService) { this.parserService = parserService; } private static final class StandoffToken { public PosTag posTag; public String text; public String id; public String comment = ""; } private static final class StandoffRelation { public String label; public String fromToken; public String toToken; public String comment = ""; } public int getIncludeIndex() { return includeIndex; } public void setIncludeIndex(int includeIndex) { this.includeIndex = includeIndex; } public int getExcludeIndex() { return excludeIndex; } public void setExcludeIndex(int excludeIndex) { this.excludeIndex = excludeIndex; } public int getCrossValidationSize() { return crossValidationSize; } public void setCrossValidationSize(int crossValidationSize) { this.crossValidationSize = crossValidationSize; } @Override public void rewind() { throw new TalismaneException("rewind operation not supported by " + this.getClass().getName()); } public int getStartSentence() { return startSentence; } public void setStartSentence(int startSentence) { this.startSentence = startSentence; } }