Java tutorial
/******************************************************************************* * Copyright 2015 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package reader; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import org.apache.commons.io.FileUtils; import org.apache.uima.UIMAException; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; import org.json.simple.parser.JSONParser; import org.json.simple.parser.ParseException; import org.jsoup.Jsoup; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.languagetool.LanguageToolSegmenter; import de.tudarmstadt.ukp.dkpro.tc.api.io.TCReaderSingleLabel; import de.tudarmstadt.ukp.dkpro.tc.api.type.TextClassificationOutcome; /** * @author Judith Eckle-Kohler * @author Roland Kluge * * assumes as input the file 20140120_dump__after_overlapping_annotations.json * where the metadata header has been removed manually * * this annotated corpus contains all 88 documents which have been annotated * 8 documents in the pilot phase * 80 documents in the main study * * * reads in the annotations of one annotator and sets the annotated label as classification outcome * used in document classification with cross validation (in order to explore linguistic features) * document = argument unit * */ public class ArgumentUnitTCReader extends JCasCollectionReader_ImplBase implements TCReaderSingleLabel { public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") private String encoding; public static final String PARAM_INPUT_FILE = "inputFile"; @ConfigurationParameter(name = PARAM_INPUT_FILE, mandatory = true, description = "JSON input file") private File inputFile; public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true, description = "two-letter language code") private String language; public static final String PARAM_ANNOTATOR = "annotator"; @ConfigurationParameter(name = PARAM_ANNOTATOR, mandatory = true, description = "The annotator whose annotations shall be included") private String annotator; private List<String> labels; private List<String> texts; private int offset; @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // read input file with texts (= argument units) and labels labels = new ArrayList<String>(); texts = new ArrayList<String>(); Iterator<Map<String, Object>> documentsIterator; try { String inputString = FileUtils.readFileToString(this.inputFile); JSONParser jsonParser = new JSONParser(); @SuppressWarnings("unchecked") ArrayList<Map<String, Object>> jsonTexts = new ArrayList<Map<String, Object>>( (List<Map<String, Object>>) jsonParser.parse(inputString)); documentsIterator = jsonTexts.iterator(); while (documentsIterator.hasNext()) { Map<String, Object> jsonData = documentsIterator.next(); @SuppressWarnings("unchecked") List<Map<String, Object>> userAnnotations = (List<Map<String, Object>>) jsonData .get(JsonCorpusUtil.USER_ANNOTATIONS); for (Map<String, Object> userAnnotation : userAnnotations) { String annotator = (String) userAnnotation.get(JsonCorpusUtil.ANNOTATOR); if (annotator.equals(this.annotator)) { String htmlText = (String) jsonData.get(JsonCorpusUtil.TEXT); org.jsoup.nodes.Document cleanedText = Jsoup.parse(htmlText); String rawDocumentText = cleanedText.text(); Map<Integer, Token> idxToTokenMapping = this.createIndexToTokenMapping(rawDocumentText); @SuppressWarnings("unchecked") List<String> argUnits = (List<String>) userAnnotation .get(JsonCorpusUtil.ARGUMENTATION_UNITS); for (String argUnit : argUnits) { //System.out.println("au: " +argUnit); String cleanedArgUnit = argUnit.replaceAll("\\s+", ""); Matcher matcher = JsonCorpusUtil.getRecognitionPattern().matcher(cleanedArgUnit); if (!matcher.matches()) { this.getLogger() .warn(String.format( "argument unit %s does not match the expected pattern %s", cleanedArgUnit, JsonCorpusUtil.getRecognitionPattern().pattern())); } else { // ************************************************** // coordinates of an argument unit: String label = matcher.group(1); String stringIndices = matcher.group(3).replaceAll("^,", ""); List<Integer> indices = CollectionUtils.parseIntList(stringIndices, ","); int firstIndex = Collections.min(indices); Token firstToken = idxToTokenMapping.get(firstIndex); int lastIndex = Collections.max(indices); Token lastToken = idxToTokenMapping.get(lastIndex); //String text = getArgunitText(firstIndex, lastIndex); // ***************************************************** String generalizedLabel = getGeneralizedLabel(label); // Read argument unit as dummy Paragraph annotation to get the text JCas dummyJCas = JCasFactory.createJCas(); dummyJCas.setDocumentText(rawDocumentText); Paragraph para = new Paragraph(dummyJCas, firstToken.getBegin(), lastToken.getEnd()); //System.out.println("argument unit text: " +para.getCoveredText()); texts.add(para.getCoveredText()); labels.add(generalizedLabel); //System.out.println("annotator: " +annotator); System.out.println("label: " + label + " general label: " + generalizedLabel); } // matching was ok } // for argUnit : argUnits } // if annotator.equals(this.annotator) } // for user annotation } // while hasNext } catch (final IOException e) { throw new ResourceInitializationException(e); } catch (final ParseException e) { throw new ResourceInitializationException(e); } catch (UIMAException e) { throw new ResourceInitializationException(e); } offset = 0; System.out.println("number of AUs: " + texts.size()); } private String getGeneralizedLabel(String label) { String result = null; if (label.startsWith("claim")) { result = "claim"; } else if (label.startsWith("support")) { result = "premise"; } else if (label.startsWith("attack")) { result = "premise"; } return result; } @Override public boolean hasNext() throws IOException, CollectionException { return offset < texts.size(); } @Override public Progress[] getProgress() { return new Progress[] { new ProgressImpl(offset, texts.size(), "argunits") }; } @Override public String getTextClassificationOutcome(JCas jcas) throws CollectionException { return labels.get(offset); } @Override public void getNext(JCas aJCas) throws IOException, CollectionException { // setting the document text aJCas.setDocumentText(texts.get(offset)); aJCas.setDocumentLanguage(language); // as we are creating more than one CAS out of a single file, we need to have different // document titles and URIs for each CAS // otherwise, serialized CASes will be overwritten DocumentMetaData dmd = DocumentMetaData.create(aJCas); dmd.setDocumentTitle("Argunit" + offset); dmd.setDocumentUri("Argunit" + offset); dmd.setDocumentId(String.valueOf(offset)); // setting the outcome / label for this document = argument unit TextClassificationOutcome outcome = new TextClassificationOutcome(aJCas); outcome.setOutcome(getTextClassificationOutcome(aJCas)); outcome.addToIndexes(); offset++; } protected final Map<Integer, Token> createIndexToTokenMapping(final String rawDocumentText) throws UIMAException, AnalysisEngineProcessException, ResourceInitializationException { final JCas dummyJCas = JCasFactory.createJCas(); dummyJCas.setDocumentText(rawDocumentText); dummyJCas.setDocumentLanguage(this.language); SimplePipeline.runPipeline(dummyJCas, createEngineDescription(LanguageToolSegmenter.class)); final Map<Integer, Token> idxToTokenMapping = ArgumentUnitUtils.mapIndexToAnnotation(dummyJCas, Token.class, JsonCorpusUtil.FIRST_TOKEN_IDX); return idxToTokenMapping; } }