Java tutorial
/******************************************************************************* * Copyright 2015 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package explore; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import org.apache.commons.io.FileUtils; import org.apache.uima.UIMAException; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.component.JCasCollectionReader_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Progress; import org.apache.uima.util.ProgressImpl; import org.json.simple.parser.JSONParser; import org.json.simple.parser.ParseException; import org.jsoup.Jsoup; import reader.ArgumentUnitUtils; import reader.CollectionUtils; import reader.JsonCorpusUtil; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.languagetool.LanguageToolSegmenter; /** * @author Roland Kluge * @author Judith Eckle-Kohler * * */ public class ArgminCorpusReader extends JCasCollectionReader_ImplBase { public static final String PARAM_INPUT_FILE = "inputFile"; @ConfigurationParameter(name = PARAM_INPUT_FILE, mandatory = true, description = "JSON input file") private File inputFile; public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true, description = "two-letter language code") private String language; public static final String PARAM_ANNOTATOR = "annotator"; @ConfigurationParameter(name = PARAM_ANNOTATOR, mandatory = true, description = "The annotator whose annotations shall be included") private String annotator; private Collection<Map<String, Object>> jsonTexts; private int nextDocumentIdx; private Iterator<Map<String, Object>> documentsIterator; private int counter; private int premises; private int claims; /* * assumes as input the file 20140120_dump__after_overlapping_annotations.json * where the metadata header has been removed manually * * this annotated corpus contains all 88 documents which have been annotated * 8 documents in the pilot phase * 80 documents in the main study * */ @Override public void initialize(final UimaContext context) throws ResourceInitializationException { super.initialize(context); try { String inputString = FileUtils.readFileToString(this.inputFile); JSONParser jsonParser = new JSONParser(); @SuppressWarnings("unchecked") ArrayList<Map<String, Object>> jsonTexts = new ArrayList<Map<String, Object>>( (List<Map<String, Object>>) jsonParser.parse(inputString)); this.jsonTexts = jsonTexts; System.out.println("number of json texts: " + this.jsonTexts.size()); //this.filterOutMetadataSection(); this.nextDocumentIdx = 0; this.documentsIterator = this.jsonTexts.iterator(); this.counter = 0; this.premises = 0; this.claims = 0; } catch (final IOException e) { throw new ResourceInitializationException(e); } catch (final ParseException e) { throw new ResourceInitializationException(e); } } // /* // * Removes the metadata section. All remaining sections are 'file sections'. // */ // private void filterOutMetadataSection() // { // if (this.jsonTexts.size() > 0) { // final Map<String, Object> firstSection = this.jsonTexts.iterator().next(); // if (CorpusMetadata.isMetadataDictionary(firstSection)) { // //this.metadata = CorpusMetadata.extractMetadata(firstSection); // this.jsonTexts.remove(firstSection); // } // } // } // @Override public boolean hasNext() throws IOException, CollectionException { return this.documentsIterator.hasNext(); } private int getNumDocuments() { return this.jsonTexts.size(); } @Override public Progress[] getProgress() { return new Progress[] { new ProgressImpl(this.nextDocumentIdx, this.getNumDocuments(), Progress.ENTITIES) }; } @Override public void getNext(JCas aJcas) throws CollectionException { try { Map<String, Object> jsonData = this.documentsIterator.next(); String htmlText = (String) jsonData.get(JsonCorpusUtil.TEXT); org.jsoup.nodes.Document cleanedText = Jsoup.parse(htmlText); String rawDocumentText = cleanedText.text(); String file = (String) jsonData.get(JsonCorpusUtil.FILE); String documentId = file.replace(".json", ""); String url = (String) jsonData.get(JsonCorpusUtil.URL); // original HTML version not required for TC experiment // JCas view = jCas.createView(JsonCorpusUtil.VIEW_ORIGINAL_HTML); // view.setDocumentText(htmlText); aJcas.setDocumentText(rawDocumentText); aJcas.setDocumentLanguage(this.language); DocumentMetaData metaData = DocumentMetaData.create(aJcas); metaData.setDocumentBaseUri(""); metaData.setDocumentUri("/" + documentId); metaData.setDocumentTitle(url); metaData.setDocumentId(documentId); Map<Integer, Token> idxToTokenMapping = this.createIndexToTokenMapping(rawDocumentText); @SuppressWarnings("unchecked") List<Map<String, Object>> userAnnotations = (List<Map<String, Object>>) jsonData .get(JsonCorpusUtil.USER_ANNOTATIONS); for (Map<String, Object> userAnnotation : userAnnotations) { String annotator = (String) userAnnotation.get(JsonCorpusUtil.ANNOTATOR); if (annotator.equals(this.annotator)) { @SuppressWarnings("unchecked") List<String> argUnits = (List<String>) userAnnotation.get(JsonCorpusUtil.ARGUMENTATION_UNITS); for (String argUnit : argUnits) { String cleanedArgUnit = argUnit.replaceAll("\\s+", ""); Matcher matcher = JsonCorpusUtil.getRecognitionPattern().matcher(cleanedArgUnit); if (!matcher.matches()) { this.getLogger() .warn(String.format("argument unit %s does not match the expected pattern %s", cleanedArgUnit, JsonCorpusUtil.getRecognitionPattern().pattern())); } else { // ************************************************** // coordinates of an argument unit: String label = matcher.group(1); String stringIndices = matcher.group(3).replaceAll("^,", ""); List<Integer> indices = CollectionUtils.parseIntList(stringIndices, ","); int firstIndex = Collections.min(indices); Token firstToken = idxToTokenMapping.get(firstIndex); int lastIndex = Collections.max(indices); Token lastToken = idxToTokenMapping.get(lastIndex); // ***************************************************** // Read argument unit as Paragraph annotation Paragraph para = new Paragraph(aJcas, firstToken.getBegin(), lastToken.getEnd()); para.addToIndexes(); // print some counts: System.out.println("annotator: " + annotator); counter++; System.out .println("AU " + counter + " -- argument unit text: " + para.getCoveredText()); System.out.println("label: " + label); if (label.contains("claim")) { claims++; } else { premises++; } System.out.println("premises " + premises + "\t claims " + claims); NamedEntity outcome = new NamedEntity(aJcas, firstToken.getBegin(), lastToken.getEnd()); outcome.setValue(label); outcome.addToIndexes(); } // matching was ok } // for argUnit : argUnits ++this.nextDocumentIdx; } // if annotator.equals(this.annotator) } } catch (final CASException e) { throw new CollectionException(e); } catch (final ResourceInitializationException e) { throw new CollectionException(e); } catch (final UIMAException e) { throw new CollectionException(e); } } protected final Map<Integer, Token> createIndexToTokenMapping(final String rawDocumentText) throws UIMAException, AnalysisEngineProcessException, ResourceInitializationException { final JCas dummyJCas = JCasFactory.createJCas(); dummyJCas.setDocumentText(rawDocumentText); dummyJCas.setDocumentLanguage(this.language); SimplePipeline.runPipeline(dummyJCas, createEngineDescription(LanguageToolSegmenter.class)); final Map<Integer, Token> idxToTokenMapping = ArgumentUnitUtils.mapIndexToAnnotation(dummyJCas, Token.class, JsonCorpusUtil.FIRST_TOKEN_IDX); return idxToTokenMapping; } }