explore.ArgminCorpusReader.java Source code

Introduction

Here is the source code for explore.ArgminCorpusReader.java
Source

/*******************************************************************************
 * Copyright 2015
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package explore;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;

import org.apache.commons.io.FileUtils;
import org.apache.uima.UIMAException;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import org.jsoup.Jsoup;

import reader.ArgumentUnitUtils;
import reader.CollectionUtils;
import reader.JsonCorpusUtil;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.languagetool.LanguageToolSegmenter;

/**
 * @author Roland Kluge
 * @author Judith Eckle-Kohler
 * 
 *
 */
public class ArgminCorpusReader extends JCasCollectionReader_ImplBase {

    public static final String PARAM_INPUT_FILE = "inputFile";
    @ConfigurationParameter(name = PARAM_INPUT_FILE, mandatory = true, description = "JSON input file")
    private File inputFile;

    public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
    @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true, description = "two-letter language code")
    private String language;

    public static final String PARAM_ANNOTATOR = "annotator";
    @ConfigurationParameter(name = PARAM_ANNOTATOR, mandatory = true, description = "The annotator whose annotations shall be included")
    private String annotator;

    private Collection<Map<String, Object>> jsonTexts;
    private int nextDocumentIdx;
    private Iterator<Map<String, Object>> documentsIterator;
    private int counter;
    private int premises;
    private int claims;

    /*
     * assumes as input the file 20140120_dump__after_overlapping_annotations.json
     * where the metadata header has been removed manually
     * 
     * this annotated corpus contains all 88 documents which have been annotated
     * 8 documents in the pilot phase
     * 80 documents in the main study
     * 
     */

    @Override
    public void initialize(final UimaContext context) throws ResourceInitializationException {
        super.initialize(context);

        try {
            String inputString = FileUtils.readFileToString(this.inputFile);
            JSONParser jsonParser = new JSONParser();

            @SuppressWarnings("unchecked")
            ArrayList<Map<String, Object>> jsonTexts = new ArrayList<Map<String, Object>>(
                    (List<Map<String, Object>>) jsonParser.parse(inputString));
            this.jsonTexts = jsonTexts;
            System.out.println("number of json texts: " + this.jsonTexts.size());

            //this.filterOutMetadataSection();
            this.nextDocumentIdx = 0;
            this.documentsIterator = this.jsonTexts.iterator();
            this.counter = 0;
            this.premises = 0;
            this.claims = 0;
        } catch (final IOException e) {
            throw new ResourceInitializationException(e);
        } catch (final ParseException e) {
            throw new ResourceInitializationException(e);
        }

    }

    //    /*
    //     * Removes the metadata section. All remaining sections are 'file sections'.
    //     */
    //    private void filterOutMetadataSection()
    //    {
    //        if (this.jsonTexts.size() > 0) {
    //            final Map<String, Object> firstSection = this.jsonTexts.iterator().next();
    //            if (CorpusMetadata.isMetadataDictionary(firstSection)) {
    //                //this.metadata = CorpusMetadata.extractMetadata(firstSection);
    //                this.jsonTexts.remove(firstSection);
    //            }
    //        }
    //    }
    //

    @Override
    public boolean hasNext() throws IOException, CollectionException {
        return this.documentsIterator.hasNext();
    }

    private int getNumDocuments() {
        return this.jsonTexts.size();

    }

    @Override
    public Progress[] getProgress() {
        return new Progress[] { new ProgressImpl(this.nextDocumentIdx, this.getNumDocuments(), Progress.ENTITIES) };
    }

    @Override
    public void getNext(JCas aJcas) throws CollectionException {
        try {
            Map<String, Object> jsonData = this.documentsIterator.next();

            String htmlText = (String) jsonData.get(JsonCorpusUtil.TEXT);
            org.jsoup.nodes.Document cleanedText = Jsoup.parse(htmlText);
            String rawDocumentText = cleanedText.text();

            String file = (String) jsonData.get(JsonCorpusUtil.FILE);
            String documentId = file.replace(".json", "");
            String url = (String) jsonData.get(JsonCorpusUtil.URL);

            // original HTML version not required for TC experiment
            //            JCas view = jCas.createView(JsonCorpusUtil.VIEW_ORIGINAL_HTML);
            //            view.setDocumentText(htmlText);

            aJcas.setDocumentText(rawDocumentText);
            aJcas.setDocumentLanguage(this.language);

            DocumentMetaData metaData = DocumentMetaData.create(aJcas);
            metaData.setDocumentBaseUri("");
            metaData.setDocumentUri("/" + documentId);
            metaData.setDocumentTitle(url);
            metaData.setDocumentId(documentId);

            Map<Integer, Token> idxToTokenMapping = this.createIndexToTokenMapping(rawDocumentText);

            @SuppressWarnings("unchecked")
            List<Map<String, Object>> userAnnotations = (List<Map<String, Object>>) jsonData
                    .get(JsonCorpusUtil.USER_ANNOTATIONS);

            for (Map<String, Object> userAnnotation : userAnnotations) {

                String annotator = (String) userAnnotation.get(JsonCorpusUtil.ANNOTATOR);
                if (annotator.equals(this.annotator)) {

                    @SuppressWarnings("unchecked")
                    List<String> argUnits = (List<String>) userAnnotation.get(JsonCorpusUtil.ARGUMENTATION_UNITS);

                    for (String argUnit : argUnits) {
                        String cleanedArgUnit = argUnit.replaceAll("\\s+", "");
                        Matcher matcher = JsonCorpusUtil.getRecognitionPattern().matcher(cleanedArgUnit);
                        if (!matcher.matches()) {
                            this.getLogger()
                                    .warn(String.format("argument unit %s does not match the expected pattern %s",
                                            cleanedArgUnit, JsonCorpusUtil.getRecognitionPattern().pattern()));
                        } else {
                            // **************************************************
                            // coordinates of an argument unit:
                            String label = matcher.group(1);
                            String stringIndices = matcher.group(3).replaceAll("^,", "");
                            List<Integer> indices = CollectionUtils.parseIntList(stringIndices, ",");

                            int firstIndex = Collections.min(indices);
                            Token firstToken = idxToTokenMapping.get(firstIndex);

                            int lastIndex = Collections.max(indices);
                            Token lastToken = idxToTokenMapping.get(lastIndex);
                            // *****************************************************

                            // Read argument unit as Paragraph annotation
                            Paragraph para = new Paragraph(aJcas, firstToken.getBegin(), lastToken.getEnd());
                            para.addToIndexes();

                            // print some counts:
                            System.out.println("annotator: " + annotator);
                            counter++;
                            System.out
                                    .println("AU " + counter + " -- argument unit text: " + para.getCoveredText());
                            System.out.println("label: " + label);
                            if (label.contains("claim")) {
                                claims++;
                            } else {
                                premises++;
                            }
                            System.out.println("premises " + premises + "\t claims " + claims);

                            NamedEntity outcome = new NamedEntity(aJcas, firstToken.getBegin(), lastToken.getEnd());
                            outcome.setValue(label);
                            outcome.addToIndexes();

                        } // matching was ok
                    } // for argUnit : argUnits
                    ++this.nextDocumentIdx;

                } // if annotator.equals(this.annotator)
            }
        } catch (final CASException e) {
            throw new CollectionException(e);
        } catch (final ResourceInitializationException e) {
            throw new CollectionException(e);
        } catch (final UIMAException e) {
            throw new CollectionException(e);
        }
    }

    protected final Map<Integer, Token> createIndexToTokenMapping(final String rawDocumentText)
            throws UIMAException, AnalysisEngineProcessException, ResourceInitializationException {
        final JCas dummyJCas = JCasFactory.createJCas();
        dummyJCas.setDocumentText(rawDocumentText);
        dummyJCas.setDocumentLanguage(this.language);
        SimplePipeline.runPipeline(dummyJCas, createEngineDescription(LanguageToolSegmenter.class));

        final Map<Integer, Token> idxToTokenMapping = ArgumentUnitUtils.mapIndexToAnnotation(dummyJCas, Token.class,
                JsonCorpusUtil.FIRST_TOKEN_IDX);
        return idxToTokenMapping;
    }

}