reader.ArgumentUnitTCReader.java Source code

Java tutorial

Introduction

Here is the source code for reader.ArgumentUnitTCReader.java

Source

/*******************************************************************************
 * Copyright 2015
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package reader;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;

import org.apache.commons.io.FileUtils;
import org.apache.uima.UIMAException;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import org.jsoup.Jsoup;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.languagetool.LanguageToolSegmenter;
import de.tudarmstadt.ukp.dkpro.tc.api.io.TCReaderSingleLabel;
import de.tudarmstadt.ukp.dkpro.tc.api.type.TextClassificationOutcome;

/**
 * @author Judith Eckle-Kohler
 * @author Roland Kluge
 * 
 * assumes as input the file 20140120_dump__after_overlapping_annotations.json
 * where the metadata header has been removed manually
 * 
 * this annotated corpus contains all 88 documents which have been annotated
 * 8 documents in the pilot phase
 * 80 documents in the main study
 * 
 * 
 * reads in the annotations of one annotator and sets the annotated label as classification outcome
 * used in document classification with cross validation (in order to explore linguistic features)
 * document = argument unit
 *
 */
public class ArgumentUnitTCReader extends JCasCollectionReader_ImplBase implements TCReaderSingleLabel

{

    public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING;
    @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8")
    private String encoding;

    public static final String PARAM_INPUT_FILE = "inputFile";
    @ConfigurationParameter(name = PARAM_INPUT_FILE, mandatory = true, description = "JSON input file")
    private File inputFile;

    public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
    @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true, description = "two-letter language code")
    private String language;

    public static final String PARAM_ANNOTATOR = "annotator";
    @ConfigurationParameter(name = PARAM_ANNOTATOR, mandatory = true, description = "The annotator whose annotations shall be included")
    private String annotator;

    private List<String> labels;
    private List<String> texts;

    private int offset;

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);

        // read input file with texts (= argument units) and labels
        labels = new ArrayList<String>();
        texts = new ArrayList<String>();
        Iterator<Map<String, Object>> documentsIterator;

        try {
            String inputString = FileUtils.readFileToString(this.inputFile);
            JSONParser jsonParser = new JSONParser();

            @SuppressWarnings("unchecked")
            ArrayList<Map<String, Object>> jsonTexts = new ArrayList<Map<String, Object>>(
                    (List<Map<String, Object>>) jsonParser.parse(inputString));
            documentsIterator = jsonTexts.iterator();

            while (documentsIterator.hasNext()) {
                Map<String, Object> jsonData = documentsIterator.next();

                @SuppressWarnings("unchecked")
                List<Map<String, Object>> userAnnotations = (List<Map<String, Object>>) jsonData
                        .get(JsonCorpusUtil.USER_ANNOTATIONS);

                for (Map<String, Object> userAnnotation : userAnnotations) {
                    String annotator = (String) userAnnotation.get(JsonCorpusUtil.ANNOTATOR);
                    if (annotator.equals(this.annotator)) {

                        String htmlText = (String) jsonData.get(JsonCorpusUtil.TEXT);
                        org.jsoup.nodes.Document cleanedText = Jsoup.parse(htmlText);
                        String rawDocumentText = cleanedText.text();
                        Map<Integer, Token> idxToTokenMapping = this.createIndexToTokenMapping(rawDocumentText);

                        @SuppressWarnings("unchecked")
                        List<String> argUnits = (List<String>) userAnnotation
                                .get(JsonCorpusUtil.ARGUMENTATION_UNITS);

                        for (String argUnit : argUnits) {
                            //System.out.println("au: " +argUnit);  
                            String cleanedArgUnit = argUnit.replaceAll("\\s+", "");
                            Matcher matcher = JsonCorpusUtil.getRecognitionPattern().matcher(cleanedArgUnit);
                            if (!matcher.matches()) {
                                this.getLogger()
                                        .warn(String.format(
                                                "argument unit %s does not match the expected pattern %s",
                                                cleanedArgUnit, JsonCorpusUtil.getRecognitionPattern().pattern()));
                            } else {
                                // **************************************************
                                // coordinates of an argument unit:
                                String label = matcher.group(1);
                                String stringIndices = matcher.group(3).replaceAll("^,", "");
                                List<Integer> indices = CollectionUtils.parseIntList(stringIndices, ",");

                                int firstIndex = Collections.min(indices);
                                Token firstToken = idxToTokenMapping.get(firstIndex);

                                int lastIndex = Collections.max(indices);
                                Token lastToken = idxToTokenMapping.get(lastIndex);

                                //String text = getArgunitText(firstIndex, lastIndex);
                                // *****************************************************

                                String generalizedLabel = getGeneralizedLabel(label);

                                // Read argument unit as dummy Paragraph annotation to get the text
                                JCas dummyJCas = JCasFactory.createJCas();
                                dummyJCas.setDocumentText(rawDocumentText);

                                Paragraph para = new Paragraph(dummyJCas, firstToken.getBegin(),
                                        lastToken.getEnd());
                                //System.out.println("argument unit text: " +para.getCoveredText());

                                texts.add(para.getCoveredText());
                                labels.add(generalizedLabel);

                                //System.out.println("annotator: " +annotator);                        
                                System.out.println("label: " + label + " general label: " + generalizedLabel);
                            } // matching was ok
                        } // for argUnit : argUnits
                    } // if annotator.equals(this.annotator)
                } // for user annotation
            } // while hasNext
        } catch (final IOException e) {
            throw new ResourceInitializationException(e);
        } catch (final ParseException e) {
            throw new ResourceInitializationException(e);
        } catch (UIMAException e) {
            throw new ResourceInitializationException(e);
        }
        offset = 0;
        System.out.println("number of AUs: " + texts.size());
    }

    private String getGeneralizedLabel(String label) {
        String result = null;
        if (label.startsWith("claim")) {
            result = "claim";
        } else if (label.startsWith("support")) {
            result = "premise";
        } else if (label.startsWith("attack")) {
            result = "premise";
        }
        return result;
    }

    @Override
    public boolean hasNext() throws IOException, CollectionException {
        return offset < texts.size();
    }

    @Override
    public Progress[] getProgress() {
        return new Progress[] { new ProgressImpl(offset, texts.size(), "argunits") };
    }

    @Override
    public String getTextClassificationOutcome(JCas jcas) throws CollectionException {
        return labels.get(offset);
    }

    @Override
    public void getNext(JCas aJCas) throws IOException, CollectionException {
        // setting the document text
        aJCas.setDocumentText(texts.get(offset));
        aJCas.setDocumentLanguage(language);

        // as we are creating more than one CAS out of a single file, we need to have different
        // document titles and URIs for each CAS
        // otherwise, serialized CASes will be overwritten
        DocumentMetaData dmd = DocumentMetaData.create(aJCas);
        dmd.setDocumentTitle("Argunit" + offset);
        dmd.setDocumentUri("Argunit" + offset);
        dmd.setDocumentId(String.valueOf(offset));

        // setting the outcome / label for this document = argument unit
        TextClassificationOutcome outcome = new TextClassificationOutcome(aJCas);
        outcome.setOutcome(getTextClassificationOutcome(aJCas));
        outcome.addToIndexes();

        offset++;
    }

    protected final Map<Integer, Token> createIndexToTokenMapping(final String rawDocumentText)
            throws UIMAException, AnalysisEngineProcessException, ResourceInitializationException {
        final JCas dummyJCas = JCasFactory.createJCas();
        dummyJCas.setDocumentText(rawDocumentText);
        dummyJCas.setDocumentLanguage(this.language);
        SimplePipeline.runPipeline(dummyJCas, createEngineDescription(LanguageToolSegmenter.class));

        final Map<Integer, Token> idxToTokenMapping = ArgumentUnitUtils.mapIndexToAnnotation(dummyJCas, Token.class,
                JsonCorpusUtil.FIRST_TOKEN_IDX);
        return idxToTokenMapping;
    }

}