edu.byu.nlp.data.app.AnnotationStream2Csv.java Source code

Introduction

Here is the source code for edu.byu.nlp.data.app.AnnotationStream2Csv.java
Source

/**
 * Copyright 2015 Brigham Young University
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.byu.nlp.data.app;

import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicates;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multiset;
import com.google.common.collect.Sets;
import com.google.common.io.Files;

import edu.byu.nlp.data.FlatInstance;
import edu.byu.nlp.data.docs.CountCutoffFeatureSelectorFactory;
import edu.byu.nlp.data.docs.DocPipes;
import edu.byu.nlp.data.docs.FeatureSelectorFactories;
import edu.byu.nlp.data.docs.JSONDocumentDatasetBuilder;
import edu.byu.nlp.data.docs.TopNPerDocumentFeatureSelectorFactory;
import edu.byu.nlp.data.types.Dataset;
import edu.byu.nlp.data.types.DatasetInstance;
import edu.byu.nlp.data.types.SparseFeatureVector;
import edu.byu.nlp.dataset.Datasets;
import edu.byu.nlp.io.Paths;
import edu.byu.nlp.util.Enumeration;
import edu.byu.nlp.util.Iterables2;
import edu.byu.nlp.util.jargparser.ArgumentParser;
import edu.byu.nlp.util.jargparser.annotations.Option;

/**
 * @author plf1
 *
 * Read in a json annotation stream and output a csv file capturing the 
 * properties of the annotations (for further analysis). 
 * Rows depend on the setting of the --row parameter. 
 * Columns include annotator id, start time, end time, annotation value, 
 * instance source, etc.
 */
public class AnnotationStream2Csv {
    private static Logger logger = LoggerFactory.getLogger(AnnotationStream2Csv.class);

    private static enum ROW {
        ANNOTATION, INSTANCE, ANNOTATOR
    };

    @Option(help = "If ")
    private static ROW row = ROW.ANNOTATION;

    @Option(help = "A json annotation stream containing annotations to be fitted.")
    private static String jsonStream = "/aml/data/plf1/cfgroups/cfgroups1000.json";

    @Option(help = "The file where csv should be written")
    private static String out = null;

    public static void main(String[] args) throws IOException {
        // parse CLI arguments
        new ArgumentParser(AnnotationStream2Csv.class).parseArgs(args);
        Preconditions.checkNotNull(jsonStream, "You must provide a valid --json-stream!");

        Dataset data = readData(jsonStream);

        // optionally aggregate by instance
        String header = "annotator,start,end,annotation,label,source,num_correct_annotations,num_annotations,cum_num_annotations,num_annotators,cum_num_annotators\n";

        // iterate over instances and (optionally) annotations
        final StringBuilder bld = new StringBuilder();

        switch (row) {
        case ANNOTATION:

            // sort all annotations by end time
            Map<FlatInstance<SparseFeatureVector, Integer>, DatasetInstance> ann2InstMap = Maps
                    .newIdentityHashMap();
            List<FlatInstance<SparseFeatureVector, Integer>> annotationList = Lists.newArrayList();
            for (DatasetInstance inst : data) {
                for (FlatInstance<SparseFeatureVector, Integer> ann : inst.getAnnotations().getRawAnnotations()) {
                    ann2InstMap.put(ann, inst); // record instance of each annotations
                    annotationList.add(ann);
                }
            }
            Collections.sort(annotationList, new Comparator<FlatInstance<SparseFeatureVector, Integer>>() {
                @Override
                public int compare(FlatInstance<SparseFeatureVector, Integer> o1,
                        FlatInstance<SparseFeatureVector, Integer> o2) {
                    // no null checking since we want to fail if annotation time is not set. 
                    return Long.compare(o1.getEndTimestamp(), o2.getEndTimestamp());
                }
            });

            Set<Integer> annotators = Sets.newHashSet();
            for (Enumeration<FlatInstance<SparseFeatureVector, Integer>> item : Iterables2
                    .enumerate(annotationList)) {
                FlatInstance<SparseFeatureVector, Integer> ann = item.getElement();
                DatasetInstance inst = ann2InstMap.get(ann);
                annotators.add(ann.getAnnotator());

                bld.append(ann.getAnnotator() + ",");
                bld.append(ann.getStartTimestamp() + ",");
                bld.append(ann.getEndTimestamp() + ",");
                bld.append(ann.getAnnotation() + ",");
                bld.append(inst.getLabel() + ",");
                bld.append(
                        data.getInfo().getIndexers().getInstanceIdIndexer().get(inst.getInfo().getSource()) + ",");
                bld.append((!inst.hasLabel() ? "NA" : ann.getAnnotation() == inst.getLabel() ? 1 : 0) + ","); // num correct
                bld.append(1 + ","); // num annotations
                bld.append((item.getIndex() + 1) + ","); // cumulative num annotations
                bld.append(1 + ","); // num annotators
                bld.append(annotators.size() + ""); // cumulative num annotators
                bld.append("\n");
            }
            break;
        case INSTANCE:
            int cumNumAnnotations = 0;
            for (DatasetInstance inst : data) {
                cumNumAnnotations += inst.getInfo().getNumAnnotations();

                int numCorrectAnnotations = 0;
                // sum over all the annotators who put the correct answer (if available)
                if (inst.hasLabel()) {
                    Integer correctLabel = inst.getLabel();
                    for (int j = 0; j < data.getInfo().getNumAnnotators(); j++) {
                        numCorrectAnnotations += inst.getAnnotations().getLabelAnnotations()
                                .getRow(j)[correctLabel];
                    }
                }

                bld.append("NA,");
                bld.append("NA,");
                bld.append("NA,");
                bld.append("NA,");
                bld.append(inst.getLabel() + ",");
                bld.append(inst.getInfo().getSource() + ",");
                bld.append(numCorrectAnnotations + ",");
                bld.append(inst.getInfo().getNumAnnotations() + ",");
                bld.append(cumNumAnnotations + ",");
                bld.append(inst.getInfo().getNumAnnotators() + ",");
                bld.append("NA"); // cumulative num annotators
                bld.append("\n");
            }
            break;

        case ANNOTATOR:
            Multiset<Integer> perAnnotatorAnnotationCounts = HashMultiset.create();
            Multiset<Integer> perAnnotatorCorrectAnnotationCounts = HashMultiset.create();
            for (DatasetInstance inst : data) {
                for (FlatInstance<SparseFeatureVector, Integer> ann : inst.getAnnotations().getRawAnnotations()) {
                    int annotatorId = ann.getAnnotator();

                    perAnnotatorAnnotationCounts.add(annotatorId);

                    if (inst.getLabel() == ann.getAnnotation()) {
                        perAnnotatorCorrectAnnotationCounts.add(annotatorId);
                    }

                }
            }

            for (String annotatorId : data.getInfo().getAnnotatorIdIndexer()) {

                bld.append(annotatorId + ",");
                bld.append("NA,");
                bld.append("NA,");
                bld.append("NA,");
                bld.append("NA,");
                bld.append("NA,");
                bld.append(perAnnotatorCorrectAnnotationCounts.count(annotatorId) + ",");
                bld.append(perAnnotatorAnnotationCounts.count(annotatorId) + ",");
                bld.append("NA,");
                bld.append("1,"); // num annotators
                bld.append("NA"); // cumulative num annotators
                bld.append("\n");
            }

            break;

        default:
            Preconditions.checkArgument(false, "unknown row type: " + row);
            break;
        }

        // output to console
        if (out == null) {
            System.out.println(header);
            System.out.println(bld.toString());
        } else {
            File outfile = new File(out);
            Files.write(header, outfile, Charsets.UTF_8);
            Files.append(bld, outfile, Charsets.UTF_8);
        }

    }

    private static Dataset readData(String jsonStream) throws IOException {
        // these parameters are not important since we will ignore the data itself and concentrate only on annotations
        // in this script
        int featureCountCutoff = -1;
        int topNFeaturesPerDocument = -1;
        Integer featureNormalizer = null;
        Function<String, String> docTransform = null;
        Function<String, String> tokenTransform = null;

        // data reader pipeline per dataset
        // build a dataset, doing all the tokenizing, stopword removal, and feature normalization
        String folder = Paths.directory(jsonStream);
        String file = Paths.baseName(jsonStream);
        Dataset data = new JSONDocumentDatasetBuilder(folder, file, docTransform,
                DocPipes.opennlpSentenceSplitter(), DocPipes.McCallumAndNigamTokenizer(), tokenTransform,
                FeatureSelectorFactories.conjoin(new CountCutoffFeatureSelectorFactory<String>(featureCountCutoff),
                        (topNFeaturesPerDocument < 0) ? null
                                : new TopNPerDocumentFeatureSelectorFactory(topNFeaturesPerDocument)),
                featureNormalizer).dataset();

        // Postprocessing: remove all documents with duplicate sources or empty features
        data = Datasets.filteredDataset(data,
                Predicates.and(Datasets.filterDuplicateSources(), Datasets.filterNonEmpty()));

        logger.info("Number of labeled instances = " + data.getInfo().getNumDocumentsWithObservedLabels());
        logger.info("Number of unlabeled instances = " + data.getInfo().getNumDocumentsWithoutObservedLabels());
        logger.info("Number of tokens = " + data.getInfo().getNumTokens());
        logger.info("Number of features = " + data.getInfo().getNumFeatures());
        logger.info("Number of classes = " + data.getInfo().getNumClasses());
        logger.info(
                "Average Document Size = " + (data.getInfo().getNumTokens() / data.getInfo().getNumDocuments()));

        return data;
    }

}