de.citec.csra.elancsv.parser.SimpleParser.java Source code

Java tutorial

Introduction

Here is the source code for de.citec.csra.elancsv.parser.SimpleParser.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package de.citec.csra.elancsv.parser;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.BasicParser;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;

/**
 *
 * @author Patrick Holthaus
 * (<a href=mailto:patrick.holthaus@uni-bielefeld.de>patrick.holthaus@uni-bielefeld.de</a>)
 */
public class SimpleParser {

    enum ElanFormat {

        TIER(0), START(1), STOP(2), DURATION(3), VALUE(4), FILE(5);

        private ElanFormat(int field) {
            this.field = field;
        }

        final int field;

    }

    private static void helpExit(Options opts, String header) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("elancsv-parser [OPTION...]", header, opts, null);
        System.exit(0);
    }

    public static void main(String[] args) throws IOException, ParseException {

        Options opts = new Options();
        opts.addOption("file", true, "Tab-separated ELAN export file to load.");
        opts.addOption("tier", true,
                "Tier to analyze. Optional: Append ::num to interpret annotations numerically.");
        opts.addOption("format", true,
                "How to read information from the file name. %V -> participant, %A -> annoatator, %C -> condition, e.g. \"%V - %A\"");
        opts.addOption("help", false, "Print this help and exit");

        CommandLineParser parser = new BasicParser();
        CommandLine cmd = parser.parse(opts, args);
        if (cmd.hasOption("help")) {
            helpExit(opts, "where OPTION includes:");
        }

        String infile = cmd.getOptionValue("file");
        if (infile == null) {
            helpExit(opts, "Error: no file given.");
        }

        String format = cmd.getOptionValue("format");
        if (format == null) {
            helpExit(opts, "Error: no format given.");
        }

        String tier = cmd.getOptionValue("tier");
        if (tier == null) {
            helpExit(opts, "Error: no tier given.");
        }

        //      TODO count values in annotations (e.g. search all robot occurrences)
        String[] tn = tier.split("::");
        boolean numeric = false;
        if (tn.length == 2 && tn[1].equals("num")) {
            numeric = true;
            tier = tn[0];
        }

        format = "^" + format + "$";
        format = format.replaceFirst("%V", "(?<V>.*?)");
        format = format.replaceFirst("%A", "(?<A>.*?)");
        format = format.replaceFirst("%C", "(?<C>.*?)");
        Pattern pa = Pattern.compile(format);

        Map<String, Participant> participants = new HashMap<>();
        BufferedReader br = new BufferedReader(new FileReader(infile));
        String line;
        int lineno = 0;
        while ((line = br.readLine()) != null) {
            String[] parts = line.split("\t");
            lineno++;
            if (parts.length < 5) {
                System.err.println("WARNING: line '" + lineno + "' too short '" + line + "'");
                continue;
            }
            Annotation a = new Annotation(Long.valueOf(parts[ElanFormat.START.field]),
                    Long.valueOf(parts[ElanFormat.STOP.field]), Long.valueOf(parts[ElanFormat.DURATION.field]),
                    parts[ElanFormat.VALUE.field]);
            String tname = parts[ElanFormat.TIER.field];
            String file = parts[ElanFormat.FILE.field].replaceAll(".eaf", "");

            Matcher m = pa.matcher(file);
            String vp = file;
            String condition = "?";
            String annotator = "?";
            String participantID = vp;

            if (m.find()) {
                vp = m.group("V");
                if (format.indexOf("<A>") > 0) {
                    annotator = m.group("A");
                }

                if (format.indexOf("<C>") > 0) {
                    condition = m.group("C");
                }
            }
            participantID = vp + ";" + annotator;

            if (!participants.containsKey(participantID)) {
                participants.put(participantID, new Participant(vp, condition, annotator));
            }
            Participant p = participants.get(participantID);

            if (!p.tiers.containsKey(tname)) {
                p.tiers.put(tname, new Tier(tname));
            }

            p.tiers.get(tname).annotations.add(a);

        }

        Map<String, Map<String, Number>> values = new HashMap<>();
        Set<String> rownames = new HashSet<>();

        String allCountKey = "c: all values";
        String allDurationKey = "d: all values";
        String allMeanKey = "m: all values";

        for (Map.Entry<String, Participant> e : participants.entrySet()) {
            //         System.out.println(e);
            Tier t = e.getValue().tiers.get(tier);
            String participantID = e.getKey();

            if (!values.containsKey(participantID)) {
                values.put(participantID, new HashMap<String, Number>());
            }
            Map<String, Number> row = values.get(participantID); //participant id

            if (t != null) {

                row.put(allCountKey, 0l);
                row.put(allDurationKey, 0l);
                row.put(allMeanKey, 0l);

                for (Annotation a : t.annotations) {

                    long countAll = (long) row.get(allCountKey) + 1;
                    long durationAll = (long) row.get(allDurationKey) + a.duration;
                    long meanAll = durationAll / countAll;

                    row.put(allCountKey, countAll);
                    row.put(allDurationKey, durationAll);
                    row.put(allMeanKey, meanAll);

                    if (!numeric) {
                        String countKey = "c: " + a.value;
                        String durationKey = "d: " + a.value;
                        String meanKey = "m: " + a.value;

                        if (!row.containsKey(countKey)) {
                            row.put(countKey, 0l);
                        }
                        if (!row.containsKey(durationKey)) {
                            row.put(durationKey, 0l);
                        }
                        if (!row.containsKey(meanKey)) {
                            row.put(meanKey, 0d);
                        }

                        long count = (long) row.get(countKey) + 1;
                        long duration = (long) row.get(durationKey) + a.duration;
                        double mean = duration * 1.0 / count;

                        row.put(countKey, count);
                        row.put(durationKey, duration);
                        row.put(meanKey, mean);

                        rownames.add(countKey);
                        rownames.add(durationKey);
                        rownames.add(meanKey);
                    } else {
                        String countKey = "c: " + t.name;
                        String sumKey = "s: " + t.name;
                        String meanKey = "m: " + t.name;

                        if (!row.containsKey(countKey)) {
                            row.put(countKey, 0l);
                        }
                        if (!row.containsKey(sumKey)) {
                            row.put(sumKey, 0d);
                        }
                        if (!row.containsKey(meanKey)) {
                            row.put(meanKey, 0d);
                        }

                        double d = 0;
                        try {
                            d = Double.valueOf(a.value);
                        } catch (NumberFormatException ex) {

                        }

                        long count = (long) row.get(countKey) + 1;
                        double sum = (double) row.get(sumKey) + d;
                        double mean = sum / count;

                        row.put(countKey, count);
                        row.put(sumKey, sum);
                        row.put(meanKey, mean);

                        rownames.add(countKey);
                        rownames.add(sumKey);
                        rownames.add(meanKey);
                    }

                }
            }

        }

        ArrayList<String> list = new ArrayList(rownames);
        Collections.sort(list);
        StringBuilder header = new StringBuilder("ID;Annotator;");
        header.append(allCountKey);
        header.append(";");
        header.append(allDurationKey);
        header.append(";");
        header.append(allMeanKey);
        header.append(";");
        for (String l : list) {
            header.append(l);
            header.append(";");
        }
        System.out.println(header);

        for (Map.Entry<String, Map<String, Number>> e : values.entrySet()) {
            StringBuilder row = new StringBuilder(e.getKey());
            row.append(";");
            if (e.getValue().containsKey(allCountKey)) {
                row.append(e.getValue().get(allCountKey));
            } else {
                row.append("0");
            }
            row.append(";");
            if (e.getValue().containsKey(allDurationKey)) {
                row.append(e.getValue().get(allDurationKey));
            } else {
                row.append("0");
            }
            row.append(";");
            if (e.getValue().containsKey(allMeanKey)) {
                row.append(e.getValue().get(allMeanKey));
            } else {
                row.append("0");
            }
            row.append(";");
            for (String l : list) {
                if (e.getValue().containsKey(l)) {
                    row.append(e.getValue().get(l));
                } else {
                    row.append("0");
                }
                row.append(";");
            }
            System.out.println(row);
        }
    }
}