jjj.asap.sas.parser.job.ImportParserData.java Source code

Introduction

Here is the source code for jjj.asap.sas.parser.job.ImportParserData.java
Source

/*
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or GITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 * Copyright (C) 2012 James Jesensky
 */

package jjj.asap.sas.parser.job;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import jjj.asap.sas.util.Contest;
import jjj.asap.sas.util.FileIterator;
import jjj.asap.sas.util.IOUtils;
import jjj.asap.sas.util.Job;
import jjj.asap.sas.util.StringUtils;
import jjj.asap.sas.weka.Dataset;
import jjj.asap.sas.weka.DatasetBuilder;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Utils;

/**
 * Reads the raw text parser output and creates ARFF files from it.
 */
public class ImportParserData extends Job {

    private static Pattern ID_PATTERN = Pattern.compile("[0-9]+\\.txt\\.sp");
    private static Pattern TAGS_PATTERN = Pattern.compile(".+/.+");
    private static Pattern TREE_PATTERN = Pattern.compile("^\\(ROOT.*");
    private static Pattern DEPENDS_PATTERN = Pattern.compile("[^\\(]+\\(.+,.+\\)");
    private static Pattern TOKENS_PATTERN = Pattern.compile("^.+/.+/.+$");

    /**
     * @param args[0] - the parent directory of the data, likely either "t" or "u".
     */
    public static void main(String[] args) {
        Job job = new ImportParserData();
        job.start();
    }

    private Set<String> standardPosTags;

    /**
     * Do actual work
     */
    @Override
    protected void run() throws Exception {

        this.standardPosTags = new HashSet<String>();
        Iterator<String> it = new FileIterator("data/pos.tags");
        while (it.hasNext()) {
            standardPosTags.add(it.next());
        }

        process("t");
        process("u");

    }

    /**
     * Loads the parser data and processes it one essay at a time
     * @param parent the parent directory of datasets and parser output
     */
    private void process(final String parent) {

        // check if output exists
        boolean any = false;
        for (int k = 1; k <= 10; k++) {
            if (!IOUtils.exists("work/datasets/" + parent + "/" + k + "-extra-stats.arff"))
                any = true;
            if (!IOUtils.exists("work/datasets/" + parent + "/" + k + "-pos-tags.arff"))
                any = true;
            if (!IOUtils.exists("work/datasets/" + parent + "/" + k + "-parse-tree.arff"))
                any = true;
            if (!IOUtils.exists("work/datasets/" + parent + "/" + k + "-depends0.arff"))
                any = true;
            if (!IOUtils.exists("work/datasets/" + parent + "/" + k + "-depends1.arff"))
                any = true;
            if (!IOUtils.exists("work/datasets/" + parent + "/" + k + "-depends2.arff"))
                any = true;
            if (!IOUtils.exists("work/datasets/" + parent + "/" + k + "-depends3.arff"))
                any = true;
            if (!IOUtils.exists("work/datasets/" + parent + "/" + k + "-depends4.arff"))
                any = true;
            if (!IOUtils.exists("work/datasets/" + parent + "/" + k + "-depends5.arff"))
                any = true;
            if (!IOUtils.exists("work/datasets/" + parent + "/" + k + "-depends6.arff"))
                any = true;
        }
        if (!any) {
            Job.log("NOTE", "work/datasets/" + parent + "/ has all required datasets - nothing to do");
            return;
        }

        // these maps will hold the data
        final Map<Double, List<String>> tags = new HashMap<Double, List<String>>();
        final Map<Double, List<String>> parseTrees = new HashMap<Double, List<String>>();
        final Map<Double, List<String>> depends = new HashMap<Double, List<String>>();

        List<String> tag = new ArrayList<String>();
        List<String> parseTree = new ArrayList<String>();
        List<String> depend = new ArrayList<String>();

        double id = -1;

        Iterator<String> it = new FileIterator("parser/" + parent + "/parser.data");
        while (it.hasNext()) {

            String line = it.next();

            // skip blank or really short lines
            if (line.length() == 1 || line.length() == 2) {
                Job.log("WARNING", "Short line: " + line);
            }
            if (line.length() < 3)
                continue;

            // parse
            boolean isId = ID_PATTERN.matcher(line).matches();
            boolean isTag = TAGS_PATTERN.matcher(line).matches();
            boolean isTree = TREE_PATTERN.matcher(line).matches();
            boolean isDepend = DEPENDS_PATTERN.matcher(line).matches();

            int matchCount = getMatchCount(isId, isTag, isTree, isDepend);

            if (matchCount != 1) {
                Job.log("WARNING",
                        "Line does not match exactly 1 pattern. Expected 1 but matched " + matchCount + " matches ("
                                + isId + "," + isTag + "," + isTree + "," + isDepend + ") Line was: " + line);
                continue;
            }

            // handle cases

            if (isId) { // id

                if (id != -1) {

                    if (tag.size() != parseTree.size()) {
                        Job.log("WARNING", "tags and parse tree counts differ for " + id + " tags=" + tag.size()
                                + " trees=" + parseTree.size());
                    }

                    if (depend.size() < 1) {
                        Job.log("WARNING", "No depends. found for " + id);
                        depend.add("NONE/NONE/NONE");
                    }

                    tags.put(id, tag);
                    parseTrees.put(id, parseTree);
                    depends.put(id, depend);
                    tag = new ArrayList<String>();
                    parseTree = new ArrayList<String>();
                    depend = new ArrayList<String>();
                }

                int pos = line.indexOf('.');
                id = Double.valueOf(line.substring(0, pos));

            } else if (isTag) { // POS tags

                String[] allTagged = line.split(" ");
                if (allTagged == null || allTagged.length == 0) {
                    Job.log("WARNING", "Expecting POS tags but found nothing: " + line);
                    continue;
                }

                StringBuilder justTheTags = new StringBuilder();
                for (String tagged : allTagged) {

                    int pos = tagged.indexOf('/');
                    if (pos == -1) {
                        Job.log("WARNING", "Expecting word slash tag but found: " + tagged);
                        continue;
                    }
                    if (tagged.length() < pos + 2) {
                        Job.log("WARNING", "Expecting word slash tag but found: " + tagged);
                        continue;
                    }

                    justTheTags.append(tagged.substring(pos + 1));
                    justTheTags.append(" ");
                }

                if (justTheTags.length() < 1) {
                    Job.log("WARNING", "This line looked like tags but had none: " + line);
                    continue;
                }
                tag.add(justTheTags.toString());

            } else if (isTree) { // parse tree

                String[] nodes = line.split(" ");
                if (nodes == null || nodes.length == 0) {
                    Job.log("WARNING", "This line looked like a parse tree but had no nodes: " + line);
                    continue;
                }

                StringBuilder justTheParseTags = new StringBuilder();
                for (String node : nodes) {
                    if (node == null || node.length() < 2) {
                        Job.log("WARNING", "Cannot handle this strange node: " + node);
                        continue;
                    }
                    if (node.startsWith("(") && node.endsWith(")")) {
                        Job.log("WARNING", "Cannot handle this strange node: " + node);
                        continue;
                    }

                    if (node.startsWith("(")) {
                        String parseTag = node.substring(1);
                        if (!this.standardPosTags.contains(parseTag)) {
                            justTheParseTags.append(parseTag);
                            justTheParseTags.append(" ");
                        }
                    }

                } // end node in nodes

                if (justTheParseTags.length() < 1) {
                    Job.log("WARNING", "This line looked like a parse tree but had no nodes: " + line);
                    continue;
                }
                parseTree.add(justTheParseTags.toString());

            } else if (isDepend) {

                // format is:
                // NAME(WORD1-99, WORD2-99)
                // and we want it to be
                // NAME/WORD1/WORD2

                // first some checking
                if (line.indexOf('(') == -1 || line.indexOf(')') == -1 || line.indexOf(',') == -1) {
                    Job.log("WARNING", "invalid depends format (1): " + line);
                    continue;
                }

                if (line.indexOf('(') != line.lastIndexOf('(')) {
                    Job.log("WARNING", "invalid depends format (2): " + line);
                    continue;
                }

                //if(line.indexOf(',') != line.lastIndexOf(',')) {
                //   Job.log("WARNING","invalid depends format (3): " + line);
                //   continue;
                //}

                if (line.indexOf(')') != line.lastIndexOf(')')) {
                    Job.log("WARNING", "invalid depends format (4): " + line);
                    continue;
                }

                // break up
                String foo = "";
                String t = line;
                foo += ("*" + t);
                t = t.replaceAll("-[0-9]+'*, ", "/");
                foo += ("**" + t);
                t = t.replaceAll("-[0-9]+'*\\)$", "");
                foo += ("***" + t);
                t = t.replaceAll("\\(", "/");
                foo += ("****" + t);
                if (!TOKENS_PATTERN.matcher(t).matches()) {
                    Job.log("WARNING", "tokenized depends looks wrong: " + t);
                    System.out.println(foo);
                }

                depend.add(t);

            } else {
                throw new RuntimeException("should not ever reach here");
            }

        } // end while it has next

        // process last insert

        if (tag.size() != parseTree.size()) {
            Job.log("WARNING", "tags and parse tree counts differ for " + id);
        }

        if (depend.size() < 1) {
            Job.log("WARNING", "No depends. found for " + id);
        }

        tags.put(id, tag);
        parseTrees.put(id, parseTree);
        depends.put(id, depend);

        // now process for each essay set
        for (int k = 1; k <= 10; k++) {
            process(parent, k, tags, parseTrees, depends);
        }
    }

    private void process(final String parent, int essaySet, Map<Double, List<String>> tags,
            Map<Double, List<String>> parseTrees, Map<Double, List<String>> depends) {

        // check if output exists
        boolean any = false;

        if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-extra-stats.arff"))
            any = true;
        if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-pos-tags.arff"))
            any = true;
        if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-parse-tree.arff"))
            any = true;
        if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends0.arff"))
            any = true;
        if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends1.arff"))
            any = true;
        if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends2.arff"))
            any = true;
        if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends3.arff"))
            any = true;
        if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends4.arff"))
            any = true;
        if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends5.arff"))
            any = true;
        if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends6.arff"))
            any = true;

        if (!any) {
            Job.log("NOTE", "work/datasets/" + parent + "/" + essaySet
                    + "-*.arff returns all required datasets - nothing to do");
            return;
        }

        // Load an existing dataset to use as a template.
        Instances dataset = Dataset.load("work/datasets/" + parent + "/" + essaySet + "-spell-checked.arff");

        // create the output datasets here. except for the extra statistics, 
        // the format is the same as 'dataset'.

        Instances tagsData = new Instances(dataset, 0);
        tagsData.setRelationName(essaySet + "-pos-tags.arff");
        Instances treeData = new Instances(dataset, 0);
        treeData.setRelationName(essaySet + "-parse-tree.arff");

        Instances dependsData[] = new Instances[7];
        for (int j = 0; j < 7; j++) {
            dependsData[j] = new Instances(dataset, 0);
            dependsData[j].setRelationName(essaySet + "-depends" + j + ".arff");
        }

        // extra stats
        DatasetBuilder builder = new DatasetBuilder();
        builder.addVariable("id");
        if (Contest.isMultiChoice(essaySet)) {
            builder.addNominalVariable("color", Contest.COLORS);
        }
        builder.addVariable("x_sent");
        builder.addVariable("x_para");
        builder.addVariable("x_length");
        builder.addVariable("x_words");
        builder.addVariable("x_unique_words");
        builder.addNominalVariable("score", Contest.getRubrics(essaySet));

        Instances extraStats = builder.getDataset(essaySet + "-extra-stats.arff");

        // now add rows for each instance

        for (int i = 0; i < dataset.numInstances(); i++) {

            // common variables
            Instance ob = dataset.instance(i);
            double id = ob.value(0);
            String y = ob.isMissing(dataset.numAttributes() - 1) ? null
                    : ob.stringValue(dataset.numAttributes() - 1);
            String color = Contest.isMultiChoice(essaySet) ? ob.stringValue(dataset.attribute("color")) : null;
            String str = ob.stringValue(dataset.attribute("text"));

            //
            // Extra stats
            //

            int nSent = tags.containsKey(id) ? tags.get(id).size() : 0;
            int nPara = 0;
            for (int a = 0; a < str.length(); a++) {
                if (str.charAt(a) == '^')
                    nPara++;
            }
            int nLength = str.length();
            int nWords = 0;
            int nUniqueWords = 0;
            String[] words = str.toLowerCase().split(" ");
            nWords = words.length;
            Set<String> u = new HashSet<String>();
            for (String w : words) {
                u.add(w);
            }
            nUniqueWords = u.size();

            extraStats.add(new DenseInstance(extraStats.numAttributes()));
            Instance extra = extraStats.lastInstance();
            extra.setValue(0, id);
            if (Contest.isMultiChoice(essaySet)) {
                extra.setValue(1, color);
            }

            extra.setValue(extraStats.attribute("x_sent"), nSent);
            extra.setValue(extraStats.attribute("x_para"), nPara);
            extra.setValue(extraStats.attribute("x_length"), nLength);
            extra.setValue(extraStats.attribute("x_words"), nWords);
            extra.setValue(extraStats.attribute("x_unique_words"), nUniqueWords);

            if (y == null)
                extra.setValue(extraStats.numAttributes() - 1, Utils.missingValue());
            else
                extra.setValue(extraStats.numAttributes() - 1, y);

            //
            // POS tags
            //

            String tagsText = "";
            List<String> tagsList = tags.get(id);
            if (tagsList == null || tagsList.isEmpty()) {
                Job.log("WARNING", "no tags for " + id);
                tagsText = "x";
            } else {
                for (String tagsItem : tagsList) {
                    tagsText += tagsItem;
                }
            }

            tagsData.add(new DenseInstance(ob.numAttributes()));
            Instance tagsOb = tagsData.lastInstance();
            tagsOb.setValue(0, id);
            if (Contest.isMultiChoice(essaySet)) {
                tagsOb.setValue(1, color);
                tagsOb.setValue(2, tagsText.trim());
                if (y == null) {
                    tagsOb.setValue(3, Utils.missingValue());
                } else {
                    tagsOb.setValue(3, y);
                }
            } else {
                tagsOb.setValue(1, tagsText.trim());
                if (y == null) {
                    tagsOb.setValue(2, Utils.missingValue());
                } else {
                    tagsOb.setValue(2, y);
                }
            }

            //
            // Parse Tree
            //

            String treeText = "";
            List<String> treeList = parseTrees.get(id);
            if (treeList == null || treeList.isEmpty()) {
                Job.log("WARNING", "no parse tree for " + id);
                treeText = "x";
            } else {
                for (String treeItem : treeList) {
                    treeText += treeItem;
                }
            }

            treeData.add(new DenseInstance(ob.numAttributes()));
            Instance treeOb = treeData.lastInstance();
            treeOb.setValue(0, id);
            if (Contest.isMultiChoice(essaySet)) {
                treeOb.setValue(1, color);
                treeOb.setValue(2, treeText.trim());
                if (y == null) {
                    treeOb.setValue(3, Utils.missingValue());
                } else {
                    treeOb.setValue(3, y);
                }
            } else {
                treeOb.setValue(1, treeText.trim());
                if (y == null) {
                    treeOb.setValue(2, Utils.missingValue());
                } else {
                    treeOb.setValue(2, y);
                }
            }

            //
            // Depends data
            //

            for (int j = 0; j < 7; j++) {

                String text = "";
                List<String> list = depends.get(id);
                if (list == null || list.isEmpty()) {
                    Job.log("WARNING", "no depends for " + id);
                    text = "x";
                } else {
                    for (String item : list) {
                        String[] term = StringUtils.safeSplit(item, "/", 3);
                        switch (j) {
                        case 0:
                            text += item;
                            break;
                        case 1:
                            text += term[1] + "/" + term[2];
                            break;
                        case 2:
                            text += term[0] + "/" + term[2];
                            break;
                        case 3:
                            text += term[0] + "/" + term[1];
                            break;
                        case 4:
                            text += term[0];
                            break;
                        case 5:
                            text += term[1];
                            break;
                        case 6:
                            text += term[2];
                            break;
                        }
                        text += " ";
                    }
                }

                dependsData[j].add(new DenseInstance(ob.numAttributes()));
                Instance dependsOb = dependsData[j].lastInstance();
                dependsOb.setValue(0, id);
                if (Contest.isMultiChoice(essaySet)) {
                    dependsOb.setValue(1, color);
                    dependsOb.setValue(2, text.trim());
                    if (y == null) {
                        dependsOb.setValue(3, Utils.missingValue());
                    } else {
                        dependsOb.setValue(3, y);
                    }
                } else {
                    dependsOb.setValue(1, text.trim());
                    if (y == null) {
                        dependsOb.setValue(2, Utils.missingValue());
                    } else {
                        dependsOb.setValue(2, y);
                    }
                }

            } // j
        } // dataset

        // Now save the new datasets

        Dataset.save("work/datasets/" + parent + "/" + tagsData.relationName(), tagsData);
        Dataset.save("work/datasets/" + parent + "/" + treeData.relationName(), treeData);
        for (int j = 0; j < 7; j++) {
            Dataset.save("work/datasets/" + parent + "/" + dependsData[j].relationName(), dependsData[j]);
        }
        Dataset.save("work/datasets/" + parent + "/" + extraStats.relationName(), extraStats);

    } // method

    private int getMatchCount(boolean a, boolean b, boolean c, boolean d) {
        int count = 0;
        if (a)
            count++;
        if (b)
            count++;
        if (c)
            count++;
        if (d)
            count++;
        return count;
    }

}