edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter.java Source code

Introduction

Here is the source code for edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter.java
Source

package edu.stanford.nlp.sequences;

import edu.stanford.nlp.util.logging.Redwood;

import java.io.PrintWriter;
import java.io.Reader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.objectbank.DelimitRegExIterator;
import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;
import java.util.function.Function;
import edu.stanford.nlp.util.StringUtils;

/**
 * DocumentReader for column format.
 *
 * @author Jenny Finkel
 */
public class ColumnDocumentReaderAndWriter implements DocumentReaderAndWriter<CoreLabel> {

    /** A logger for this class */
    private static final Redwood.RedwoodChannels log = Redwood.channels(ColumnDocumentReaderAndWriter.class);

    private static final long serialVersionUID = 3806263423697973704L;
    private static final boolean includeProbabilities = false;

    //  private SeqClassifierFlags flags; // = null;
    //map can be something like "word=0,tag=1,answer=2"
    @SuppressWarnings("rawtypes")
    private Class[] map; // = null;
    private IteratorFromReaderFactory<List<CoreLabel>> factory;

    //  public void init(SeqClassifierFlags flags) {
    //    this.flags = flags;
    //    this.map = StringUtils.mapStringToArray(flags.map);
    //    factory = DelimitRegExIterator.getFactory("\n(\\s*\n)+", new ColumnDocParser());
    //  }

    @Override
    public void init(SeqClassifierFlags flags) {
        init(flags.map);
    }

    public void init(String map) {
        // this.flags = null;
        this.map = CoreLabel.parseStringKeys(StringUtils.mapStringToArray(map));
        factory = DelimitRegExIterator.getFactory("\n(?:\\s*\n)+", new ColumnDocParser());
    }

    @Override
    public Iterator<List<CoreLabel>> getIterator(Reader r) {
        return factory.getIterator(r);
    }

    // private int num; // = 0;

    private class ColumnDocParser implements Serializable, Function<String, List<CoreLabel>> {

        private static final long serialVersionUID = -6266332661459630572L;
        private final Pattern whitePattern = Pattern.compile("\\s+"); // should this really only do a tab?

        private int lineCount; // = 0;

        @Override
        public List<CoreLabel> apply(String doc) {
            // if (num > 0 && num % 1000 == 0) { log.info("["+num+"]"); } // cdm: Not so useful to do in new logging world
            // num++;

            List<CoreLabel> words = new ArrayList<>();
            String[] lines = doc.split("\n");

            for (String line : lines) {
                ++lineCount;
                if (line.trim().isEmpty()) {
                    continue;
                }
                // Optimistic splitting on tabs first. If that doesn't work, use any whitespace (slower, because of regexps).
                String[] info = line.split("\t");
                if (info.length == 1) {
                    info = whitePattern.split(line);
                }
                CoreLabel wi;
                try {
                    wi = new CoreLabel(map, info);
                    // Since the map normally only specified answer, we copy it to GoldAnswer unless they've put something else there!
                    if (!wi.containsKey(CoreAnnotations.GoldAnswerAnnotation.class)
                            && wi.containsKey(CoreAnnotations.AnswerAnnotation.class)) {
                        wi.set(CoreAnnotations.GoldAnswerAnnotation.class,
                                wi.get(CoreAnnotations.AnswerAnnotation.class));
                    }
                } catch (RuntimeException e) {
                    log.info("Error on line " + lineCount + ": " + line);
                    throw e;
                }
                words.add(wi);
            }
            return words;
        }

    } // end class ColumnDocParser

    @Override
    public void printAnswers(List<CoreLabel> doc, PrintWriter out) {
        for (CoreLabel wi : doc) {
            String answer = wi.get(CoreAnnotations.AnswerAnnotation.class);
            String goldAnswer = wi.get(CoreAnnotations.GoldAnswerAnnotation.class);
            if (includeProbabilities) {
                double answerProb = wi.get(CoreAnnotations.AnswerProbAnnotation.class);
                out.println(wi.word() + '\t' + goldAnswer + '\t' + answer + '\t' + answerProb);
            } else {
                out.println(wi.word() + '\t' + goldAnswer + '\t' + answer);
            }
        }
        out.println();
    }

}