com.clust4j.data.BufferedMatrixReader.java Source code

Introduction

Here is the source code for com.clust4j.data.BufferedMatrixReader.java
Source

/*******************************************************************************
 *    Copyright 2015, 2016 Taylor G Smith
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 *******************************************************************************/
package com.clust4j.data;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.concurrent.RecursiveTask;
import java.util.concurrent.RejectedExecutionException;

import org.apache.commons.math3.exception.DimensionMismatchException;
import org.apache.commons.math3.util.FastMath;

import com.clust4j.Clust4j;
import com.clust4j.GlobalState;
import com.clust4j.algo.ParallelChunkingTask;
import com.clust4j.except.MatrixParseException;
import com.clust4j.log.Log.Tag.Algo;
import com.clust4j.log.Log;
import com.clust4j.log.LogTimer;
import com.clust4j.log.Loggable;
import com.clust4j.utils.ArrayFormatter;
import com.clust4j.utils.DeepCloneable;
import com.clust4j.utils.MatUtils;
import com.clust4j.utils.VecUtils;

/**
 * A class for reading a {@link DataSet} from files. If the separator
 * is not provided, the class will estimate the separator. This is based
 * on H2O's CsvParser, but is lighter weight, as clust4j mandates 100%
 * numeric matrices. 
 * 
 * <p>
 * The following byte delimiters are supported for auto-estimation:
 * <ul>
 * <li><tt>0x1</tt> - the default Hive delimiter
 * <li><tt>','</tt>
 * <li><tt>'\t'</tt>
 * <li><tt>';'</tt>
 * <li><tt>'|'</tt>
 * <li><tt>' '</tt>
 * </ul>
 * 
 * <p>
 * The parser will also strip out comments in the head of the file, if found. 
 * Additionally, the following tokens will be coerced to {@link Double#NaN}:
 * 
 * <ul>
 * <li><tt>""</tt>
 * <li><tt>" "</tt>
 * <li><tt>nan</tt> (case insensitive)
 * <li><tt>na</tt> (case insensitive)
 * <li><tt>?</tt>
 * </ul>
 * 
 * <p>
 * The following tokens will be coerced to {@link Double#POSITIVE_INFINITY}:
 * 
 * <ul>
 * <li><tt>inf</tt> (case insensitive)
 * <li><tt>infinity</tt> (case insensitive)
 * </ul>
 * 
 * <p>
 * The following tokens will be coerced to {@link Double#NEGATIVE_INFINITY}:
 * 
 * <ul>
 * <li><tt>-inf</tt> (case insensitive)
 * <li><tt>-infinity</tt> (case insensitive)
 * </ul>
 * 
 * @see <a href="https://github.com/h2oai/h2o-3/blob/master/h2o-core/src/main/java/water/parser/CsvParser.java">h2o</a>
 * @author Taylor G Smith
 */
public class BufferedMatrixReader implements Loggable {
    private boolean hasWarnings = false;

    /* Chars to watch for... */
    private static final byte HIVE = 0x1;
    private static final byte COMMA = ',';
    private static final byte TAB = '\t';
    private static final byte CARRIAGE = 13;
    private static final byte LINE_FEED = 10;
    private static final byte SPACE = ' ';
    private static final byte DQUOTE = '"';
    private static final byte SQUOTE = '\'';
    //private static final byte DECIMAL = '.';
    private static final byte GUESS_SEP = -1;

    /* More statics */
    static final long LARGEST_DIGIT_NUM = Long.MAX_VALUE / 10;

    /* Separators to watch for... */
    static final byte[] known_separators = new byte[] { HIVE /* Hive - '^A' */, COMMA, ';', '|' /* MySql, Sqlite */,
            TAB, SPACE /* Or multiple spaces... */, };

    /* Separators that need escaping */
    static final byte[] escapable_separators = new byte[] { '|' };

    /* Comment chars to watch for */
    static final byte[] known_comments = new byte[] { '#', '%', '@' };

    /* Change to NaN */
    static final String[] nan_strings = new String[] { "", "nan", "na", "?", // do we want this?
    };

    /* Change to Inf */
    static final String[] pos_inf_strings = new String[] { "inf", "infinity", };

    /* Change to neg inf */
    static final String[] neg_inf_strings = new String[] { "-inf", "-infinity", };

    /** Helper functions */
    static boolean isEscapable(byte b) {
        for (byte c : escapable_separators) {
            if (c == b)
                return true;
        }
        return false;
    }

    static boolean isEOL(byte chr) {
        return chr == LINE_FEED || chr == CARRIAGE;
    }

    static boolean isComment(final byte chr) {
        for (final byte b : known_comments)
            if (chr == b)
                return true;
        return false;
    }

    static boolean isNaN(final String lower) {
        for (String nan : nan_strings)
            if (nan.equals(lower))
                return true;

        return false;
    }

    static boolean isNegInf(final String lower) {
        for (String inf : neg_inf_strings)
            if (inf.equals(lower))
                return true;

        return false;
    }

    static boolean isPosInf(final String lower) {
        for (String inf : pos_inf_strings)
            if (inf.equals(lower))
                return true;

        return false;
    }

    static byte[] fileToBytes(final File file) throws IOException {
        return Files.readAllBytes(file.toPath());
    }

    /* Instance attributes */
    final MatrixReaderSetup setup;

    /**
     * Create an instance from a file
     * @param file
     * @throws MatrixParseException
     * @throws IOException
     */
    public BufferedMatrixReader(final File file) throws MatrixParseException, IOException {
        this(fileToBytes(file));
    }

    /**
     * Create an instance from a file
     * @param file
     * @param single_quotes
     * @throws MatrixParseException
     * @throws IOException
     */
    public BufferedMatrixReader(final File file, boolean single_quotes) throws MatrixParseException, IOException {
        this(fileToBytes(file), single_quotes);
    }

    /**
     * Create an instance from a file
     * @param file
     * @param sep
     * @throws MatrixParseException
     * @throws IOException
     */
    public BufferedMatrixReader(final File file, byte sep) throws MatrixParseException, IOException {
        this(fileToBytes(file), sep);
    }

    /**
     * Create an instance from a file
     * @param file
     * @param sep
     * @param single_quotes
     * @throws MatrixParseException
     * @throws IOException
     */
    public BufferedMatrixReader(final File file, boolean single_quotes, byte sep)
            throws MatrixParseException, IOException {
        this(fileToBytes(file), single_quotes, sep);
    }

    /**
     * Create an instance from an array of bytes
     * @param bits
     * @throws MatrixParseException
     */
    public BufferedMatrixReader(byte[] bits) throws MatrixParseException {
        this(new MatrixReaderSetup(bits));
    }

    /**
     * Create an instance from an array of bytes
     * @param bits
     * @param single_quotes
     * @throws MatrixParseException
     */
    public BufferedMatrixReader(byte[] bits, boolean single_quotes) throws MatrixParseException {
        this(new MatrixReaderSetup(bits, single_quotes));
    }

    /**
     * Create an instance from an array of bytes
     * @param bits
     * @param single_quotes
     * @param sep
     * @throws MatrixParseException
     */
    public BufferedMatrixReader(byte[] bits, boolean single_quotes, byte sep) throws MatrixParseException {
        this(new MatrixReaderSetup(bits, single_quotes, sep));
    }

    /**
     * Create an instance from an array of bytes and a separator
     * @param bits
     * @param sep
     * @throws MatrixParseException
     */
    public BufferedMatrixReader(byte[] bits, byte sep) throws MatrixParseException {
        this(new MatrixReaderSetup(bits, sep));
    }

    /**
     * Create an instance from an existing setup
     * @param setup
     * @throws MatrixParseException
     */
    protected BufferedMatrixReader(MatrixReaderSetup setup) throws MatrixParseException {
        this.setup = setup;
        this.hasWarnings = setup.hasWarnings();
    }

    /**
     * A class that guesses the setup of the input file, including
     * separators, etc.
     * @author Taylor G Smith
     */
    protected static class MatrixReaderSetup extends Clust4j implements Loggable, DeepCloneable {
        private static final long serialVersionUID = 5863624610174664028L;
        private static final int GUESS_LINES = 4;

        /* Instance vars */
        boolean single_quotes; // whether single quotes quote a field or double quotes do
        final int num_cols;
        int header_offset = 0; // which row to start on due to headers
        String[] headers = null;
        String[][] data; // First few rows of parsed data
        final byte separator;
        final byte[] stream;
        private boolean hasWarnings;
        final LogTimer timer;

        /**
         * Copy constructor
         * @param instance
         */
        private MatrixReaderSetup(MatrixReaderSetup instance) {
            this.single_quotes = instance.single_quotes;
            this.num_cols = instance.num_cols;
            this.header_offset = instance.header_offset;
            this.headers = VecUtils.copy(instance.headers); // if null, sets to null
            this.data = MatUtils.copy(instance.data);
            this.separator = instance.separator;
            this.stream = Arrays.copyOf(instance.stream, instance.stream.length);
            this.hasWarnings = instance.hasWarnings;
            this.timer = instance.timer;
        }

        MatrixReaderSetup(byte[] bits) throws MatrixParseException {
            this(bits, false, GUESS_SEP);
        }

        MatrixReaderSetup(byte[] bits, boolean single_quotes) throws MatrixParseException {
            this(bits, single_quotes, GUESS_SEP);
        }

        MatrixReaderSetup(byte[] bits, byte sep) throws MatrixParseException {
            this(bits, false, sep);
        }

        MatrixReaderSetup(byte[] bits, boolean single_quotes, byte sep) throws MatrixParseException {
            this.single_quotes = single_quotes;
            if (single_quotes)
                info("using single quotes (\"'\")");
            else
                info("using double quotes ('\"')");

            this.timer = new LogTimer();

            /* Given the bytes, we look at first few lines and guess the setup... */
            String[] lines = getFirstLines(bits);

            // If data is empty, fail
            if (lines.length == 0)
                error(new MatrixParseException("data is empty!"));

            // Guess separator, columns and header
            data = new String[lines.length][];

            // Corner case first:
            if (1 == lines.length) {
                warn("only one line found in data");
                String line = lines[0];

                if (GUESS_SEP == sep) {
                    /*
                     *  Guess the separator. Harder to do with only one line
                     */

                    String splitter;
                    boolean foundSep = false;

                    for (byte ks : known_separators) {

                        /*
                         * Some chars require escaping or they'll
                         * falsely flag their presence.
                         */
                        splitter = isEscapable(ks) ? new String(new byte[] { (byte) '\\', ks })
                                : new String(new byte[] { ks });

                        if (line.split(splitter).length > 1) {
                            foundSep = true;
                            sep = ks;
                            break;
                        }

                        /*
                         *  There's a corner case here... imagine the row is:
                         *  
                         *  "a,b,c"|"d,e,f"
                         *  
                         *  ... since this is ordinally dependent, it will select
                         *  the comma as the separator, though in cases where this
                         *  would happen, we'd likely fail the case on the basis that
                         *  it's text. However, this is a very real possibility:
                         *  
                         *  "10,123"|"12,198"
                         *  
                         *  ... in which case the | should be the delimiter and 
                         *  we selected the wrong one. The moral of the story (two):
                         *  
                         *  - Don't try to read a single-row CSV
                         *  - Don't include thousands separators in your data
                         */
                    }

                    if (!foundSep) { // probably one item
                        // If there's one item, we're just going to fail out
                        error(new MatrixParseException("could not find separator in row: " + line));
                    }
                }

                /*
                 * One way or another at this point, we have a separator picked out
                 */
                data[0] = getTokens(line, sep, single_quotes);
                this.num_cols = data[0].length;

                // What about the header? Always check...
                if (allStrings(data[0]) && !data[0][0].isEmpty()) {
                    error(new MatrixParseException(
                            "singular " + "row is entirely character; maybe " + "an orphaned header?"));
                }

            } else { // 2+ lines

                // First guess the separator
                if (GUESS_SEP == sep) {
                    sep = guessSeparator(lines[0], lines[1], single_quotes, this);

                    // extremely difficult-to-replicate corner case... let's keep it simple
                    /*
                    if(GUESS_SEP == sep && lines.length > 2) {
                       sep = guessSeparator(lines[1], lines[2], single_quotes);
                       if(GUESS_SEP == sep)
                          sep = guessSeparator(lines[0], lines[2], single_quotes);
                    }
                        
                    if(GUESS_SEP == sep) {
                       warn("could not determine uniform separator; using space (' ')");
                       sep = SPACE; // bail and go for space...
                    } else {
                       info("separator estimated as '"+new String(new byte[]{sep})+"'");
                    }
                    */

                    if (GUESS_SEP == sep) {
                        error(new MatrixParseException("cannot determine uniform separator"));
                    } else {
                        info("separator estimated as '" + new String(new byte[] { sep }) + "'");
                    }
                } else {
                    info("separator provided as '" + new String(new byte[] { sep }) + "'");
                }

                // Tokenize first few
                for (int i = 0; i < lines.length; ++i)
                    data[i] = getTokens(lines[i], sep, single_quotes);

                // Guess the number of columns
                this.num_cols = guessNumCols(data);

                // Check for header
                if (allStrings(data[0]) && !data[0][0].isEmpty()) {
                    header_offset = 1;
                    this.headers = data[0];
                }
            }

            /*
             *  Now we need to go through each row and ensure it's 
             *  completely numeric... this only looks through the first
             *  few, but gives us confidence, and saves us time later
             *  if it's bad up front.
             */
            for (int i = header_offset; i < data.length; i++) {
                try {
                    tokenize(data[i]);
                } catch (NumberFormatException e) {
                    error(new MatrixParseException(
                            "non-numeric row found: " + ArrayFormatter.arrayToString(data[i])));
                }
            }

            // Num cols?
            info(num_cols + " feature" + (num_cols == 1 ? "" : "s") + " identified in dataset");

            this.stream = bits;
            this.separator = sep;
            sayBye(timer);
        }

        static boolean allStrings(String[] row) {
            for (String s : row) {
                try {
                    Double.parseDouble(s);
                    return false;
                } catch (NumberFormatException e) {
                }
            }

            return true;
        }

        /**
         * Adapted from H2O's getFirstLines method
         * @param bits
         * @return
         */
        static String[] getFirstLines(byte[] bits) {
            return getLines(bits, GUESS_LINES);
        }

        static int[] getSeparatorCounts(String l1, final byte single) {
            // This is essentially a lightweight map... byte : int
            int[] result = new int[known_separators.length];
            byte[] bits = l1.getBytes();

            boolean inQuote = false;
            for (byte c : bits) {
                if (single == c || DQUOTE == c)
                    inQuote ^= true; // toggles on or off

                if (!inQuote || HIVE == c) {
                    for (int i = 0; i < known_separators.length; ++i)
                        if (known_separators[i] == c)
                            ++result[i];
                }
            }

            return result;
        }

        static int guessNumCols(String[][] data) {
            int longest = 0;
            for (String[] s : data)
                if (s.length > longest)
                    longest = s.length;

            if (longest == data[0].length)
                return longest; // 1st line longer than or equal to rest, so take it

            int lengths[] = new int[longest + 1];
            for (String[] s : data)
                lengths[s.length]++;

            int maxCnt = 0;
            for (int i = 0; i <= longest; i++)
                if (lengths[i] > lengths[maxCnt])
                    maxCnt = i;

            return maxCnt;
        }

        static byte guessSeparator(String l1, String l2, boolean single_quotes, Loggable logger) {
            final byte single = single_quotes ? SQUOTE : -1;
            int[] s1 = getSeparatorCounts(l1, single);
            int[] s2 = getSeparatorCounts(l2, single);

            // If both lines have the same number of separators, it's 
            // likely that one... Separators ordered by likelihood.
            int max = 0;

            for (int i = 0; i < s1.length; ++i) {
                if (s1[i] == 0) // didn't show up in this string
                    continue;
                if (s1[max] < s1[i]) // new max
                    max = i;
                if (s1[i] == s2[i]) { // equal counts
                    try {
                        logger.trace("trying to separate using '" + (char) known_separators[i] + "'");
                        String[] t1 = getTokens(l1, known_separators[i], single);
                        String[] t2 = getTokens(l2, known_separators[i], single);

                        if (t1.length != s1[i] + 1 || t2.length != s2[i] + 1) // non-uniform
                            continue;

                        return known_separators[i];
                    } catch (Exception e) {
                        // we ignore this and try another one...
                    }
                }
            }

            // No separators appeared or we didn't see any equal ones...
            /*// if no uniform separators, just going to bail out with exception
            if(s1[max] == 0) { // try the last one (space) 
               max = known_separators.length - 1;
            } if(s1[max] != 0) {
               String[] t1 = getTokens(l1, known_separators[max], single);
               String[] t2 = getTokens(l2, known_separators[max], single);
                   
               if(t1.length == s1[max]+1 && t2.length == s2[max]+1 
                  && t1.length == t2.length) // they are equally split
                  return known_separators[max];
            }
            */

            return GUESS_SEP;
        }

        @Override
        public void error(String msg) {
            Log.err(getLoggerTag(), msg);
        }

        @Override
        public void error(RuntimeException thrown) {
            error(thrown.getMessage());
            throw thrown;
        }

        @Override
        public void warn(String msg) {
            hasWarnings = true;
            Log.warn(getLoggerTag(), msg);
        }

        @Override
        public void info(String msg) {
            Log.info(getLoggerTag(), msg);
        }

        @Override
        public void trace(String msg) {
            Log.trace(getLoggerTag(), msg);
        }

        @Override
        public void debug(String msg) {
            Log.debug(getLoggerTag(), msg);
        }

        @Override
        public void sayBye(LogTimer timer) {
            info("matrix parse setup completed in " + timer.toString());
        }

        @Override
        public Algo getLoggerTag() {
            return parserLoggerTag();
        }

        @Override
        public boolean hasWarnings() {
            return hasWarnings;
        }

        @Override
        public MatrixReaderSetup copy() {
            return new MatrixReaderSetup(this);
        }
    } // end setup class

    static String[] getLines(byte[] bits) {
        return getLines(bits, GlobalState.MAX_ARRAY_SIZE);
    }

    static String[] getLines(byte[] bits, int num) {
        ArrayList<String> lines = new ArrayList<>();

        int nlines = 0, offset = 0;
        while (offset < bits.length && nlines < num) {
            int lineStart = offset;
            while (offset < bits.length && !isEOL(bits[offset]))
                ++offset;

            int lineEnd = offset++;

            /*
             *  Windows needs to skip a trailing line-feed 
             *  char after a carriage return
             */
            if (offset < bits.length && bits[offset] == LINE_FEED)
                ++offset;

            // Check for comments at top of dataset
            if (isComment(bits[lineStart]))
                continue;

            // Do work
            if (lineEnd > lineStart) {
                String data = new String(bits, lineStart, lineEnd - lineStart).trim();
                if (!data.isEmpty()) {
                    lines.add(data);
                    nlines++;
                }
            }
        }

        return lines.toArray(new String[lines.size()]);
    }

    static String[] getTokens(String from, byte sep, boolean single_quotes) {
        final byte single = single_quotes ? SQUOTE : -1;
        return getTokens(from, sep, single);
    }

    static String[] getTokens(String from, byte sep, final byte single) {
        final ArrayList<String> tokens = new ArrayList<>();
        byte[] bits = from.getBytes();

        int offset = 0;
        int quotes = 0;

        while (offset < bits.length) {
            while (offset < bits.length && bits[offset] == SPACE) // skip leading ws
                ++offset;

            if (offset == bits.length)
                break; // reached end of string

            StringBuilder t = new StringBuilder();
            byte c = bits[offset];

            if (DQUOTE == c || single == c) {
                quotes = c;
                ++offset;
            }

            while (offset < bits.length) {
                c = bits[offset];

                if (quotes == c) {
                    ++offset;

                    if (offset < bits.length && bits[offset] == c) {
                        t.append((char) c);
                        ++offset;
                        continue;
                    }

                    quotes = 0;
                } else if (0 == quotes && sep == c || isEOL(c)) {
                    break; // break inner only
                } else if (sep != COMMA && c == COMMA) {
                    /*
                     * This is a corner case where the separator is NOT
                     * a comma, but the data may contain thousands separators
                     * and this prevents non-numeric exceptions later.
                     */
                    ++offset;
                    continue;
                } else {
                    t.append((char) c);
                    ++offset;
                }
            }

            c = (offset == bits.length) ? LINE_FEED : bits[offset];
            tokens.add(t.toString());

            if (isEOL(c) || offset == bits.length)
                break;
            if (c != sep)
                return new String[0]; // error!
            ++offset;
        }

        // Catch case where last char is a separator, indicating empty last col
        if (bits.length > 0 && bits[bits.length - 1] == sep && bits[bits.length - 1] != SPACE) {
            tokens.add("");
        }

        return tokens.toArray(new String[tokens.size()]);
    }

    /**
     * A class for parallel reading in of files
     * @author Taylor G Smith
     */
    static class ParallelChunkParser extends RecursiveTask<double[][]> {
        private static final long serialVersionUID = 8556857221656513389L;
        private ArrayList<InstanceChunk> chunks;
        private double[][] result;
        final MatrixReaderSetup setup;
        final int n, hi, lo;

        /**
         * A chunk of instances to parse
         * @author Taylor G Smith
         */
        final static class InstanceChunk {
            final String[] rows;
            final int startIdx;

            InstanceChunk(String[] rows, int startIdx) {
                this.rows = rows;
                this.startIdx = startIdx;
            }
        }

        public ParallelChunkParser(ParallelChunkParser instance, int lo, int hi) {
            this.chunks = instance.chunks;
            this.result = instance.result;
            this.setup = instance.setup;
            this.n = instance.n;
            this.lo = lo;
            this.hi = hi;
        }

        private ParallelChunkParser(String[] rows, MatrixReaderSetup setup) {
            this.setup = setup;
            this.n = setup.num_cols;
            this.result = new double[rows.length][n];
            this.chunks = map(rows);
            this.lo = setup.header_offset;
            this.hi = this.chunks.size();
        }

        /**
         * Given a chunk number, read the chunk
         * @param chunk
         * @param startIdx
         */
        void doChunk(int chunk) {
            final InstanceChunk c = chunks.get(chunk);

            int idx = c.startIdx;
            double[] next;
            for (String instance : c.rows) {
                String[] a = getTokens(instance, setup.separator, setup.single_quotes);

                try {
                    next = tokenize(a);

                    // Ensure not jagged
                    if (next.length != setup.num_cols)
                        throw new DimensionMismatchException(next.length, setup.num_cols);

                    result[idx++] = next;
                } catch (NumberFormatException e) {
                    throw new NumberFormatException(ArrayFormatter.arrayToString(a));
                } catch (DimensionMismatchException d) {
                    throw d; // propagate it
                } catch (Exception e) {
                    throw new RuntimeException("unexpected exception in parallel processing", e);
                }
            }
        }

        @Override
        protected double[][] compute() {
            if (hi - lo <= 1) { // generally should equal one...
                doChunk(lo);
                return result;
            } else {
                int mid = this.lo + (this.hi - this.lo) / 2;
                ParallelChunkParser left = new ParallelChunkParser(this, lo, mid);
                ParallelChunkParser right = new ParallelChunkParser(this, mid, hi);

                left.fork();
                right.compute();
                left.join();

                return result;
            }
        }

        protected static InstanceChunk getChunk(String[] X, int chunkSize, int chunkNum, int header_offset) {
            String[] chunk;

            int idx = 0;
            int startingPt = chunkNum * chunkSize + (chunkNum == 0 ? header_offset : 0);
            int endingPt = FastMath.min(X.length, startingPt + chunkSize);

            chunk = new String[endingPt - startingPt];
            for (int j = startingPt; j < endingPt; j++) {
                chunk[idx++] = X[j];
            }

            return new InstanceChunk(chunk, startingPt);
        }

        private ArrayList<InstanceChunk> map(String[] rows) {
            final ArrayList<InstanceChunk> out = new ArrayList<>();
            final int chunkSize = ParallelChunkingTask.ChunkingStrategy.getChunkSize(rows.length);
            final int numChunks = ParallelChunkingTask.ChunkingStrategy.getNumChunks(chunkSize, rows.length);

            for (int i = 0; i < numChunks; i++)
                out.add(getChunk(rows, chunkSize, i, this.setup.header_offset));

            return out;
        }

        public static double[][] doAll(String[] rows, MatrixReaderSetup setup) {
            return GlobalState.ParallelismConf.FJ_THREADPOOL.invoke(new ParallelChunkParser(rows, setup));
        }
    }

    /**
     * Read in the data
     * @return the matrix
     * @throws MatrixParseException
     */
    public DataSet read() throws MatrixParseException {
        return read(false);
    }

    /**
     * Read in the data
     * @param parallel - whether to parallelize the operation
     * @return the matrix
     * @throws MatrixParseException
     */
    public DataSet read(boolean parallel) throws MatrixParseException {
        LogTimer timer = new LogTimer();
        String msg;

        /*
         * Get lines...
         */
        String[] lines = getLines(setup.stream);

        // Potential for truncation here...
        if (lines.length == GlobalState.MAX_ARRAY_SIZE)
            warn("only " + lines.length + " rows read from data, " + "as this is the max clust4j allows");
        else
            info((lines.length - setup.header_offset) + " record" + (lines.length == 1 ? "" : "s") + " ("
                    + setup.stream.length + " byte" + (setup.stream.length == 1 ? "" : "s") + ") read from file");

        /*
         * Do double parsing...
         */
        double[][] res = null;
        if (!parallel) {
            // Let any exceptions propagate
            res = parseSerial(lines);
        } else {

            boolean throwing_exception = true;
            try {
                res = ParallelChunkParser.doAll(lines, setup);
            } catch (NumberFormatException n) {
                error(new MatrixParseException("caught NumberFormatException: " + n.getLocalizedMessage()));
            } catch (DimensionMismatchException d) {
                error(new MatrixParseException("caught row of unexpected dimensions: " + d.getMessage()));
            } catch (RejectedExecutionException r) {
                throwing_exception = false;
                warn("unable to schedule parallel job; falling back to serial parse");
                res = parseSerial(lines);
            } catch (Exception e) {
                msg = "encountered Exception in thread" + e.getMessage();
                error(msg);
                throw e;
            } finally {
                if (null == res && !throwing_exception)
                    throw new RuntimeException("unable to parse data");
            }
        }

        sayBye(timer);
        return new DataSet(res, setup.headers);
    }

    private double[][] parseSerial(String[] lines) {
        int k = 0;
        String msg, line;
        double[] next;

        double[][] res = new double[lines.length - setup.header_offset][setup.num_cols];
        for (int idx = setup.header_offset; idx < lines.length; idx++) {
            line = lines[idx];

            try {
                next = tokenize(line);

                // Ensure not jagged
                if (next.length != setup.num_cols) {
                    msg = "expected row of length " + setup.num_cols + "; got row of length " + next.length
                            + " at line " + idx;
                    error(msg);
                    throw new MatrixParseException(msg);
                }

                res[k++] = next;
            } catch (NumberFormatException e) {
                msg = "non-numeric row found: " + line;
                error(msg);
                throw new MatrixParseException(msg);
            }
        }

        return res;
    }

    /**
     * Handle the tokenizing logic for this instance
     * @param row
     * @return
     * @throws NumberFormatException
     */
    private double[] tokenize(String row) throws NumberFormatException {
        final String[] tokens = getTokens(row, setup.separator, setup.single_quotes);
        return tokenize(tokens);
    }

    /**
     * Static tokenizing method to move a row of strings into a double array
     * @param row
     * @return
     * @throws NumberFormatException
     */
    static double[] tokenize(String[] row) throws NumberFormatException {
        final double[] out = new double[row.length];

        int idx = 0;
        for (String str : row) {
            double val = 0;

            try {
                val = Double.parseDouble(str);
            } catch (NumberFormatException e) {
                String lower = str.toLowerCase();

                // Check if it's a nan...
                if (isNaN(lower))
                    val = Double.NaN;
                else if (isPosInf(lower))
                    val = Double.POSITIVE_INFINITY;
                else if (isNegInf(lower))
                    val = Double.NEGATIVE_INFINITY;
                else
                    throw e;
            }

            out[idx++] = val;
        }

        return out;
    }

    @Override
    public void error(String msg) {
        Log.err(getLoggerTag(), msg);
    }

    @Override
    public void error(RuntimeException thrown) {
        error(thrown.getMessage());
        throw thrown;
    }

    @Override
    public void warn(String msg) {
        hasWarnings = true;
        Log.warn(getLoggerTag(), msg);
    }

    @Override
    public void info(String msg) {
        Log.info(getLoggerTag(), msg);
    }

    @Override
    public void trace(String msg) {
        Log.trace(getLoggerTag(), msg);
    }

    @Override
    public void debug(String msg) {
        Log.debug(getLoggerTag(), msg);
    }

    @Override
    public void sayBye(LogTimer timer) {
        info("dataset parsed from file in " + timer.toString());
    }

    @Override
    public Algo getLoggerTag() {
        return parserLoggerTag();
    }

    @Override
    public boolean hasWarnings() {
        return hasWarnings;
    }

    /**
     * Gets called from Setup class as well
     * @return
     */
    final static Algo parserLoggerTag() {
        return Algo.PARSER;
    }
}