org.apache.sysml.runtime.io.FrameReaderTextCSV.java Source code

Introduction

Here is the source code for org.apache.sysml.runtime.io.FrameReaderTextCSV.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.io;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties;
import org.apache.sysml.runtime.matrix.data.FrameBlock;
import org.apache.sysml.runtime.matrix.data.Pair;
import org.apache.sysml.runtime.util.UtilFunctions;

/**
 * Single-threaded frame text csv reader.
 * 
 */
public class FrameReaderTextCSV extends FrameReader {
    protected CSVFileFormatProperties _props = null;

    public FrameReaderTextCSV(CSVFileFormatProperties props) {
        _props = props;
    }

    /**
     * 
     * @param fname
     * @param schema
     * @param names
     * @param rlen
     * @param clen
     * @return
     * @throws DMLRuntimeException 
     * @throws IOException 
     */
    @Override
    public final FrameBlock readFrameFromHDFS(String fname, List<ValueType> schema, List<String> names, long rlen,
            long clen) throws IOException, DMLRuntimeException {
        //prepare file access
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        FileSystem fs = FileSystem.get(job);
        Path path = new Path(fname);
        FileInputFormat.addInputPath(job, path);

        //check existence and non-empty file
        checkValidInputFile(fs, path);

        //compute size if necessary
        if (rlen <= 0 || clen <= 0) {
            Pair<Integer, Integer> size = computeCSVSize(path, job, fs);
            rlen = size.getKey();
            clen = size.getValue();
        }

        //allocate output frame block
        List<ValueType> lschema = createOutputSchema(schema, clen);
        List<String> lnames = createOutputNames(names, clen);
        FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen);

        //core read (sequential/parallel) 
        readCSVFrameFromHDFS(path, job, fs, ret, lschema, lnames, rlen, clen);

        return ret;
    }

    /**
     * 
     * @param path
     * @param job
     * @param fs
     * @param dest
     * @param schema
     * @param names
     * @param rlen
     * @param clen
     * @return
     * @throws IOException 
     */
    protected void readCSVFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest,
            List<ValueType> schema, List<String> names, long rlen, long clen) throws IOException {
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);
        splits = IOUtilFunctions.sortInputSplits(splits);
        for (int i = 0; i < splits.length; i++)
            readCSVFrameFromInputSplit(splits[i], informat, job, dest, schema, names, rlen, clen, 0, i == 0);
    }

    /**
     * 
     * @param path
     * @param job
     * @param fs
     * @param dest
     * @param rlen
     * @param clen
     * @param hasHeader
     * @param delim
     * @param fill
     * @return
     * @throws IOException
     */
    protected final void readCSVFrameFromInputSplit(InputSplit split, TextInputFormat informat, JobConf job,
            FrameBlock dest, List<ValueType> schema, List<String> names, long rlen, long clen, int rl,
            boolean first) throws IOException {
        boolean hasHeader = _props.hasHeader();
        boolean isFill = _props.isFill();
        double dfillValue = _props.getFillValue();
        String sfillValue = String.valueOf(_props.getFillValue());
        String delim = _props.getDelim();

        //create record reader
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
        LongWritable key = new LongWritable();
        Text value = new Text();
        int row = rl;
        int col = -1;

        //handle header if existing
        if (first && hasHeader) {
            reader.next(key, value); //read header
            List<String> colnames = Arrays.asList(value.toString().split(delim));
            dest.setColumnNames(colnames);
        }

        // Read the data
        boolean emptyValuesFound = false;
        try {
            while (reader.next(key, value)) //foreach line
            {
                String cellStr = value.toString().trim();
                emptyValuesFound = false;
                col = 0;
                String[] parts = IOUtilFunctions.split(cellStr, delim);

                for (String part : parts) //foreach cell
                {
                    part = part.trim();
                    if (part.isEmpty()) {
                        if (isFill && dfillValue != 0)
                            dest.set(row, col, UtilFunctions.stringToObject(schema.get(col), sfillValue));
                        emptyValuesFound = true;
                    } else {
                        dest.set(row, col, UtilFunctions.stringToObject(schema.get(col), part));
                    }
                    col++;
                }

                //sanity checks for empty values and number of columns
                IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, isFill, emptyValuesFound);
                IOUtilFunctions.checkAndRaiseErrorCSVNumColumns("", cellStr, parts, clen);
                row++;
            }
        } finally {
            IOUtilFunctions.closeSilently(reader);
        }
    }

    /**
     * 
     * @param files
     * @param fs
     * @param schema
     * @param names
     * @param hasHeader
     * @param delim
     * @return
     * @throws IOException
     */
    protected Pair<Integer, Integer> computeCSVSize(Path path, JobConf job, FileSystem fs) throws IOException {
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);
        splits = IOUtilFunctions.sortInputSplits(splits);

        boolean first = true;
        int ncol = -1;
        int nrow = -1;

        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            LongWritable key = new LongWritable();
            Text value = new Text();

            try {
                //read head and first line to determine num columns
                if (first) {
                    if (_props.hasHeader())
                        reader.next(key, value); //ignore header
                    reader.next(key, value);
                    ncol = StringUtils.countMatches(value.toString(), _props.getDelim()) + 1;
                    nrow = 1;
                    first = false;
                }

                //count remaining number of rows
                while (reader.next(key, value))
                    nrow++;
            } finally {
                IOUtilFunctions.closeSilently(reader);
            }
        }

        return new Pair<Integer, Integer>(nrow, ncol);
    }
}