com.ibm.bi.dml.runtime.io.ReaderTextCSV.java Source code

Java tutorial

Introduction

Here is the source code for com.ibm.bi.dml.runtime.io.ReaderTextCSV.java

Source

/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
*/

package com.ibm.bi.dml.runtime.io;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;

import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.matrix.CSVReblockMR;
import com.ibm.bi.dml.runtime.matrix.data.CSVFileFormatProperties;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.util.UtilFunctions;

public class ReaderTextCSV extends MatrixReader {

    private CSVFileFormatProperties _props = null;

    public ReaderTextCSV(CSVFileFormatProperties props) {
        _props = props;
    }

    @Override
    public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz)
            throws IOException, DMLRuntimeException {
        //allocate output matrix block
        MatrixBlock ret = null;
        if (rlen > 0 && clen > 0) //otherwise CSV reblock based on file size for matrix w/ unknown dimensions
            ret = createOutputMatrixBlock(rlen, clen, estnnz, true, false);

        //prepare file access
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        FileSystem fs = FileSystem.get(job);
        Path path = new Path(fname);

        //check existence and non-empty file
        checkValidInputFile(fs, path);

        //core read 
        ret = readCSVMatrixFromHDFS(path, job, fs, ret, rlen, clen, brlen, bclen, _props.hasHeader(),
                _props.getDelim(), _props.isFill(), _props.getFillValue());

        //finally check if change of sparse/dense block representation required
        //(nnz explicitly maintained during read)
        ret.examSparsity();

        return ret;
    }

    /**
     * 
     * @param path
     * @param job
     * @param fs
     * @param dest
     * @param rlen
     * @param clen
     * @param brlen
     * @param bclen
     * @param hasHeader
     * @param delim
     * @param fill
     * @param fillValue
     * @return
     * @throws IOException
     */
    @SuppressWarnings("unchecked")
    private MatrixBlock readCSVMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen,
            long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue)
            throws IOException {
        ArrayList<Path> files = new ArrayList<Path>();
        if (fs.isDirectory(path)) {
            for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter))
                files.add(stat.getPath());
            Collections.sort(files);
        } else
            files.add(path);

        if (dest == null) {
            dest = computeCSVSize(files, job, fs, hasHeader, delim, fill, fillValue);
            clen = dest.getNumColumns();
        }

        boolean sparse = dest.isInSparseFormat();

        /////////////////////////////////////////
        String value = null;
        int row = 0;
        int col = -1;
        double cellValue = 0;
        long lnnz = 0;

        for (int fileNo = 0; fileNo < files.size(); fileNo++) {
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
            if (fileNo == 0 && hasHeader)
                br.readLine(); //ignore header

            // Read the data
            boolean emptyValuesFound = false;
            try {
                if (sparse) //SPARSE<-value
                {
                    while ((value = br.readLine()) != null) //foreach line
                    {
                        String cellStr = value.toString().trim();
                        emptyValuesFound = false;
                        String[] parts = IOUtilFunctions.split(cellStr, delim);
                        col = 0;

                        for (String part : parts) //foreach cell
                        {
                            part = part.trim();
                            if (part.isEmpty()) {
                                emptyValuesFound = true;
                                cellValue = fillValue;
                            } else {
                                cellValue = UtilFunctions.parseToDouble(part);
                            }
                            if (cellValue != 0) {
                                dest.appendValue(row, col, cellValue);
                                lnnz++;
                            }
                            col++;
                        }

                        //sanity checks for empty values and number of columns
                        IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
                        IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
                        row++;
                    }
                } else //DENSE<-value
                {
                    while ((value = br.readLine()) != null) //foreach line
                    {
                        String cellStr = value.toString().trim();
                        emptyValuesFound = false;
                        String[] parts = IOUtilFunctions.split(cellStr, delim);
                        col = 0;

                        for (String part : parts) //foreach cell
                        {
                            part = part.trim();
                            if (part.isEmpty()) {
                                emptyValuesFound = true;
                                cellValue = fillValue;
                            } else {
                                cellValue = UtilFunctions.parseToDouble(part);
                            }
                            if (cellValue != 0) {
                                dest.setValueDenseUnsafe(row, col, cellValue);
                                lnnz++;
                            }
                            col++;
                        }

                        //sanity checks for empty values and number of columns
                        IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
                        IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
                        row++;
                    }
                }
            } finally {
                IOUtilFunctions.closeSilently(br);
            }
        }

        //post processing
        dest.setNonZeros(lnnz);

        return dest;
    }

    /**
     * 
     * @param files
     * @param job
     * @param fs
     * @param hasHeader
     * @param delim
     * @param fill
     * @param fillValue
     * @return
     * @throws IOException
     */
    private MatrixBlock computeCSVSize(List<Path> files, JobConf job, FileSystem fs, boolean hasHeader,
            String delim, boolean fill, double fillValue) throws IOException {
        int nrow = -1;
        int ncol = -1;
        String value = null;

        String cellStr = null;
        for (int fileNo = 0; fileNo < files.size(); fileNo++) {
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
            try {
                // Read the header line, if there is one.
                if (fileNo == 0) {
                    if (hasHeader)
                        br.readLine(); //ignore header
                    if ((value = br.readLine()) != null) {
                        cellStr = value.toString().trim();
                        ncol = StringUtils.countMatches(cellStr, delim) + 1;
                        nrow = 1;
                    }
                }

                while ((value = br.readLine()) != null) {
                    nrow++;
                }
            } finally {
                IOUtilFunctions.closeSilently(br);
            }
        }

        //create new matrix block (assume sparse for consistency w/ compiler)
        return new MatrixBlock(nrow, ncol, true);
    }
}