com.datascience.hadoop.CsvRecordReader.java Source code

Introduction

Here is the source code for com.datascience.hadoop.CsvRecordReader.java
Source

/*
 * Copyright 2015 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datascience.hadoop;

import cascading.tap.TapException;
import cascading.tuple.Tuple;
import com.datascience.util.CsvParseException;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.RecordReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;

/**
 * CSV record reader.
 * <p>
 * The CSV record reader handles reading CSV records from an {@link java.io.InputStream} on behalf of a
 * {@link com.datascience.hadoop.CsvInputFormat}. Internally, the record reader uses a {@link org.apache.commons.csv.CSVParser}
 * to iterate through the provided stream and parse records.
 * <p>
 * This record reader is agnostic about the compression of the provided input stream. Additionally, note that this
 * reader does <em>not</em> currently support input splits. It instead assumes that all provided
 * {@link java.io.InputStream streams} are representative of a full data set.
 *
 * @author <a href="http://github.com/kuujo">Jordan Halterman</a>
 */
public class CsvRecordReader implements RecordReader<LongWritable, ListWritable<Text>> {
    private static final Logger LOGGER = LoggerFactory.getLogger(CsvRecordReader.class);
    private final Text[] cache = new Text[1024];
    private final CSVParser parser;
    private final Iterator<CSVRecord> iterator;
    private final long length;
    private final boolean strict;
    private long position = 0;
    private Integer colLength;

    public CsvRecordReader(Reader reader, CSVFormat format, long length, boolean strict) throws IOException {
        this.length = length;
        this.strict = strict;
        parser = new CSVParser(reader, format);
        iterator = parser.iterator();
        if (parser.getHeaderMap() == null) {
            colLength = null;
        } else {
            colLength = parser.getHeaderMap().size();
        }
    }

    @Override
    public boolean next(LongWritable key, ListWritable<Text> value) throws IOException {
        value.clear();
        try {
            if (iterator.hasNext()) {
                CSVRecord record = iterator.next();
                position++;
                colLength = colLength == null ? record.size() : colLength;
                if ((!record.isConsistent() || record.size() != colLength) && strict) {
                    String message = String.format("%s: %s", "inconsistent record at position", position);
                    throw new CsvParseException(message);
                }

                key.set(record.getRecordNumber());

                for (int i = 0; i < record.size(); i++) {
                    String item = record.get(i);
                    if (item == null) {
                        value.add(null);
                    } else {
                        Text text = cache[i];
                        if (text == null) {
                            text = new Text();
                            cache[i] = text;
                        }
                        text.set(item);
                        value.add(text);
                    }
                }
                //position = record.getCharacterPosition();
                return true;
            }

        } catch (Exception e) {
            LOGGER.warn("failed to parse record at position: " + position);
            if (strict) {
                throw e;
            } else {
                return next(key, value);
            }
        }
        return false;
    }

    @Override
    public LongWritable createKey() {
        return new LongWritable();
    }

    @Override
    public ListWritable<Text> createValue() {
        return new ListWritable<>(Text.class);
    }

    @Override
    public long getPos() throws IOException {
        return position;
    }

    @Override
    public float getProgress() throws IOException {
        return Math.min(1.0f, (float) position / length);
    }

    @Override
    public void close() throws IOException {
        parser.close();
    }

}