com.datascience.hadoop.CsvInputFormat.java Source code

Introduction

Here is the source code for com.datascience.hadoop.CsvInputFormat.java
Source

/*
 * Copyright 2015 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datascience.hadoop;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.QuoteMode;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CodecPool;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.mapred.*;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

/**
 * CSV input format.
 * <p>
 * This custom input format supports reading records from a CSV file using <a href="https://commons.apache.org/proper/commons-csv/">Apache Commons CSV</a>.
 * As with more basic Hadoop input formats such as {@link org.apache.hadoop.mapred.TextInputFormat}, this input format
 * supports basic decompression through Hadoop's standard {@link org.apache.hadoop.io.compress.CompressionCodec codecs}.
 * However, the limitations of the {@link com.datascience.hadoop.CsvRecordReader} mean that this input format <em>does not
 * currently support input splits</em>.
 *
 * @author <a href="http://github.com/kuujo">Jordan Halterman</a>
 */
public class CsvInputFormat extends FileInputFormat<LongWritable, ListWritable<Text>> implements JobConfigurable {
    public static final String CHARSET = "csv.charset";
    public static final String STRICT_MODE = "csv.strict";
    public static final String CSV_READER_COLUMNS = "csv.reader.columns";
    public static final String CSV_READER_SKIP_HEADER = "csv.reader.skip_header";
    public static final String CSV_READER_DELIMITER = "csv.reader.delimiter";
    public static final String CSV_READER_RECORD_SEPARATOR = "csv.reader.record_separator";
    public static final String CSV_READER_QUOTE_CHARACTER = "csv.reader.quote";
    public static final String CSV_READER_QUOTE_MODE = "csv.reader.quote.mode";
    public static final String CSV_READER_ESCAPE_CHARACTER = "csv.reader.escape";
    public static final String CSV_READER_IGNORE_EMPTY_LINES = "csv.reader.ignore_empty_lines";
    public static final String CSV_READER_IGNORE_SURROUNDING_SPACES = "csv.reader.ignore_surrounding_lines";
    public static final String CSV_READER_NULL_STRING = "csv.reader.null";

    private static final boolean DEFAULT_CSV_READER_SKIP_HEADER = CSVFormat.DEFAULT.getSkipHeaderRecord();
    private static final String DEFAULT_CSV_READER_DELIMITER = String.valueOf(CSVFormat.DEFAULT.getDelimiter());
    private static final String DEFAULT_CSV_READER_RECORD_SEPARATOR = CSVFormat.DEFAULT.getRecordSeparator();
    private static final String DEFAULT_CSV_READER_QUOTE_CHARACTER = String
            .valueOf(CSVFormat.DEFAULT.getQuoteCharacter());
    private static final String DEFAULT_CSV_READER_QUOTE_MODE = CSVFormat.DEFAULT.getQuoteMode() != null
            ? CSVFormat.DEFAULT.getQuoteMode().name()
            : null;
    private static final String DEFAULT_CSV_READER_ESCAPE_CHARACTER = null;
    private static final boolean DEFAULT_CSV_READER_IGNORE_EMPTY_LINES = CSVFormat.DEFAULT.getIgnoreEmptyLines();
    private static final boolean DEFAULT_CSV_READER_IGNORE_SURROUNDING_SPACES = CSVFormat.DEFAULT
            .getIgnoreSurroundingSpaces();
    private static final String DEFAULT_CSV_READER_NULL_STRING = CSVFormat.DEFAULT.getNullString();

    private CompressionCodecFactory codecs;

    @Override
    public void configure(JobConf conf) {
        codecs = new CompressionCodecFactory(conf);
    }

    @Override
    protected boolean isSplitable(FileSystem fs, Path path) {
        // TODO: CsvRecordReader cannot currently support input splits.
        // final CompressionCodec codec = codecs.getCodec(path);
        // return codec == null || codec instanceof SplittableCompressionCodec;
        return false;
    }

    @Override
    public RecordReader<LongWritable, ListWritable<Text>> getRecordReader(InputSplit inputSplit, JobConf conf,
            Reporter reporter) throws IOException {
        String charsetName = conf.get(CHARSET);
        Charset charset = charsetName != null ? Charset.forName(charsetName) : StandardCharsets.UTF_8;

        FileSplit split = (FileSplit) inputSplit;
        Path path = split.getPath();
        FileSystem fs = path.getFileSystem(conf);
        InputStream is = fs.open(path);

        // If the input is compressed, load the compression codec.
        CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
        CompressionCodec codec = codecFactory.getCodec(path);
        if (codec != null) {
            Decompressor decompressor = CodecPool.getDecompressor(codec);
            is = codec.createInputStream(is, decompressor);
        }
        return new CsvRecordReader(new InputStreamReader(is, charset), createFormat(conf), split.getLength(),
                conf.getBoolean(STRICT_MODE, true));
    }

    /**
     * Creates a CSV format from a Hadoop configuration.
     */
    private static CSVFormat createFormat(Configuration conf) {
        CSVFormat format = CSVFormat
                .newFormat(conf.get(CSV_READER_DELIMITER, DEFAULT_CSV_READER_DELIMITER).charAt(0))
                .withSkipHeaderRecord(conf.getBoolean(CSV_READER_SKIP_HEADER, DEFAULT_CSV_READER_SKIP_HEADER))
                .withRecordSeparator(conf.get(CSV_READER_RECORD_SEPARATOR, DEFAULT_CSV_READER_RECORD_SEPARATOR))
                .withIgnoreEmptyLines(
                        conf.getBoolean(CSV_READER_IGNORE_EMPTY_LINES, DEFAULT_CSV_READER_IGNORE_EMPTY_LINES))
                .withIgnoreSurroundingSpaces(conf.getBoolean(CSV_READER_IGNORE_SURROUNDING_SPACES,
                        DEFAULT_CSV_READER_IGNORE_SURROUNDING_SPACES))
                .withNullString(conf.get(CSV_READER_NULL_STRING, DEFAULT_CSV_READER_NULL_STRING));

        String[] header = conf.getStrings(CSV_READER_COLUMNS);
        if (header != null && header.length > 0)
            format = format.withHeader(header);

        String escape = conf.get(CSV_READER_ESCAPE_CHARACTER, DEFAULT_CSV_READER_ESCAPE_CHARACTER);
        if (escape != null)
            format = format.withEscape(escape.charAt(0));

        String quote = conf.get(CSV_READER_QUOTE_CHARACTER, DEFAULT_CSV_READER_QUOTE_CHARACTER);
        if (quote != null)
            format = format.withQuote(quote.charAt(0));

        String quoteMode = conf.get(CSV_READER_QUOTE_MODE, DEFAULT_CSV_READER_QUOTE_MODE);
        if (quoteMode != null)
            format = format.withQuoteMode(QuoteMode.valueOf(quoteMode));
        return format;
    }

}