com.github.jferard.pgloaderutils.sniffer.csv.CSVOptionalHeaderSniffer.java Source code

Java tutorial

Introduction

Here is the source code for com.github.jferard.pgloaderutils.sniffer.csv.CSVOptionalHeaderSniffer.java

Source

/*
 * Some utilities for loading csv data into a PosgtreSQL database:
 * detect file encoding, CSV format and populate database
 *
 *     Copyright (C) 2016, 2018 J. Frard <https://github.com/jferard>
 *
 * This file is part of pgLoader Utils.
 *
 * pgLoader Utils is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * pgLoader Utils is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.github.jferard.pgloaderutils.sniffer.csv;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import com.github.jferard.pgloaderutils.sniffer.Sniffer;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.csv.QuoteMode;

/**
 * The CSVOptionalHeaderSniffer class is a Sniffer that checks if the provided
 * stream contains a header. Here are the steps :
 * 
 * <ol>
 * <li>if the first row contains at least on digits only value, then there is no
 * header in the stream</li>
 * <li>if the first row contains at least on digits only value, then there is no
 * header in the stream</li>
 * </ol>
 *
 * @author Julien Frard (C) 2016
 *
 */
public class CSVOptionalHeaderSniffer implements Sniffer {
    public static CSVOptionalHeaderSniffer getSniffer(final byte delimiter, final byte quote, final byte escape,
            final Charset charset) {
        CSVFormat csvFormat = CSVFormat.newFormat((char) delimiter).withQuote((char) quote)
                .withQuoteMode(QuoteMode.MINIMAL).withAllowMissingColumnNames();
        if (escape != quote)
            csvFormat = csvFormat.withEscape((char) escape);
        return new CSVOptionalHeaderSniffer(csvFormat, charset);
    }

    private final Charset charset;

    private final CSVFormat csvFormat;

    private List<String> header;

    private RowSignaturesAnalyzer rowSignaturesAnalyzer;

    public CSVOptionalHeaderSniffer(final CSVFormat csvFormat, final Charset charset) {
        this.charset = charset;
        this.csvFormat = csvFormat;
        this.rowSignaturesAnalyzer = new RowSignaturesAnalyzer();
    }

    public List<String> getHeader() {
        return this.header;
    }

    @Override
    public void sniff(final InputStream inputStream, final int size) throws IOException {
        final Reader streamReader = new InputStreamReader(inputStream, this.charset);

        final CSVParser parser = new CSVParser(streamReader, this.csvFormat);
        try {
            final Iterator<CSVRecord> iterator = parser.iterator();

            if (iterator.hasNext()) {
                final CSVRecord firstRowRecord = iterator.next();
                final int firstRowSize = firstRowRecord.size();

                final char[] firstRowSignature = this.rowSignaturesAnalyzer.getSignature(firstRowRecord,
                        firstRowSize);

                if (this.containsAtLeastOneOnlyDigitsValue(firstRowSignature)) {
                    this.header = null;
                } else {
                    final char[] remainingRowsSignature = this.rowSignaturesAnalyzer
                            .getRemainingRowsSignature(iterator, firstRowSize);
                    if (this.containsAtLeastOneColumnWithLetterHeaderAndDigitValues(firstRowSignature,
                            remainingRowsSignature, firstRowSize)) {
                        // copy firstRow in header
                        for (final String s : firstRowRecord)
                            this.header.add(s);
                    }
                }
            } else
                this.header = null;
        } finally {
            parser.close();
        }
    }

    private boolean containsAtLeastOneColumnWithLetterHeaderAndDigitValues(final char[] firstRowSignature,
            final char[] remainingRowsSignature, final int firstRowSize) {
        for (int col = 0; col < firstRowSize; col++) {
            // at least one column with non digit first cell and digit next
            // cells
            if (firstRowSignature[col] == '?' && remainingRowsSignature[col] != '?') {
                this.header = new ArrayList<String>(firstRowSize);
                return true;
            }
        }
        return false;
    }

    private boolean containsAtLeastOneOnlyDigitsValue(final char[] firstRowSignature) {
        for (final char c : firstRowSignature) {
            if (c == 'D')
                return true;
        }
        return false;
    }
}