com.fasterxml.jackson.dataformat.csv.CsvParser.java Source code

Introduction

Here is the source code for com.fasterxml.jackson.dataformat.csv.CsvParser.java
Source

package com.fasterxml.jackson.dataformat.csv;

import java.io.*;
import java.math.BigDecimal;
import java.math.BigInteger;

import com.fasterxml.jackson.core.*;
import com.fasterxml.jackson.core.base.ParserMinimalBase;
import com.fasterxml.jackson.core.io.IOContext;
import com.fasterxml.jackson.core.json.JsonReadContext;
import com.fasterxml.jackson.core.util.BufferRecycler;
import com.fasterxml.jackson.core.util.ByteArrayBuilder;
import com.fasterxml.jackson.dataformat.csv.impl.CsvReader;
import com.fasterxml.jackson.dataformat.csv.impl.TextBuffer;

/**
 * {@link JsonParser} implementation used to expose CSV documents
 * in form that allows other Jackson functionality to deal
 * with it.
 *<p>
 * Implementation is based on a state-machine that pulls information
 * using {@link CsvReader}.
 */
public class CsvParser extends ParserMinimalBase {
    /**
     * Enumeration that defines all togglable features for CSV parsers
     */
    public enum Feature {
        /**
         * Feature determines whether spaces around separator characters
         * (commas) are to be automatically trimmed before being reported
         * or not.
         * Note that this does NOT force trimming of possible white space from
         * within double-quoted values, but only those surrounding unquoted
         * values (white space outside of double-quotes is never included regardless
         * of trimming).
         *<p>
         * Default value is false, as per <a href="http://tools.ietf.org/html/rfc4180">RFC-4180</a>.
         */
        TRIM_SPACES(false),

        /**
         * Feature that determines how stream of records (usually CSV lines, but sometimes
         * multiple lines when linefeeds are included in quoted values) is exposed:
         * either as a sequence of Objects (false), or as an array of Objects (true).
         * Using stream of Objects is convenient when using
         * <code>ObjectMapper.readValues(...)</code>
         * and array of Objects convenient when binding to <code>List</code>s or
         * arrays of values.
         *<p>
         * Default value is false, meaning that by default a CSV document is exposed as
         * a sequence of root-level Object entries.
         */
        WRAP_AS_ARRAY(false);

        final boolean _defaultState;
        final int _mask;

        /**
         * Method that calculates bit set (flags) of all features that
         * are enabled by default.
         */
        public static int collectDefaults() {
            int flags = 0;
            for (Feature f : values()) {
                if (f.enabledByDefault()) {
                    flags |= f.getMask();
                }
            }
            return flags;
        }

        private Feature(boolean defaultState) {
            _defaultState = defaultState;
            _mask = (1 << ordinal());
        }

        public boolean enabledByDefault() {
            return _defaultState;
        }

        public int getMask() {
            return _mask;
        }
    }

    private final static CsvSchema EMPTY_SCHEMA;
    static {
        EMPTY_SCHEMA = CsvSchema.emptySchema();
    }

    /*
    /**********************************************************************
    /* State constants
    /**********************************************************************
     */

    /**
     * Initial state before anything is read from document.
     */
    protected final static int STATE_DOC_START = 0;

    /**
     * State before logical start of a record, in which next
     * token to return will be {@link JsonToken#START_OBJECT}
     * (or if no Schema is provided, {@link JsonToken#START_ARRAY}).
     */
    protected final static int STATE_RECORD_START = 1;

    /**
     * State in which next entry will be available, returning
     * either {@link JsonToken#FIELD_NAME} or value
     * (depending on whether entries are expressed as
     * Objects or just Arrays); or
     * matching close marker.
     */
    protected final static int STATE_NEXT_ENTRY = 2;

    /**
     * State in which value matching field name will
     * be returned.
     */
    protected final static int STATE_NAMED_VALUE = 3;

    /**
     * State in which "unnamed" value (entry in an array)
     * will be returned, if one available; otherwise
     * end-array is returned.
     */
    protected final static int STATE_UNNAMED_VALUE = 4;

    /**
     * State in which end marker is returned; either
     * null (if no array wrapping), or
     * {@link JsonToken#END_ARRAY} for wrapping.
     * This step will loop, returning series of nulls
     * if {@link #nextToken} is called multiple times.
     */
    protected final static int STATE_DOC_END = 5;

    /*
    /**********************************************************************
    /* Configuration
    /**********************************************************************
     */

    /**
     * Codec used for data binding when (if) requested.
     */
    protected ObjectCodec _objectCodec;

    protected int _csvFeatures;

    /**
     * Definition of columns being read. Initialized to "empty" instance, which
     * has default configuration settings.
     */
    protected CsvSchema _schema = EMPTY_SCHEMA;

    /**
     * Number of columns defined by schema.
     */
    protected int _columnCount = 0;

    /*
    /**********************************************************************
    /* State
    /**********************************************************************
     */

    /**
     * Information about parser context, context in which
     * the next token is to be parsed (root, array, object).
     */
    protected JsonReadContext _parsingContext;

    /**
     * Name of column that we exposed most recently, accessible after
     * {@link JsonToken#FIELD_NAME} as well as value tokens immediately
     * following field name.
     */
    protected String _currentName;

    /**
     * String value for the current column, if accessed.
     */
    protected String _currentValue;

    /**
     * Index of the column we are exposing
     */
    protected int _columnIndex;

    /**
     * Current logical state of the parser; one of <code>STATE_</code>
     * constants.
     */
    protected int _state = STATE_DOC_START;

    /**
     * We will hold on to decoded binary data, for duration of
     * current event, so that multiple calls to
     * {@link #getBinaryValue} will not need to decode data more
     * than once.
     */
    protected byte[] _binaryValue;

    /*
    /**********************************************************************
    /* Helper objects
    /**********************************************************************
     */

    /**
     * Thing that actually reads the CSV content
     */
    protected final CsvReader _reader;

    /**
     * Buffer that contains contents of all values after processing
     * of doubled-quotes, escaped characters.
     */
    protected final TextBuffer _textBuffer;

    protected ByteArrayBuilder _byteArrayBuilder;

    /*
    /**********************************************************************
    /* Life-cycle
    /**********************************************************************
     */

    public CsvParser(IOContext ctxt, BufferRecycler br, int parserFeatures, int csvFeatures, ObjectCodec codec,
            Reader reader) {
        super(parserFeatures);
        _objectCodec = codec;
        _textBuffer = new TextBuffer(br);
        _csvFeatures = csvFeatures;
        _parsingContext = JsonReadContext.createRootContext();
        _reader = new CsvReader(this, ctxt, reader, _schema, _textBuffer,
                isEnabled(JsonParser.Feature.AUTO_CLOSE_SOURCE), isEnabled(Feature.TRIM_SPACES));
    }

    /*                                                                                       
    /**********************************************************                              
    /* Versioned                                                                             
    /**********************************************************                              
     */

    @Override
    public Version version() {
        return PackageVersion.VERSION;
    }

    /*
    /**********************************************************                              
    /* Overridden methods
    /**********************************************************                              
     */

    @Override
    public ObjectCodec getCodec() {
        return _objectCodec;
    }

    @Override
    public void setCodec(ObjectCodec c) {
        _objectCodec = c;
    }

    @Override
    public boolean canUseSchema(FormatSchema schema) {
        return (schema instanceof CsvSchema);
    }

    @Override
    public void setSchema(FormatSchema schema) {
        if (schema instanceof CsvSchema) {
            _schema = (CsvSchema) schema;
        } else if (schema == null) {
            schema = EMPTY_SCHEMA;
        } else {
            super.setSchema(schema);
        }
        _columnCount = _schema.size();
        _reader.setSchema(_schema);
    }

    @Override
    public int releaseBuffered(Writer out) throws IOException {
        return _reader.releaseBuffered(out);
    }

    @Override
    public boolean isClosed() {
        return _reader.isClosed();
    }

    @Override
    public void close() throws IOException {
        _reader.close();
    }

    /*
    /***************************************************
    /* Public API, configuration
    /***************************************************
     */

    /**
     * Method for enabling specified CSV feature
     * (check {@link Feature} for list of features)
     */
    public JsonParser enable(Feature f) {
        _csvFeatures |= f.getMask();
        return this;
    }

    /**
     * Method for disabling specified  CSV feature
     * (check {@link Feature} for list of features)
     */
    public JsonParser disable(Feature f) {
        _csvFeatures &= ~f.getMask();
        return this;
    }

    /**
     * Method for enabling or disabling specified CSV feature
     * (check {@link Feature} for list of features)
     */
    public JsonParser configure(Feature f, boolean state) {
        if (state) {
            enable(f);
        } else {
            disable(f);
        }
        return this;
    }

    /**
     * Method for checking whether specified CSV {@link Feature}
     * is enabled.
     */
    public boolean isEnabled(Feature f) {
        return (_csvFeatures & f.getMask()) != 0;
    }

    // SHOULD have been in 2.0; but is only in 2.1 for JsonParser:
    //@Override
    /**
     * Accessor for getting active schema definition: it may be
     * "empty" (no column definitions), but will never be null
     * since it defaults to an empty schema (and default configuration)
     */
    @Override
    public CsvSchema getSchema() {
        return _schema;
    }

    /*
    /**********************************************************
    /* Location info
    /**********************************************************
     */

    @Override
    public JsonStreamContext getParsingContext() {
        return _parsingContext;
    }

    @Override
    public JsonLocation getTokenLocation() {
        return _reader.getTokenLocation();
    }

    @Override
    public JsonLocation getCurrentLocation() {
        return _reader.getCurrentLocation();
    }

    @Override
    public Object getInputSource() {
        return _reader.getInputSource();
    }

    /*
    /**********************************************************
    /* Parsing
    /**********************************************************
     */

    @Override
    public String getCurrentName() throws IOException, JsonParseException {
        return _currentName;
    }

    @Override
    public void overrideCurrentName(String name) {
        _currentName = name;
    }

    @Override
    public JsonToken nextToken() throws IOException, JsonParseException {
        _binaryValue = null;
        switch (_state) {
        case STATE_DOC_START:
            return (_currToken = _handleStartDoc());
        case STATE_RECORD_START:
            return (_currToken = _handleRecordStart());
        case STATE_NEXT_ENTRY:
            return (_currToken = _handleNextEntry());
        case STATE_NAMED_VALUE:
            return (_currToken = _handleNamedValue());
        case STATE_UNNAMED_VALUE:
            return (_currToken = _handleUnnamedValue());
        case STATE_DOC_END:
            _reader.close();
            if (_parsingContext.inRoot()) {
                return null;
            }
            // should always be in array, actually... but:
            boolean inArray = _parsingContext.inArray();
            _parsingContext = _parsingContext.getParent();
            return inArray ? JsonToken.END_ARRAY : JsonToken.END_OBJECT;
        default:
            throw new IllegalStateException();
        }
    }

    /*
    /**********************************************************
    /* Parsing, helper methods
    /**********************************************************
     */

    /**
     * Method called to handle details of initializing things to return
     * the very first token.
     */
    protected JsonToken _handleStartDoc() throws IOException, JsonParseException {
        // First things first: are we expecting header line? If so, read, process
        if (_schema.useHeader()) {
            _readHeaderLine();
        }
        // and if we are to skip the first data line, skip it
        if (_schema.skipFirstDataRow()) {
            _reader.skipLine();
        }

        /* Only one real complication, actually; empy documents (zero bytes).
         * Those have no entries. Should be easy enough to detect like so:
         */
        if (!_reader.hasMoreInput()) {
            _state = STATE_DOC_END;
            // but even empty sequence must still be wrapped in logical array
            if (isEnabled(Feature.WRAP_AS_ARRAY)) {
                _parsingContext = _reader.childArrayContext(_parsingContext);
                return JsonToken.START_ARRAY;
            }
            return null;
        }

        if (isEnabled(Feature.WRAP_AS_ARRAY)) {
            _parsingContext = _reader.childArrayContext(_parsingContext);
            _state = STATE_RECORD_START;
            return JsonToken.START_ARRAY;
        }
        // otherwise, same as regular new entry...
        return _handleRecordStart();
    }

    protected JsonToken _handleRecordStart() throws IOException, JsonParseException {
        _columnIndex = 0;
        if (_columnCount == 0) { // no schema; exposed as an array
            _state = STATE_UNNAMED_VALUE;
            _parsingContext = _reader.childArrayContext(_parsingContext);
            return JsonToken.START_ARRAY;
        }
        // otherwise, exposed as an Object
        _parsingContext = _reader.childObjectContext(_parsingContext);
        _state = STATE_NEXT_ENTRY;
        return JsonToken.START_OBJECT;
    }

    protected JsonToken _handleNextEntry() throws IOException, JsonParseException {
        // NOTE: only called when we do have real Schema
        String next = _reader.nextString();

        if (next == null) { // end of record or input...
            _parsingContext = _parsingContext.getParent();
            // let's handle EOF or linefeed
            if (!_reader.startNewLine()) {
                _state = STATE_DOC_END;
            } else {
                // no, just end of record
                _state = STATE_RECORD_START;
            }
            return JsonToken.END_OBJECT;
        }
        _state = STATE_NAMED_VALUE;
        _currentValue = next;
        if (_columnIndex >= _columnCount) {
            _currentName = null;
            /* 14-Mar-2012, tatu: As per [Issue-1], let's allow one specific
             *  case of extra: if we get just one all-whitespace entry, that
             *  can be just skipped
             */
            if (_columnIndex == _columnCount) {
                next = next.trim();
                if (next.length() == 0) {
                    /* if so, need to verify we then get the end-of-record;
                     * easiest to do by just calling ourselves again...
                     */
                    return _handleNextEntry();
                }
            }
            _reportError("Too many entries: expected at most " + _columnCount + " (value #" + _columnCount + " ("
                    + next.length() + " chars) \"" + next + "\")");
        }
        _currentName = _schema.column(_columnIndex).getName();
        return JsonToken.FIELD_NAME;
    }

    protected JsonToken _handleNamedValue() throws IOException, JsonParseException {
        _state = STATE_NEXT_ENTRY;
        ++_columnIndex;
        return JsonToken.VALUE_STRING;
    }

    protected JsonToken _handleUnnamedValue() throws IOException, JsonParseException {
        String next = _reader.nextString();
        if (next == null) { // end of record or input...
            _parsingContext = _parsingContext.getParent();
            if (!_reader.startNewLine()) { // end of whole thing...
                _state = STATE_DOC_END;
            } else {
                // no, just end of record
                _state = STATE_RECORD_START;
            }
            return JsonToken.END_ARRAY;
        }
        // state remains the same
        _currentValue = next;
        ++_columnIndex;
        return JsonToken.VALUE_STRING;
    }

    /**
     * Method called to process the expected header line
     */
    protected void _readHeaderLine() throws IOException, JsonParseException {
        /* Two separate cases:
         * 
         * (a) We already have a Schema with columns; if so, header will be skipped
         * (b) Otherwise, need to find column definitions; empty one is not acceptable
         */

        if (_schema.size() > 0) { // case (a); skip all/any
            while (_reader.nextString() != null) {
            }
            return;
        }
        // case (b); read all
        String name;
        // base setting on existing schema, but drop columns
        CsvSchema.Builder builder = _schema.rebuild().clearColumns();

        while ((name = _reader.nextString()) != null) {
            // one more thing: always trim names, regardless of config settings
            name = name.trim();

            // See if "old" schema defined type; if so, use that type...
            CsvSchema.Column prev = _schema.column(name);
            if (prev != null) {
                builder.addColumn(name, prev.getType());
            } else {
                builder.addColumn(name);
            }
        }
        // Ok: did we get any  columns?
        CsvSchema newSchema = builder.build();
        int size = newSchema.size();
        if (size < 2) { // 1 just because we may get 'empty' header name
            String first = (size == 0) ? "" : newSchema.column(0).getName().trim();
            if (first.length() == 0) {
                _reportError("Empty header line: can not bind data");
            }
        }
        // otherwise we will use what we got
        setSchema(builder.build());
    }

    /*
    /**********************************************************
    /* String value handling
    /**********************************************************
     */

    // For now we do not store char[] representation...
    @Override
    public boolean hasTextCharacters() {
        return _textBuffer.hasTextAsCharacters();
    }

    @Override
    public String getText() throws IOException, JsonParseException {
        return _currentValue;
    }

    @Override
    public char[] getTextCharacters() throws IOException, JsonParseException {
        return _textBuffer.contentsAsArray();
    }

    @Override
    public int getTextLength() throws IOException, JsonParseException {
        return _textBuffer.size();
    }

    @Override
    public int getTextOffset() throws IOException, JsonParseException {
        return 0;
    }

    /*
    /**********************************************************************
    /* Binary (base64)
    /**********************************************************************
     */

    @Override
    public Object getEmbeddedObject() throws IOException, JsonParseException {
        return null;
    }

    @SuppressWarnings("resource")
    @Override
    public byte[] getBinaryValue(Base64Variant variant) throws IOException, JsonParseException {
        if (_binaryValue == null) {
            if (_currToken != JsonToken.VALUE_STRING) {
                _reportError("Current token (" + _currToken + ") not VALUE_STRING, can not access as binary");
            }
            ByteArrayBuilder builder = _getByteArrayBuilder();
            _decodeBase64(_currentValue, builder, variant);
            _binaryValue = builder.toByteArray();
        }
        return _binaryValue;
    }

    /*
    /**********************************************************************
    /* Number accessors
    /**********************************************************************
     */

    @Override
    public NumberType getNumberType() throws IOException, JsonParseException {
        return _reader.getNumberType();
    }

    @Override
    public Number getNumberValue() throws IOException, JsonParseException {
        return _reader.getNumberValue();
    }

    @Override
    public int getIntValue() throws IOException, JsonParseException {
        return _reader.getIntValue();
    }

    @Override
    public long getLongValue() throws IOException, JsonParseException {
        return _reader.getLongValue();
    }

    @Override
    public BigInteger getBigIntegerValue() throws IOException, JsonParseException {
        return _reader.getBigIntegerValue();
    }

    @Override
    public float getFloatValue() throws IOException, JsonParseException {
        return _reader.getFloatValue();
    }

    @Override
    public double getDoubleValue() throws IOException, JsonParseException {
        return _reader.getDoubleValue();
    }

    @Override
    public BigDecimal getDecimalValue() throws IOException, JsonParseException {
        return _reader.getDecimalValue();
    }

    /*
    /**********************************************************************
    /* Helper methods from base class
    /**********************************************************************
     */

    @Override
    protected void _handleEOF() throws JsonParseException {
        // I don't think there's problem with EOFs usually; except maybe in quoted stuff?
        _reportInvalidEOF(": expected closing quote character");
    }

    /*
    /**********************************************************************
    /* Helper methods for CsvReader
    /**********************************************************************
     */

    // must be (re)defined to make package-accessible
    public void _reportCsvError(String msg) throws JsonParseException {
        super._reportError(msg);
    }

    public void _reportUnexpectedCsvChar(int ch, String msg) throws JsonParseException {
        super._reportUnexpectedChar(ch, msg);
    }

    /*
    /**********************************************************************
    /* Internal methods
    /**********************************************************************
     */

    public ByteArrayBuilder _getByteArrayBuilder() {
        if (_byteArrayBuilder == null) {
            _byteArrayBuilder = new ByteArrayBuilder();
        } else {
            _byteArrayBuilder.reset();
        }
        return _byteArrayBuilder;
    }
}