com.epam.dlab.module.ParserCsv.java Source code

Introduction

Here is the source code for com.epam.dlab.module.ParserCsv.java
Source

/***************************************************************************
    
Copyright (c) 2016, EPAM SYSTEMS INC
    
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    
http://www.apache.org/licenses/LICENSE-2.0
    
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    
****************************************************************************/

package com.epam.dlab.module;

import java.util.ArrayList;
import java.util.List;

import javax.validation.constraints.NotNull;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.epam.dlab.core.parser.ParserByLine;
import com.epam.dlab.exceptions.AdapterException;
import com.epam.dlab.exceptions.InitializationException;
import com.epam.dlab.exceptions.ParseException;
import com.fasterxml.jackson.annotation.JsonClassDescription;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonTypeName;
import com.google.common.base.MoreObjects.ToStringHelper;

/** Parse CSV format to common CSV format.
 */
@JsonTypeName(ModuleName.PARSER_CSV)
@JsonClassDescription("CSV parser.\n" + "Parse source CSV format to common billing report.\n" + "  - type: "
        + ModuleName.PARSER_CSV + "\n"
        + "    [dataFile: <filename>]           - the file name to store working data of parser.]\n"
        + "    [columnStartDate: <column_name>] - the name of source column with date of data.]\n"
        + "    [columnMapping: >-\n"
        + "                    <targetColumn1=sourceColumnX;targetColumn2=sourceColumnY; ...;\n"
        + "                     tags=sourceColumnK,...,sourceColumnN>]\n"
        + "                                  - columns mapping to target from source columns.\n"
        + "                                    Know target columns: dlab_id, user,\n"
        + "                                    usage_date, product, usage_type, usage, cost,\n"
        + "                                    currency_code, resource_id, tags.\n" + "    [whereCondition: >-\n"
        + "                    <(source_columnX > 0.0 || source_columnY == 'string') &&\n"
        + "                     source_columnZ != 2016>]\n"
        + "                                  - where condition for filtering the source data,\n"
        + "                                    see http://commons.apache.org/proper/commons-jexl/reference/syntax.html#Operators\n"
        + "                                    for detais.\n"
        + "    [aggregate: <none | month | day>] - how to aggregate the data.\n"
        + "    [headerLineNo: <number>]          - the number of header line in source data.\n"
        + "    [skipLines: <numbber>]            - the number of line which will be skipped\n"
        + "                                        (include header).\n"
        + "    [fieldSeparator: <char>]          - char for separate field names and values.\n"
        + "    [fieldTerminator: <char>]         - char for terminate field names and values.\n"
        + "    [escapeChar: <char>]              - escape char.\n"
        + "    [decimalSeparator: <char>]        - char for decimal sign.\n"
        + "    [groupingSeparator: <char>]       - char for thousands separator.\n")
public class ParserCsv extends ParserByLine {
    private static final Logger LOGGER = LoggerFactory.getLogger(ParserCsv.class);

    /** Character for separate field names and values. */
    public static final char FIELD_SEPARATOR_DEFAULT = ',';

    /** Character for termination field names and values. */
    public static final char FIELD_DELIMITER_DEFAULT = '"';

    /** Escape character. */
    public static final char ESCAPE_CHAR_DEFAULT = '\\';

    /** Character for separate field names and values. */
    @NotNull
    @JsonProperty
    private char fieldSeparator = FIELD_SEPARATOR_DEFAULT;

    /** Character for termination field names and values. */
    @NotNull
    @JsonProperty
    private char fieldTerminator = FIELD_DELIMITER_DEFAULT;

    /** Escape character. */
    @NotNull
    @JsonProperty
    private char escapeChar = ESCAPE_CHAR_DEFAULT;

    /** The number of line that contain the header of data.*/
    @JsonProperty
    private int headerLineNo = 0;

    /** The number of line which will be skipped (include header).*/
    @JsonProperty
    private int skipLines = 0;

    /** Return the character for separate field names and values. */
    public char getFieldSeparator() {
        return fieldSeparator;
    }

    /** Set the character for separate field names and values. */
    public void setFieldSeparator(char fieldSeparator) {
        this.fieldSeparator = fieldSeparator;
    }

    /** Return the character for termination field names and values. */
    public char getFieldTerminator() {
        return fieldTerminator;
    }

    /** Set the character for termination field names and values. */
    public void setFieldTerminator(char fieldTerminator) {
        this.fieldTerminator = fieldTerminator;
    }

    /** Return the escape character. */
    public char getEscapeChar() {
        return escapeChar;
    }

    /** Set the escape character. */
    public void setEscapeChar(char escapeChar) {
        this.escapeChar = escapeChar;
    }

    /** Return the number of line that contain the header of data.*/
    public int getHeaderLineNo() {
        return headerLineNo;
    }

    /** Set the number of line that contain the header of data.*/
    public void setHeaderLineNo(int headerLineNo) {
        this.headerLineNo = headerLineNo;
    }

    /** Return the number of line which will be skipped (include header).*/
    public int getSkipLines() {
        return skipLines;
    }

    /** Set the number of line which will be skipped (include header).*/
    public void setSkipLines(int skipLines) {
        this.skipLines = skipLines;
    }

    @Override
    public void initialize() throws InitializationException {
    }

    @Override
    public List<String> parseHeader() throws AdapterException, ParseException {
        String line = null;
        List<String> header = null;

        if (headerLineNo > 0) {
            while (getCurrentStatistics().getRowReaded() < headerLineNo) {
                if ((line = getNextRow()) == null) {
                    return null;
                }
                getCurrentStatistics().incrRowSkipped();
            }
            header = parseRow(line);
        }

        while (getCurrentStatistics().getRowReaded() < skipLines) {
            if (getNextRow() == null) {
                break;
            }
            getCurrentStatistics().incrRowSkipped();
        }

        return header;
    }

    /** Construct the exception.
     * @param message the error message.
     * @param pos the position in the parsed line.
     * @param sourceLine the parsed line.
     * @return ParseException
     */
    private ParseException getParseException(String message, int pos, String sourceLine) {
        String s = String.format("%s at pos %d in line: ", message, pos);
        LOGGER.error(s + sourceLine);
        LOGGER.error(StringUtils.repeat(' ', s.length() + pos - 1) + '^');
        return new ParseException(s + sourceLine);
    }

    @Override
    public List<String> parseRow(String line) throws ParseException {
        int realPos = 0;
        int pos = 0;
        boolean isDelimiter = false;
        StringBuilder sb = new StringBuilder(line);
        List<String> row = new ArrayList<String>();

        while (pos < sb.length()) {
            char c = sb.charAt(pos);
            /*
            LOGGER.debug("Current buffer {}", sb);
            LOGGER.debug("pos {}", pos);
            LOGGER.debug("isDelimiter {}", isDelimiter);
            */
            if (c == escapeChar) {
                realPos++;
                pos++;
                if (pos == sb.length()) {
                    throw getParseException("Invalid escape char", realPos, line);
                }
                sb.delete(pos - 1, pos);
                realPos++;
            } else if (c == fieldTerminator) {
                realPos++;
                if (isDelimiter) {
                    realPos++;
                    pos++;
                    if (pos == sb.length()) {
                        sb.delete(pos - 1, pos);
                        break;
                    }
                    if (sb.charAt(pos) == fieldSeparator) {
                        row.add(sb.substring(0, pos - 1));
                        sb.delete(0, pos + 1);
                        pos = 0;
                        isDelimiter = false;
                        continue;
                    }
                    throw getParseException("Invalid field delimiter", realPos, line);
                }

                if (pos != 0) {
                    throw getParseException("Unterminated field", realPos, line);
                }
                sb.delete(0, 1);
                isDelimiter = true;
                continue;
            } else if (c == fieldSeparator) {
                realPos++;
                if (isDelimiter) {
                    pos++;
                    continue;
                }
                row.add(sb.substring(0, pos));
                sb.delete(0, pos + 1);
                pos = 0;
            } else {
                realPos++;
                pos++;
            }
        }
        row.add(sb.toString());

        return row;
    }

    @Override
    public ToStringHelper toStringHelper(Object self) {
        return super.toStringHelper(self).add("fieldSeparator", fieldSeparator)
                .add("fieldTerminator", fieldTerminator).add("escapeChar", escapeChar)
                .add("headerLineNo", headerLineNo).add("skipLines", skipLines);
    }
}