co.cask.hydrator.plugin.CSVParser.java Source code

Introduction

Here is the source code for co.cask.hydrator.plugin.CSVParser.java
Source

/*
 * Copyright  2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.hydrator.plugin;

import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.data.schema.Schema.Field;
import co.cask.cdap.api.plugin.PluginConfig;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.Transform;
import co.cask.cdap.etl.api.TransformContext;
import com.google.common.base.Throwables;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.List;

/**
 * Transformation that parses a field as CSV Record into {@link StructuredRecord}.
 *
 * <p>
 * CSVParser supports transforming the input into {@link StructuredRecord}
 * Following are different CSV record types that are supported by this transform.
 * <ul>
 *   <li>DEFAULT</li>
 *   <li>EXCEL</li>
 *   <li>RFC4180</li>
 *   <li>MYSQL</li>
 *   <li>TDF and</li>
 *   <li>PDL</li>   
 * </ul>
 * </p>  
 */
@Plugin(type = "transform")
@Name("CSVParser")
@Description("Parses a field as CSV Record into a Structured Record.")
public final class CSVParser extends Transform<StructuredRecord, StructuredRecord> {
    private static final Logger LOG = LoggerFactory.getLogger(CSVParser.class);
    private final Config config;

    // Output Schema associated with transform output. 
    private Schema outSchema;

    // List of fields specified in the schema. 
    private List<Field> fields;

    // Format of CSV.
    private CSVFormat csvFormat = CSVFormat.DEFAULT;

    // Format of PDL.
    public static final CSVFormat PDL;

    // Initialize Pipe Delimiter CSV Parser. 
    static {
        PDL = CSVFormat.DEFAULT.withDelimiter('|').withEscape('\\').withIgnoreEmptyLines(false)
                .withAllowMissingColumnNames().withQuote((Character) null).withRecordSeparator('\n')
                .withIgnoreSurroundingSpaces();
    }

    // This is used only for tests, otherwise this is being injected by the ingestion framework. 
    public CSVParser(Config config) {
        this.config = config;
    }

    @Override
    public void configurePipeline(PipelineConfigurer configurer) throws IllegalArgumentException {
        super.configurePipeline(configurer);

        // Check if the format specified is valid.
        if (this.config.format == null || this.config.format.isEmpty()) {
            throw new IllegalArgumentException(
                    "Format is not specified. Allowed values are DEFAULT, EXCEL, MYSQL," + " RFC4180, PDL & TDF");
        }

        // Check if format is one of the allowed types.
        if (!this.config.format.equalsIgnoreCase("DEFAULT") && !this.config.format.equalsIgnoreCase("EXCEL")
                && !this.config.format.equalsIgnoreCase("MYSQL") && !this.config.format.equalsIgnoreCase("RFC4180")
                && !this.config.format.equalsIgnoreCase("TDF") && !this.config.format.equalsIgnoreCase("PDL")) {
            throw new IllegalArgumentException(
                    "Format specified is not one of the allowed values. Allowed values are "
                            + "DEFAULT, EXCEL, MYSQL, RFC4180, PDL & TDF");
        }

        Schema inputSchema = configurer.getStageConfigurer().getInputSchema();
        if (inputSchema != null) {
            Schema.Field inputSchemaField = inputSchema.getField(config.field);
            if (inputSchemaField == null) {
                throw new IllegalArgumentException("Field " + config.field + " is not present in the input schema");
            } else {
                if (!inputSchemaField.getSchema().getType().equals(Schema.Type.STRING)) {
                    throw new IllegalArgumentException("Type for field  " + config.field + " must be String");
                }
            }
        }

        // Check if schema specified is a valid schema or no.
        try {
            Schema outputSchema = Schema.parseJson(this.config.schema);
            configurer.getStageConfigurer().setOutputSchema(outputSchema);

            // When a input field is passed through to output, the type and name should be the same.
            // If the type is not the same, then we fail.
            if (inputSchema != null) {
                for (Field field : inputSchema.getFields()) {
                    if (outputSchema.getField(field.getName()) != null) {
                        Schema out = outputSchema.getField(field.getName()).getSchema();
                        Schema in = field.getSchema();
                        if (!in.equals(out)) {
                            throw new IllegalArgumentException("Input field '" + field.getName()
                                    + "' does not have same output schema as input.");
                        }
                    }
                }
            }
        } catch (IOException e) {
            throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format.");
        }
    }

    @Override
    public void initialize(TransformContext context) throws Exception {
        super.initialize(context);

        String csvFormatString = config.format.toLowerCase();
        switch (csvFormatString) {
        case "default":
            csvFormat = CSVFormat.DEFAULT;
            break;

        case "excel":
            csvFormat = CSVFormat.EXCEL;
            break;

        case "mysql":
            csvFormat = CSVFormat.MYSQL;
            break;

        case "rfc4180":
            csvFormat = CSVFormat.RFC4180;
            break;

        case "tdf":
            csvFormat = CSVFormat.TDF;
            break;

        case "pdl":
            csvFormat = PDL;
            break;

        default:
            throw new IllegalArgumentException(
                    "Format {} specified is not one of the allowed format. Allowed formats are"
                            + "DEFAULT, EXCEL, MYSQL, RFC4180, PDL and TDF");
        }

        try {
            outSchema = Schema.parseJson(config.schema);
            fields = outSchema.getFields();
        } catch (IOException e) {
            throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format.");
        }
    }

    @Override
    public void transform(StructuredRecord in, Emitter<StructuredRecord> emitter) throws Exception {
        // Field has to string to be parsed correctly. For others throw an exception.
        String body = in.get(config.field);

        // Parse the text as CSV and emit it as structured record.
        try {
            org.apache.commons.csv.CSVParser parser = org.apache.commons.csv.CSVParser.parse(body, csvFormat);
            List<CSVRecord> records = parser.getRecords();
            for (CSVRecord record : records) {
                emitter.emit(createStructuredRecord(record, in));
            }
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
    }

    private StructuredRecord createStructuredRecord(CSVRecord record, StructuredRecord in) {
        StructuredRecord.Builder builder = StructuredRecord.builder(outSchema);
        int i = 0;
        for (Field field : fields) {
            String name = field.getName();
            // If the field specified in the output field is present in the input, then
            // it's directly copied into the output, else field is parsed in from the CSV parser.
            if (in.get(name) != null) {
                builder.set(name, in.get(name));
            } else {
                builder.set(name, TypeConvertor.get(record.get(i), field.getSchema().getType()));
                ++i;
            }
        }
        return builder.build();
    }

    /**
     * Configuration for the plugin.
     */
    public static class Config extends PluginConfig {

        @Name("format")
        @Description("Specify one of the predefined formats. DEFAULT, EXCEL, MYSQL, RFC4180, PDL & TDF "
                + "are supported formats.")
        private final String format;

        @Name("field")
        @Description("Specify the field that should be used for parsing into CSV.")
        private final String field;

        @Name("schema")
        @Description("Specifies the schema that has to be output.")
        private final String schema;

        public Config(String format, String field, String schema) {
            this.format = format;
            this.field = field;
            this.schema = schema;
        }
    }
}