co.cask.hydrator.plugin.CSVFormatter.java Source code

Introduction

Here is the source code for co.cask.hydrator.plugin.CSVFormatter.java
Source

/*
 * Copyright  2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.hydrator.plugin;

import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.data.schema.Schema.Field;
import co.cask.cdap.api.plugin.PluginConfig;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.Transform;
import co.cask.cdap.etl.api.TransformContext;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.StringWriter;
import java.util.List;
import java.util.Map;

/**
 * Transform that formats a {@link StructuredRecord} to CSV.
 * <p>
 * CSVFormatter supports transforming the input {@link StructuredRecord}
 * into CSV Record of varying types. Following are different CSV record
 * types that are supported by this transform.
 * <ul>
 *   <li>DELIMITED</li>
 *   <li>EXCEL</li>
 *   <li>RFC4180</li>
 *   <li>MYSQL and</li>
 *   <li>TDF</li>
 * </ul>
 * </p>
 */
@Plugin(type = "transform")
@Name("CSVFormatter")
@Description("Formats a Structured Record to CSV")
public final class CSVFormatter extends Transform<StructuredRecord, StructuredRecord> {
    private static final Logger LOG = LoggerFactory.getLogger(CSVFormatter.class);

    // Transform configuraiton.
    private final Config config;

    // Output Schema associated with transform output.
    private Schema outSchema;

    // List of fields specified in the schema.
    private List<Field> fields;

    // Mapping from delimiter name to the character to be used as delimiter.
    private static final Map<String, String> delimMap = Maps.newHashMap();

    // Format of CSV File.
    private CSVFormat csvFileFormat;

    // This is used only for tests, otherwise this is being injected by the ingestion framework.
    public CSVFormatter(Config config) {
        this.config = config;
    }

    // Static collection of delimiter mappings from name to delim.
    static {
        delimMap.put("COMMA", ",");
        delimMap.put("CTRL-A", "\001");
        delimMap.put("TAB", "\t");
        delimMap.put("VBAR", "|");
        delimMap.put("STAR", "*");
        delimMap.put("CARROT", "^");
        delimMap.put("DOLLAR", "$");
        delimMap.put("HASH", "#");
        delimMap.put("TILDE", "~");
        delimMap.put("CTRL-B", "\002");
        delimMap.put("CTRL-C", "\003");
        delimMap.put("CTRL-D", "\004");
        delimMap.put("CTRL-E", "\005");
        delimMap.put("CTRL-F", "\006");
    }

    @Override
    public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws IllegalArgumentException {
        super.configurePipeline(pipelineConfigurer);

        if (!delimMap.containsKey(config.delimiter)) {
            throw new IllegalArgumentException("Unknown delimiter '" + config.delimiter + "' specified. ");
        }

        // Check if the format specified is valid.
        if (config.format == null || config.format.isEmpty()) {
            throw new IllegalArgumentException(
                    "Format is not specified. Allowed values are DELIMITED, EXCEL, MYSQL," + " RFC4180 & TDF");
        }

        if (!config.format.equalsIgnoreCase("DELIMITED") && !config.format.equalsIgnoreCase("EXCEL")
                && !config.format.equalsIgnoreCase("MYSQL") && !config.format.equalsIgnoreCase("RFC4180")
                && !config.format.equalsIgnoreCase("TDF")) {
            throw new IllegalArgumentException(
                    "Format specified is not one of the allowed values. Allowed values are "
                            + "DELIMITED, EXCEL, MYSQL, RFC4180 & TDF");
        }

        // Check if schema specified is a valid schema or no.
        try {
            Schema schema = Schema.parseJson(config.schema);
            List<Schema.Field> fields = schema.getFields();
            if (fields.size() > 1) {
                throw new IllegalArgumentException("Output schema should have only one field of type String");
            }
            if (fields.get(0).getSchema().getType() != Schema.Type.STRING) {
                throw new IllegalArgumentException("Output field type should be String");
            }
            pipelineConfigurer.getStageConfigurer().setOutputSchema(schema);
        } catch (IOException e) {
            throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format.");
        }

    }

    @Override
    public void initialize(TransformContext context) throws Exception {
        super.initialize(context);

        try {
            outSchema = Schema.parseJson(config.schema);
            fields = outSchema.getFields();
        } catch (IOException e) {
            throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format.");
        }

        // Based on the delimiter name specified pick the delimiter to be used for the record.
        // This is only applicable when the format type is choosen as DELIMITER
        char delim = ',';
        if (delimMap.containsKey(config.delimiter)) {
            delim = delimMap.get(config.delimiter).charAt(0);
        } else {
            throw new IllegalArgumentException("Unknown delimiter '" + config.delimiter + "' specified. ");
        }

        // Create CSVFileFormat based on the format specified.
        switch (config.format.toLowerCase()) {
        case "delimited":
            csvFileFormat = CSVFormat.newFormat(delim).withQuote('"').withRecordSeparator("\r\n")
                    .withIgnoreEmptyLines();
            break;

        case "excel":
            csvFileFormat = CSVFormat.Predefined.Excel.getFormat();
            break;

        case "mysql":
            csvFileFormat = CSVFormat.Predefined.MySQL.getFormat();
            break;

        case "tdf":
            csvFileFormat = CSVFormat.Predefined.TDF.getFormat();
            break;

        case "rfc4180":
            csvFileFormat = CSVFormat.Predefined.TDF.getFormat();
            break;

        default:
            throw new RuntimeException("Unknown format specified for CSV. Please check the format.");
        }

    }

    @Override
    public void transform(StructuredRecord record, Emitter<StructuredRecord> emitter) throws Exception {
        List<Object> values = Lists.newArrayList();
        for (Schema.Field field : record.getSchema().getFields()) {
            values.add(record.get(field.getName()));
        }

        CSVPrinter printer = new CSVPrinter(new StringWriter(), csvFileFormat);
        if (printer != null) {
            printer.printRecord(values);
            emitter.emit(StructuredRecord.builder(outSchema)
                    .set(outSchema.getFields().get(0).getName(), printer.getOut().toString()).build());
            printer.close();
        }
    }

    /**
     * Configuration for the plugin.
     */
    public static class Config extends PluginConfig {

        @Name("format")
        @Description("Specify one of the predefined formats. DEFAULT, EXCEL, MYSQL, RFC4180 & TDF are supported formats.")
        private final String format;

        @Name("delimiter")
        @Description("Specify delimiter to be used for separating fields.")
        private final String delimiter;

        @Name("schema")
        @Description("Specifies the schema that has to be output.")
        private final String schema;

        public Config(String format, String delimiter, String schema) {
            this.format = format;
            this.delimiter = delimiter;
            this.schema = schema;
        }
    }
}