Java tutorial
/* * Copyright 2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.hydrator.transforms; import co.cask.cdap.api.annotation.Description; import co.cask.cdap.api.annotation.Name; import co.cask.cdap.api.annotation.Plugin; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.api.data.schema.Schema.Field; import co.cask.cdap.api.plugin.PluginConfig; import co.cask.cdap.etl.api.Emitter; import co.cask.cdap.etl.api.PipelineConfigurer; import co.cask.cdap.etl.api.Transform; import co.cask.cdap.etl.api.TransformContext; import org.apache.commons.codec.DecoderException; import org.apache.commons.codec.binary.Base32; import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Hex; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import org.xerial.snappy.Snappy; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.List; import java.util.zip.GZIPInputStream; import java.util.zip.Inflater; import java.util.zip.ZipInputStream; /** * A Transformation that parses a text into CSV Fields. */ @Plugin(type = "transform") @Name("CSVParser2") @Description("Decodes, Decompresses and Parses CSV Records.") public final class CSVParser2 extends Transform<StructuredRecord, StructuredRecord> { private final Config config; // Output Schema associated with transform output. private Schema outSchema; // List of fields specified in the schema. private List<Field> fields; // Format of CSV. private CSVFormat csvFormat = CSVFormat.DEFAULT; // This is used only for tests, otherwise this is being injected by the ingestion framework. public CSVParser2(Config config) { this.config = config; } @Override public void initialize(TransformContext context) throws Exception { super.initialize(context); String csvFormatString = config.format.toLowerCase(); switch (csvFormatString) { case "default": csvFormat = CSVFormat.DEFAULT; break; case "excel": csvFormat = CSVFormat.EXCEL; break; case "mysql": csvFormat = CSVFormat.MYSQL; break; case "rfc4180": csvFormat = CSVFormat.RFC4180; break; case "tdf": csvFormat = CSVFormat.TDF; break; default: throw new IllegalArgumentException( "Format {} specified is not one of the allowed format. Allowed formats are" + "DEFAULT, EXCEL, MYSQL, RFC4180 and TDF"); } if (config.field == null || config.field.isEmpty()) { throw new IllegalArgumentException("Field for applying transformation is not specified."); } try { outSchema = Schema.parseJson(config.schema); fields = outSchema.getFields(); } catch (IOException e) { throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format."); } } @Override public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws IllegalArgumentException { super.configurePipeline(pipelineConfigurer); // Check if the format specified is valid. if (config.format == null || config.format.isEmpty()) { throw new IllegalArgumentException( "Format is not specified. Allowed values are DEFAULT, EXCEL, MYSQL," + " RFC4180 & TDF"); } if (!config.format.equalsIgnoreCase("DEFAULT") && !config.format.equalsIgnoreCase("EXCEL") && !config.format.equalsIgnoreCase("MYSQL") && !config.format.equalsIgnoreCase("RFC4180") && !config.format.equalsIgnoreCase("TDF")) { throw new IllegalArgumentException( "Format specified is not one of the allowed values. Allowed values are " + "DEFAULT, EXCEL, MYSQL, RFC4180 & TDF"); } // Check if the decoder specified is one of the allowed types. if (!config.decompress.equalsIgnoreCase("BASE64") && !config.decompress.equalsIgnoreCase("BASE32") && !config.decompress.equalsIgnoreCase("NONE") && !config.decompress.equalsIgnoreCase("HEX")) { throw new IllegalArgumentException("Unsupported decoder '" + config.decoder + ", specified. Supported types are" + "NONE, BASE64, BASE32 and HEX"); } // Check if the decompressor specified is one of the allowed types. if (!config.decompress.equalsIgnoreCase("SNAPPY") && !config.decompress.equalsIgnoreCase("NONE") && !config.decompress.equalsIgnoreCase("GZIP") && !config.decompress.equalsIgnoreCase("ZIP")) { throw new IllegalArgumentException("Unsupported decompressor algorithm '" + config.decompress + "' specified. Currently supports NONE, SNAPPY, GZIP and ZIP"); } // Check if schema specified is a valid schema or no. try { Schema.parseJson(config.schema); } catch (IOException e) { throw new IllegalArgumentException("Format of schema specified is invalid. Please check the format."); } } @Override public void transform(StructuredRecord in, Emitter<StructuredRecord> emitter) throws Exception { // Field has to string to be parsed correctly. For others throw an exception. String body = in.get(config.field); // If decoder is not NONE, then apply decoder. byte[] decodedPayLoad; if (!config.decoder.equalsIgnoreCase("NONE")) { decodedPayLoad = decodePayLoad(body); } else { decodedPayLoad = body.getBytes(); } // If decompess is not NONE, then apply decompressor. byte[] uncompressedPayLoad = decodedPayLoad.clone(); if (!config.decompress.equalsIgnoreCase("NONE")) { if (config.decompress.equalsIgnoreCase("SNAPPY")) { uncompressedPayLoad = Snappy.uncompress(decodedPayLoad); } else if (config.decompress.equalsIgnoreCase("GZIP")) { uncompressedPayLoad = ungzip(decodedPayLoad); } else if (config.decompress.equalsIgnoreCase("ZIP")) { uncompressedPayLoad = unzip(decodedPayLoad); } } // Parse the text as CSV and emit it as structured record. try { CSVParser parser = CSVParser.parse(new String(uncompressedPayLoad), csvFormat); List<CSVRecord> records = parser.getRecords(); for (CSVRecord record : records) { if (fields.size() == record.size()) { StructuredRecord sRecord = createStructuredRecord(record); emitter.emit(sRecord); } else { // Write the record to error Dataset. } } } catch (IOException e) { } } /** * UnCompress payload using GZIP Algorithm * * @param body byte array * @return decompressed bytes. * @throws IOException */ private byte[] ungzip(byte[] body) throws IOException { Inflater inf = new Inflater(); ByteArrayInputStream bytein = new ByteArrayInputStream(body); GZIPInputStream gzin = new GZIPInputStream(bytein); ByteArrayOutputStream byteout = new ByteArrayOutputStream(); int res = 0; byte buf[] = new byte[1024]; while (res >= 0) { res = gzin.read(buf, 0, buf.length); if (res > 0) { byteout.write(buf, 0, res); } } byte uncompressed[] = byteout.toByteArray(); return uncompressed; } private byte[] unzip(byte[] body) throws IOException { Inflater inf = new Inflater(); ByteArrayInputStream bytein = new ByteArrayInputStream(body); ZipInputStream gzin = new ZipInputStream(bytein); ByteArrayOutputStream byteout = new ByteArrayOutputStream(); int res = 0; byte buf[] = new byte[1024]; while (res >= 0) { res = gzin.read(buf, 0, buf.length); if (res > 0) { byteout.write(buf, 0, res); } } byte uncompressed[] = byteout.toByteArray(); return uncompressed; } /** * Decodes the payload. * * @param body * @return */ private byte[] decodePayLoad(String body) throws DecoderException { if (config.decoder.equalsIgnoreCase("base64")) { Base64 codec = new Base64(); return codec.decode(body); } else if (config.decoder.equalsIgnoreCase("base32")) { Base32 codec = new Base32(); return codec.decode(body); } else if (config.decoder.equalsIgnoreCase("hex")) { Hex codec = new Hex(); return codec.decode(body.getBytes()); } return new byte[0]; } private StructuredRecord createStructuredRecord(CSVRecord record) { StructuredRecord.Builder builder = StructuredRecord.builder(outSchema); int i = 0; for (Field field : fields) { builder.set(field.getName(), TypeConvertors.get(record.get(i), field.getSchema().getType())); ++i; } return builder.build(); } /** * Configuration for the plugin. */ public static class Config extends PluginConfig { @Name("decode") @Description("Specify the decoder to be applied on the payload.") private final String decoder; @Name("decompress") @Description("Specifies decompress algorithm to be applied to decoded payload.") private final String decompress; @Name("format") @Description("Specify one of the predefined formats. DEFAULT, EXCEL, MYSQL, RFC4180 & TDF are supported formats.") private final String format; @Name("field") @Description("Specify the field that should be used for parsing into CSV.") private final String field; @Name("schema") @Description("Specifies the schema that has to be output.") private final String schema; public Config(String decoder, String decompress, String format, String field, String schema) { this.decoder = decoder; this.decompress = decompress; this.format = format; this.field = field; this.schema = schema; } } }