com.cloudera.recordbreaker.analyzer.CSVDataDescriptor.java Source code

Introduction

Here is the source code for com.cloudera.recordbreaker.analyzer.CSVDataDescriptor.java
Source

/*
 * Copyright (c) 2013, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.analyzer;

import org.apache.avro.Schema;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericData;

import java.io.File;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.util.List;
import java.util.ArrayList;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;

import au.com.bytecode.opencsv.CSVParser;

/*****************************************************
 * <code>CSVDataDescriptor</code> describes comma-separated
 * textual data.  Based on previous analysis of the file,
 * we know whether the first line should be treated as
 * schema info or not.
 *
 * @author Michael Cafarella
 *****************************************************/
public class CSVDataDescriptor extends GenericDataDescriptor {
    final public static String CSV_TYPE = "csv";
    private static int MAX_LINES = 25;
    private static int MIN_MEAN_ELTS = 3;
    private static int MIN_LINE_COUNT = 10;
    private static double MAX_ALLOWABLE_LINE_STDDEV = 0.1;

    /**
     * Test whether a given file is amenable to CSV processing
     */
    public static boolean isCSV(FileSystem fs, Path p) {
        String fname = p.getName();
        if (fname.endsWith(".csv")) {
            return true;
        }
        CSVParser parser = new CSVParser();
        try {
            BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(p)));
            try {
                int lineCount = 0;
                List<Integer> observedEltCounts = new ArrayList<Integer>();
                int totalEltCount = 0;
                int minEltCount = Integer.MAX_VALUE;
                int maxEltCount = -1;

                String line = null;
                while (lineCount < MAX_LINES && ((line = in.readLine()) != null)) {
                    String parts[] = parser.parseLine(line);
                    int numElts = parts.length;
                    minEltCount = Math.min(minEltCount, numElts);
                    maxEltCount = Math.max(maxEltCount, numElts);
                    totalEltCount += numElts;
                    observedEltCounts.add(numElts);

                    lineCount++;
                }
                double meanEltCount = totalEltCount / (1.0 * observedEltCounts.size());
                double totalVariance = 0;
                for (Integer v : observedEltCounts) {
                    totalVariance += Math.pow(v - meanEltCount, 2);
                }
                double variance = totalVariance / observedEltCounts.size();
                double stddev = Math.sqrt(variance);
                if (lineCount >= MIN_LINE_COUNT && meanEltCount >= MIN_MEAN_ELTS
                        && ((stddev / meanEltCount) < MAX_ALLOWABLE_LINE_STDDEV)) {
                    return true;
                }
            } finally {
                in.close();
            }
        } catch (IOException ie) {
        }
        return false;
    }

    public CSVDataDescriptor(Path p, FileSystem fs) throws IOException {
        super(p, fs, CSV_TYPE);
        schemas.add(new CSVSchemaDescriptor(this));
    }

    public CSVDataDescriptor(Path p, FileSystem fs, List<String> schemaReprs, List<String> schemaDescs,
            List<byte[]> schemaBlobs) throws IOException {
        super(p, fs, CSV_TYPE, schemaReprs, schemaDescs, schemaBlobs);
    }

    public SchemaDescriptor loadSchemaDescriptor(String schemaRepr, String schemaId, byte[] blob)
            throws IOException {
        return new CSVSchemaDescriptor(this, schemaRepr, blob);
    }

    ///////////////////////////////////
    // GenericDataDescriptor
    //////////////////////////////////
    public void prepareAvroFile(FileSystem srcFs, FileSystem dstFs, Path dst, Configuration conf)
            throws IOException {
        // THIS IS WHERE THE MAGIC HAPPENS!!!
        // Convert CSV into Avro!!!!
        SchemaDescriptor sd = this.getSchemaDescriptor().get(0);
        List<Schema> unionFreeSchemas = SchemaUtils.getUnionFreeSchemasByFrequency(sd, 100, true);
        Schema schema = unionFreeSchemas.get(0);

        String headerRowHash = new String(sd.getPayload());
        CSVRowParser rowParser = new CSVRowParser(schema, headerRowHash);

        // Open stream to write out Avro contents
        DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema);
        DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(writer);
        dataFileWriter.create(schema, dstFs.create(dst, true));
        int numRecords = 0;
        int MAX_RECORDS = 1000;
        try {
            BufferedReader in = new BufferedReader(new InputStreamReader(srcFs.open(getFilename())));
            try {
                String rowStr = null;
                while (((rowStr = in.readLine()) != null) && (numRecords < MAX_RECORDS)) {
                    if (("" + rowStr.hashCode()).compareTo(headerRowHash) == 0) {
                        continue;
                    }
                    GenericData.Record record = rowParser.parseRow(rowStr);
                    if (record == null) {
                        continue;
                    }
                    if (record.getSchema().toString().hashCode() != schema.toString().hashCode()) {
                        continue;
                    }
                    dataFileWriter.append(record);
                    numRecords++;
                }
            } finally {
                in.close();
            }
        } finally {
            dataFileWriter.close();
        }
    }
}