Java tutorial
/* * Copyright (c) 2013, Cloudera, Inc. All Rights Reserved. * * Cloudera, Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"). You may not use this file except in * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for * the specific language governing permissions and limitations under the * License. */ package com.cloudera.recordbreaker.analyzer; import org.apache.avro.Schema; import org.apache.avro.io.DatumWriter; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericData; import java.io.File; import java.io.IOException; import java.io.BufferedReader; import java.io.FileOutputStream; import java.io.InputStreamReader; import java.util.List; import java.util.ArrayList; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FileSystem; import au.com.bytecode.opencsv.CSVParser; /***************************************************** * <code>CSVDataDescriptor</code> describes comma-separated * textual data. Based on previous analysis of the file, * we know whether the first line should be treated as * schema info or not. * * @author Michael Cafarella *****************************************************/ public class CSVDataDescriptor extends GenericDataDescriptor { final public static String CSV_TYPE = "csv"; private static int MAX_LINES = 25; private static int MIN_MEAN_ELTS = 3; private static int MIN_LINE_COUNT = 10; private static double MAX_ALLOWABLE_LINE_STDDEV = 0.1; /** * Test whether a given file is amenable to CSV processing */ public static boolean isCSV(FileSystem fs, Path p) { String fname = p.getName(); if (fname.endsWith(".csv")) { return true; } CSVParser parser = new CSVParser(); try { BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(p))); try { int lineCount = 0; List<Integer> observedEltCounts = new ArrayList<Integer>(); int totalEltCount = 0; int minEltCount = Integer.MAX_VALUE; int maxEltCount = -1; String line = null; while (lineCount < MAX_LINES && ((line = in.readLine()) != null)) { String parts[] = parser.parseLine(line); int numElts = parts.length; minEltCount = Math.min(minEltCount, numElts); maxEltCount = Math.max(maxEltCount, numElts); totalEltCount += numElts; observedEltCounts.add(numElts); lineCount++; } double meanEltCount = totalEltCount / (1.0 * observedEltCounts.size()); double totalVariance = 0; for (Integer v : observedEltCounts) { totalVariance += Math.pow(v - meanEltCount, 2); } double variance = totalVariance / observedEltCounts.size(); double stddev = Math.sqrt(variance); if (lineCount >= MIN_LINE_COUNT && meanEltCount >= MIN_MEAN_ELTS && ((stddev / meanEltCount) < MAX_ALLOWABLE_LINE_STDDEV)) { return true; } } finally { in.close(); } } catch (IOException ie) { } return false; } public CSVDataDescriptor(Path p, FileSystem fs) throws IOException { super(p, fs, CSV_TYPE); schemas.add(new CSVSchemaDescriptor(this)); } public CSVDataDescriptor(Path p, FileSystem fs, List<String> schemaReprs, List<String> schemaDescs, List<byte[]> schemaBlobs) throws IOException { super(p, fs, CSV_TYPE, schemaReprs, schemaDescs, schemaBlobs); } public SchemaDescriptor loadSchemaDescriptor(String schemaRepr, String schemaId, byte[] blob) throws IOException { return new CSVSchemaDescriptor(this, schemaRepr, blob); } /////////////////////////////////// // GenericDataDescriptor ////////////////////////////////// public void prepareAvroFile(FileSystem srcFs, FileSystem dstFs, Path dst, Configuration conf) throws IOException { // THIS IS WHERE THE MAGIC HAPPENS!!! // Convert CSV into Avro!!!! SchemaDescriptor sd = this.getSchemaDescriptor().get(0); List<Schema> unionFreeSchemas = SchemaUtils.getUnionFreeSchemasByFrequency(sd, 100, true); Schema schema = unionFreeSchemas.get(0); String headerRowHash = new String(sd.getPayload()); CSVRowParser rowParser = new CSVRowParser(schema, headerRowHash); // Open stream to write out Avro contents DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema); DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(writer); dataFileWriter.create(schema, dstFs.create(dst, true)); int numRecords = 0; int MAX_RECORDS = 1000; try { BufferedReader in = new BufferedReader(new InputStreamReader(srcFs.open(getFilename()))); try { String rowStr = null; while (((rowStr = in.readLine()) != null) && (numRecords < MAX_RECORDS)) { if (("" + rowStr.hashCode()).compareTo(headerRowHash) == 0) { continue; } GenericData.Record record = rowParser.parseRow(rowStr); if (record == null) { continue; } if (record.getSchema().toString().hashCode() != schema.toString().hashCode()) { continue; } dataFileWriter.append(record); numRecords++; } } finally { in.close(); } } finally { dataFileWriter.close(); } } }