Java tutorial
/* * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved. * * Cloudera, Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"). You may not use this file except in * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for * the specific language governing permissions and limitations under the * License. */ package com.cloudera.learnavro.test; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.io.BufferedOutputStream; import java.io.DataOutputStream; import java.io.OutputStreamWriter; import java.io.FileOutputStream; import java.util.Random; import org.apache.avro.Schema; import org.apache.avro.file.DataFileWriter; import org.apache.avro.io.DatumWriter; import org.apache.avro.io.Encoder; import org.apache.avro.io.JsonEncoder; import org.apache.avro.reflect.ReflectDatumWriter; import org.apache.avro.reflect.ReflectData; import org.codehaus.jackson.JsonGenerator; import com.cloudera.recordbreaker.schemadict.TestRecord; import com.cloudera.recordbreaker.schemadict.SchemaSuggest; /********************************************************************* * This class generates a number of test data files for the schema inference program. * It makes data for five different genres: * 1) A Web crawl * 2) An access log * 3) A file listing * 4) Sensor data * 5) Purchase transactions * * We also attempt to generate statistically-plausible data for each * * @author mjc ***********************************************************************/ public class GenerateTestAvro { static long DAY_IN_MILLIS = 1000 * 60 * 60 * 24; static long WEEK_IN_MILLIS = 7 * DAY_IN_MILLIS; static int NUM_ALPHA = 32; static int CAPITAL_A = 65; static int LOWER_A = 97; static int RESULT_CODES[] = { 200, 401, 501, 301, 403 }; static double RESULT_CODE_DIST[] = { .8, .1, .02, .02, .06 }; static Random r = new Random(3333); public GenerateTestAvro() { } /** * Main method for building all the test data files. */ public void generateData(File outDir, int numRecords) throws IOException, InstantiationException { // Create the target dir outDir = outDir.getCanonicalFile(); if (outDir.exists()) { throw new IOException("Directory already exists: " + outDir); } outDir.mkdirs(); // // Emit WebPage data. Note the weird "Instantiator" business that appears as if it could be done // via Class.newInstance(). We can't do that here because newInstance() is incompatible with inner // classes. // Schema webCrawlSchema = ReflectData.get().getSchema(WebPage.class); emitSchema(new File(outDir, "webcrawl.schema"), webCrawlSchema); emitData(new File(outDir, "webcrawl.dat"), webCrawlSchema, numRecords, new Instantiator<WebPage>() { public WebPage create() { return new WebPage(); } }); // // Access log // Schema accessLogSchema = ReflectData.get().getSchema(AccessLog.class); emitSchema(new File(outDir, "accesslog.schema"), accessLogSchema); emitData(new File(outDir, "accesslog.dat"), accessLogSchema, numRecords, new Instantiator<AccessLog>() { public AccessLog create() { return new AccessLog(); } }); // // File listing // Schema fileListingSchema = ReflectData.get().getSchema(FileListing.class); emitSchema(new File(outDir, "filelisting.schema"), fileListingSchema); emitData(new File(outDir, "filelisting.dat"), fileListingSchema, numRecords, new Instantiator<FileListing>() { public FileListing create() { return new FileListing(); } }); // // Sensor data // Schema sensorDataSchema = ReflectData.get().getSchema(SensorData.class); emitSchema(new File(outDir, "sensordata.schema"), sensorDataSchema); emitData(new File(outDir, "sensordata.dat"), sensorDataSchema, numRecords, new Instantiator<SensorData>() { public SensorData create() { return new SensorData(); } }); // // Purchases // Schema purchaseSchema = ReflectData.get().getSchema(Purchase.class); emitSchema(new File(outDir, "purchase.schema"), purchaseSchema); emitData(new File(outDir, "purchase.dat"), purchaseSchema, numRecords, new Instantiator<Purchase>() { public Purchase create() { return new Purchase(); } }); } /////////////////////////// // The sample classes /////////////////////////// /** * Data type #1: WebPage */ public class WebPage { String url; long dateCrawled; double rank; int lastResultCode; int failedAttempts; long nextCrawl; String content; public WebPage() { this.url = "http://" + generateRandomString(10, 100); this.dateCrawled = System.currentTimeMillis() - (Math.abs(r.nextLong()) % WEEK_IN_MILLIS); this.rank = r.nextDouble(); this.lastResultCode = RESULT_CODES[chooseIndex(RESULT_CODE_DIST)]; this.failedAttempts = r.nextInt(3); this.nextCrawl = dateCrawled + WEEK_IN_MILLIS; this.content = generateRandomString(1024, 10 * 1024); } } /** * Data type #2: access log. (Taken from Pavlo, et al, SIGMOD 2009) */ public class AccessLog { String srcIP; String destURL; long visitDate; float adRevenue; String userAgent; String countryCode; String languageCode; String searchWord; int duration; public AccessLog() { this.srcIP = generateRandomString(12, 12); this.destURL = "http://" + generateRandomString(10, 100); this.visitDate = System.currentTimeMillis() - (Math.abs(r.nextLong()) % DAY_IN_MILLIS); this.adRevenue = Math.abs(r.nextFloat()) * 100; this.userAgent = generateRandomString(4, 10); this.countryCode = generateRandomString(2, 2); this.languageCode = generateRandomString(4, 4); this.searchWord = generateRandomString(4, 20); this.duration = r.nextInt(10000); } } /** * Data type #3: file listing */ public class FileListing { String permissions; String user; String group; int size; String month; int day; String time; String filename; public FileListing() { this.permissions = generateRandomString(10, 10); this.user = generateRandomString(2, 8); this.group = generateRandomString(5, 8); this.size = r.nextInt(9086); this.month = generateRandomString(3, 3); this.day = r.nextInt(31); this.time = generateRandomString(5, 5); this.filename = generateRandomString(3, 20); } } /** * Data type #4: sensor data */ public class SensorData { double temp; double lumens; double pressure; long timestamp; int xpos; int ypos; int zpos; public SensorData() { this.temp = r.nextDouble() * 120; this.lumens = r.nextDouble() * 15000; this.pressure = r.nextDouble(); this.timestamp = System.currentTimeMillis() + (Math.abs(r.nextLong()) % DAY_IN_MILLIS); this.xpos = r.nextInt(1000); this.ypos = r.nextInt(1000); this.zpos = r.nextInt(1000); } } /** * Data type #5: purchases */ public class Purchase { long productCode; String productDesc; double price; long timestamp; int quantity; public Purchase() { this.productCode = Math.abs(r.nextLong()); this.productDesc = generateRandomString(15, 25); this.price = r.nextDouble() * 10000; this.timestamp = System.currentTimeMillis() + (Math.abs(r.nextLong()) % DAY_IN_MILLIS); this.quantity = r.nextInt(10); } } ///////////////////////////// // Utility file-handling ////////////////////////////// /** */ void emitSchema(File outSchema, Schema schema) throws IOException { OutputStreamWriter out = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(outSchema))); try { out.write(schema.toString(true)); } finally { out.close(); } } /** */ void emitData(File outData, Schema schema, int numRecords, Instantiator inster) throws IOException, InstantiationException { DatumWriter dout = new ReflectDatumWriter(schema); DataFileWriter out = new DataFileWriter(dout); out = out.create(schema, outData); try { for (int i = 0; i < numRecords; i++) { out.append(inster.create()); } //encoder.flush(); } finally { out.close(); } } ///////////////////////////// // Utility class-handling ////////////////////////////// interface Instantiator<T> { public T create(); } ///////////////////////////// // Utility random-gen ////////////////////////////// String generateRandomString(int minLen, int maxLen) { int target = minLen; if (maxLen - minLen > 0) { target += r.nextInt(maxLen - minLen); } StringBuffer buf = new StringBuffer(); for (int i = 0; i < target; i++) { int rval = r.nextInt(NUM_ALPHA * 2); if (rval < NUM_ALPHA) { buf.append((char) (CAPITAL_A + rval)); } else { buf.append((char) (LOWER_A + rval)); } } return buf.toString(); } /** * We require, but do not test, that the contents of distribution sums to 1.0 */ int chooseIndex(double[] distribution) { double target = r.nextDouble(); for (int i = 0; i < distribution.length; i++) { target -= distribution[i]; if (target <= 0) { return i; } } return distribution.length - 1; } /** */ public static void main(String argv[]) throws IOException, InstantiationException { CommandLine cmd = null; Options options = new Options(); options.addOption("?", false, "Help for command-line"); options.addOption("n", true, "# tuples to emit per file"); try { CommandLineParser parser = new PosixParser(); cmd = parser.parse(options, argv); } catch (ParseException pe) { HelpFormatter fmt = new HelpFormatter(); fmt.printHelp("GenerateTestAvro", options, true); System.exit(-1); } if (cmd.hasOption("?")) { HelpFormatter fmt = new HelpFormatter(); fmt.printHelp("GenerateTestAvro", options, true); System.exit(0); } int numToEmit = 100; if (cmd.hasOption("n")) { try { numToEmit = Integer.parseInt(cmd.getOptionValue("n")); } catch (NumberFormatException nfe) { nfe.printStackTrace(); } } String[] argArray = cmd.getArgs(); if (argArray.length == 0) { HelpFormatter fmt = new HelpFormatter(); fmt.printHelp("GenerateTestAvro", options, true); System.exit(0); } File outputDir = new File(argArray[0]).getCanonicalFile(); GenerateTestAvro gta = new GenerateTestAvro(); gta.generateData(outputDir, numToEmit); } }