Java tutorial
/* * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved. * * Cloudera, Inc. licenses this file to you under the Apache License, * Version 2.0 (the "License"). You may not use this file except in * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. See the License for * the specific language governing permissions and limitations under the * License. */ package com.cloudera.recordbreaker.schemadict; import java.io.*; import java.util.*; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.file.DataFileReader; import org.apache.avro.reflect.ReflectDatumReader; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericArray; import org.apache.avro.generic.GenericFixed; import org.apache.avro.generic.GenericRecord; /**************************************************************** * SchemaSuggest generates labels for an Avro file with schema elts that are not * usefully named. It uses a dictionary of schemas/data with high-quality labels. * It compares the candidate avro data to everything in the dictionary, finding the * k most-similar entries. It then computes a mapping between the candidate schema * and each of the k best ones. The user can then use the resulting schema to replace * the candidate's badly-labeled one. * * This class is particularly useful when operating on an Avro schema that was * algorithmically-generated (\ie, through learn-avro). * * @author mjc ****************************************************************/ public class SchemaSuggest { int NUM_BUCKETS = 20; SchemaDictionary dict; boolean useAttributeLabels = true; List<List<SchemaDictionaryEntry>> dictBySize; /** * Load in the Schema Dictionary from the indicated file. */ public SchemaSuggest(File dataDir) throws IOException { this.dict = new SchemaDictionary(dataDir); // The 'dictBySize' structure allows us to perform schema inference // more quickly, by avoiding examination of schemas that can't possibly // be returned by inferSchemaMapping(). this.dictBySize = new ArrayList<List<SchemaDictionaryEntry>>(); for (int i = 0; i < NUM_BUCKETS; i++) { dictBySize.add(new ArrayList<SchemaDictionaryEntry>()); } for (SchemaDictionaryEntry elt : dict.contents()) { Schema comparisonSchema = elt.getSchema(); int comparisonSchemaSize = comparisonSchema.getFields().size(); if (comparisonSchemaSize < dictBySize.size() - 1) { dictBySize.get(comparisonSchemaSize - 1).add(elt); } else { dictBySize.get(dictBySize.size() - 1).add(elt); } } } /** * Should SchemaSuggest examine attribute labels when trying to find a match? * Typically this should be set to true. However, it can be useful to turn off * this feature for testing purposes. */ public void setUseAttributeLabels(boolean useAttributeLabels) { this.useAttributeLabels = useAttributeLabels; } /** * This method infers new schema labels for each element in the input. It returns a Schema object that * has the identical format as the input file's Schema object, but the labels may be changed. */ public List<DictionaryMapping> inferSchemaMapping(File avroFile, int k) throws IOException { SchemaStatisticalSummary srcSummary = new SchemaStatisticalSummary("input"); Schema srcSchema = srcSummary.createSummaryFromData(avroFile); srcSummary.setUseAttributeLabels(useAttributeLabels); // // Compare the statistics to the database of schema statistics. Find the closest matches, both // on a per-attribute basis and structurally. // int schemaSize = srcSchema.getFields().size(); // // We start testing the input database against known schemas that have an identical // number of attributes, which should allow for the best matches. This gives us an // initial set of distances. We then expand the search to schemas of greater or fewer // attributes, as long as a given bucket of size-k schemas has a min-distance of less // than the current top-k matches. // // TreeSet<DictionaryMapping> sorter = new TreeSet<DictionaryMapping>(); int numMatches = 0; List<Integer> seenIndexes = new ArrayList<Integer>(); int searchRadius = 0; boolean seenAllCandidates = false; int srcSchemaSize = srcSchema.getFields().size(); int totalSchemasExamined = 0; while (!seenAllCandidates) { // Examine the relevant schema buckets, compute all matches to those schemas for (int j = Math.max(1, srcSchemaSize - searchRadius); j <= Math.min(NUM_BUCKETS, srcSchemaSize + searchRadius); j++) { if (seenIndexes.contains(j - 1)) { continue; } for (SchemaDictionaryEntry elt : dictBySize.get(j - 1)) { ///////////////////////////// // This is where we instrument the mapping stuff. // If the pair is an interesting one, then emit the mapping that // we discover. Why are good matches going undiscovered? ///////////////////////////// SchemaMapping mapping = srcSummary.getBestMapping(elt.getSummary()); if (srcSchema.getName().equals(elt.getSchema().getName())) { System.err .println("Comparing " + srcSchema.getName() + " with " + elt.getSchema().getName()); System.err.println("Obtained mapping: " + mapping.toString()); } totalSchemasExamined++; sorter.add(new DictionaryMapping(mapping, elt)); numMatches++; } seenIndexes.add(j - 1); } // Have we examined the entire corpus of known schemas? if ((srcSchemaSize - searchRadius) <= 1 && (srcSchemaSize + searchRadius) >= NUM_BUCKETS) { seenAllCandidates = true; } else { // Test to see if the best matches are good enough that we can stop looking. // We compare the lowest known match distance to the minimum distance for matches // in the closest non-examined buckets. int lowestSize = srcSchemaSize - searchRadius - 1; int highestSize = srcSchemaSize + searchRadius + 1; double minNearbyDistance = Double.MAX_VALUE; if (lowestSize >= 1) { minNearbyDistance = Math.min(minNearbyDistance, SchemaStatisticalSummary.getMinimumMappingCost(srcSchemaSize, lowestSize)); } if (highestSize <= NUM_BUCKETS) { minNearbyDistance = Math.min(minNearbyDistance, SchemaStatisticalSummary.getMinimumMappingCost(srcSchemaSize, highestSize)); } // Grab from the Sorter the elt that is MIN_ELTS_SUGGESTED into the sorted list if (sorter.size() >= k) { DictionaryMapping testDictMapping = null; int idx = 0; for (DictionaryMapping cur : sorter) { idx++; if (idx == k) { testDictMapping = cur; break; } } if (testDictMapping.getMapping().getDist() < minNearbyDistance) { seenAllCandidates = true; } } } searchRadius++; } // Return the k best schema mappings double smallestDistance = sorter.first().getMapping().getDist(); List<DictionaryMapping> dsts = new ArrayList<DictionaryMapping>(); for (DictionaryMapping dp : sorter) { if (dsts.size() > k && dp.getMapping().getDist() > smallestDistance) { break; } dsts.add(dp); } double pct = totalSchemasExamined / (1.0 * dict.contents().size()); System.err.println("Final search radius of " + searchRadius + " yielded a search over " + pct + " of all known databases."); return dsts; } /** * SchemaSuggest takes an avro file where schema elements may be anonymous. It then attempts to * compute good labels for the anonymous elts. By default, this tool simply prints out the * suggested labels, if any. The user may include a flag to rewrite the input data using * the new labels. * * schemaSuggest avroFile * */ public static void main(String argv[]) throws IOException { CommandLine cmd = null; boolean debug = false; Options options = new Options(); options.addOption("?", false, "Help for command-line"); options.addOption("f", true, "Accept suggestions and rewrite input to a new Avro file"); options.addOption("d", false, "Debug mode"); options.addOption("k", true, "How many matches to emit."); try { CommandLineParser parser = new PosixParser(); cmd = parser.parse(options, argv); } catch (ParseException e) { HelpFormatter fmt = new HelpFormatter(); fmt.printHelp("SchemaSuggest", options, true); System.err.println("Required inputs: <schemadictionary> <anonymousAvro>"); System.exit(-1); } if (cmd.hasOption("?")) { HelpFormatter fmt = new HelpFormatter(); fmt.printHelp("SchemaSuggest", options, true); System.err.println("Required inputs: <schemadictionary> <anonymousAvro>"); System.exit(0); } if (cmd.hasOption("d")) { debug = true; } int k = 1; if (cmd.hasOption("k")) { try { k = Integer.parseInt(cmd.getOptionValue("k")); } catch (NumberFormatException nfe) { } } String[] argArray = cmd.getArgs(); if (argArray.length < 2) { HelpFormatter fmt = new HelpFormatter(); fmt.printHelp("SchemaSuggest", options, true); System.err.println("Required inputs: <schemadictionary> <anonymousAvro>"); System.exit(0); } File dataDir = new File(argArray[0]).getCanonicalFile(); File inputData = new File(argArray[1]).getCanonicalFile(); SchemaSuggest ss = new SchemaSuggest(dataDir); List<DictionaryMapping> mappings = ss.inferSchemaMapping(inputData, k); if (!cmd.hasOption("f")) { System.out.println("Ranking of closest known data types, with match-distance (smaller is better):"); int counter = 1; for (DictionaryMapping mapping : mappings) { SchemaMapping sm = mapping.getMapping(); List<SchemaMappingOp> bestOps = sm.getMapping(); System.err.println(); System.err.println(); System.err.println("-------------------------------------------------------------"); System.out.println( counter + ". '" + mapping.getDictEntry().getInfo() + "', with distance: " + sm.getDist()); List<SchemaMappingOp> renames = new ArrayList<SchemaMappingOp>(); List<SchemaMappingOp> extraInTarget = new ArrayList<SchemaMappingOp>(); List<SchemaMappingOp> extraInSource = new ArrayList<SchemaMappingOp>(); for (SchemaMappingOp op : bestOps) { if (op.opcode == SchemaMappingOp.CREATE_OP) { extraInTarget.add(op); } else if (op.opcode == SchemaMappingOp.DELETE_OP) { if (op.getS1DatasetLabel().compareTo("input") == 0) { extraInSource.add(op); } else { extraInTarget.add(op); } } else if (op.opcode == SchemaMappingOp.TRANSFORM_OP) { renames.add(op); } } System.err.println(); System.err.println(" DISCOVERED LABELS"); int counterIn = 1; if (renames.size() == 0) { System.err.println(" (None)"); } else { for (SchemaMappingOp op : renames) { System.err.println(" " + counterIn + ". " + "In '" + op.getS1DatasetLabel() + "', label '" + op.getS1FieldLabel() + "' AS " + op.getS2FieldLabel()); if (debug) { if (op.getS1DocStr() != null && op.getS1DocStr().length() > 0) { System.err.println( " '" + op.getS1DocStr() + "' ==> '" + op.getS2DocStr() + "'"); } } counterIn++; } } System.err.println(); System.err.println(" UNMATCHED ITEMS IN TARGET DATA TYPE"); counterIn = 1; if (extraInTarget.size() == 0) { System.err.println(" (None)"); } else { for (SchemaMappingOp op : extraInTarget) { System.err.println(" " + counterIn + ". " + op.getS1FieldLabel()); if (debug) { if (op.getS1DocStr() != null && op.getS1DocStr().length() > 0) { System.err.println(" " + op.getS1DocStr()); } } counterIn++; } } System.err.println(); System.err.println(" UNMATCHED ITEMS IN SOURCE DATA"); counterIn = 1; if (extraInSource.size() == 0) { System.err.println(" (None)"); } else { for (SchemaMappingOp op : extraInSource) { System.err.println(" " + counterIn + ". " + op.getS1FieldLabel()); if (debug) { if (op.getS1DocStr() != null && op.getS1DocStr().length() > 0) { System.err.println(" " + op.getS1DocStr()); } } counterIn++; } } counter++; } } } }