Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.apache.pig.builtin; import static org.apache.pig.builtin.mock.Storage.resetData; import static org.apache.pig.builtin.mock.Storage.tuple; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintStream; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.Iterator; import com.google.common.io.Closeables; import org.apache.avro.file.DataFileStream; import org.apache.avro.generic.GenericContainer; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData.Record; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.tool.DataFileWriteTool; import org.apache.avro.tool.Tool; import org.apache.avro.tool.TrevniCreateRandomTool; import org.apache.avro.tool.TrevniToJsonTool; import org.apache.avro.util.Utf8; import org.apache.commons.io.FileUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.pig.PigServer; import org.apache.pig.backend.executionengine.ExecJob; import org.apache.pig.backend.executionengine.ExecJob.JOB_STATUS; import org.apache.pig.builtin.mock.Storage.Data; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.impl.util.Utils; import org.apache.pig.impl.util.avro.AvroBagWrapper; import org.apache.pig.impl.util.avro.AvroMapWrapper; import org.apache.pig.impl.util.avro.AvroTupleWrapper; import org.apache.pig.test.Util; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import com.google.common.collect.ImmutableMap; public class TestAvroStorage { final protected static Log LOG = LogFactory.getLog(TestAvroStorage.class); final private static String basedir = "test/org/apache/pig/builtin/avro/"; private static String outbasedir = System.getProperty("user.dir") + "/build/test/TestAvroStorage/"; final private static String[] datadir = { "data/avro", "data/avro/compressed", "data/avro/compressed/snappy", "data/avro/compressed/deflate", "data/avro/uncompressed", "data/avro/uncompressed/testdirectory", "data/json/testdirectory", "data/trevni", "data/trevni/uncompressed", }; final private static String[] avroSchemas = { "arraysAsOutputByPig", "arrays", "recordsAsOutputByPig", "recordsAsOutputByPigWithDates", "records", "recordsOfArrays", "recordsOfStringArrays", "recordsOfArraysOfRecords", "recordsSubSchema", "recordsSubSchemaNullable", "recordsWithDoubleUnderscores", "recordsWithEnums", "recordsWithFixed", "recordsWithMaps", "recordsWithMapsOfRecords", "recordsWithMapsOfArrayOfRecords", "recordsWithNullableUnions", "recordWithRepeatedSubRecords", "recursiveRecord", "projectionTest", "projectionTestWithSchema", "recordsWithSimpleUnion", "recordsWithSimpleUnionOutput", }; final private static String[] trevniSchemas = { "simpleRecordsTrevni", }; private static PigServer pigServerLocal = null; public static final PathFilter hiddenPathFilter = new PathFilter() { @Override public boolean accept(Path p) { String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); } }; private static String loadFileIntoString(String file) throws IOException { return FileUtils.readFileToString(new File(file)).replace("\n", " "); } @BeforeClass public static void setup() throws Exception { pigServerLocal = new PigServer(Util.getLocalTestMode()); Util.deleteDirectory(new File(outbasedir)); generateInputFiles(); } @AfterClass public static void teardown() throws IOException { if (pigServerLocal != null) { pigServerLocal.shutdown(); } deleteInputFiles(); } /** * Generate input files for test cases */ private static void generateInputFiles() throws IOException { String data; String json; String avro; String trevni; String schema; for (String fn : datadir) { FileUtils.forceMkdir(new File(basedir + fn)); } for (String fn : avroSchemas) { schema = basedir + "schema/" + fn + ".avsc"; json = basedir + "data/json/" + fn + ".json"; avro = basedir + "data/avro/uncompressed/" + fn + ".avro"; LOG.info("creating " + avro); generateAvroFile(schema, json, avro, null); } for (String fn : trevniSchemas) { schema = basedir + "schema/" + fn + ".avsc"; trevni = basedir + "data/trevni/uncompressed/" + fn + ".trevni"; json = basedir + "data/json/" + fn + ".json"; LOG.info("creating " + trevni); generateRandomTrevniFile(schema, "1000", trevni); LOG.info("creating " + json); convertTrevniToJsonFile(schema, trevni, json); schema = basedir + "schema/" + fn + ".avsc"; json = basedir + "data/json/" + fn + ".json"; avro = basedir + "data/avro/uncompressed/" + fn + ".avro"; LOG.info("creating " + avro); generateAvroFile(schema, json, avro, null); } PrintWriter w; int itemSum = 0; int recordCount = 0; int evenFileNameItemSum = 0; int evenFileNameRecordCount = 0; for (int i = 0; i < 8; i++) { schema = basedir + "schema/testDirectory.avsc"; json = basedir + "data/json/testdirectory/part-m-0000" + i; avro = basedir + "data/avro/uncompressed/testdirectory/part-m-0000" + i + ".avro"; LOG.info("creating " + json); w = new PrintWriter(new FileWriter(json)); for (int j = i * 1000; j < (i + 1) * 1000; j++) { itemSum += j; recordCount += 1; evenFileNameItemSum += ((i + 1) % 2) * j; evenFileNameRecordCount += (i + 1) % 2; w.println("{\"item\" : " + j + ", \"timestamp\" : " + System.currentTimeMillis() + " }\n"); } w.close(); generateAvroFile(schema, json, avro, null); } schema = basedir + "schema/testDirectoryCounts.avsc"; json = basedir + "data/json/testDirectoryCounts.json"; avro = basedir + "data/avro/uncompressed/testDirectoryCounts.avro"; data = "{\"itemSum\" : {\"int\" : " + itemSum + "}, \"n\" : {\"int\" : " + recordCount + "} }"; LOG.info("creating " + json); FileUtils.writeStringToFile(new File(json), data); LOG.info("creating " + avro); generateAvroFile(schema, json, avro, null); schema = basedir + "schema/testDirectoryCounts.avsc"; json = basedir + "data/json/evenFileNameTestDirectoryCounts.json"; avro = basedir + "data/avro/uncompressed/evenFileNameTestDirectoryCounts.avro"; data = "{\"itemSum\" : {\"int\" : " + evenFileNameItemSum + "}, \"n\" : {\"int\" : " + evenFileNameRecordCount + "} }"; LOG.info("creating " + json); FileUtils.writeStringToFile(new File(json), data); LOG.info("creating " + avro); generateAvroFile(schema, json, avro, null); for (String codec : new String[] { "deflate", "snappy" }) { for (String fn : new String[] { "records", "recordsAsOutputByPig" }) { schema = basedir + "schema/" + fn + ".avsc"; json = basedir + "data/json/" + fn + ".json"; avro = basedir + "data/avro/compressed/" + codec + "/" + fn + ".avro"; LOG.info("creating " + avro); generateAvroFile(schema, json, avro, codec); } } } /** * Clean up generated input files */ private static void deleteInputFiles() throws IOException { // Delete auto-generated directories for (String fn : datadir) { LOG.info("Deleting " + basedir + fn); FileUtils.deleteQuietly(new File(basedir + fn)); } // Delete auto-generated json files String json; for (String fn : trevniSchemas) { json = basedir + "data/json/" + fn + ".json"; LOG.info("Deleting " + json); FileUtils.deleteQuietly(new File(json)); } json = basedir + "data/json/testDirectoryCounts.json"; LOG.info("Deleting " + json); FileUtils.deleteQuietly(new File(json)); json = basedir + "data/json/evenFileNameTestDirectoryCounts.json"; LOG.info("Deleting " + json); FileUtils.deleteQuietly(new File(json)); } private static void generateAvroFile(String schema, String json, String avro, String codec) throws IOException { Tool tool = new DataFileWriteTool(); List<String> args = new ArrayList<String>(); args.add("--schema-file"); args.add(schema); args.add(json); if (codec != null) { args.add("--codec"); args.add(codec); } try { StringBuffer sb = new StringBuffer(); for (String a : args) { sb.append(a); sb.append(" "); } PrintStream out = new PrintStream(avro); tool.run(System.in, out, System.err, args); } catch (Exception e) { LOG.info("Could not generate avro file: " + avro, e); throw new IOException(); } } private static void generateRandomTrevniFile(String schema, String count, String trevni) throws IOException { Tool tool = new TrevniCreateRandomTool(); List<String> args = new ArrayList<String>(); args.add(schema); args.add(count); args.add(trevni); try { tool.run(System.in, System.out, System.err, args); } catch (Exception e) { LOG.info("Could not generate trevni file: " + trevni, e); throw new IOException(); } } private static void convertTrevniToJsonFile(String schema, String trevni, String json) throws IOException { Tool tool = new TrevniToJsonTool(); List<String> args = new ArrayList<String>(); args.add(trevni); try { PrintStream out = new PrintStream(json); tool.run(System.in, out, System.err, args); } catch (Exception e) { LOG.info("Could not generate json file: " + json, e); throw new IOException(); } } private String createOutputName() { final StackTraceElement[] st = Thread.currentThread().getStackTrace(); if (Util.WINDOWS) { outbasedir = outbasedir.replace('\\', '/'); } int index; if (Utils.isVendorIBM()) { index = 3; } else { index = 2; } return outbasedir + st[index].getMethodName(); } @Test public void testLoadRecordsOfStringArrays() throws Exception { final String input = basedir + "data/avro/uncompressed/recordsOfStringArrays.avro"; testAvroStorage(true, basedir + "code/pig/dump.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/recordsOfStringArrays.avsc", "OUTFILE", createOutputName())); } @Test public void testLoadRecords() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPig.avro"; testAvroStorage(true, basedir + "code/pig/identity_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "records", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsWithSimpleUnion() throws Exception { final String input = basedir + "data/avro/uncompressed/recordsWithSimpleUnion.avro"; final String check = basedir + "data/avro/uncompressed/recordsWithSimpleUnionOutput.avro"; testAvroStorage(true, basedir + "code/pig/identity_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "records", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin -f " + basedir + "schema/recordsSubSchema.avsc", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testProjection() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; final String check = basedir + "data/avro/uncompressed/projectionTest.avro"; testAvroStorage(true, basedir + "code/pig/projection_test.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "projectionTest", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testProjectionWithSchema() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; final String check = basedir + "data/avro/uncompressed/projectionTestWithSchema.avro"; testAvroStorage(true, basedir + "code/pig/projection_test_with_schema.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_IN_2", "-f " + basedir + "schema/records.avsc", "AVROSTORAGE_OUT_1", "projectionTest", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testDates() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPigWithDates.avro"; testAvroStorage(true, basedir + "code/pig/with_dates.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "records", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsSpecifyFullSchema() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPig.avro"; final String schema = loadFileIntoString(basedir + "schema/records.avsc"); testAvroStorage(true, basedir + "code/pig/identity_ai1_ao2.pig", ImmutableMap.of("INFILE", input, "OUTFILE", createOutputName(), "AVROSTORAGE_OUT_1", "records", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "AVROSTORAGE_IN_1", schema)); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsSpecifyFullSchemaFromClass() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPig.avro"; testAvroStorage(true, basedir + "code/pig/identity.pig", ImmutableMap.of("INFILE", input, "OUTFILE", createOutputName(), "AVROSTORAGE_IN_2", "-c org.apache.pig.builtin.avro.code.java.RecordPojo", "AVROSTORAGE_OUT_1", "''", "AVROSTORAGE_OUT_2", "-c org.apache.pig.builtin.avro.code.java.RecordPojo")); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsSpecifyFullSchemaFromFile() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPig.avro"; testAvroStorage(true, basedir + "code/pig/identity.pig", ImmutableMap.of("INFILE", input, "OUTFILE", createOutputName(), "AVROSTORAGE_OUT_1", "records", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "AVROSTORAGE_IN_2", "-f " + basedir + "schema/records.avsc")); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsSpecifySubSchema() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; final String check = basedir + "data/avro/uncompressed/recordsSubSchema.avro"; testAvroStorage(true, basedir + "code/pig/identity_ai1_ao2.pig", ImmutableMap.of("INFILE", input, "OUTFILE", createOutputName(), "AVROSTORAGE_OUT_1", "records", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin -f " + basedir + "schema/recordsSubSchema.avsc", "AVROSTORAGE_IN_1", loadFileIntoString(basedir + "schema/recordsSubSchema.avsc"))); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsSpecifySubSchemaFromFile() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; final String check = basedir + "data/avro/uncompressed/recordsSubSchema.avro"; testAvroStorage(true, basedir + "code/pig/identity_blank_first_args.pig", ImmutableMap.of("INFILE", input, "OUTFILE", createOutputName(), "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/recordsSubSchema.avsc", "AVROSTORAGE_IN_2", "-f " + basedir + "schema/recordsSubSchema.avsc")); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsSpecifySubSchemaFromExampleFile() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; final String check = basedir + "data/avro/uncompressed/recordsSubSchema.avro"; testAvroStorage(true, basedir + "code/pig/identity_blank_first_args.pig", ImmutableMap.of("INFILE", input, "OUTFILE", createOutputName(), "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/recordsSubSchema.avsc", "AVROSTORAGE_IN_2", "-e " + basedir + "data/avro/uncompressed/recordsSubSchema.avro")); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsOfArrays() throws Exception { final String input = basedir + "data/avro/uncompressed/recordsOfArrays.avro"; final String check = input; testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/recordsOfArrays.avsc", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsOfArraysOfRecords() throws Exception { final String input = basedir + "data/avro/uncompressed/recordsOfArraysOfRecords.avro"; final String check = input; testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/recordsOfArraysOfRecords.avsc", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsWithEnums() throws Exception { final String input = basedir + "data/avro/uncompressed/recordsWithEnums.avro"; final String check = input; testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/recordsWithEnums.avsc", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsWithFixed() throws Exception { final String input = basedir + "data/avro/uncompressed/recordsWithFixed.avro"; final String check = input; testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/recordsWithFixed.avsc", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsWithMaps() throws Exception { final String input = basedir + "data/avro/uncompressed/recordsWithMaps.avro"; final String check = input; testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/recordsWithMaps.avsc", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsWithMapsOfRecords() throws Exception { final String input = basedir + "data/avro/uncompressed/recordsWithMapsOfRecords.avro"; final String check = input; testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/recordsWithMapsOfRecords.avsc", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsWithMapsOfArrayOfRecords() throws Exception { final String input = basedir + "data/avro/uncompressed/recordsWithMapsOfArrayOfRecords.avro"; final String check = input; testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/recordsWithMapsOfArrayOfRecords.avsc", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsWithNullableUnions() throws Exception { final String input = basedir + "data/avro/uncompressed/recordsWithNullableUnions.avro"; final String check = input; testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/recordsWithNullableUnions.avsc", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadDeflateCompressedRecords() throws Exception { final String input = basedir + "data/avro/compressed/deflate/records.avro"; final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPig.avro"; testAvroStorage(true, basedir + "code/pig/identity_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "records", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadSnappyCompressedRecords() throws Exception { final String input = basedir + "data/avro/compressed/snappy/records.avro"; final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPig.avro"; testAvroStorage(true, basedir + "code/pig/identity_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "records", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testStoreDeflateCompressedRecords() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; final String check = basedir + "data/avro/compressed/deflate/recordsAsOutputByPig.avro"; testAvroStorage(true, basedir + "code/pig/identity_codec.pig", ImmutableMap.<String, String>builder().put("INFILE", input).put("OUTFILE", createOutputName()) .put("AVROSTORAGE_OUT_1", "records") .put("AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin").put("CODEC", "deflate") .put("LEVEL", "6").build()); verifyResults(createOutputName(), check); } @Test public void testStoreSnappyCompressedRecords() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; final String check = basedir + "data/avro/compressed/snappy/recordsAsOutputByPig.avro"; testAvroStorage(true, basedir + "code/pig/identity_codec.pig", ImmutableMap.<String, String>builder().put("INFILE", input).put("OUTFILE", createOutputName()) .put("AVROSTORAGE_OUT_1", "records") .put("AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin").put("CODEC", "snappy") .put("LEVEL", "6").build()); verifyResults(createOutputName(), check); } @Test public void testLoadRecursiveRecords() throws Exception { final String input = basedir + "data/avro/uncompressed/recursiveRecord.avro"; testAvroStorage(false, basedir + "code/pig/recursive_tests.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_IN_2", "-n this.is.ignored.in.a.loadFunc", "AVROSTORAGE_OUT_1", "recordsSubSchemaNullable", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); } @Test public void testLoadRecursiveRecordsOptionOn() throws Exception { final String input = basedir + "data/avro/uncompressed/recursiveRecord.avro"; final String check = basedir + "data/avro/uncompressed/recordsSubSchemaNullable.avro"; testAvroStorage(true, basedir + "code/pig/recursive_tests.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_IN_2", "-r", "AVROSTORAGE_OUT_1", "recordsSubSchemaNullable", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadRecordsWithRepeatedSubRecords() throws Exception { final String input = basedir + "data/avro/uncompressed/recordWithRepeatedSubRecords.avro"; final String check = basedir + "data/avro/uncompressed/recordWithRepeatedSubRecords.avro"; testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/recordWithRepeatedSubRecords.avsc", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadDirectory() throws Exception { final String input = basedir + "data/avro/uncompressed/testdirectory"; final String check = basedir + "data/avro/uncompressed/testDirectoryCounts.avro"; testAvroStorage(true, basedir + "code/pig/directory_test.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "stats", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadGlob() throws Exception { final String input = basedir + "data/avro/uncompressed/testdirectory/part-m-0000*"; final String check = basedir + "data/avro/uncompressed/testDirectoryCounts.avro"; testAvroStorage(true, basedir + "code/pig/directory_test.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "stats", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testPartialLoadGlob() throws Exception { final String input = basedir + "data/avro/uncompressed/testdirectory/part-m-0000{0,2,4,6}.avro"; final String check = basedir + "data/avro/uncompressed/evenFileNameTestDirectoryCounts.avro"; testAvroStorage(true, basedir + "code/pig/directory_test.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "stats", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testSeparatedByComma() throws Exception { final String temp = basedir + "data/avro/uncompressed/testdirectory/part-m-0000"; StringBuffer sb = new StringBuffer(); sb.append(temp + "0.avro"); for (int i = 1; i <= 7; ++i) { sb.append(","); sb.append(temp + String.valueOf(i) + ".avro"); } final String input = sb.toString(); final String check = basedir + "data/avro/uncompressed/testDirectoryCounts.avro"; testAvroStorage(true, basedir + "code/pig/directory_test.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "stats", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testDoubleUnderscore() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; final String check = basedir + "data/avro/uncompressed/recordsWithDoubleUnderscores.avro"; testAvroStorage(true, basedir + "code/pig/namesWithDoubleColons.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "recordsWithDoubleUnderscores", "AVROSTORAGE_OUT_2", "-d -n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testDoubleUnderscoreNoFlag() throws Exception { final String input = basedir + "data/avro/uncompressed/records.avro"; testAvroStorage(false, basedir + "code/pig/namesWithDoubleColons.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "recordsWithDoubleUnderscores", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); } @Test public void testLoadArrays() throws Exception { final String input = basedir + "data/avro/uncompressed/arrays.avro"; final String check = basedir + "data/avro/uncompressed/arraysAsOutputByPig.avro"; testAvroStorage(true, basedir + "code/pig/identity_ao2.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "arrays", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadTrevniRecords() throws Exception { final String input = basedir + "data/trevni/uncompressed/simpleRecordsTrevni.trevni"; final String check = basedir + "data/avro/uncompressed/simpleRecordsTrevni.avro"; testAvroStorage(true, basedir + "code/pig/trevni_to_avro.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/simpleRecordsTrevni.avsc", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testLoadAndSaveTrevniRecords() throws Exception { final String input = basedir + "data/trevni/uncompressed/simpleRecordsTrevni.trevni"; final String check = basedir + "data/avro/uncompressed/simpleRecordsTrevni.avro"; testAvroStorage(true, basedir + "code/pig/trevni_to_trevni.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/simpleRecordsTrevni.avsc", "OUTFILE", createOutputName() + "Trevni")); testAvroStorage(true, basedir + "code/pig/trevni_to_avro.pig", ImmutableMap.of("INFILE", createOutputName() + "Trevni", "AVROSTORAGE_OUT_2", "-f " + basedir + "schema/simpleRecordsTrevni.avsc", "OUTFILE", createOutputName())); verifyResults(createOutputName(), check); } @Test public void testRetrieveDataFromMap() throws Exception { pigServerLocal = new PigServer(Util.getLocalTestMode()); Data data = resetData(pigServerLocal); Map<String, String> mapv1 = new HashMap<String, String>(); mapv1.put("key1", "v11"); mapv1.put("key2", "v12"); Map<String, String> mapv2 = new HashMap<String, String>(); mapv2.put("key1", "v21"); mapv2.put("key2", "v22"); data.set("testMap", "maps:map[chararray]", tuple(mapv1), tuple(mapv2)); String schemaDescription = new String("{" + "\"type\": \"record\"," + "\"name\": \"record\"," + "\"fields\" : [" + "{\"name\" : \"maps\", \"type\" :{\"type\" : \"map\", \"values\" : \"string\"}}" + "]" + "}"); pigServerLocal.registerQuery("A = LOAD 'testMap' USING mock.Storage();"); pigServerLocal.registerQuery( "STORE A INTO '" + createOutputName() + "' USING AvroStorage('" + schemaDescription + "');"); pigServerLocal.registerQuery("B = LOAD '" + createOutputName() + "' USING AvroStorage();"); pigServerLocal.registerQuery("C = FOREACH B generate maps#'key1';"); pigServerLocal.registerQuery("STORE C INTO 'out' USING mock.Storage();"); List<Tuple> out = data.get("out"); assertEquals(tuple("v11"), out.get(0)); assertEquals(tuple("v21"), out.get(1)); } @SuppressWarnings("rawtypes") @Test public void testCompareToOfBagWrapper() throws Exception { final String check = basedir + "data/avro/uncompressed/arraysAsOutputByPig.avro"; Set<GenericData.Record> records = getExpected(check); assertEquals(3, records.size()); AvroBagWrapper size0 = null; // [ ] AvroBagWrapper size1 = null; // [ 6 ] AvroBagWrapper size5 = null; // [ 1, 2, 3, 4, 5 ] // 3 arrays in records are in an arbitrary order. We re-order them by // their size. for (GenericData.Record record : records) { AvroTupleWrapper tw = new AvroTupleWrapper<GenericData.Record>(record); if (((AvroBagWrapper) tw.get(0)).size() == 0) { size0 = (AvroBagWrapper) tw.get(0); } else if (((AvroBagWrapper) tw.get(0)).size() == 1) { size1 = (AvroBagWrapper) tw.get(0); } else if (((AvroBagWrapper) tw.get(0)).size() == 5) { size5 = (AvroBagWrapper) tw.get(0); } } assertEquals(0, size0.size()); assertEquals(1, size1.size()); assertEquals(5, size5.size()); assertTrue(size0.compareTo(size0) == 0); assertTrue(size0.compareTo(size1) < 0); assertTrue(size0.compareTo(size5) < 0); assertTrue(size1.compareTo(size0) > 0); // 6 > 1, so size1 > size5 even though size1 is smaller than size5. assertTrue(size1.compareTo(size5) > 0); } @Test public void testUtf8KeyLookupFromMap() throws Exception { Map<CharSequence, Object> tm = new TreeMap<CharSequence, Object>(); tm.put(new Utf8("foo"), "foo"); tm.put(new Utf8("bar"), "bar"); AvroMapWrapper wrapper = new AvroMapWrapper(tm); String v = (String) wrapper.get(new Utf8("foo")); assertEquals("foo", v); v = (String) wrapper.get(new Utf8("bar")); assertEquals("bar", v); } @Test public void testAvroMapWrapper() throws Exception { final Map<CharSequence, Object> m = new HashMap<CharSequence, Object>(); for (String fn : avroSchemas) { final String avro = basedir + "data/avro/uncompressed/" + fn + ".avro"; int i = 0; for (GenericContainer r : readAvroData(avro)) { m.put(new Utf8(fn + i), r); i += 1; } } final AvroMapWrapper amw = new AvroMapWrapper(m); // Test out all the interfaces the AvroMapWrapper supports for (Object o : amw.values()) { assertTrue(isValidPigObject(o)); } for (CharSequence k : amw.keySet()) { assertTrue(isValidPigObject(k)); assertTrue(isValidPigObject(amw.get(k))); } for (Map.Entry<CharSequence, Object> e : amw.entrySet()) { assertTrue(isValidPigObject(e.getKey())); assertTrue(isValidPigObject(e.getValue())); } } private boolean isValidPigObject(Object o) { if (o == null) { return true; } switch (DataType.findType(o)) { case DataType.TUPLE: for (Object inner : ((Tuple) o).getAll()) { if (!isValidPigObject(inner)) { return false; } } return true; case DataType.BAG: final Iterator<Tuple> bi = ((DataBag) o).iterator(); while (bi.hasNext()) { if (!isValidPigObject(bi.next())) { return false; } } return true; case DataType.MAP: for (Object inner : ((Map) o).values()) { if (!isValidPigObject(inner)) { return false; } } return true; case DataType.BIGDECIMAL: case DataType.BIGINTEGER: case DataType.BOOLEAN: case DataType.BYTE: case DataType.BYTEARRAY: case DataType.CHARARRAY: case DataType.DATETIME: case DataType.DOUBLE: case DataType.FLOAT: case DataType.GENERIC_WRITABLECOMPARABLE: case DataType.INTEGER: case DataType.LONG: return true; case DataType.ERROR: default: return false; } } private List<GenericContainer> readAvroData(String path) throws IOException { final FileSystem fs = FileSystem.getLocal(new Configuration()); final Path filePath = new Path(path); assertTrue("File path " + filePath + " does not exists!", fs.exists(filePath)); final GenericDatumReader<GenericContainer> reader = new GenericDatumReader<GenericContainer>(); final DataFileStream<GenericContainer> in = new DataFileStream<GenericContainer>(fs.open(filePath), reader); final List<GenericContainer> avroData = new ArrayList<GenericContainer>(); try { while (in.hasNext()) { GenericContainer obj = in.next(); avroData.add(obj); } } finally { Closeables.closeQuietly(in); } return avroData; } private void testAvroStorage(boolean expectedToSucceed, String scriptFile, Map<String, String> parameterMap) throws IOException { pigServerLocal.setBatchOn(); int numOfFailedJobs = 0; try { pigServerLocal.registerScript(scriptFile, parameterMap); for (ExecJob job : pigServerLocal.executeBatch()) { if (job.getStatus().equals(JOB_STATUS.FAILED)) { numOfFailedJobs++; } } } catch (Exception e) { System.err.printf("Exception caught in testAvroStorage: %s\n", e); numOfFailedJobs++; } if (expectedToSucceed) { assertTrue("There was a failed job!", numOfFailedJobs == 0); } else { assertTrue("There was no failed job!", numOfFailedJobs > 0); } } private void verifyResults(String outPath, String expectedOutpath) throws IOException { verifyResults(outPath, expectedOutpath, null); } private void verifyResults(String outPath, String expectedOutpath, String expectedCodec) throws IOException { FileSystem fs = FileSystem.getLocal(new Configuration()); /* read in expected results*/ Set<GenericData.Record> expected = getExpected(expectedOutpath); /* read in output results and compare */ Path output = new Path(outPath); assertTrue("Output dir does not exists!", fs.exists(output) && fs.getFileStatus(output).isDir()); Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter)); assertTrue("Split field dirs not found!", paths != null); for (Path path : paths) { Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter)); assertTrue("No files found for path: " + path.toUri().getPath(), files != null); for (Path filePath : files) { assertTrue("This shouldn't be a directory", fs.isFile(filePath)); GenericDatumReader<GenericData.Record> reader = new GenericDatumReader<GenericData.Record>(); DataFileStream<GenericData.Record> in = new DataFileStream<GenericData.Record>(fs.open(filePath), reader); assertEquals("codec", expectedCodec, in.getMetaString("avro.codec")); int count = 0; while (in.hasNext()) { GenericData.Record obj = in.next(); assertTrue( "Avro result object found that's not expected: Found " + (obj != null ? obj.getSchema() : "null") + ", " + obj.toString() + "\nExpected " + (expected != null ? expected.toString() : "null") + "\n", expected.contains(obj)); count++; } in.close(); assertEquals(expected.size(), count); } } } private Set<GenericData.Record> getExpected(String pathstr) throws IOException { Set<GenericData.Record> ret = new TreeSet<GenericData.Record>(new Comparator<GenericData.Record>() { @Override public int compare(Record o1, Record o2) { //return o1.compareTo(o2); throws AvroRuntimeException: Can't compare maps! return o1.equals(o2) ? 0 : o1.toString().compareTo(o2.toString()); } }); FileSystem fs = FileSystem.getLocal(new Configuration()); /* read in output results and compare */ Path output = new Path(pathstr); assertTrue("Expected output does not exists!", fs.exists(output)); Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter)); assertTrue("Split field dirs not found!", paths != null); for (Path path : paths) { Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter)); assertTrue("No files found for path: " + path.toUri().getPath(), files != null); for (Path filePath : files) { assertTrue("This shouldn't be a directory", fs.isFile(filePath)); GenericDatumReader<GenericData.Record> reader = new GenericDatumReader<GenericData.Record>(); DataFileStream<GenericData.Record> in = new DataFileStream<GenericData.Record>(fs.open(filePath), reader); while (in.hasNext()) { GenericData.Record obj = in.next(); ret.add(obj); } in.close(); } } return ret; } }