org.apache.pig.piggybank.test.storage.avro.TestAvroStorage.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pig.piggybank.test.storage.avro.TestAvroStorage.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.pig.piggybank.test.storage.avro;

import static org.apache.pig.builtin.mock.Storage.resetData;
import static org.apache.pig.builtin.mock.Storage.schema;
import static org.apache.pig.builtin.mock.Storage.tuple;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;

import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.pig.ExecType;
import org.apache.pig.LoadFunc;
import org.apache.pig.PigConfiguration;
import org.apache.pig.PigServer;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.executionengine.ExecJob;
import org.apache.pig.backend.executionengine.ExecJob.JOB_STATUS;
import org.apache.pig.backend.hadoop.executionengine.JobCreationException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
import org.apache.pig.builtin.mock.Storage.Data;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.io.FileLocalizer;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.piggybank.storage.avro.PigSchema2Avro;
import org.apache.pig.test.Util;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;

public class TestAvroStorage {

    protected static final Log LOG = LogFactory.getLog(TestAvroStorage.class);

    private static PigServer pigServerLocal = null;

    final private static String basedir = "src/test/java/org/apache/pig/piggybank/test/storage/avro/avro_test_files/";

    private static String outbasedir;

    public static final PathFilter hiddenPathFilter = new PathFilter() {
        @Override
        public boolean accept(Path p) {
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    };

    private static String getInputFile(String file) {
        String locations[] = LoadFunc.getPathStrings(file);
        if (locations.length == 1)
            return System.getProperty("user.dir") + "/" + basedir + file;
        else {
            ArrayList<String> pathStrings = new ArrayList<String>();
            for (int index = 0; index < locations.length; index++) {
                String f = System.getProperty("user.dir") + "/" + basedir + locations[index].trim();
                pathStrings.add(f);
            }
            return LoadFunc.join(pathStrings, ",");
        }
    }

    final private String testDir1 = getInputFile("test_dir1");
    final private String testDir1AllFiles = getInputFile("test_dir1/*");
    final private String testDir1Files123 = getInputFile("test_dir1/test_glob{1,2,3}.avro");
    final private String testDir1Files321 = getInputFile("test_dir1/test_glob{3,2,1}.avro");
    final private String testDir12AllFiles = getInputFile("{test_dir1,test_dir2}/test_glob*.avro");
    final private String testDir21AllFiles = getInputFile("{test_dir2,test_dir1}/test_glob*.avro");
    final private String testCommaSeparated1 = getInputFile(
            "test_dir1/test_glob1.avro,test_dir1/test_glob2.avro,test_dir1/test_glob3.avro");
    final private String testCommaSeparated2 = getInputFile(
            "test_dir1/test_glob*,test_dir2/test_glob4.avro,test_dir2/test_glob5.avro");
    final private String testNoMatchedFiles = getInputFile("test_dir{1,2}/file_that_does_not_exist*.avro");
    final private String testArrayFile = getInputFile("test_array.avro");
    final private String testArraySchema = getInputFile("test_array.avsc");
    final private String testRecordFile = getInputFile("test_record.avro");
    final private String testRecordSchema = getInputFile("test_record.avsc");
    final private String testGenericUnionFile = getInputFile("test_generic_union.avro");
    final private String testRecursiveRecordInMap = getInputFile("test_recursive_record_in_map.avro");
    final private String testRecursiveRecordInArray = getInputFile("test_recursive_record_in_array.avro");
    final private String testRecursiveRecordInUnion = getInputFile("test_recursive_record_in_union.avro");
    final private String testRecursiveRecordInRecord = getInputFile("test_recursive_record_in_record.avro");
    final private String testRecursiveRecordInUnionSchema = getInputFile("test_recursive_record_in_union.avsc");
    final private String testTextFile = getInputFile("test_record.txt");
    final private String testSingleTupleBagFile = getInputFile("messages.avro");
    final private String testNoExtensionFile = getInputFile("test_no_extension");
    final private String recursiveRecordInMap = " {" + "   \"type\" : \"record\","
            + "   \"name\" : \"recursive_record\"," + "   \"fields\" : [ {" + "     \"name\" : \"id\","
            + "     \"type\" : \"int\"" + "   }, {" + "     \"name\" : \"nested\","
            + "     \"type\" : [ \"null\", {" + "       \"type\" : \"map\","
            + "       \"values\" : \"recursive_record\"" + "     } ]" + "   } ]" + " }";
    final private String recursiveRecordInArray = " {" + "   \"type\" : \"record\","
            + "   \"name\" : \"recursive_record\"," + "   \"fields\" : [ {" + "     \"name\" : \"id\","
            + "     \"type\" : \"int\"" + "   }, {" + "     \"name\" : \"nested\","
            + "     \"type\" : [ \"null\", {" + "       \"type\" : \"array\","
            + "       \"items\" : \"recursive_record\"" + "     } ]" + "   } ]" + " }";
    final private String recursiveRecordInUnion = " {" + "   \"type\" : \"record\","
            + "   \"name\" : \"recursive_record\"," + "   \"fields\" : [ {" + "     \"name\" : \"value\","
            + "     \"type\" : \"int\"" + "   }, {" + "     \"name\" : \"next\","
            + "     \"type\" : [ \"null\", \"recursive_record\" ]" + "   } ]" + " }";
    final private String recursiveRecordInRecord = " {" + "   \"type\" : \"record\","
            + "   \"name\" : \"recursive_record\"," + "   \"fields\" : [ {" + "     \"name\" : \"id\","
            + "     \"type\" : \"int\"" + "   }, {" + "     \"name\" : \"nested\","
            + "     \"type\" : [ \"null\", {" + "       \"type\" : \"record\","
            + "       \"name\" : \"nested_record\"," + "       \"fields\" : [ {" + "         \"name\" : \"value1\","
            + "         \"type\" : \"string\"" + "       }, {" + "         \"name\" : \"next\","
            + "         \"type\" : \"recursive_record\"" + "       }, {" + "         \"name\" : \"value2\","
            + "         \"type\" : \"string\"" + "       } ]" + "     } ]" + "   } ]" + " }";
    final private String testCorruptedFile = getInputFile("test_corrupted_file.avro");
    final private String testMultipleSchemas1File = getInputFile("test_primitive_types/*");
    final private String testMultipleSchemas2File = getInputFile("test_complex_types/*");
    final private String testMultipleSchemasWithDefaultValue = getInputFile(
            "test_merge_schemas_default/{Employee{3,4,6}.avro}");
    final private String testUserDefinedLoadSchemaFile = getInputFile("test_user_defined_load_schema/*");
    final private String testLoadwithNullValues = getInputFile("test_loadavrowithnulls.avro");

    @BeforeClass
    public static void setup() throws ExecException, IOException {
        pigServerLocal = new PigServer(ExecType.LOCAL);
        String TMP_DIR = System.getProperty("user.dir") + "/build/test/tmp/";
        pigServerLocal.getPigContext().getProperties().setProperty(PigConfiguration.PIG_TEMP_DIR, TMP_DIR);
        outbasedir = FileLocalizer.getTemporaryPath(pigServerLocal.getPigContext()).toString()
                + "/TestAvroStorage/";
        deleteDirectory(new File(outbasedir));
    }

    @AfterClass
    public static void teardown() {
        if (pigServerLocal != null)
            pigServerLocal.shutdown();
    }

    @Test
    public void testRecursiveRecordInMap() throws IOException {
        // Verify that recursive records in map can be loaded/saved.
        String output = outbasedir + "testRecursiveRecordInMap";
        String expected = testRecursiveRecordInMap;
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInMap)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + " 'no_schema_check'," + " 'schema', '" + recursiveRecordInMap + "' );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testRecursiveRecordInArray() throws IOException {
        // Verify that recursive records in array can be loaded/saved.
        String output = outbasedir + "testRecursiveRecordInArray";
        String expected = testRecursiveRecordInArray;
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInArray)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + " 'no_schema_check'," + " 'schema', '" + recursiveRecordInArray + "' );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testRecursiveRecordInUnion() throws IOException {
        // Verify that recursive records in union can be loaded/saved.
        String output = outbasedir + "testRecursiveRecordInUnion";
        String expected = testRecursiveRecordInUnion;
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + " 'no_schema_check'," + " 'schema', '" + recursiveRecordInUnion + "' );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testRecursiveRecordInRecord() throws IOException {
        // Verify that recursive records in record can be loaded/saved.
        String output = outbasedir + "testRecursiveRecordInRecord";
        String expected = testRecursiveRecordInRecord;
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInRecord)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + " 'no_schema_check'," + " 'schema', '" + Util.encodeEscape(recursiveRecordInRecord)
                        + "' );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testRecursiveRecordWithSame() throws IOException {
        // Verify that avro schema can be specified via an external avro file
        // instead of a json string.
        String output = outbasedir + "testRecursiveRecordWithSame";
        String expected = testRecursiveRecordInUnion;
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + " 'no_schema_check'," + " 'same', '" + Util.encodeEscape(testRecursiveRecordInUnion)
                        + "' );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testRecursiveRecordReference1() throws IOException {
        // The relation 'in' looks like this:
        //  (1,(2,(3,)))
        //  (2,(3,))
        //  (3,)
        // $0 looks like this:
        //  (1)
        //  (2)
        //  (3)
        // Avro file stored after filtering out nulls looks like this:
        //  1
        //  2
        //  3
        String output = outbasedir + "testRecursiveRecordReference1";
        String expected = basedir + "expected_testRecursiveRecordReference1.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " first = FOREACH in GENERATE $0 AS value;", " filtered = FILTER first BY value is not null;",
                " STORE filtered INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + " 'no_schema_check'," + " 'schema', '\"int\"' );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testRecursiveRecordReference2() throws IOException {
        // The relation 'in' looks like this:
        //  (1,(2,(3,)))
        //  (2,(3,))
        //  (3,)
        // $1.$0 looks like this:
        //  (2)
        //  (3)
        //  ()
        // Avro file stored after filtering out nulls looks like this:
        //  2
        //  3
        String output = outbasedir + "testRecursiveRecordReference2";
        String expected = basedir + "expected_testRecursiveRecordReference2.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " second = FOREACH in GENERATE $1.$0 AS value;", " filtered = FILTER second BY value is not null;",
                " STORE filtered INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + " 'no_schema_check'," + " 'schema', '\"int\"' );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testRecursiveRecordReference3() throws IOException {
        // The relation 'in' looks like this:
        //  (1,(2,(3,)))
        //  (2,(3,))
        //  (3,)
        // $1.$1.$0 looks like this:
        //  (3)
        //  ()
        //  ()
        // Avro file stored after filtering out nulls looks like this:
        //  3
        String output = outbasedir + "testRecursiveRecordReference3";
        String expected = basedir + "expected_testRecursiveRecordReference3.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " third = FOREACH in GENERATE $1.$1.$0 AS value;", " filtered = FILTER third BY value is not null;",
                " STORE filtered INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + " 'no_schema_check'," + " 'schema', '\"int\"' );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testRecursiveRecordWithNoAvroSchema() throws IOException {
        // Verify that recursive records cannot be stored,
        // if no avro schema is specified either via 'schema' or 'same'.
        String output = outbasedir + "testRecursiveRecordWithNoAvroSchema";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + " 'no_schema_check' );" };
        // Since Avro schema is not specified via the 'schema' parameter, it is
        // derived from Pig schema. Job is expected to fail because this derived
        // Avro schema (bytes) is not compatible with data (tuples).
        testAvroStorage(true, queries);
    }

    @Test
    public void testRecursiveRecordWithSchemaCheck() throws IOException {
        // Verify that recursive records cannot be stored if schema check is enbled.
        String output = outbasedir + "testRecursiveWithSchemaCheck";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + " 'schema', '" + recursiveRecordInUnion + "' );" };
        try {
            testAvroStorage(queries);
            Assert.fail("Negative test to test an exception. Should not be succeeding!");
        } catch (IOException e) {
            // An IOException is thrown by AvroStorage during schema check due to incompatible
            // data types.
            assertTrue(e.getMessage().contains("bytearray is not compatible with avro"));
        }
    }

    @Test
    public void testRecursiveRecordWithSchemaFile() throws IOException {
        // Verify that recursive records cannot be stored if avro schema is specified by 'schema_file'.
        String output = outbasedir + "testRecursiveWithSchemaFile";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + " 'no_schema_check'," + " 'schema_file', '"
                        + Util.encodeEscape(testRecursiveRecordInUnionSchema) + "' );" };
        try {
            testAvroStorage(queries);
            Assert.fail("Negative test to test an exception. Should not be succeeding!");
        } catch (FrontendException e) {
            // The IOException thrown by AvroSchemaManager for recursive record is caught
            // by the Pig frontend, and FrontendException is re-thrown.
            assertTrue(e.getMessage()
                    .contains("could not instantiate 'org.apache.pig.piggybank.storage.avro.AvroStorage'"));
        }
    }

    @Test
    public void testRecursiveRecordWithData() throws IOException {
        // Verify that recursive records cannot be stored if avro schema is specified by 'data'.
        String output = outbasedir + "testRecursiveWithData";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testRecursiveRecordInUnion)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + " 'no_schema_check'," + " 'data', '" + Util.encodeEscape(testRecursiveRecordInUnion)
                        + "' );" };
        try {
            testAvroStorage(queries);
            Assert.fail("Negative test to test an exception. Should not be succeeding!");
        } catch (FrontendException e) {
            // The IOException thrown by AvroSchemaManager for recursive record is caught
            // by the Pig frontend, and FrontendException is re-thrown.
            assertTrue(e.getMessage()
                    .contains("could not instantiate 'org.apache.pig.piggybank.storage.avro.AvroStorage'"));
        }
    }

    @Test
    public void testGenericUnion() throws IOException {
        // Verify that a FrontendException is thrown if schema has generic union.
        String output = outbasedir + "testGenericUnion";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testGenericUnionFile)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" };
        try {
            testAvroStorage(queries);
            Assert.fail("Negative test to test an exception. Should not be succeeding!");
        } catch (FrontendException e) {
            // The IOException thrown by AvroStorage for generic union is caught
            // by the Pig frontend, and FrontendException is re-thrown.
            assertTrue(e.getMessage().contains("Cannot get schema"));
        }
    }

    @Test
    public void testMultipleSchemas1() throws IOException {
        // Verify that multiple primitive types can be loaded.
        // Input Avro files have the following schemas:
        //  "int"
        //  "long"
        //  "float"
        //  "double"
        //  "string"
        //  { "type" : "enum", "name" : "foo", "symbols" : [ "6" ] }
        // Merged Avro schema looks like this:
        //  "string"
        // The relation 'in' looks like this: (order of rows can be different.)
        //  (6)
        //  (4.0)
        //  (3.0)
        //  (5)
        //  (2)
        //  (1)
        // Avro file stored after processing looks like this:
        //  "1"
        //  "2"
        //  "3.0"
        //  "4.0"
        //  "5"
        //  "6"
        String output = outbasedir + "testMultipleSchemas1";
        String expected = basedir + "expected_testMultipleSchemas1.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testMultipleSchemas1File)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ('multiple_schemas');",
                " s = FOREACH in GENERATE StringConcat($0);", " o = ORDER s BY $0;", " STORE o INTO '" + output
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ('schema', '\"string\"');" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testMultipleSchemas2() throws IOException {
        // Verify that multiple complex types (records) can be loaded.
        // Input Avro files have the following schemas:
        //  { "type" : "record", "name" : "r", "fields" : [ { "name" : "i", "type" : "int" } ] }
        //  { "type" : "record", "name" : "r", "fields" : [ { "name" : "l", "type" : "long" } ] }
        //  { "type" : "record", "name" : "r", "fields" : [ { "name" : "f", "type" : "float" } ] }
        //  { "type" : "record", "name" : "r", "fields" : [ { "name" : "d", "type" : "double" } ] }
        //  { "type" : "record", "name" : "r", "fields" : [ { "name" : "s", "type" : "string" } ] }
        //  { "type" : "record", "name" : "r", "fields" : [ { "name" : "e", "type" : {
        //      "type" : "enum", "name" : "foo", "symbols" : [ "6" ] } } ] }
        // Merged Avro schema looks like this:
        //  { "type" : "record",
        //    "name" : "merged",
        //    "fields" : [ { "name" : "i", "type" : "int" },
        //                 { "name" : "l", "type" : "long" },
        //                 { "name" : "f", "type" : "float" },
        //                 { "name" : "d", "type" : "double" },
        //                 { "name" : "s", "type" : "string" },
        //                 { "name" : "e", "type" : {
        //                      "type" : "enum", "name" : "foo", "symbols" : [ "6" ] } }
        //               ]
        //  }
        // The relation 'in' looks like this: (order of rows can be different.)
        //  (,,6,,,)
        //  (,,,,4.0,)
        //  (,,,,,3.0)
        //  (,5,,,,)
        //  (,,,2,,)
        //  (1,,,,,)
        // Avro file stored after processing looks like this:
        //  "1"
        //  "2"
        //  "3.0"
        //  "4.0"
        //  "5"
        //  "6"
        String output = outbasedir + "testMultipleSchemas2";
        String expected = basedir + "expected_testMultipleSchemas2.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testMultipleSchemas2File)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ('multiple_schemas');",
                " f = FOREACH in GENERATE ($0 is not null ? (chararray)$0 : ''),"
                        + "                         ($1 is not null ? (chararray)$1 : ''),"
                        + "                         ($2 is not null ? (chararray)$2 : ''),"
                        + "                         ($3 is not null ? (chararray)$3 : ''),"
                        + "                         ($4 is not null ? (chararray)$4 : ''),"
                        + "                         ($5 is not null ? (chararray)$5 : '');",
                " c = FOREACH f GENERATE StringConcat( $0, $1, $2, $3, $4, $5 );", " o = ORDER c BY $0;",
                " STORE o INTO '" + output
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ('schema', '\"string\"');" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testUserDefinedLoadSchema() throws IOException {
        PigSchema2Avro.setTupleIndex(2);
        // Verify that user specified schema correctly maps to input schemas
        // Input Avro files have the following schemas:
        //   name:"string", address:[customField1:"int", addressLine:"string"]
        //   address:[addressLine:"string", customField2:"int"], name:"string"
        // User Avro schema looks like this:
        //   name:"string", address:[customField1:"int", customField2:"int", customField3:"int"]
        // This test will confirm that AvroStorage correctly maps fields from writer to reader schema,
        // dropping, adding, and reordering fields where needed.
        String output = outbasedir + "testUserDefinedLoadSchema";
        String expected = basedir + "expected_testUserDefinedLoadSchema.avro";
        String customSchema = "{\"type\": \"record\", \"name\": \"employee\", \"fields\": [ "
                + "{ \"default\": \"***\", \"type\": \"string\", \"name\": \"name\" }, "
                + "{ \"name\": \"address\", \"type\": { "
                + "\"type\": \"record\", \"name\": \"addressDetails\", \"fields\": [ "
                + "{ \"default\": 0, \"type\": \"int\", \"name\": \"customField1\" }, "
                + "{ \"default\": 0, \"type\": \"int\", \"name\": \"customField2\" }, "
                + "{ \"default\": 0, \"type\": \"int\", \"name\": \"customField3\" } " + "] " + "} } " + "] } ";

        deleteDirectory(new File(output));
        String[] queries = { " in = LOAD '" + testUserDefinedLoadSchemaFile
                + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ('schema', '" + customSchema + "');",
                " o = ORDER in BY name;",
                " STORE o INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testMultipleSchemasWithDefaultValue() throws IOException {
        //        ==> Employee3.avro <==
        //            {
        //            "type" : "record",
        //            "name" : "employee",
        //            "fields":[
        //                    {"name" : "name", "type" : "string", "default" : "NU"},
        //                    {"name" : "age", "type" : "int", "default" : 0 },
        //                    {"name" : "dept", "type": "string", "default" : "DU"} ] }
        //
        //            ==> Employee4.avro <==
        //            {
        //            "type" : "record",
        //            "name" : "employee",
        //            "fields":[
        //                    {"name" : "name", "type" : "string", "default" : "NU"},
        //                    {"name" : "age", "type" : "int", "default" : 0},
        //                    {"name" : "dept", "type": "string", "default" : "DU"},
        //                    {"name" : "office", "type": "string", "default" : "OU"} ] }
        //
        //            ==> Employee6.avro <==
        //            {
        //            "type" : "record",
        //            "name" : "employee",
        //            "fields":[
        //                    {"name" : "name", "type" : "string", "default" : "NU"},
        //                    {"name" : "lastname", "type": "string", "default" : "LNU"},
        //                    {"name" : "age", "type" : "int","default" : 0},
        //                    {"name" : "salary", "type": "int", "default" : 0},
        //                    {"name" : "dept", "type": "string","default" : "DU"},
        //                    {"name" : "office", "type": "string","default" : "OU"} ] }
        // The relation 'in' looks like this: (order of rows can be different.)
        // Avro file stored after processing looks like this:
        // The relation 'in' looks like this: (order of rows can be different.)
        //      Employee3.avro
        //        (Milo,30,DH)
        //        (Asmya,34,PQ)
        //        (Baljit,23,RS)
        //
        //      Employee4.avro
        //        (Praj,54,RMX,Champaign)
        //        (Buba,767,HD,Sunnyvale)
        //        (Manku,375,MS,New York)
        //
        //      Employee6.avro
        //        (Pune,Warriors,60,5466,Astrophysics,UTA)
        //        (Rajsathan,Royals,20,1378,Biochemistry,Stanford)
        //        (Chennai,Superkings,50,7338,Microbiology,Hopkins)
        //        (Mumbai,Indians,20,4468,Applied Math,UAH)

        // Data file stored after without looks like this with the
        // following schema and data
        // {name: chararray,age: int,dept: chararray,office: chararray,
        // lastname: chararray,salary: int}
        //(Asmya,34,PQ,OU,LNU,0)
        //(Baljit,23,RS,OU,LNU,0)
        //(Buba,767,HD,Sunnyvale,LNU,0)
        //(Chennai,50,Microbiology,Hopkins,Superkings,7338)
        //(Manku,375,MS,New York,LNU,0)
        //(Milo,30,DH,OU,LNU,0)
        //(Mumbai,20,Applied Math,UAH,Indians,4468)
        //(Praj,54,RMX,Champaign,LNU,0)
        //(Pune,60,Astrophysics,UTA,Warriors,5466)
        //(Rajsathan,20,Biochemistry,Stanford,Royals,1378)

        Data data = resetData(pigServerLocal);
        String output = outbasedir + "testMultipleSchemasWithDefaultValue";
        deleteDirectory(new File(output));
        String expected = basedir + "expected_testMultipleSchemasWithDefaultValue.avro";
        String[] queries = {
                " a = LOAD '" + testMultipleSchemasWithDefaultValue
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ('multiple_schemas');",
                " b = foreach a generate name,age,dept,office,lastname,salary;", " c = filter b by age < 40 ;",
                " d = order c by  name;", " STORE d INTO '" + output + "' using mock.Storage();" };
        testAvroStorage(queries);
        List<Tuple> out = data.get(output);
        assertEquals(out + " size", 5, out.size());
        assertEquals(schema(
                "name: chararray,age: int,dept: chararray,office: chararray,lastname: chararray,salary: int"),
                data.getSchema(output));
        assertEquals(tuple("Asmya", 34, "PQ", "OU", "LNU", 0), out.get(0));
        assertEquals(tuple("Baljit", 23, "RS", "OU", "LNU", 0), out.get(1));
        assertEquals(tuple("Milo", 30, "DH", "OU", "LNU", 0), out.get(2));
        assertEquals(tuple("Mumbai", 20, "Applied Math", "UAH", "Indians", 4468), out.get(3));
        assertEquals(tuple("Rajsathan", 20, "Biochemistry", "Stanford", "Royals", 1378), out.get(4));
    }

    @Test
    // Verify the default values specified in the schema in AvroStorage
    // are actually written to the schema in the output avro file
    public void testDefaultValueSchemaWrite() throws IOException {
        String output = outbasedir + "testDefaultValueSchemaWrite";
        String expected = basedir + "expected_testDefaultSchemaWrite.avro";
        Data data = resetData(pigServerLocal);
        data.set("testDefaultValueSchemaWrite", tuple(0, 115, 115000, 115000.1), tuple(1, 116, 116000, 116000.1),
                tuple(2, 117, 117000, 117000.1), tuple(3, 118, 118000, 118000.1), tuple(4, 119, 119000, 119000.1));
        deleteDirectory(new File(output));
        String[] queries = {
                " a = LOAD 'testDefaultValueSchemaWrite' USING mock.Storage as  "
                        + " (id: int, intval:int, longval:long, floatval:float);",
                " b = foreach a generate id, longval, floatval;", " c = order b by id;",
                " STORE c INTO '" + output + "' USING "
                        + " org.apache.pig.piggybank.storage.avro.AvroStorage (' { \"debug\" : 5, \"schema\" : "
                        + " {  \"name\" : \"rmyrecord\", \"type\" : \"record\",  \"fields\" : [ { \"name\" : \"id\", "
                        + " \"type\" : \"int\" , \"default\" : 0 }, {  \"name\" : \"longval\",  \"type\" : \"long\","
                        + " \"default\" : 0 }, { \"name\" : \"floatval\", \"type\" : \"float\", \"default\" : 1.0 } ] } } "
                        + " ');" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testDir() throws IOException {
        // Verify that all files in a directory including its sub-directories are loaded.
        String output = outbasedir + "testDir";
        String expected = basedir + "expected_testDir.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testDir1)
                        + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "   'schema', '{\"type\":\"array\",\"items\":\"float\"}'  );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testGlob1() throws IOException {
        // Verify that the a glob pattern matches files properly.
        String output = outbasedir + "testGlob1";
        String expected = basedir + "expected_testDir.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testDir1AllFiles)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "   'schema', '{\"type\":\"array\",\"items\":\"float\"}'  );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testGlob2() throws IOException {
        // Verify that comma-separated filenames are escaped properly.
        String output = outbasedir + "testGlob2";
        String expected = basedir + "expected_test_dir_1.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testDir1Files123)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "   'schema', '{\"type\":\"array\",\"items\":\"float\"}'  );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testGlob3() throws IOException {
        // Verify that comma-separated filenames are escaped properly.
        String output = outbasedir + "testGlob3";
        String expected = basedir + "expected_test_dir_1.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testDir1Files321)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "   'schema', '{\"type\":\"array\",\"items\":\"float\"}'  );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testGlob4() throws IOException {
        // Verify that comma-separated directory names are escaped properly.
        String output = outbasedir + "testGlob4";
        String expected = basedir + "expected_test_dir_1_2.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testDir12AllFiles)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "   'schema', '{\"type\":\"array\",\"items\":\"float\"}'  );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testGlob5() throws IOException {
        // Verify that comma-separated directory names are escaped properly.
        String output = outbasedir + "testGlob5";
        String expected = basedir + "expected_test_dir_1_2.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testDir21AllFiles)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "   'schema', '{\"type\":\"array\",\"items\":\"float\"}'  );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testGlob6() throws IOException {
        // Verify that an IOException is thrown if no files are matched by the glob pattern.
        String output = outbasedir + "testGlob6";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testNoMatchedFiles)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "   'schema', '{\"type\":\"array\",\"items\":\"float\"}'  );" };
        try {
            testAvroStorage(queries);
            Assert.fail("Negative test to test an exception. Should not be succeeding!");
        } catch (JobCreationException e) {
            // The IOException thrown by AvroStorage for input file not found is catched
            // by the Pig backend, and JobCreationException (a subclass of IOException)
            // is re-thrown while creating a job configuration.
            assertEquals(e.getMessage(), "Internal error creating job configuration.");
        }
    }

    @Test
    public void testComma1() throws IOException {
        // Verify that comma-separated file can be processed
        String output = outbasedir + "testComma1";
        String expected = basedir + "expected_test_dir_1.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testCommaSeparated1)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "   'schema', '{\"type\":\"array\",\"items\":\"float\"}'  );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testComma2() throws IOException {
        // Verify that comma-separated file can be processed
        String output = outbasedir + "testComma2";
        String expected = basedir + "expected_test_dir_1_2.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testCommaSeparated2)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "   'schema', '{\"type\":\"array\",\"items\":\"float\"}'  );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testArrayDefault() throws IOException {
        String output = outbasedir + "testArrayDefault";
        String expected = basedir + "expected_testArrayDefault.avro";

        deleteDirectory(new File(output));

        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testArrayFile)
                        + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testArrayWithSchema() throws IOException {
        String output = outbasedir + "testArrayWithSchema";
        String expected = basedir + "expected_testArrayWithSchema.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testArrayFile)
                        + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ( "
                        + "   'schema', '{\"type\":\"array\",\"items\":\"float\"}'  );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testArrayWithSchemaURI() throws IOException {
        String output = outbasedir + "testArrayWithSchemaURI";
        String expected = basedir + "expected_testArrayWithSchemaURI.avro"; // doubles (not floats) stored
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testArrayFile)
                        + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ( "
                        + "   'schema_uri', '" + Util.encodeEscape(testArraySchema) + "'  );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testArrayWithNotNull() throws IOException {
        String output = outbasedir + "testArrayWithNotNull";
        String expected = basedir + "expected_testArrayWithSchema.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testArrayFile)
                        + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ( "
                        + "   '{\"nullable\": false }'  );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testArrayWithSame() throws IOException {
        String output = outbasedir + "testArrayWithSame";
        String expected = basedir + "expected_testArrayWithSchema.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testArrayFile)
                        + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ( "
                        + "   'same', '" + Util.encodeEscape(testArrayFile) + "'  );" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testArrayWithSnappyCompression() throws IOException {
        String output = outbasedir + "testArrayWithSnappyCompression";
        String expected = basedir + "expected_testArrayDefault.avro";

        deleteDirectory(new File(output));

        Properties properties = new Properties();
        properties.setProperty(MRConfiguration.OUTPUT_COMPRESS, "true");
        properties.setProperty(MRConfiguration.OUTPUT_COMPRESSION_CODEC,
                "org.apache.hadoop.io.compress.SnappyCodec");
        properties.setProperty("avro.output.codec", "snappy");
        PigServer pigServer = new PigServer(ExecType.LOCAL, properties);
        pigServer.setBatchOn();
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testArrayFile)
                        + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" };
        for (String query : queries) {
            pigServer.registerQuery(query);
        }
        pigServer.executeBatch();
        verifyResults(output, expected, "snappy");
    }

    @Test
    public void testRecordWithSplit() throws IOException {
        PigSchema2Avro.setTupleIndex(0);
        String output1 = outbasedir + "testRecordSplit1";
        String output2 = outbasedir + "testRecordSplit2";
        String expected1 = basedir + "expected_testRecordSplit1.avro";
        String expected2 = basedir + "expected_testRecordSplit2.avro";
        deleteDirectory(new File(output1));
        deleteDirectory(new File(output2));
        String[] queries = {
                " avro = LOAD '" + Util.encodeEscape(testRecordFile)
                        + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " groups = GROUP avro BY member_id;",
                " sc = FOREACH groups GENERATE group AS key, COUNT(avro) AS cnt;",
                " STORE sc INTO '" + output1 + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "'{\"index\": 1, " + "  \"schema\": {\"type\":\"record\", " + " \"name\":\"result\", "
                        + "  \"fields\":[ {\"name\":\"member_id\",\"type\":\"int\"}, "
                        + "{\"name\":\"count\", \"type\":\"long\"} " + "]" + "}" + " }');",
                " STORE sc INTO '" + output2
                        + " 'USING org.apache.pig.piggybank.storage.avro.AvroStorage ('index', '2');" };
        testAvroStorage(queries);
        verifyResults(output1, expected1);
        verifyResults(output2, expected2);
    }

    @Test
    public void testRecordWithSplitFromText() throws IOException {
        PigSchema2Avro.setTupleIndex(0);
        String output1 = outbasedir + "testRecordSplitFromText1";
        String output2 = outbasedir + "testRecordSplitFromText2";
        String expected1 = basedir + "expected_testRecordSplitFromText1.avro";
        String expected2 = basedir + "expected_testRecordSplitFromText2.avro";
        deleteDirectory(new File(output1));
        deleteDirectory(new File(output2));
        String[] queries = { " avro = LOAD '" + Util.encodeEscape(testTextFile)
                + "' AS (member_id:int, browser_id:chararray, tracking_time:long, act_content:bag{inner:tuple(key:chararray, value:chararray)});",
                " groups = GROUP avro BY member_id;",
                " sc = FOREACH groups GENERATE group AS key, COUNT(avro) AS cnt;",
                " STORE sc INTO '" + output1 + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "'{\"index\": 1, " + "  \"schema\": {\"type\":\"record\", " + " \"name\":\"result\", "
                        + " \"fields\":[ {\"name\":\"member_id\",\"type\":\"int\"}, "
                        + "{\"name\":\"count\", \"type\":\"long\"} " + "]" + "}" + " }');",
                " STORE sc INTO '" + output2
                        + " 'USING org.apache.pig.piggybank.storage.avro.AvroStorage ('index', '2');" };
        testAvroStorage(queries);
        verifyResults(output1, expected1);
        verifyResults(output2, expected2);
    }

    @Test
    public void testRecordWithFieldSchema() throws IOException {
        PigSchema2Avro.setTupleIndex(1);
        String output = outbasedir + "testRecordWithFieldSchema";
        String expected = basedir + "expected_testRecordWithFieldSchema.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " avro = LOAD '" + Util.encodeEscape(testRecordFile)
                        + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " avro1 = FILTER avro BY member_id > 1211;",
                " avro2 = FOREACH avro1 GENERATE member_id, browser_id, tracking_time, act_content ;",
                " STORE avro2 INTO '" + output + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "'{\"data\":  \"" + Util.encodeEscape(testRecordFile) + "\" ," + "  \"field0\": \"int\", "
                        + " \"field1\":  \"def:browser_id\", " + "  \"field3\": \"def:act_content\" " + " }');" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testRecordWithFieldSchemaFromText() throws IOException {
        PigSchema2Avro.setTupleIndex(1);
        String output = outbasedir + "testRecordWithFieldSchemaFromText";
        String expected = basedir + "expected_testRecordWithFieldSchema.avro";
        deleteDirectory(new File(output));
        String[] queries = { " avro = LOAD '" + Util.encodeEscape(testTextFile)
                + "' AS (member_id:int, browser_id:chararray, tracking_time:long, act_content:bag{inner:tuple(key:chararray, value:chararray)});",
                " avro1 = FILTER avro BY member_id > 1211;",
                " avro2 = FOREACH avro1 GENERATE member_id, browser_id, tracking_time, act_content ;",
                " STORE avro2 INTO '" + output + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "'{\"data\":  \"" + Util.encodeEscape(testRecordFile) + "\" ," + "  \"field0\": \"int\", "
                        + " \"field1\":  \"def:browser_id\", " + "  \"field3\": \"def:act_content\" " + " }');" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testRecordWithFieldSchemaFromTextWithSchemaFile() throws IOException {
        PigSchema2Avro.setTupleIndex(1);
        String output = outbasedir + "testRecordWithFieldSchemaFromTextWithSchemaFile";
        String expected = basedir + "expected_testRecordWithFieldSchema.avro";
        deleteDirectory(new File(output));
        String[] queries = { " avro = LOAD '" + Util.encodeEscape(testTextFile)
                + "' AS (member_id:int, browser_id:chararray, tracking_time:long, act_content:bag{inner:tuple(key:chararray, value:chararray)});",
                " avro1 = FILTER avro BY member_id > 1211;",
                " avro2 = FOREACH avro1 GENERATE member_id, browser_id, tracking_time, act_content ;",
                " STORE avro2 INTO '" + output + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "'{\"schema_file\":  \"" + Util.encodeEscape(testRecordSchema) + "\" ,"
                        + "  \"field0\": \"int\", " + " \"field1\":  \"def:browser_id\", "
                        + "  \"field3\": \"def:act_content\" " + " }');" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testSingleFieldTuples() throws IOException {
        String output = outbasedir + "testSingleFieldTuples";
        String expected = basedir + "expected_testSingleFieldTuples.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " messages = LOAD '" + Util.encodeEscape(testSingleTupleBagFile)
                        + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " a = foreach (group messages by user_id) { sorted = order messages by message_id DESC; GENERATE group AS user_id, sorted AS messages; };",
                " STORE a INTO '" + output + "' "
                        + " USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" };
        testAvroStorage(queries);
    }

    @Test
    public void testFileWithNoExtension() throws IOException {
        PigSchema2Avro.setTupleIndex(4);
        String output = outbasedir + "testFileWithNoExtension";
        String expected = basedir + "expected_testFileWithNoExtension.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " avro = LOAD '" + Util.encodeEscape(testNoExtensionFile)
                        + " ' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " avro1 = FILTER avro BY member_id > 1211;",
                " avro2 = FOREACH avro1 GENERATE member_id, browser_id, tracking_time, act_content ;",
                " STORE avro2 INTO '" + output + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "'{\"data\":  \"" + Util.encodeEscape(testNoExtensionFile) + "\" ,"
                        + "  \"field0\": \"int\", " + " \"field1\":  \"def:browser_id\", "
                        + "  \"field3\": \"def:act_content\" " + " }');" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    // Same as above, just without using json in the constructor
    @Test
    public void testRecordWithFieldSchemaFromTextWithSchemaFile2() throws IOException {
        PigSchema2Avro.setTupleIndex(1);
        String output = outbasedir + "testRecordWithFieldSchemaFromTextWithSchemaFile2";
        String expected = basedir + "expected_testRecordWithFieldSchema.avro";
        deleteDirectory(new File(output));
        String[] queries = { " avro = LOAD '" + Util.encodeEscape(testTextFile)
                + "' AS (member_id:int, browser_id:chararray, tracking_time:long, act_content:bag{inner:tuple(key:chararray, value:chararray)});",
                " avro1 = FILTER avro BY member_id > 1211;",
                " avro2 = FOREACH avro1 GENERATE member_id, browser_id, tracking_time, act_content ;",
                " STORE avro2 INTO '" + output + "' " + " USING org.apache.pig.piggybank.storage.avro.AvroStorage ("
                        + "'schema_file', '" + Util.encodeEscape(testRecordSchema) + "'," + "'field0','int',"
                        + "'field1','def:browser_id'," + "'field3','def:act_content'" + ");" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    public void testCorruptedFile1() throws IOException {
        // Verify that load fails when bad files are found if ignore_bad_files is disabled.
        String output = outbasedir + "testCorruptedFile1";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testCorruptedFile)
                        + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" };
        // Job is expected to fail for bad files.
        testAvroStorage(true, queries);
    }

    @Test
    public void testCorruptedFile2() throws IOException {
        // Verify that corrupted files are skipped if ignore_bad_files is enabled.
        // Output is expected to be empty.
        String output = outbasedir + "testCorruptedFile2";
        String expected = basedir + "expected_testCorruptedFile.avro";
        deleteDirectory(new File(output));
        String[] queries = {
                " in = LOAD '" + Util.encodeEscape(testCorruptedFile) + "'"
                        + " USING org.apache.pig.piggybank.storage.avro.AvroStorage ('ignore_bad_files');",
                " STORE in INTO '" + output + "' USING org.apache.pig.piggybank.storage.avro.AvroStorage ();" };
        testAvroStorage(queries);
        verifyResults(output, expected);
    }

    @Test
    // Schema for the generated avro file test_loadavrowithnulls.avro
    // ["null",{"type":"record","name":"TUPLE_0",
    // "fields":[
    // {"name":"name","type":["null","string"],"doc":"autogenerated from Pig Field Schema"},
    // {"name":"age","type":["null","int"],"doc":"autogenerated from Pig Field Schema"},
    // {"name":"gpa","type":["null","double"],"doc":"autogenerated from Pig Field Schema"}]}]
    public void testLoadwithNullValues() throws IOException {
        //Input is supposed to have empty tuples
        PigSchema2Avro.setTupleIndex(0);
        Data data = resetData(pigServerLocal);
        String output = outbasedir + "testLoadwithNulls";
        deleteDirectory(new File(output));
        String[] queries = {
                " A = load '" + testLoadwithNullValues + "' USING "
                        + " org.apache.pig.piggybank.storage.avro.AvroStorage(); ",
                " B = order A by name;", " store B into '" + output + "' USING mock.Storage();" };
        testAvroStorage(queries);
        List<Tuple> out = data.get(output);
        assertEquals(out + " size", 4, out.size());

        assertEquals(schema("name:chararray,age:int,gpa:double"), data.getSchema(output));

        // sorted data ordered by name
        assertEquals(tuple((String) null), out.get(0));
        assertEquals(tuple((String) null), out.get(1));
        assertEquals(tuple("calvin ellison", 24, 0.71), out.get(2));
        assertEquals(tuple("wendy johnson", 60, 0.07), out.get(3));

    }

    @Test
    public void testMultipleLoadStore() throws Exception {
        PigSchema2Avro.setTupleIndex(0);
        Data data = resetData(pigServerLocal);
        data.set("foo", tuple(1, 2, 3), tuple(4, 5, 6), tuple(7, 8, 9));
        data.set("bar", tuple("a", "b", "c"), tuple("d", "e", "f"), tuple("g", "h", "i"));
        String output = outbasedir + "testMultipleLoadStore";
        deleteDirectory(new File(output));
        String[] storeQuery = { "A = LOAD 'foo' USING " + "mock.Storage() as (a1:int, a2:int, a3:int);",
                "B = LOAD 'bar' USING " + "mock.Storage() as (b1:chararray, b2:chararray, b3:chararray);",
                "STORE A into '" + output + "/A' USING " + "org.apache.pig.piggybank.storage.avro.AvroStorage();",
                "STORE B into '" + output + "/B' USING " + "org.apache.pig.piggybank.storage.avro.AvroStorage();" };
        testAvroStorage(storeQuery);
        String[] loadQuery = {
                "C = LOAD '" + output + "/A' USING " + "org.apache.pig.piggybank.storage.avro.AvroStorage();",
                "D = LOAD '" + output + "/B' USING " + "org.apache.pig.piggybank.storage.avro.AvroStorage();",
                "STORE C into 'foo-actual' USING mock.Storage();",
                "STORE D into 'bar-actual' USING mock.Storage();" };
        testAvroStorage(loadQuery);

        assertEquals(data.get("foo"), data.get("foo-actual"));
        assertEquals(data.get("bar"), data.get("bar-actual"));
        assertEquals("{a1: int,a2: int,a3: int}", data.getSchema("foo-actual").toString());
        assertEquals("{b1: chararray,b2: chararray,b3: chararray}", data.getSchema("bar-actual").toString());
    }

    private static void deleteDirectory(File path) {
        if (path.exists()) {
            File[] files = path.listFiles();
            for (File file : files) {
                if (file.isDirectory())
                    deleteDirectory(file);
                file.delete();
            }
        }
    }

    private void testAvroStorage(String... queries) throws IOException {
        testAvroStorage(false, queries);
    }

    private void testAvroStorage(boolean expectedToFail, String... queries) throws IOException {
        pigServerLocal.setBatchOn();
        for (String query : queries) {
            if (query != null && query.length() > 0) {
                pigServerLocal.registerQuery(query);
            }
        }
        int numOfFailedJobs = 0;
        for (ExecJob job : pigServerLocal.executeBatch()) {
            if (job.getStatus().equals(JOB_STATUS.FAILED)) {
                numOfFailedJobs++;
            }
        }
        if (expectedToFail) {
            assertTrue("There was no failed job!", numOfFailedJobs > 0);
        } else {
            assertTrue("There was a failed job!", numOfFailedJobs == 0);
        }
    }

    private void verifyResults(String outPath, String expectedOutpath) throws IOException {
        verifyResults(outPath, expectedOutpath, null);
    }

    private void verifyResults(String outPath, String expectedOutpath, String expectedCodec) throws IOException {

        FileSystem fs = FileSystem.getLocal(new Configuration());

        /* read in expected results*/
        Set<Object> expected = getExpected(expectedOutpath);

        /* read in output results and compare */
        Path output = new Path(outPath);
        assertTrue("Output dir does not exists!", fs.exists(output) && fs.getFileStatus(output).isDir());

        Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter));
        assertTrue("Split field dirs not found!", paths != null);

        for (Path path : paths) {
            Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter));
            assertTrue("No files found for path: " + path.toUri().getPath(), files != null);
            for (Path filePath : files) {
                assertTrue("This shouldn't be a directory", fs.isFile(filePath));

                GenericDatumReader<Object> reader = new GenericDatumReader<Object>();

                DataFileStream<Object> in = new DataFileStream<Object>(fs.open(filePath), reader);
                assertEquals("codec", expectedCodec, in.getMetaString("avro.codec"));
                int count = 0;
                while (in.hasNext()) {
                    Object obj = in.next();
                    //System.out.println("obj = " + (GenericData.Array<Float>)obj);
                    assertTrue("Avro result object found that's not expected: " + obj, expected.contains(obj));
                    count++;
                }
                in.close();
                assertEquals(expected.size(), count);
            }
        }
    }

    private Set<Object> getExpected(String pathstr) throws IOException {

        Set<Object> ret = new HashSet<Object>();
        FileSystem fs = FileSystem.getLocal(new Configuration());

        /* read in output results and compare */
        Path output = new Path(pathstr);
        assertTrue("Expected output does not exists!", fs.exists(output));

        Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter));
        assertTrue("Split field dirs not found!", paths != null);

        for (Path path : paths) {
            Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter));
            assertTrue("No files found for path: " + path.toUri().getPath(), files != null);
            for (Path filePath : files) {
                assertTrue("This shouldn't be a directory", fs.isFile(filePath));

                GenericDatumReader<Object> reader = new GenericDatumReader<Object>();

                DataFileStream<Object> in = new DataFileStream<Object>(fs.open(filePath), reader);

                while (in.hasNext()) {
                    Object obj = in.next();
                    ret.add(obj);
                }
                in.close();
            }
        }
        return ret;
    }

}