org.apache.pig.builtin.TestAvroStorage.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pig.builtin.TestAvroStorage.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.pig.builtin;

import static org.apache.pig.builtin.mock.Storage.resetData;
import static org.apache.pig.builtin.mock.Storage.tuple;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Iterator;

import com.google.common.io.Closeables;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericContainer;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.Record;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.tool.DataFileWriteTool;
import org.apache.avro.tool.Tool;
import org.apache.avro.tool.TrevniCreateRandomTool;
import org.apache.avro.tool.TrevniToJsonTool;
import org.apache.avro.util.Utf8;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.pig.PigServer;
import org.apache.pig.backend.executionengine.ExecJob;
import org.apache.pig.backend.executionengine.ExecJob.JOB_STATUS;
import org.apache.pig.builtin.mock.Storage.Data;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.Utils;
import org.apache.pig.impl.util.avro.AvroBagWrapper;
import org.apache.pig.impl.util.avro.AvroMapWrapper;
import org.apache.pig.impl.util.avro.AvroTupleWrapper;
import org.apache.pig.test.Util;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;

import com.google.common.collect.ImmutableMap;

public class TestAvroStorage {

    final protected static Log LOG = LogFactory.getLog(TestAvroStorage.class);

    final private static String basedir = "test/org/apache/pig/builtin/avro/";
    private static String outbasedir = System.getProperty("user.dir") + "/build/test/TestAvroStorage/";
    final private static String[] datadir = { "data/avro", "data/avro/compressed", "data/avro/compressed/snappy",
            "data/avro/compressed/deflate", "data/avro/uncompressed", "data/avro/uncompressed/testdirectory",
            "data/json/testdirectory", "data/trevni", "data/trevni/uncompressed", };
    final private static String[] avroSchemas = { "arraysAsOutputByPig", "arrays", "recordsAsOutputByPig",
            "recordsAsOutputByPigWithDates", "records", "recordsOfArrays", "recordsOfStringArrays",
            "recordsOfArraysOfRecords", "recordsSubSchema", "recordsSubSchemaNullable",
            "recordsWithDoubleUnderscores", "recordsWithEnums", "recordsWithFixed", "recordsWithMaps",
            "recordsWithMapsOfRecords", "recordsWithMapsOfArrayOfRecords", "recordsWithNullableUnions",
            "recordWithRepeatedSubRecords", "recursiveRecord", "projectionTest", "projectionTestWithSchema",
            "recordsWithSimpleUnion", "recordsWithSimpleUnionOutput", };
    final private static String[] trevniSchemas = { "simpleRecordsTrevni", };

    private static PigServer pigServerLocal = null;

    public static final PathFilter hiddenPathFilter = new PathFilter() {
        @Override
        public boolean accept(Path p) {
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    };

    private static String loadFileIntoString(String file) throws IOException {
        return FileUtils.readFileToString(new File(file)).replace("\n", " ");
    }

    @BeforeClass
    public static void setup() throws Exception {
        pigServerLocal = new PigServer(Util.getLocalTestMode());
        Util.deleteDirectory(new File(outbasedir));
        generateInputFiles();
    }

    @AfterClass
    public static void teardown() throws IOException {
        if (pigServerLocal != null) {
            pigServerLocal.shutdown();
        }
        deleteInputFiles();
    }

    /**
     * Generate input files for test cases
     */
    private static void generateInputFiles() throws IOException {
        String data;
        String json;
        String avro;
        String trevni;
        String schema;

        for (String fn : datadir) {
            FileUtils.forceMkdir(new File(basedir + fn));
        }

        for (String fn : avroSchemas) {
            schema = basedir + "schema/" + fn + ".avsc";
            json = basedir + "data/json/" + fn + ".json";
            avro = basedir + "data/avro/uncompressed/" + fn + ".avro";
            LOG.info("creating " + avro);
            generateAvroFile(schema, json, avro, null);
        }

        for (String fn : trevniSchemas) {
            schema = basedir + "schema/" + fn + ".avsc";
            trevni = basedir + "data/trevni/uncompressed/" + fn + ".trevni";
            json = basedir + "data/json/" + fn + ".json";
            LOG.info("creating " + trevni);
            generateRandomTrevniFile(schema, "1000", trevni);
            LOG.info("creating " + json);
            convertTrevniToJsonFile(schema, trevni, json);

            schema = basedir + "schema/" + fn + ".avsc";
            json = basedir + "data/json/" + fn + ".json";
            avro = basedir + "data/avro/uncompressed/" + fn + ".avro";
            LOG.info("creating " + avro);
            generateAvroFile(schema, json, avro, null);
        }

        PrintWriter w;
        int itemSum = 0;
        int recordCount = 0;
        int evenFileNameItemSum = 0;
        int evenFileNameRecordCount = 0;

        for (int i = 0; i < 8; i++) {
            schema = basedir + "schema/testDirectory.avsc";
            json = basedir + "data/json/testdirectory/part-m-0000" + i;
            avro = basedir + "data/avro/uncompressed/testdirectory/part-m-0000" + i + ".avro";
            LOG.info("creating " + json);
            w = new PrintWriter(new FileWriter(json));
            for (int j = i * 1000; j < (i + 1) * 1000; j++) {
                itemSum += j;
                recordCount += 1;
                evenFileNameItemSum += ((i + 1) % 2) * j;
                evenFileNameRecordCount += (i + 1) % 2;
                w.println("{\"item\" : " + j + ", \"timestamp\" : " + System.currentTimeMillis() + " }\n");
            }
            w.close();
            generateAvroFile(schema, json, avro, null);
        }

        schema = basedir + "schema/testDirectoryCounts.avsc";
        json = basedir + "data/json/testDirectoryCounts.json";
        avro = basedir + "data/avro/uncompressed/testDirectoryCounts.avro";
        data = "{\"itemSum\" : {\"int\" : " + itemSum + "}, \"n\" : {\"int\" : " + recordCount + "} }";
        LOG.info("creating " + json);
        FileUtils.writeStringToFile(new File(json), data);
        LOG.info("creating " + avro);
        generateAvroFile(schema, json, avro, null);

        schema = basedir + "schema/testDirectoryCounts.avsc";
        json = basedir + "data/json/evenFileNameTestDirectoryCounts.json";
        avro = basedir + "data/avro/uncompressed/evenFileNameTestDirectoryCounts.avro";
        data = "{\"itemSum\" : {\"int\" : " + evenFileNameItemSum + "}, \"n\" : {\"int\" : "
                + evenFileNameRecordCount + "} }";
        LOG.info("creating " + json);
        FileUtils.writeStringToFile(new File(json), data);
        LOG.info("creating " + avro);
        generateAvroFile(schema, json, avro, null);

        for (String codec : new String[] { "deflate", "snappy" }) {
            for (String fn : new String[] { "records", "recordsAsOutputByPig" }) {
                schema = basedir + "schema/" + fn + ".avsc";
                json = basedir + "data/json/" + fn + ".json";
                avro = basedir + "data/avro/compressed/" + codec + "/" + fn + ".avro";
                LOG.info("creating " + avro);
                generateAvroFile(schema, json, avro, codec);
            }
        }
    }

    /**
     * Clean up generated input files
     */
    private static void deleteInputFiles() throws IOException {
        // Delete auto-generated directories
        for (String fn : datadir) {
            LOG.info("Deleting " + basedir + fn);
            FileUtils.deleteQuietly(new File(basedir + fn));
        }
        // Delete auto-generated json files
        String json;
        for (String fn : trevniSchemas) {
            json = basedir + "data/json/" + fn + ".json";
            LOG.info("Deleting " + json);
            FileUtils.deleteQuietly(new File(json));
        }
        json = basedir + "data/json/testDirectoryCounts.json";
        LOG.info("Deleting " + json);
        FileUtils.deleteQuietly(new File(json));
        json = basedir + "data/json/evenFileNameTestDirectoryCounts.json";
        LOG.info("Deleting " + json);
        FileUtils.deleteQuietly(new File(json));
    }

    private static void generateAvroFile(String schema, String json, String avro, String codec) throws IOException {
        Tool tool = new DataFileWriteTool();
        List<String> args = new ArrayList<String>();
        args.add("--schema-file");
        args.add(schema);
        args.add(json);
        if (codec != null) {
            args.add("--codec");
            args.add(codec);
        }
        try {
            StringBuffer sb = new StringBuffer();
            for (String a : args) {
                sb.append(a);
                sb.append(" ");
            }
            PrintStream out = new PrintStream(avro);
            tool.run(System.in, out, System.err, args);
        } catch (Exception e) {
            LOG.info("Could not generate avro file: " + avro, e);
            throw new IOException();
        }
    }

    private static void generateRandomTrevniFile(String schema, String count, String trevni) throws IOException {
        Tool tool = new TrevniCreateRandomTool();
        List<String> args = new ArrayList<String>();
        args.add(schema);
        args.add(count);
        args.add(trevni);
        try {
            tool.run(System.in, System.out, System.err, args);
        } catch (Exception e) {
            LOG.info("Could not generate trevni file: " + trevni, e);
            throw new IOException();
        }
    }

    private static void convertTrevniToJsonFile(String schema, String trevni, String json) throws IOException {
        Tool tool = new TrevniToJsonTool();
        List<String> args = new ArrayList<String>();
        args.add(trevni);
        try {
            PrintStream out = new PrintStream(json);
            tool.run(System.in, out, System.err, args);
        } catch (Exception e) {
            LOG.info("Could not generate json file: " + json, e);
            throw new IOException();
        }
    }

    private String createOutputName() {
        final StackTraceElement[] st = Thread.currentThread().getStackTrace();
        if (Util.WINDOWS) {
            outbasedir = outbasedir.replace('\\', '/');
        }

        int index;
        if (Utils.isVendorIBM()) {
            index = 3;
        } else {
            index = 2;
        }

        return outbasedir + st[index].getMethodName();
    }

    @Test
    public void testLoadRecordsOfStringArrays() throws Exception {
        final String input = basedir + "data/avro/uncompressed/recordsOfStringArrays.avro";
        testAvroStorage(true, basedir + "code/pig/dump.pig", ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2",
                "-f " + basedir + "schema/recordsOfStringArrays.avsc", "OUTFILE", createOutputName()));
    }

    @Test
    public void testLoadRecords() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPig.avro";
        testAvroStorage(true, basedir + "code/pig/identity_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "records", "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsWithSimpleUnion() throws Exception {
        final String input = basedir + "data/avro/uncompressed/recordsWithSimpleUnion.avro";
        final String check = basedir + "data/avro/uncompressed/recordsWithSimpleUnionOutput.avro";
        testAvroStorage(true, basedir + "code/pig/identity_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "records", "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin -f " + basedir + "schema/recordsSubSchema.avsc", "OUTFILE",
                        createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testProjection() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        final String check = basedir + "data/avro/uncompressed/projectionTest.avro";
        testAvroStorage(true, basedir + "code/pig/projection_test.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "projectionTest", "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testProjectionWithSchema() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        final String check = basedir + "data/avro/uncompressed/projectionTestWithSchema.avro";
        testAvroStorage(true, basedir + "code/pig/projection_test_with_schema.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_IN_2", "-f " + basedir + "schema/records.avsc",
                        "AVROSTORAGE_OUT_1", "projectionTest", "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testDates() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPigWithDates.avro";
        testAvroStorage(true, basedir + "code/pig/with_dates.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "records", "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsSpecifyFullSchema() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPig.avro";
        final String schema = loadFileIntoString(basedir + "schema/records.avsc");
        testAvroStorage(true, basedir + "code/pig/identity_ai1_ao2.pig",
                ImmutableMap.of("INFILE", input, "OUTFILE", createOutputName(), "AVROSTORAGE_OUT_1", "records",
                        "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "AVROSTORAGE_IN_1", schema));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsSpecifyFullSchemaFromClass() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPig.avro";
        testAvroStorage(true, basedir + "code/pig/identity.pig",
                ImmutableMap.of("INFILE", input, "OUTFILE", createOutputName(), "AVROSTORAGE_IN_2",
                        "-c org.apache.pig.builtin.avro.code.java.RecordPojo", "AVROSTORAGE_OUT_1", "''",
                        "AVROSTORAGE_OUT_2", "-c org.apache.pig.builtin.avro.code.java.RecordPojo"));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsSpecifyFullSchemaFromFile() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPig.avro";
        testAvroStorage(true, basedir + "code/pig/identity.pig",
                ImmutableMap.of("INFILE", input, "OUTFILE", createOutputName(), "AVROSTORAGE_OUT_1", "records",
                        "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "AVROSTORAGE_IN_2",
                        "-f " + basedir + "schema/records.avsc"));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsSpecifySubSchema() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        final String check = basedir + "data/avro/uncompressed/recordsSubSchema.avro";
        testAvroStorage(true, basedir + "code/pig/identity_ai1_ao2.pig",
                ImmutableMap.of("INFILE", input, "OUTFILE", createOutputName(), "AVROSTORAGE_OUT_1", "records",
                        "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin -f " + basedir + "schema/recordsSubSchema.avsc",
                        "AVROSTORAGE_IN_1", loadFileIntoString(basedir + "schema/recordsSubSchema.avsc")));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsSpecifySubSchemaFromFile() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        final String check = basedir + "data/avro/uncompressed/recordsSubSchema.avro";
        testAvroStorage(true, basedir + "code/pig/identity_blank_first_args.pig",
                ImmutableMap.of("INFILE", input, "OUTFILE", createOutputName(), "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/recordsSubSchema.avsc", "AVROSTORAGE_IN_2",
                        "-f " + basedir + "schema/recordsSubSchema.avsc"));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsSpecifySubSchemaFromExampleFile() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        final String check = basedir + "data/avro/uncompressed/recordsSubSchema.avro";
        testAvroStorage(true, basedir + "code/pig/identity_blank_first_args.pig",
                ImmutableMap.of("INFILE", input, "OUTFILE", createOutputName(), "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/recordsSubSchema.avsc", "AVROSTORAGE_IN_2",
                        "-e " + basedir + "data/avro/uncompressed/recordsSubSchema.avro"));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsOfArrays() throws Exception {
        final String input = basedir + "data/avro/uncompressed/recordsOfArrays.avro";
        final String check = input;
        testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/recordsOfArrays.avsc", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsOfArraysOfRecords() throws Exception {
        final String input = basedir + "data/avro/uncompressed/recordsOfArraysOfRecords.avro";
        final String check = input;
        testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/recordsOfArraysOfRecords.avsc", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsWithEnums() throws Exception {
        final String input = basedir + "data/avro/uncompressed/recordsWithEnums.avro";
        final String check = input;
        testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/recordsWithEnums.avsc", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsWithFixed() throws Exception {
        final String input = basedir + "data/avro/uncompressed/recordsWithFixed.avro";
        final String check = input;
        testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/recordsWithFixed.avsc", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsWithMaps() throws Exception {
        final String input = basedir + "data/avro/uncompressed/recordsWithMaps.avro";
        final String check = input;
        testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/recordsWithMaps.avsc", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsWithMapsOfRecords() throws Exception {
        final String input = basedir + "data/avro/uncompressed/recordsWithMapsOfRecords.avro";
        final String check = input;
        testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/recordsWithMapsOfRecords.avsc", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsWithMapsOfArrayOfRecords() throws Exception {
        final String input = basedir + "data/avro/uncompressed/recordsWithMapsOfArrayOfRecords.avro";
        final String check = input;
        testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/recordsWithMapsOfArrayOfRecords.avsc", "OUTFILE",
                        createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsWithNullableUnions() throws Exception {
        final String input = basedir + "data/avro/uncompressed/recordsWithNullableUnions.avro";
        final String check = input;
        testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/recordsWithNullableUnions.avsc", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadDeflateCompressedRecords() throws Exception {
        final String input = basedir + "data/avro/compressed/deflate/records.avro";
        final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPig.avro";
        testAvroStorage(true, basedir + "code/pig/identity_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "records", "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadSnappyCompressedRecords() throws Exception {
        final String input = basedir + "data/avro/compressed/snappy/records.avro";
        final String check = basedir + "data/avro/uncompressed/recordsAsOutputByPig.avro";
        testAvroStorage(true, basedir + "code/pig/identity_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "records", "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testStoreDeflateCompressedRecords() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        final String check = basedir + "data/avro/compressed/deflate/recordsAsOutputByPig.avro";
        testAvroStorage(true, basedir + "code/pig/identity_codec.pig",
                ImmutableMap.<String, String>builder().put("INFILE", input).put("OUTFILE", createOutputName())
                        .put("AVROSTORAGE_OUT_1", "records")
                        .put("AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin").put("CODEC", "deflate")
                        .put("LEVEL", "6").build());
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testStoreSnappyCompressedRecords() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        final String check = basedir + "data/avro/compressed/snappy/recordsAsOutputByPig.avro";
        testAvroStorage(true, basedir + "code/pig/identity_codec.pig",
                ImmutableMap.<String, String>builder().put("INFILE", input).put("OUTFILE", createOutputName())
                        .put("AVROSTORAGE_OUT_1", "records")
                        .put("AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin").put("CODEC", "snappy")
                        .put("LEVEL", "6").build());
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecursiveRecords() throws Exception {
        final String input = basedir + "data/avro/uncompressed/recursiveRecord.avro";
        testAvroStorage(false, basedir + "code/pig/recursive_tests.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_IN_2", "-n this.is.ignored.in.a.loadFunc",
                        "AVROSTORAGE_OUT_1", "recordsSubSchemaNullable", "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
    }

    @Test
    public void testLoadRecursiveRecordsOptionOn() throws Exception {
        final String input = basedir + "data/avro/uncompressed/recursiveRecord.avro";
        final String check = basedir + "data/avro/uncompressed/recordsSubSchemaNullable.avro";
        testAvroStorage(true, basedir + "code/pig/recursive_tests.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_IN_2", "-r", "AVROSTORAGE_OUT_1",
                        "recordsSubSchemaNullable", "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin",
                        "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadRecordsWithRepeatedSubRecords() throws Exception {
        final String input = basedir + "data/avro/uncompressed/recordWithRepeatedSubRecords.avro";
        final String check = basedir + "data/avro/uncompressed/recordWithRepeatedSubRecords.avro";
        testAvroStorage(true, basedir + "code/pig/identity_just_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/recordWithRepeatedSubRecords.avsc", "OUTFILE",
                        createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadDirectory() throws Exception {
        final String input = basedir + "data/avro/uncompressed/testdirectory";
        final String check = basedir + "data/avro/uncompressed/testDirectoryCounts.avro";
        testAvroStorage(true, basedir + "code/pig/directory_test.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "stats", "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadGlob() throws Exception {
        final String input = basedir + "data/avro/uncompressed/testdirectory/part-m-0000*";
        final String check = basedir + "data/avro/uncompressed/testDirectoryCounts.avro";
        testAvroStorage(true, basedir + "code/pig/directory_test.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "stats", "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testPartialLoadGlob() throws Exception {
        final String input = basedir + "data/avro/uncompressed/testdirectory/part-m-0000{0,2,4,6}.avro";
        final String check = basedir + "data/avro/uncompressed/evenFileNameTestDirectoryCounts.avro";
        testAvroStorage(true, basedir + "code/pig/directory_test.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "stats", "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testSeparatedByComma() throws Exception {
        final String temp = basedir + "data/avro/uncompressed/testdirectory/part-m-0000";
        StringBuffer sb = new StringBuffer();
        sb.append(temp + "0.avro");
        for (int i = 1; i <= 7; ++i) {
            sb.append(",");
            sb.append(temp + String.valueOf(i) + ".avro");
        }
        final String input = sb.toString();
        final String check = basedir + "data/avro/uncompressed/testDirectoryCounts.avro";
        testAvroStorage(true, basedir + "code/pig/directory_test.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "stats", "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testDoubleUnderscore() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        final String check = basedir + "data/avro/uncompressed/recordsWithDoubleUnderscores.avro";
        testAvroStorage(true, basedir + "code/pig/namesWithDoubleColons.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "recordsWithDoubleUnderscores",
                        "AVROSTORAGE_OUT_2", "-d -n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testDoubleUnderscoreNoFlag() throws Exception {
        final String input = basedir + "data/avro/uncompressed/records.avro";
        testAvroStorage(false, basedir + "code/pig/namesWithDoubleColons.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "recordsWithDoubleUnderscores",
                        "AVROSTORAGE_OUT_2", "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
    }

    @Test
    public void testLoadArrays() throws Exception {
        final String input = basedir + "data/avro/uncompressed/arrays.avro";
        final String check = basedir + "data/avro/uncompressed/arraysAsOutputByPig.avro";
        testAvroStorage(true, basedir + "code/pig/identity_ao2.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_1", "arrays", "AVROSTORAGE_OUT_2",
                        "-n org.apache.pig.test.builtin", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadTrevniRecords() throws Exception {
        final String input = basedir + "data/trevni/uncompressed/simpleRecordsTrevni.trevni";
        final String check = basedir + "data/avro/uncompressed/simpleRecordsTrevni.avro";
        testAvroStorage(true, basedir + "code/pig/trevni_to_avro.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/simpleRecordsTrevni.avsc", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testLoadAndSaveTrevniRecords() throws Exception {
        final String input = basedir + "data/trevni/uncompressed/simpleRecordsTrevni.trevni";
        final String check = basedir + "data/avro/uncompressed/simpleRecordsTrevni.avro";

        testAvroStorage(true, basedir + "code/pig/trevni_to_trevni.pig",
                ImmutableMap.of("INFILE", input, "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/simpleRecordsTrevni.avsc", "OUTFILE",
                        createOutputName() + "Trevni"));

        testAvroStorage(true, basedir + "code/pig/trevni_to_avro.pig",
                ImmutableMap.of("INFILE", createOutputName() + "Trevni", "AVROSTORAGE_OUT_2",
                        "-f " + basedir + "schema/simpleRecordsTrevni.avsc", "OUTFILE", createOutputName()));
        verifyResults(createOutputName(), check);
    }

    @Test
    public void testRetrieveDataFromMap() throws Exception {
        pigServerLocal = new PigServer(Util.getLocalTestMode());
        Data data = resetData(pigServerLocal);
        Map<String, String> mapv1 = new HashMap<String, String>();
        mapv1.put("key1", "v11");
        mapv1.put("key2", "v12");
        Map<String, String> mapv2 = new HashMap<String, String>();
        mapv2.put("key1", "v21");
        mapv2.put("key2", "v22");
        data.set("testMap", "maps:map[chararray]", tuple(mapv1), tuple(mapv2));
        String schemaDescription = new String("{" + "\"type\": \"record\"," + "\"name\": \"record\","
                + "\"fields\" : ["
                + "{\"name\" : \"maps\", \"type\" :{\"type\" : \"map\", \"values\" : \"string\"}}" + "]" + "}");
        pigServerLocal.registerQuery("A = LOAD 'testMap' USING mock.Storage();");
        pigServerLocal.registerQuery(
                "STORE A INTO '" + createOutputName() + "' USING AvroStorage('" + schemaDescription + "');");
        pigServerLocal.registerQuery("B = LOAD '" + createOutputName() + "' USING AvroStorage();");
        pigServerLocal.registerQuery("C = FOREACH B generate maps#'key1';");
        pigServerLocal.registerQuery("STORE C INTO 'out' USING mock.Storage();");

        List<Tuple> out = data.get("out");
        assertEquals(tuple("v11"), out.get(0));
        assertEquals(tuple("v21"), out.get(1));
    }

    @SuppressWarnings("rawtypes")
    @Test
    public void testCompareToOfBagWrapper() throws Exception {
        final String check = basedir + "data/avro/uncompressed/arraysAsOutputByPig.avro";

        Set<GenericData.Record> records = getExpected(check);
        assertEquals(3, records.size());

        AvroBagWrapper size0 = null; // [ ]
        AvroBagWrapper size1 = null; // [ 6 ]
        AvroBagWrapper size5 = null; // [ 1, 2, 3, 4, 5 ]

        // 3 arrays in records are in an arbitrary order. We re-order them by
        // their size.
        for (GenericData.Record record : records) {
            AvroTupleWrapper tw = new AvroTupleWrapper<GenericData.Record>(record);
            if (((AvroBagWrapper) tw.get(0)).size() == 0) {
                size0 = (AvroBagWrapper) tw.get(0);
            } else if (((AvroBagWrapper) tw.get(0)).size() == 1) {
                size1 = (AvroBagWrapper) tw.get(0);
            } else if (((AvroBagWrapper) tw.get(0)).size() == 5) {
                size5 = (AvroBagWrapper) tw.get(0);
            }
        }

        assertEquals(0, size0.size());
        assertEquals(1, size1.size());
        assertEquals(5, size5.size());

        assertTrue(size0.compareTo(size0) == 0);
        assertTrue(size0.compareTo(size1) < 0);
        assertTrue(size0.compareTo(size5) < 0);
        assertTrue(size1.compareTo(size0) > 0);
        // 6 > 1, so size1 > size5 even though size1 is smaller than size5.
        assertTrue(size1.compareTo(size5) > 0);
    }

    @Test
    public void testUtf8KeyLookupFromMap() throws Exception {
        Map<CharSequence, Object> tm = new TreeMap<CharSequence, Object>();
        tm.put(new Utf8("foo"), "foo");
        tm.put(new Utf8("bar"), "bar");

        AvroMapWrapper wrapper = new AvroMapWrapper(tm);
        String v = (String) wrapper.get(new Utf8("foo"));
        assertEquals("foo", v);
        v = (String) wrapper.get(new Utf8("bar"));
        assertEquals("bar", v);
    }

    @Test
    public void testAvroMapWrapper() throws Exception {
        final Map<CharSequence, Object> m = new HashMap<CharSequence, Object>();
        for (String fn : avroSchemas) {
            final String avro = basedir + "data/avro/uncompressed/" + fn + ".avro";
            int i = 0;
            for (GenericContainer r : readAvroData(avro)) {
                m.put(new Utf8(fn + i), r);
                i += 1;
            }
        }
        final AvroMapWrapper amw = new AvroMapWrapper(m);
        // Test out all the interfaces the AvroMapWrapper supports
        for (Object o : amw.values()) {
            assertTrue(isValidPigObject(o));
        }
        for (CharSequence k : amw.keySet()) {
            assertTrue(isValidPigObject(k));
            assertTrue(isValidPigObject(amw.get(k)));
        }
        for (Map.Entry<CharSequence, Object> e : amw.entrySet()) {
            assertTrue(isValidPigObject(e.getKey()));
            assertTrue(isValidPigObject(e.getValue()));
        }
    }

    private boolean isValidPigObject(Object o) {
        if (o == null) {
            return true;
        }
        switch (DataType.findType(o)) {
        case DataType.TUPLE:
            for (Object inner : ((Tuple) o).getAll()) {
                if (!isValidPigObject(inner)) {
                    return false;
                }
            }
            return true;
        case DataType.BAG:
            final Iterator<Tuple> bi = ((DataBag) o).iterator();
            while (bi.hasNext()) {
                if (!isValidPigObject(bi.next())) {
                    return false;
                }
            }
            return true;
        case DataType.MAP:
            for (Object inner : ((Map) o).values()) {
                if (!isValidPigObject(inner)) {
                    return false;
                }
            }
            return true;
        case DataType.BIGDECIMAL:
        case DataType.BIGINTEGER:
        case DataType.BOOLEAN:
        case DataType.BYTE:
        case DataType.BYTEARRAY:
        case DataType.CHARARRAY:
        case DataType.DATETIME:
        case DataType.DOUBLE:
        case DataType.FLOAT:
        case DataType.GENERIC_WRITABLECOMPARABLE:
        case DataType.INTEGER:
        case DataType.LONG:
            return true;
        case DataType.ERROR:
        default:
            return false;
        }
    }

    private List<GenericContainer> readAvroData(String path) throws IOException {
        final FileSystem fs = FileSystem.getLocal(new Configuration());
        final Path filePath = new Path(path);
        assertTrue("File path " + filePath + " does not exists!", fs.exists(filePath));
        final GenericDatumReader<GenericContainer> reader = new GenericDatumReader<GenericContainer>();
        final DataFileStream<GenericContainer> in = new DataFileStream<GenericContainer>(fs.open(filePath), reader);
        final List<GenericContainer> avroData = new ArrayList<GenericContainer>();
        try {
            while (in.hasNext()) {
                GenericContainer obj = in.next();
                avroData.add(obj);
            }
        } finally {
            Closeables.closeQuietly(in);
        }
        return avroData;
    }

    private void testAvroStorage(boolean expectedToSucceed, String scriptFile, Map<String, String> parameterMap)
            throws IOException {
        pigServerLocal.setBatchOn();

        int numOfFailedJobs = 0;

        try {
            pigServerLocal.registerScript(scriptFile, parameterMap);
            for (ExecJob job : pigServerLocal.executeBatch()) {
                if (job.getStatus().equals(JOB_STATUS.FAILED)) {
                    numOfFailedJobs++;
                }
            }
        } catch (Exception e) {
            System.err.printf("Exception caught in testAvroStorage: %s\n", e);
            numOfFailedJobs++;
        }

        if (expectedToSucceed) {
            assertTrue("There was a failed job!", numOfFailedJobs == 0);
        } else {
            assertTrue("There was no failed job!", numOfFailedJobs > 0);
        }
    }

    private void verifyResults(String outPath, String expectedOutpath) throws IOException {
        verifyResults(outPath, expectedOutpath, null);
    }

    private void verifyResults(String outPath, String expectedOutpath, String expectedCodec) throws IOException {
        FileSystem fs = FileSystem.getLocal(new Configuration());

        /* read in expected results*/
        Set<GenericData.Record> expected = getExpected(expectedOutpath);

        /* read in output results and compare */
        Path output = new Path(outPath);
        assertTrue("Output dir does not exists!", fs.exists(output) && fs.getFileStatus(output).isDir());

        Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter));
        assertTrue("Split field dirs not found!", paths != null);

        for (Path path : paths) {
            Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter));
            assertTrue("No files found for path: " + path.toUri().getPath(), files != null);
            for (Path filePath : files) {
                assertTrue("This shouldn't be a directory", fs.isFile(filePath));

                GenericDatumReader<GenericData.Record> reader = new GenericDatumReader<GenericData.Record>();

                DataFileStream<GenericData.Record> in = new DataFileStream<GenericData.Record>(fs.open(filePath),
                        reader);
                assertEquals("codec", expectedCodec, in.getMetaString("avro.codec"));
                int count = 0;
                while (in.hasNext()) {
                    GenericData.Record obj = in.next();
                    assertTrue(
                            "Avro result object found that's not expected: Found "
                                    + (obj != null ? obj.getSchema() : "null") + ", " + obj.toString()
                                    + "\nExpected " + (expected != null ? expected.toString() : "null") + "\n",
                            expected.contains(obj));
                    count++;
                }
                in.close();
                assertEquals(expected.size(), count);
            }
        }
    }

    private Set<GenericData.Record> getExpected(String pathstr) throws IOException {

        Set<GenericData.Record> ret = new TreeSet<GenericData.Record>(new Comparator<GenericData.Record>() {
            @Override
            public int compare(Record o1, Record o2) {
                //return o1.compareTo(o2); throws AvroRuntimeException: Can't compare maps!
                return o1.equals(o2) ? 0 : o1.toString().compareTo(o2.toString());
            }
        });
        FileSystem fs = FileSystem.getLocal(new Configuration());

        /* read in output results and compare */
        Path output = new Path(pathstr);
        assertTrue("Expected output does not exists!", fs.exists(output));

        Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter));
        assertTrue("Split field dirs not found!", paths != null);

        for (Path path : paths) {
            Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter));
            assertTrue("No files found for path: " + path.toUri().getPath(), files != null);
            for (Path filePath : files) {
                assertTrue("This shouldn't be a directory", fs.isFile(filePath));

                GenericDatumReader<GenericData.Record> reader = new GenericDatumReader<GenericData.Record>();

                DataFileStream<GenericData.Record> in = new DataFileStream<GenericData.Record>(fs.open(filePath),
                        reader);

                while (in.hasNext()) {
                    GenericData.Record obj = in.next();
                    ret.add(obj);
                }
                in.close();
            }
        }
        return ret;
    }

}