com.cloudera.sqoop.TestAvroImport.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.sqoop.TestAvroImport.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.cloudera.sqoop;

import com.cloudera.sqoop.testutil.BaseSqoopTestCase;
import com.cloudera.sqoop.testutil.CommonArgs;
import com.cloudera.sqoop.testutil.HsqldbTestServer;
import com.cloudera.sqoop.testutil.ImportJobTestCase;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.apache.avro.file.DataFileConstants;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.mapred.FsInput;
import org.apache.avro.util.Utf8;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.joda.time.DateMidnight;
import org.joda.time.DateTime;
import org.joda.time.LocalDate;
import org.joda.time.LocalTime;
import org.joda.time.format.DateTimeFormat;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.sql.SQLException;
import java.util.*;

/**
 * Tests --as-avrodatafile.
 */
public class TestAvroImport extends ImportJobTestCase {

    public static final Log LOG = LogFactory.getLog(TestAvroImport.class.getName());

    /**
     * Create the argv to pass to Sqoop.
     *
     * @return the argv as an array of strings.
     */
    protected String[] getOutputArgv(boolean includeHadoopFlags, String[] extraArgs) {
        ArrayList<String> args = new ArrayList<String>();

        if (includeHadoopFlags) {
            CommonArgs.addHadoopFlags(args);
        }

        args.add("--table");
        args.add(getTableName());
        args.add("--connect");
        args.add(HsqldbTestServer.getUrl());
        args.add("--warehouse-dir");
        args.add(getWarehouseDir());
        args.add("--split-by");
        args.add("INTFIELD1");
        args.add("--as-avrodatafile");
        if (extraArgs != null) {
            args.addAll(Arrays.asList(extraArgs));
        }

        return args.toArray(new String[0]);
    }

    public void testAvroImport() throws IOException {
        avroImportTestHelper(null, null);
    }

    public void testDeflateCompressedAvroImport() throws IOException {
        avroImportTestHelper(new String[] { "--compression-codec", "org.apache.hadoop.io.compress.DefaultCodec", },
                "deflate");
    }

    public void testDefaultCompressedAvroImport() throws IOException {
        avroImportTestHelper(new String[] { "--compress", }, "deflate");
    }

    public void testUnsupportedCodec() throws IOException {
        try {
            avroImportTestHelper(new String[] { "--compression-codec", "foobar", }, null);
            fail("Expected IOException");
        } catch (IOException e) {
            // Exception is expected
        }
    }

    public void testAvroImportNulls() throws IOException {
        String[] types = { "BIT", "INTEGER", "BIGINT", "REAL", "DOUBLE", "VARCHAR(6)", "VARBINARY(2)", "DATE",
                "TIMESTAMP" };
        String[] vals = Collections.<String>nCopies(types.length, null).toArray(new String[types.length]);
        createTableWithColTypes(types, vals);

        runImport(getOutputArgv(true, null));

        Path outputFile = new Path(getTablePath(), "part-m-00000.avro");
        DataFileReader<GenericRecord> reader = read(outputFile);
        Schema schema = reader.getSchema();
        assertEquals(Schema.Type.RECORD, schema.getType());
        List<Field> fields = schema.getFields();
        assertEquals(types.length, fields.size());

        checkField(fields.get(0), "DATA_COL0", Schema.Type.BOOLEAN);
        checkField(fields.get(1), "DATA_COL1", Schema.Type.INT);
        checkField(fields.get(2), "DATA_COL2", Schema.Type.LONG);
        checkField(fields.get(3), "DATA_COL3", Schema.Type.FLOAT);
        checkField(fields.get(4), "DATA_COL4", Schema.Type.DOUBLE);
        checkField(fields.get(5), "DATA_COL5", Schema.Type.STRING);
        checkField(fields.get(6), "DATA_COL6", Schema.Type.BYTES);
        checkField(fields.get(7), "DATA_COL7", Schema.Type.STRING);
        checkField(fields.get(8), "DATA_COL8", Schema.Type.STRING);

        GenericRecord record1 = reader.next();
        for (int i = 0; i < types.length; ++i)
            assertEquals("DATA_COL" + i, null, record1.get("DATA_COL" + i));
    }

    /**
     * Helper method that runs an import using Avro with optional command line
     * arguments and checks that the created file matches the expectations.
     * <p/>
     * This can be used to test various extra options that are implemented for
     * the Avro input.
     *
     * @param extraArgs extra command line arguments to pass to Sqoop in addition
     *                  to those that {@link #getOutputArgv(boolean, String[])}
     *                  returns
     */
    private void avroImportTestHelper(String[] extraArgs, String codec) throws IOException {
        String[] types = { "BIT", "INTEGER", "BIGINT", "REAL", "DOUBLE", "VARCHAR(6)", "VARBINARY(2)", "DATE",
                "TIMESTAMP", "TIME" };
        DateMidnight testDate = new LocalDate().toDateMidnight();
        LocalTime testTime = new LocalTime().withMillisOfSecond(0);
        // this example checks for wrong week-year conversion
        DateTime testTimestamp = testTime.toDateTime(new DateTime(2012, 12, 31, 0, 0, 0, 0));
        String timestampPattern = "YYYY-MM-dd HH:mm:ss.SSS";
        String datePattern = "YYYY-MM-dd";
        String[] vals = { "true", "100", "200", "1.0", "2.0", "'s'", "'0102'",
                singleQuote(DateTimeFormat.forPattern(datePattern).print(testDate)),
                singleQuote(DateTimeFormat.forPattern(timestampPattern).print(testTimestamp)),
                singleQuote(DateTimeFormat.forPattern("HH:mm:ss").print(testTime)) };
        createTableWithColTypes(types, vals);

        runImport(getOutputArgv(true, extraArgs));

        Path outputFile = new Path(getTablePath(), "part-m-00000.avro");
        DataFileReader<GenericRecord> reader = read(outputFile);
        Schema schema = reader.getSchema();
        assertEquals(Schema.Type.RECORD, schema.getType());
        List<Field> fields = schema.getFields();
        assertEquals(types.length, fields.size());

        checkField(fields.get(0), "DATA_COL0", Schema.Type.BOOLEAN);
        checkField(fields.get(1), "DATA_COL1", Schema.Type.INT);
        checkField(fields.get(2), "DATA_COL2", Schema.Type.LONG);
        checkField(fields.get(3), "DATA_COL3", Schema.Type.FLOAT);
        checkField(fields.get(4), "DATA_COL4", Schema.Type.DOUBLE);
        checkField(fields.get(5), "DATA_COL5", Schema.Type.STRING);
        checkField(fields.get(6), "DATA_COL6", Schema.Type.BYTES);
        checkField(fields.get(7), "DATA_COL7", Type.STRING);
        checkField(fields.get(8), "DATA_COL8", Type.STRING);
        checkField(fields.get(9), "DATA_COL9", Type.STRING);

        GenericRecord record1 = reader.next();
        assertEquals("DATA_COL0", true, record1.get("DATA_COL0"));
        assertEquals("DATA_COL1", 100, record1.get("DATA_COL1"));
        assertEquals("DATA_COL2", 200L, record1.get("DATA_COL2"));
        assertEquals("DATA_COL3", 1.0f, record1.get("DATA_COL3"));
        assertEquals("DATA_COL4", 2.0, record1.get("DATA_COL4"));
        assertEquals("DATA_COL5", new Utf8("s"), record1.get("DATA_COL5"));
        Object object = record1.get("DATA_COL6");
        assertTrue(object instanceof ByteBuffer);
        ByteBuffer b = ((ByteBuffer) object);
        assertEquals((byte) 1, b.get(0));
        assertEquals((byte) 2, b.get(1));
        assertEquals("DATA_COL7", testDate.toDateTime(),
                DateTimeFormat.forPattern(datePattern).parseDateTime(record1.get("DATA_COL7").toString()));
        assertEquals("DATA_COL8", testTimestamp,
                DateTimeFormat.forPattern(timestampPattern).parseDateTime(record1.get("DATA_COL8").toString()));
        assertEquals("DATA_COL9", testTime, DateTimeFormat.forPattern("HH:mm:ss.SSS")
                .parseDateTime(record1.get("DATA_COL9").toString()).toLocalTime());
        if (codec != null) {
            assertEquals(codec, reader.getMetaString(DataFileConstants.CODEC));
        }
    }

    private String singleQuote(String string) {
        return "'" + string + "'";
    }

    public void testOverrideTypeMapping() throws IOException {
        String[] types = { "INT" };
        String[] vals = { "10" };
        createTableWithColTypes(types, vals);

        String[] extraArgs = { "--map-column-java", "DATA_COL0=String" };

        runImport(getOutputArgv(true, extraArgs));

        Path outputFile = new Path(getTablePath(), "part-m-00000.avro");
        DataFileReader<GenericRecord> reader = read(outputFile);
        Schema schema = reader.getSchema();
        assertEquals(Schema.Type.RECORD, schema.getType());
        List<Field> fields = schema.getFields();
        assertEquals(types.length, fields.size());

        checkField(fields.get(0), "DATA_COL0", Schema.Type.STRING);

        GenericRecord record1 = reader.next();
        assertEquals("DATA_COL0", new Utf8("10"), record1.get("DATA_COL0"));
    }

    private void checkField(Field field, String name, Type type) throws IOException {
        assertEquals(name, field.name());
        assertEquals(Schema.Type.UNION, field.schema().getType());
        assertEquals(type, field.schema().getTypes().get(0).getType());
        assertEquals(Schema.Type.NULL, field.schema().getTypes().get(1).getType());
    }

    public void testNullableAvroImport() throws IOException, SQLException {
        String[] types = { "INT" };
        String[] vals = { null };
        createTableWithColTypes(types, vals);

        runImport(getOutputArgv(true, null));

        Path outputFile = new Path(getTablePath(), "part-m-00000.avro");
        DataFileReader<GenericRecord> reader = read(outputFile);

        GenericRecord record1 = reader.next();
        assertNull(record1.get("DATA_COL0"));

    }

    private DataFileReader<GenericRecord> read(Path filename) throws IOException {
        Configuration conf = new Configuration();
        if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
            conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
        }
        FsInput fsInput = new FsInput(filename, conf);
        DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
        return new DataFileReader<GenericRecord>(fsInput, datumReader);
    }

}