com.cloudera.sqoop.hive.TestHiveImport.java Source code

Introduction

Here is the source code for com.cloudera.sqoop.hive.TestHiveImport.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.cloudera.sqoop.hive;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Test;

import com.cloudera.sqoop.SqoopOptions;
import com.cloudera.sqoop.SqoopOptions.InvalidOptionsException;
import com.cloudera.sqoop.testutil.CommonArgs;
import com.cloudera.sqoop.testutil.HsqldbTestServer;
import com.cloudera.sqoop.testutil.ImportJobTestCase;
import com.cloudera.sqoop.tool.BaseSqoopTool;
import com.cloudera.sqoop.tool.CodeGenTool;
import com.cloudera.sqoop.tool.CreateHiveTableTool;
import com.cloudera.sqoop.tool.ImportTool;
import com.cloudera.sqoop.tool.SqoopTool;
import org.apache.commons.cli.ParseException;

/**
 * Test HiveImport capability after an import to HDFS.
 */
public class TestHiveImport extends ImportJobTestCase {

    public static final Log LOG = LogFactory.getLog(TestHiveImport.class.getName());

    public void setUp() {
        super.setUp();
        HiveImport.setTestMode(true);
    }

    public void tearDown() {
        super.tearDown();
        HiveImport.setTestMode(false);
    }

    /**
     * Sets the expected number of columns in the table being manipulated
     * by the test. Under the hood, this sets the expected column names
     * to DATA_COLi for 0 &lt;= i &lt; numCols.
     * @param numCols the number of columns to be created.
     */
    protected void setNumCols(int numCols) {
        String[] cols = new String[numCols];
        for (int i = 0; i < numCols; i++) {
            cols[i] = "DATA_COL" + i;
        }

        setColNames(cols);
    }

    protected String[] getTypesNewLineTest() {
        String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
        return types;
    }

    /**
     * Create the argv to pass to Sqoop.
     * @return the argv as an array of strings.
     */
    protected String[] getArgv(boolean includeHadoopFlags, String[] moreArgs) {
        ArrayList<String> args = new ArrayList<String>();

        if (includeHadoopFlags) {
            CommonArgs.addHadoopFlags(args);
        }

        if (null != moreArgs) {
            for (String arg : moreArgs) {
                args.add(arg);
            }
        }

        args.add("--table");
        args.add(getTableName());
        args.add("--warehouse-dir");
        args.add(getWarehouseDir());
        args.add("--connect");
        args.add(getConnectString());
        args.add("--hive-import");
        String[] colNames = getColNames();
        if (null != colNames) {
            args.add("--split-by");
            args.add(colNames[0]);
        } else {
            fail("Could not determine column names.");
        }

        args.add("--num-mappers");
        args.add("1");

        for (String a : args) {
            LOG.debug("ARG : " + a);
        }

        return args.toArray(new String[0]);
    }

    /**
     * @return the argv to supply to a create-table only job for Hive imports.
     */
    protected String[] getCreateTableArgv(boolean includeHadoopFlags, String[] moreArgs) {

        ArrayList<String> args = new ArrayList<String>();

        if (null != moreArgs) {
            for (String arg : moreArgs) {
                args.add(arg);
            }
        }

        args.add("--table");
        args.add(getTableName());
        args.add("--connect");
        args.add(getConnectString());

        return args.toArray(new String[0]);
    }

    /**
     * @return the argv to supply to a code-gen only job for Hive imports.
     */
    protected String[] getCodeGenArgs() {
        ArrayList<String> args = new ArrayList<String>();

        args.add("--table");
        args.add(getTableName());
        args.add("--connect");
        args.add(getConnectString());
        args.add("--hive-import");

        return args.toArray(new String[0]);
    }

    /**
     * @return the argv to supply to a ddl-executing-only job for Hive imports.
     */
    protected String[] getCreateHiveTableArgs(String[] extraArgs) {
        ArrayList<String> args = new ArrayList<String>();

        args.add("--table");
        args.add(getTableName());
        args.add("--connect");
        args.add(getConnectString());

        if (null != extraArgs) {
            for (String arg : extraArgs) {
                args.add(arg);
            }
        }

        return args.toArray(new String[0]);
    }

    private SqoopOptions getSqoopOptions(String[] args, SqoopTool tool) {
        SqoopOptions opts = null;
        try {
            opts = tool.parseArguments(args, null, null, true);
        } catch (Exception e) {
            fail("Invalid options: " + e.toString());
        }

        return opts;
    }

    private void runImportTest(String tableName, String[] types, String[] values, String verificationScript,
            String[] args, SqoopTool tool) throws IOException {

        // create a table and populate it with a row...
        createTableWithColTypes(types, values);

        // set up our mock hive shell to compare our generated script
        // against the correct expected one.
        SqoopOptions options = getSqoopOptions(args, tool);
        String hiveHome = options.getHiveHome();
        assertNotNull("hive.home was not set", hiveHome);
        String testDataPath = new Path(new Path(hiveHome), "scripts/" + verificationScript).toString();
        System.setProperty("expected.script", new File(testDataPath).getAbsolutePath());

        // verify that we can import it correctly into hive.
        runImport(tool, args);
    }

    /** Test that we can generate a file containing the DDL and not import. */
    @Test
    public void testGenerateOnly() throws IOException {
        final String TABLE_NAME = "GenerateOnly";
        setCurTableName(TABLE_NAME);
        setNumCols(1);

        // Figure out where our target generated .q file is going to be.
        SqoopOptions options = getSqoopOptions(getArgv(false, null), new ImportTool());
        Path ddlFile = new Path(new Path(options.getCodeOutputDir()), TABLE_NAME + ".q");
        FileSystem fs = FileSystem.getLocal(new Configuration());

        // If it's already there, remove it before running the test to ensure
        // that it's the current test that generated the file.
        if (fs.exists(ddlFile)) {
            if (!fs.delete(ddlFile, false)) {
                LOG.warn("Could not delete previous ddl file: " + ddlFile);
            }
        }

        // Run a basic import, but specify that we're just generating definitions.
        String[] types = { "INTEGER" };
        String[] vals = { "42" };
        runImportTest(TABLE_NAME, types, vals, null, getCodeGenArgs(), new CodeGenTool());

        // Test that the generated definition file exists.
        assertTrue("Couldn't find expected ddl file", fs.exists(ddlFile));

        Path hiveImportPath = new Path(new Path(options.getWarehouseDir()), TABLE_NAME);
        assertFalse("Import actually happened!", fs.exists(hiveImportPath));
    }

    /** Test that strings and ints are handled in the normal fashion. */
    @Test
    public void testNormalHiveImport() throws IOException {
        final String TABLE_NAME = "NORMAL_HIVE_IMPORT";
        setCurTableName(TABLE_NAME);
        setNumCols(3);
        String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
        String[] vals = { "'test'", "42", "'somestring'" };
        runImportTest(TABLE_NAME, types, vals, "normalImport.q", getArgv(false, null), new ImportTool());
    }

    /** Test that table is created in hive with no data import. */
    @Test
    public void testCreateOnlyHiveImport() throws IOException {
        final String TABLE_NAME = "CREATE_ONLY_HIVE_IMPORT";
        setCurTableName(TABLE_NAME);
        setNumCols(3);
        String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
        String[] vals = { "'test'", "42", "'somestring'" };
        runImportTest(TABLE_NAME, types, vals, "createOnlyImport.q", getCreateHiveTableArgs(null),
                new CreateHiveTableTool());
    }

    /**
     * Test that table is created in hive and replaces the existing table if
     * any.
     */
    @Test
    public void testCreateOverwriteHiveImport() throws IOException {
        final String TABLE_NAME = "CREATE_OVERWRITE_HIVE_IMPORT";
        setCurTableName(TABLE_NAME);
        setNumCols(3);
        String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
        String[] vals = { "'test'", "42", "'somestring'" };
        String[] extraArgs = { "--hive-overwrite", "--create-hive-table" };
        runImportTest(TABLE_NAME, types, vals, "createOverwriteImport.q", getCreateHiveTableArgs(extraArgs),
                new CreateHiveTableTool());
    }

    /** Test that dates are coerced properly to strings. */
    @Test
    public void testDate() throws IOException {
        final String TABLE_NAME = "DATE_HIVE_IMPORT";
        setCurTableName(TABLE_NAME);
        setNumCols(2);
        String[] types = { "VARCHAR(32)", "DATE" };
        String[] vals = { "'test'", "'2009-05-12'" };
        runImportTest(TABLE_NAME, types, vals, "dateImport.q", getArgv(false, null), new ImportTool());
    }

    /** Test that NUMERICs are coerced to doubles. */
    @Test
    public void testNumeric() throws IOException {
        final String TABLE_NAME = "NUMERIC_HIVE_IMPORT";
        setCurTableName(TABLE_NAME);
        setNumCols(2);
        String[] types = { "NUMERIC", "CHAR(64)" };
        String[] vals = { "3.14159", "'foo'" };
        runImportTest(TABLE_NAME, types, vals, "numericImport.q", getArgv(false, null), new ImportTool());
    }

    /** If bin/hive returns an error exit status, we should get an IOException. */
    @Test
    public void testHiveExitFails() {
        // The expected script is different than the one which would be generated
        // by this, so we expect an IOException out.
        final String TABLE_NAME = "FAILING_HIVE_IMPORT";
        setCurTableName(TABLE_NAME);
        setNumCols(2);
        String[] types = { "NUMERIC", "CHAR(64)" };
        String[] vals = { "3.14159", "'foo'" };
        try {
            runImportTest(TABLE_NAME, types, vals, "failingImport.q", getArgv(false, null), new ImportTool());
            // If we get here, then the run succeeded -- which is incorrect.
            fail("FAILING_HIVE_IMPORT test should have thrown IOException");
        } catch (IOException ioe) {
            // expected; ok.
        }
    }

    /** Test that we can set delimiters how we want them. */
    @Test
    public void testCustomDelimiters() throws IOException {
        final String TABLE_NAME = "CUSTOM_DELIM_IMPORT";
        setCurTableName(TABLE_NAME);
        setNumCols(3);
        String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)" };
        String[] vals = { "'test'", "42", "'somestring'" };
        String[] extraArgs = { "--fields-terminated-by", ",", "--lines-terminated-by", "|", };
        runImportTest(TABLE_NAME, types, vals, "customDelimImport.q", getArgv(false, extraArgs), new ImportTool());
    }

    /**
     * Test hive import with row that has new line in it.
     */
    @Test
    public void testFieldWithHiveDelims() throws IOException, InterruptedException {
        final String TABLE_NAME = "FIELD_WITH_NL_HIVE_IMPORT";

        LOG.info("Doing import of single row into FIELD_WITH_NL_HIVE_IMPORT table");
        setCurTableName(TABLE_NAME);
        setNumCols(3);
        String[] types = getTypesNewLineTest();
        String[] vals = { "'test with \n new lines \n'", "42",
                "'oh no " + '\01' + " field delims " + '\01' + "'", };
        String[] moreArgs = { "--" + BaseSqoopTool.HIVE_DROP_DELIMS_ARG };

        runImportTest(TABLE_NAME, types, vals, "fieldWithNewlineImport.q", getArgv(false, moreArgs),
                new ImportTool());

        LOG.info("Validating data in single row is present in: " + "FIELD_WITH_NL_HIVE_IMPORT table");

        // Ideally, we would actually invoke hive code to verify that record with
        // record and field delimiters have values replaced and that we have the
        // proper number of hive records. Unfortunately, this is a non-trivial task,
        // and better dealt with at an integration test level
        //
        // Instead, this assumes the path of the generated table and just validate
        // map job output.

        // Get and read the raw output file
        String whDir = getWarehouseDir();
        File p = new File(new File(whDir, TABLE_NAME), "part-m-00000");
        File f = new File(p.toString());
        FileReader fr = new FileReader(f);
        BufferedReader br = new BufferedReader(fr);
        try {
            // verify the output
            assertEquals(br.readLine(), "test with  new lines " + '\01' + "42" + '\01' + "oh no  field delims ");
            assertEquals(br.readLine(), null); // should only be one line
        } catch (IOException ioe) {
            fail("Unable to read files generated from hive");
        } finally {
            br.close();
        }
    }

    /**
     * Test hive import with row that has new line in it.
     */
    @Test
    public void testFieldWithHiveDelimsReplacement() throws IOException, InterruptedException {
        final String TABLE_NAME = "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT";

        LOG.info("Doing import of single row into " + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table");
        setCurTableName(TABLE_NAME);
        setNumCols(3);
        String[] types = getTypesNewLineTest();
        String[] vals = { "'test with\nnew lines\n'", "42", "'oh no " + '\01' + " field delims " + '\01' + "'", };
        String[] moreArgs = { "--" + BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " " };

        runImportTest(TABLE_NAME, types, vals, "fieldWithNewlineReplacementImport.q", getArgv(false, moreArgs),
                new ImportTool());

        LOG.info("Validating data in single row is present in: " + "FIELD_WITH_NL_REPLACEMENT_HIVE_IMPORT table");

        // Ideally, we would actually invoke hive code to verify that record with
        // record and field delimiters have values replaced and that we have the
        // proper number of hive records. Unfortunately, this is a non-trivial task,
        // and better dealt with at an integration test level
        //
        // Instead, this assumes the path of the generated table and just validate
        // map job output.

        // Get and read the raw output file
        String whDir = getWarehouseDir();
        File p = new File(new File(whDir, TABLE_NAME), "part-m-00000");
        File f = new File(p.toString());
        FileReader fr = new FileReader(f);
        BufferedReader br = new BufferedReader(fr);
        try {
            // verify the output
            assertEquals(br.readLine(), "test with new lines " + '\01' + "42" + '\01' + "oh no   field delims  ");
            assertEquals(br.readLine(), null); // should only be one line
        } catch (IOException ioe) {
            fail("Unable to read files generated from hive");
        } finally {
            br.close();
        }
    }

    /**
     * Test hive drop and replace option validation.
     */
    @Test
    public void testHiveDropAndReplaceOptionValidation() throws ParseException {
        LOG.info("Testing conflicting Hive delimiter drop/replace options");

        setNumCols(3);
        String[] moreArgs = { "--" + BaseSqoopTool.HIVE_DELIMS_REPLACEMENT_ARG, " ",
                "--" + BaseSqoopTool.HIVE_DROP_DELIMS_ARG, };

        ImportTool tool = new ImportTool();
        try {
            tool.validateOptions(tool.parseArguments(getArgv(false, moreArgs), null, null, true));
            fail("Expected InvalidOptionsException");
        } catch (InvalidOptionsException ex) {
            /* success */
        }
    }

    /**
     * Test hive import with row that has new line in it.
     */
    @Test
    public void testImportHiveWithPartitions() throws IOException, InterruptedException {
        final String TABLE_NAME = "PARTITION_HIVE_IMPORT";

        LOG.info("Doing import of single row into PARTITION_HIVE_IMPORT table");
        setCurTableName(TABLE_NAME);
        setNumCols(3);
        String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)", };
        String[] vals = { "'whoop'", "42", "'I am a row in a partition'", };
        String[] moreArgs = { "--" + BaseSqoopTool.HIVE_PARTITION_KEY_ARG, "ds",
                "--" + BaseSqoopTool.HIVE_PARTITION_VALUE_ARG, "20110413", };

        runImportTest(TABLE_NAME, types, vals, "partitionImport.q", getArgv(false, moreArgs), new ImportTool());
    }

    /**
     * If partition key is set to one of importing columns, we should get an
     * IOException.
     * */
    @Test
    public void testImportWithBadPartitionKey() {
        final String TABLE_NAME = "FAILING_PARTITION_HIVE_IMPORT";

        LOG.info("Doing import of single row into " + TABLE_NAME + " table");
        setCurTableName(TABLE_NAME);
        setNumCols(3);
        String[] types = { "VARCHAR(32)", "INTEGER", "CHAR(64)", };
        String[] vals = { "'key'", "42", "'I am a row in a partition'", };

        String partitionKey = getColNames()[0];

        // Specify 1st column as partition key and import every column of the
        // table by default (i.e. no --columns option).
        String[] moreArgs1 = { "--" + BaseSqoopTool.HIVE_PARTITION_KEY_ARG, partitionKey, };

        // Specify 1st column as both partition key and importing column.
        String[] moreArgs2 = { "--" + BaseSqoopTool.HIVE_PARTITION_KEY_ARG, partitionKey,
                "--" + BaseSqoopTool.COLUMNS_ARG, partitionKey, };

        // Test hive-import with the 1st args.
        try {
            runImportTest(TABLE_NAME, types, vals, "partitionImport.q", getArgv(false, moreArgs1),
                    new ImportTool());
            fail(TABLE_NAME + " test should have thrown IOException");
        } catch (IOException ioe) {
            // expected; ok.
        }

        // Test hive-import with the 2nd args.
        try {
            runImportTest(TABLE_NAME, types, vals, "partitionImport.q", getArgv(false, moreArgs2),
                    new ImportTool());
            fail(TABLE_NAME + " test should have thrown IOException");
        } catch (IOException ioe) {
            // expected; ok.
        }

        // Test create-hive-table with the 1st args.
        try {
            runImportTest(TABLE_NAME, types, vals, "partitionImport.q", getCreateTableArgv(false, moreArgs1),
                    new CreateHiveTableTool());
            fail(TABLE_NAME + " test should have thrown IOException");
        } catch (IOException ioe) {
            // expected; ok.
        }
    }
}