Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.sqoop; import com.cloudera.sqoop.testutil.CommonArgs; import com.cloudera.sqoop.testutil.HsqldbTestServer; import com.cloudera.sqoop.testutil.ImportJobTestCase; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.Schema.Type; import org.apache.avro.file.DataFileReader; import org.apache.avro.generic.GenericRecord; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; import org.kitesdk.data.CompressionType; import org.kitesdk.data.Dataset; import org.kitesdk.data.DatasetReader; import org.kitesdk.data.Datasets; import java.io.IOException; import java.nio.ByteBuffer; import java.sql.SQLException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * Tests --as-parquetfile. */ public class TestParquetImport extends ImportJobTestCase { public static final Log LOG = LogFactory.getLog(TestParquetImport.class.getName()); /** * Create the argv to pass to Sqoop. * * @return the argv as an array of strings. */ protected String[] getOutputArgv(boolean includeHadoopFlags, String[] extraArgs) { ArrayList<String> args = new ArrayList<String>(); if (includeHadoopFlags) { CommonArgs.addHadoopFlags(args); } args.add("--table"); args.add(getTableName()); args.add("--connect"); args.add(HsqldbTestServer.getUrl()); args.add("--warehouse-dir"); args.add(getWarehouseDir()); args.add("--m"); args.add("1"); args.add("--split-by"); args.add("INTFIELD1"); args.add("--as-parquetfile"); if (extraArgs != null) { args.addAll(Arrays.asList(extraArgs)); } return args.toArray(new String[args.size()]); } protected String[] getOutputQueryArgv(boolean includeHadoopFlags, String[] extraArgs) { ArrayList<String> args = new ArrayList<String>(); if (includeHadoopFlags) { CommonArgs.addHadoopFlags(args); } args.add("--query"); args.add("SELECT * FROM " + getTableName() + " WHERE $CONDITIONS"); args.add("--connect"); args.add(HsqldbTestServer.getUrl()); args.add("--target-dir"); args.add(getWarehouseDir() + "/" + getTableName()); args.add("--m"); args.add("1"); args.add("--split-by"); args.add("INTFIELD1"); args.add("--as-parquetfile"); if (extraArgs != null) { args.addAll(Arrays.asList(extraArgs)); } return args.toArray(new String[args.size()]); } public void testSnappyCompression() throws IOException { runParquetImportTest("snappy"); } public void testDeflateCompression() throws IOException { runParquetImportTest("deflate"); } private void runParquetImportTest(String codec) throws IOException { String[] types = { "BIT", "INTEGER", "BIGINT", "REAL", "DOUBLE", "VARCHAR(6)", "VARBINARY(2)", }; String[] vals = { "true", "100", "200", "1.0", "2.0", "'s'", "'0102'", }; createTableWithColTypes(types, vals); String[] extraArgs = { "--compression-codec", codec }; runImport(getOutputArgv(true, extraArgs)); assertEquals(CompressionType.forName(codec), getCompressionType()); Schema schema = getSchema(); assertEquals(Type.RECORD, schema.getType()); List<Field> fields = schema.getFields(); assertEquals(types.length, fields.size()); checkField(fields.get(0), "DATA_COL0", Type.BOOLEAN); checkField(fields.get(1), "DATA_COL1", Type.INT); checkField(fields.get(2), "DATA_COL2", Type.LONG); checkField(fields.get(3), "DATA_COL3", Type.FLOAT); checkField(fields.get(4), "DATA_COL4", Type.DOUBLE); checkField(fields.get(5), "DATA_COL5", Type.STRING); checkField(fields.get(6), "DATA_COL6", Type.BYTES); DatasetReader<GenericRecord> reader = getReader(); try { GenericRecord record1 = reader.next(); assertNotNull(record1); assertEquals("DATA_COL0", true, record1.get("DATA_COL0")); assertEquals("DATA_COL1", 100, record1.get("DATA_COL1")); assertEquals("DATA_COL2", 200L, record1.get("DATA_COL2")); assertEquals("DATA_COL3", 1.0f, record1.get("DATA_COL3")); assertEquals("DATA_COL4", 2.0, record1.get("DATA_COL4")); assertEquals("DATA_COL5", "s", record1.get("DATA_COL5")); Object object = record1.get("DATA_COL6"); assertTrue(object instanceof ByteBuffer); ByteBuffer b = ((ByteBuffer) object); assertEquals((byte) 1, b.get(0)); assertEquals((byte) 2, b.get(1)); assertFalse(reader.hasNext()); } finally { reader.close(); } } public void testOverrideTypeMapping() throws IOException { String[] types = { "INT" }; String[] vals = { "10" }; createTableWithColTypes(types, vals); String[] extraArgs = { "--map-column-java", "DATA_COL0=String" }; runImport(getOutputArgv(true, extraArgs)); Schema schema = getSchema(); assertEquals(Type.RECORD, schema.getType()); List<Field> fields = schema.getFields(); assertEquals(types.length, fields.size()); checkField(fields.get(0), "DATA_COL0", Type.STRING); DatasetReader<GenericRecord> reader = getReader(); try { assertTrue(reader.hasNext()); GenericRecord record1 = reader.next(); assertEquals("DATA_COL0", "10", record1.get("DATA_COL0")); assertFalse(reader.hasNext()); } finally { reader.close(); } } public void testFirstUnderscoreInColumnName() throws IOException { String[] names = { "_NAME" }; String[] types = { "INT" }; String[] vals = { "1987" }; createTableWithColTypesAndNames(names, types, vals); runImport(getOutputArgv(true, null)); Schema schema = getSchema(); assertEquals(Type.RECORD, schema.getType()); List<Field> fields = schema.getFields(); assertEquals(types.length, fields.size()); checkField(fields.get(0), "__NAME", Type.INT); DatasetReader<GenericRecord> reader = getReader(); try { assertTrue(reader.hasNext()); GenericRecord record1 = reader.next(); assertEquals("__NAME", 1987, record1.get("__NAME")); assertFalse(reader.hasNext()); } finally { reader.close(); } } public void testNonIdentCharactersInColumnName() throws IOException { String[] names = { "test_p-a+r/quet" }; String[] types = { "INT" }; String[] vals = { "2015" }; createTableWithColTypesAndNames(names, types, vals); runImport(getOutputArgv(true, null)); Schema schema = getSchema(); assertEquals(Type.RECORD, schema.getType()); List<Field> fields = schema.getFields(); assertEquals(types.length, fields.size()); checkField(fields.get(0), "TEST_P_A_R_QUET", Type.INT); DatasetReader<GenericRecord> reader = getReader(); try { assertTrue(reader.hasNext()); GenericRecord record1 = reader.next(); assertEquals("TEST_P_A_R_QUET", 2015, record1.get("TEST_P_A_R_QUET")); assertFalse(reader.hasNext()); } finally { reader.close(); } } public void testNullableParquetImport() throws IOException, SQLException { String[] types = { "INT" }; String[] vals = { null }; createTableWithColTypes(types, vals); runImport(getOutputArgv(true, null)); DatasetReader<GenericRecord> reader = getReader(); try { assertTrue(reader.hasNext()); GenericRecord record1 = reader.next(); assertNull(record1.get("DATA_COL0")); assertFalse(reader.hasNext()); } finally { reader.close(); } } public void testQueryImport() throws IOException, SQLException { String[] types = { "INT" }; String[] vals = { "1" }; createTableWithColTypes(types, vals); runImport(getOutputQueryArgv(true, null)); DatasetReader<GenericRecord> reader = getReader(); try { assertTrue(reader.hasNext()); GenericRecord record1 = reader.next(); assertEquals(1, record1.get("DATA_COL0")); assertFalse(reader.hasNext()); } finally { reader.close(); } } public void testIncrementalParquetImport() throws IOException, SQLException { String[] types = { "INT" }; String[] vals = { "1" }; createTableWithColTypes(types, vals); runImport(getOutputArgv(true, null)); runImport(getOutputArgv(true, new String[] { "--append" })); DatasetReader<GenericRecord> reader = getReader(); try { assertTrue(reader.hasNext()); GenericRecord record1 = reader.next(); assertEquals(1, record1.get("DATA_COL0")); record1 = reader.next(); assertEquals(1, record1.get("DATA_COL0")); assertFalse(reader.hasNext()); } finally { reader.close(); } } public void testOverwriteParquetDatasetFail() throws IOException, SQLException { String[] types = { "INT" }; String[] vals = {}; createTableWithColTypes(types, vals); runImport(getOutputArgv(true, null)); try { runImport(getOutputArgv(true, null)); fail(""); } catch (IOException ex) { // ok } } private CompressionType getCompressionType() { return getDataset().getDescriptor().getCompressionType(); } private Schema getSchema() { return getDataset().getDescriptor().getSchema(); } private DatasetReader<GenericRecord> getReader() { return getDataset().newReader(); } private Dataset<GenericRecord> getDataset() { String uri = "dataset:file:" + getTablePath(); return Datasets.load(uri, GenericRecord.class); } @Override public void tearDown() { super.tearDown(); String uri = "dataset:file:" + getTablePath(); if (Datasets.exists(uri)) { Datasets.delete(uri); } } private void checkField(Field field, String name, Type type) { assertEquals(name, field.name()); assertEquals(Type.UNION, field.schema().getType()); assertEquals(Type.NULL, field.schema().getTypes().get(0).getType()); assertEquals(type, field.schema().getTypes().get(1).getType()); } }