Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io.hadoop.inputformat; import static org.apache.beam.sdk.io.common.IOITHelper.executeWithRetry; import static org.apache.beam.sdk.io.common.IOITHelper.readIOTestPipelineOptions; import static org.apache.beam.sdk.io.common.TestRow.DeterministicallyConstructTestRowFn; import static org.apache.beam.sdk.io.common.TestRow.SelectNameFn; import static org.apache.beam.sdk.io.common.TestRow.getExpectedHashForRowCount; import static org.apache.beam.sdk.io.hadoop.inputformat.TestRowDBWritable.PrepareStatementFromTestRow; import java.sql.SQLException; import org.apache.beam.sdk.io.GenerateSequence; import org.apache.beam.sdk.io.common.DatabaseTestHelper; import org.apache.beam.sdk.io.common.HashingFn; import org.apache.beam.sdk.io.common.PostgresIOTestPipelineOptions; import org.apache.beam.sdk.io.common.TestRow; import org.apache.beam.sdk.io.hadoop.SerializableConfiguration; import org.apache.beam.sdk.io.jdbc.JdbcIO; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Combine; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.Reshuffle; import org.apache.beam.sdk.transforms.Values; import org.apache.beam.sdk.values.PCollection; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.lib.db.DBConfiguration; import org.apache.hadoop.mapreduce.lib.db.DBInputFormat; import org.apache.hadoop.mapreduce.lib.db.DBWritable; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; import org.postgresql.ds.PGSimpleDataSource; /** * A test of {@link org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIO} on an independent * postgres instance. * * <p>This test requires a running instance of Postgres. Pass in connection information using * PipelineOptions: * * <pre> * ./gradlew integrationTest -p sdks/java/io/hadoop/input-format/ * -DintegrationTestPipelineOptions='[ * "--postgresServerName=1.2.3.4", * "--postgresUsername=postgres", * "--postgresDatabaseName=myfancydb", * "--postgresPassword=mypass", * "--postgresSsl=false", * "--numberOfRecords=1000" ]' * --tests org.apache.beam.sdk.io.hadoop.inputformat.HadoopInputFormatIOIT * -DintegrationTestRunner=direct * </pre> * * <p>Please see 'build_rules.gradle' file for instructions regarding running this test using Beam * performance testing framework. */ public class HadoopInputFormatIOIT { private static PGSimpleDataSource dataSource; private static Integer numberOfRows; private static String tableName; private static SerializableConfiguration hadoopConfiguration; @Rule public TestPipeline writePipeline = TestPipeline.create(); @Rule public TestPipeline readPipeline = TestPipeline.create(); @BeforeClass public static void setUp() throws Exception { PostgresIOTestPipelineOptions options = readIOTestPipelineOptions(PostgresIOTestPipelineOptions.class); dataSource = DatabaseTestHelper.getPostgresDataSource(options); numberOfRows = options.getNumberOfRecords(); tableName = DatabaseTestHelper.getTestTableName("HadoopInputFormatIOIT"); executeWithRetry(HadoopInputFormatIOIT::createTable); setupHadoopConfiguration(options); } private static void createTable() throws SQLException { DatabaseTestHelper.createTable(dataSource, tableName); } private static void setupHadoopConfiguration(PostgresIOTestPipelineOptions options) { Configuration conf = new Configuration(); DBConfiguration.configureDB(conf, "org.postgresql.Driver", DatabaseTestHelper.getPostgresDBUrl(options), options.getPostgresUsername(), options.getPostgresPassword()); conf.set(DBConfiguration.INPUT_TABLE_NAME_PROPERTY, tableName); conf.setStrings(DBConfiguration.INPUT_FIELD_NAMES_PROPERTY, "id", "name"); conf.set(DBConfiguration.INPUT_ORDER_BY_PROPERTY, "id ASC"); conf.setClass(DBConfiguration.INPUT_CLASS_PROPERTY, TestRowDBWritable.class, DBWritable.class); conf.setClass("key.class", LongWritable.class, Object.class); conf.setClass("value.class", TestRowDBWritable.class, Object.class); conf.setClass("mapreduce.job.inputformat.class", DBInputFormat.class, InputFormat.class); hadoopConfiguration = new SerializableConfiguration(conf); } @AfterClass public static void tearDown() throws Exception { executeWithRetry(HadoopInputFormatIOIT::deleteTable); } private static void deleteTable() throws SQLException { DatabaseTestHelper.deleteTable(dataSource, tableName); } @Test public void readUsingHadoopInputFormat() { writePipeline.apply("Generate sequence", GenerateSequence.from(0).to(numberOfRows)) .apply("Produce db rows", ParDo.of(new DeterministicallyConstructTestRowFn())) .apply("Prevent fusion before writing", Reshuffle.viaRandomKey()).apply("Write using JDBCIO", JdbcIO.<TestRow>write() .withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create(dataSource)) .withStatement(String.format("insert into %s values(?, ?)", tableName)) .withPreparedStatementSetter(new PrepareStatementFromTestRow())); writePipeline.run().waitUntilFinish(); PCollection<String> consolidatedHashcode = readPipeline .apply("Read using HadoopInputFormat", HadoopInputFormatIO.<LongWritable, TestRowDBWritable>read() .withConfiguration(hadoopConfiguration.get())) .apply("Get values only", Values.create()).apply("Values as string", ParDo.of(new SelectNameFn())) .apply("Calculate hashcode", Combine.globally(new HashingFn())); PAssert.thatSingleton(consolidatedHashcode).isEqualTo(getExpectedHashForRowCount(numberOfRows)); readPipeline.run().waitUntilFinish(); } }