Java tutorial
/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package fr.jetoile.hadoopunit.sample; import com.ninja_squad.dbsetup.Operations; import com.ninja_squad.dbsetup.operation.Operation; import fr.jetoile.hadoopunit.HadoopUnitConfig; import fr.jetoile.hadoopunit.exception.BootstrapException; import fr.jetoile.hadoopunit.exception.NotFoundServiceException; import fr.jetoile.hadoopunit.test.hdfs.HdfsUtils; import fr.jetoile.hadoopunit.test.hive.HiveConnectionUtils; import fr.jetoile.hadoopunit.test.hive.HiveSetup; import org.apache.commons.configuration.Configuration; import org.apache.commons.configuration.ConfigurationException; import org.apache.commons.configuration.PropertiesConfiguration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.junit.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URISyntaxException; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import static com.ninja_squad.dbsetup.Operations.sequenceOf; import static com.ninja_squad.dbsetup.Operations.sql; import static org.fest.assertions.Assertions.assertThat; @Ignore public class SparkJobTest { static private Logger LOGGER = LoggerFactory.getLogger(SparkJobTest.class); private static Configuration configuration; public static Operation CREATE_TABLES = null; public static Operation DROP_TABLES = null; @BeforeClass public static void setUp() throws BootstrapException, SQLException, ClassNotFoundException, NotFoundServiceException { try { configuration = new PropertiesConfiguration(HadoopUnitConfig.DEFAULT_PROPS_FILE); } catch (ConfigurationException e) { throw new BootstrapException("bad config", e); } CREATE_TABLES = sequenceOf( sql("CREATE EXTERNAL TABLE IF NOT EXISTS default.test(id INT, value STRING) " + " ROW FORMAT DELIMITED FIELDS TERMINATED BY ';'" + " STORED AS TEXTFILE" + " LOCATION 'hdfs://localhost:" + configuration.getInt(HadoopUnitConfig.HDFS_NAMENODE_PORT_KEY) + "/khanh/test'"), sql("CREATE EXTERNAL TABLE IF NOT EXISTS default.test_parquet(id INT, value STRING) " + " ROW FORMAT DELIMITED FIELDS TERMINATED BY ';'" + " STORED AS PARQUET" + " LOCATION 'hdfs://localhost:" + configuration.getInt(HadoopUnitConfig.HDFS_NAMENODE_PORT_KEY) + "/khanh/test_parquet'")); DROP_TABLES = sequenceOf(sql("DROP TABLE IF EXISTS default.test"), sql("DROP TABLE IF EXISTS default.test_parquet")); } @Before public void before() throws IOException, URISyntaxException { FileSystem fileSystem = HdfsUtils.INSTANCE.getFileSystem(); fileSystem.mkdirs(new Path("/khanh/test")); fileSystem.mkdirs(new Path("/khanh/test_parquet")); fileSystem.copyFromLocalFile(new Path(SparkJobTest.class.getClassLoader().getResource("test.csv").toURI()), new Path("/khanh/test/test.csv")); new HiveSetup(HiveConnectionUtils.INSTANCE.getDestination(), Operations.sequenceOf(CREATE_TABLES)).launch(); } @After public void clean() throws IOException { new HiveSetup(HiveConnectionUtils.INSTANCE.getDestination(), Operations.sequenceOf(DROP_TABLES)).launch(); FileSystem fileSystem = HdfsUtils.INSTANCE.getFileSystem(); fileSystem.delete(new Path("/khanh"), true); } @Test public void upload_file_into_hdfs_and_map_hive_should_success() throws SQLException { Statement stmt = HiveConnectionUtils.INSTANCE.getConnection().createStatement(); String select = "SELECT * FROM default.test"; ResultSet resultSet = stmt.executeQuery(select); while (resultSet.next()) { int id = resultSet.getInt(1); String value = resultSet.getString(2); assertThat(id).isNotNull(); assertThat(value).isNotNull(); } } @Test public void spark_should_read_hive() throws SQLException { SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("test"); JavaSparkContext context = new JavaSparkContext(conf); SparkJob sparkJob = new SparkJob(context); DataFrame sql = sparkJob.run(); // sql.printSchema(); Row[] rows = sql.collect(); for (int i = 1; i < 4; i++) { Row row = rows[i - 1]; assertThat("value" + i).isEqualToIgnoringCase(row.getString(1)); assertThat(i).isEqualTo(row.getInt(0)); } context.close(); } @Test public void spark_should_create_a_parquet_file() throws SQLException, IOException { SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("test"); JavaSparkContext context = new JavaSparkContext(conf); SparkJob sparkJob = new SparkJob(context); DataFrame sql = sparkJob.run(); sql.write().parquet("hdfs://localhost:" + configuration.getInt(HadoopUnitConfig.HDFS_NAMENODE_PORT_KEY) + "/khanh/test_parquet/file.parquet"); FileSystem fileSystem = HdfsUtils.INSTANCE.getFileSystem(); assertThat(fileSystem .exists(new Path("hdfs://localhost:" + configuration.getInt(HadoopUnitConfig.HDFS_NAMENODE_PORT_KEY) + "/khanh/test_parquet/file.parquet"))).isTrue(); context.close(); } @Test public void spark_should_read_parquet_file() throws IOException { //given SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("test"); JavaSparkContext context = new JavaSparkContext(conf); SparkJob sparkJob = new SparkJob(context); DataFrame sql = sparkJob.run(); sql.write().parquet("hdfs://localhost:" + configuration.getInt(HadoopUnitConfig.HDFS_NAMENODE_PORT_KEY) + "/khanh/test_parquet/file.parquet"); FileSystem fileSystem = HdfsUtils.INSTANCE.getFileSystem(); assertThat(fileSystem .exists(new Path("hdfs://localhost:" + configuration.getInt(HadoopUnitConfig.HDFS_NAMENODE_PORT_KEY) + "/khanh/test_parquet/file.parquet"))).isTrue(); context.close(); //when JavaSparkContext context1 = new JavaSparkContext(conf); SparkJob sparkJob1 = new SparkJob(context1); DataFrame sql1 = sparkJob1.run(); DataFrame select = sql1.select("id", "value"); Row[] rows = select.collect(); //then for (int i = 1; i < 4; i++) { Row row = rows[i - 1]; assertThat("value" + i).isEqualToIgnoringCase(row.getString(1)); assertThat(i).isEqualTo(row.getInt(0)); } context.close(); } }