Java tutorial
package com.thinkbiganalytics.spark.dataquality.checker; /*- * #%L * kylo-spark-job-dataquality-app * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.thinkbiganalytics.spark.DataSet; import com.thinkbiganalytics.spark.SparkContextService; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkContext; import org.apache.spark.sql.Row; import org.apache.spark.sql.hive.HiveContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.ApplicationContext; import org.springframework.context.annotation.AnnotationConfigApplicationContext; import org.springframework.stereotype.Component; import java.util.List; /** * Perform data quality checks */ /* TODO: Implement full functionality This implementation provides the skeleton layout to enable implementation of the full functionality. It demonstrates using Spark to run a row count for a schema.table in Hive. Tested on both Spark 1 and Spark 2. Please refer to README for commands to run application. */ @Component public class DataQualityChecker { private static final Logger log = LoggerFactory.getLogger(DataQualityChecker.class); @Autowired private SparkContextService scs; private HiveContext hiveContext; private String categoryName; private String feedName; public static void main(String[] args) { log.info("Running DataQualityChecker with these command line args: " + StringUtils.join(args, ",")); if (args.length < 2) { System.out.println("Expected command line args: <hive-schema-name> <hive-table-name>"); System.exit(1); } try { ApplicationContext ctx = new AnnotationConfigApplicationContext("com.thinkbiganalytics.spark"); DataQualityChecker app = ctx.getBean(DataQualityChecker.class); app.setArguments(args[0], args[1]); app.doDataQualityChecks(); } catch (Exception e) { log.error("Failed to perform data quality checks: {}", e.getMessage()); System.exit(1); } log.info("DataQualityChecker has finished."); } public void setArguments(String categoryName, String feedName) { this.categoryName = categoryName; this.feedName = feedName; } protected HiveContext getHiveContext() { return hiveContext; } public void doDataQualityChecks() { try { SparkContext sparkContext = SparkContext.getOrCreate(); hiveContext = new org.apache.spark.sql.hive.HiveContext(sparkContext); String sql = "SELECT COUNT(*) FROM " + categoryName + "." + feedName; log.info("Executing query {}", sql); DataSet dataFrame = scs.sql(getHiveContext(), sql); List<Row> resultRows = dataFrame.collectAsList(); long rowCount = 0; if (resultRows.size() > 0) { rowCount = resultRows.get(0).getLong(0); } log.info("Total rows in {}.{}: {}", categoryName, feedName, rowCount); } catch (Exception e) { log.error("An error occurred during running data quality checks: {}", e.getMessage()); System.exit(1); } } }