com.thinkbiganalytics.spark.dataquality.checker.DataQualityChecker.java Source code

Java tutorial

Introduction

Here is the source code for com.thinkbiganalytics.spark.dataquality.checker.DataQualityChecker.java

Source

package com.thinkbiganalytics.spark.dataquality.checker;

/*-
 * #%L
 * kylo-spark-job-dataquality-app
 * %%
 * Copyright (C) 2017 ThinkBig Analytics
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import com.thinkbiganalytics.spark.DataSet;
import com.thinkbiganalytics.spark.SparkContextService;

import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkContext;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.hive.HiveContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.AnnotationConfigApplicationContext;
import org.springframework.stereotype.Component;

import java.util.List;

/**
 * Perform data quality checks
 */

/*
TODO: Implement full functionality
    
This implementation provides the skeleton layout to enable implementation of the full functionality.
 It demonstrates using Spark to run a row count for a schema.table in Hive.
 Tested on both Spark 1 and Spark 2.
 Please refer to README for commands to run application.
 */

@Component
public class DataQualityChecker {

    private static final Logger log = LoggerFactory.getLogger(DataQualityChecker.class);

    @Autowired
    private SparkContextService scs;

    private HiveContext hiveContext;
    private String categoryName;
    private String feedName;

    public static void main(String[] args) {

        log.info("Running DataQualityChecker with these command line args: " + StringUtils.join(args, ","));

        if (args.length < 2) {
            System.out.println("Expected command line args: <hive-schema-name> <hive-table-name>");
            System.exit(1);
        }

        try {
            ApplicationContext ctx = new AnnotationConfigApplicationContext("com.thinkbiganalytics.spark");
            DataQualityChecker app = ctx.getBean(DataQualityChecker.class);
            app.setArguments(args[0], args[1]);
            app.doDataQualityChecks();
        } catch (Exception e) {
            log.error("Failed to perform data quality checks: {}", e.getMessage());
            System.exit(1);
        }

        log.info("DataQualityChecker has finished.");
    }

    public void setArguments(String categoryName, String feedName) {
        this.categoryName = categoryName;
        this.feedName = feedName;
    }

    protected HiveContext getHiveContext() {
        return hiveContext;
    }

    public void doDataQualityChecks() {
        try {
            SparkContext sparkContext = SparkContext.getOrCreate();
            hiveContext = new org.apache.spark.sql.hive.HiveContext(sparkContext);

            String sql = "SELECT COUNT(*) FROM " + categoryName + "." + feedName;
            log.info("Executing query {}", sql);
            DataSet dataFrame = scs.sql(getHiveContext(), sql);
            List<Row> resultRows = dataFrame.collectAsList();
            long rowCount = 0;
            if (resultRows.size() > 0) {
                rowCount = resultRows.get(0).getLong(0);
            }
            log.info("Total rows in {}.{}: {}", categoryName, feedName, rowCount);
        } catch (Exception e) {
            log.error("An error occurred during running data quality checks: {}", e.getMessage());
            System.exit(1);
        }
    }

}