Multiple Verticle CSV File Analysis In Spark SQL - Java Big Data

Java examples for Big Data:apache spark

Description

Multiple Verticle CSV File Analysis In Spark SQL

Demo Code



import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

public class MultipleVerticleCSVFileAnalysisInSparkSQL {


    public static void main(String[] args) {
        final SparkSession sparkSession = SparkSession.builder()
                .appName("Spark CSV Analysis Demo").master("local[5]")
                .getOrCreate();//  w  ww .j a  v a2s.co  m

        final DataFrameReader dataFrameReader = sparkSession.read();
        dataFrameReader.option("header", "true");

        final Dataset<Row> csvDataFramePart1 = dataFrameReader
                .csv("src/main/resources/data-part1.csv");
        final Dataset<Row> csvDataFramePart2 = dataFrameReader
                .csv("src/main/resources/data-part2.csv");

        final Dataset<Row> csvDataFrame = csvDataFramePart1.join(
                csvDataFramePart2, "id");

        csvDataFrame.printSchema();
        csvDataFrame.createOrReplaceTempView("ROOM_OCCUPANCY_RAW");
        final Dataset<Row> roomOccupancyData = sparkSession
                .sql("SELECT CAST(id as int) id, CAST(date as string) date, "
                        + " Occupancy FROM ROOM_OCCUPANCY_RAW");

        roomOccupancyData.printSchema();
        roomOccupancyData.createOrReplaceTempView("ROOM_OCCUPANCY");
        sparkSession
                .sql("SELECT * FROM ROOM_OCCUPANCY WHERE Temperature >= 23.6  "
                        + "AND CO2 BETWEEN 920 and 950").show();
    }
}

Related Tutorials