Process custom data object using apache spark - Java Big Data

Java examples for Big Data:apache spark

Description

Process custom data object using apache spark

Demo Code



import java.io.Serializable;
import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SaveMode;

public class Basic {
    public static class Person implements Serializable {
        private String name;
        private int age;

        public String getName() {
            return name;
        }/*  ww w  .  ja va  2s.c o m*/

        public void setName(String name) {
            this.name = name;
        }

        public int getAge() {
            return age;
        }

        public void setAge(int age) {
            this.age = age;
        }
    }

    public static void main(String[] args) throws Exception {

        SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL")
                .setMaster("local[*]");
        JavaSparkContext ctx = new JavaSparkContext(sparkConf);
        SQLContext sqlContext = new SQLContext(ctx);

        JavaRDD<Person> people = ctx.textFile(
                "examples/src/main/resources/people.txt").map(
                new Function<String, Person>() {
                    @Override
                    public Person call(String line) {
                        String[] parts = line.split(",");

                        Person person = new Person();
                        person.setName(parts[0]);
                        person.setAge(Integer.parseInt(parts[1].trim()));

                        return person;
                    }
                });

        DataFrame schemaPeople = sqlContext.createDataFrame(people,
                Person.class);
        schemaPeople.registerTempTable("people");

        DataFrame teenagers = sqlContext
                .sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");

        List<String> teenagerNames = teenagers.toJavaRDD()
                .map(new Function<Row, String>() {
                    @Override
                    public String call(Row row) {
                        return "Name: " + row.getString(0);
                    }
                }).collect();
        for (String name : teenagerNames) {
            System.out.println(name);
        }

        System.out.println("=== Data source: Parquet File ===");
        schemaPeople.write().parquet("people.parquet");

        DataFrame parquetFile = sqlContext.read().parquet("people.parquet");

        parquetFile.registerTempTable("parquetFile");
        DataFrame teenagers2 = sqlContext
                .sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19");
        teenagerNames = teenagers2.toJavaRDD()
                .map(new Function<Row, String>() {
                    @Override
                    public String call(Row row) {
                        return "Name: " + row.getString(0);
                    }
                }).collect();
        for (String name : teenagerNames) {
            System.out.println(name);
        }

        System.out.println("=== Data source: JSON Dataset ===");
        String path = "examples/src/main/resources/people.json";
        DataFrame peopleFromJsonFile = sqlContext.read().json(path);

        peopleFromJsonFile.printSchema();
        peopleFromJsonFile.registerTempTable("people");
        DataFrame teenagers3 = sqlContext
                .sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
        teenagerNames = teenagers3.toJavaRDD()
                .map(new Function<Row, String>() {
                    @Override
                    public String call(Row row) {
                        return "Name: " + row.getString(0);
                    }
                }).collect();
        for (String name : teenagerNames) {
            System.out.println(name);
        }

        List<String> jsonData = Arrays
                .asList("{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
        JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
        DataFrame peopleFromJsonRDD = sqlContext.read().json(
                anotherPeopleRDD.rdd());

        peopleFromJsonRDD.printSchema();

        peopleFromJsonRDD.registerTempTable("people2");

        DataFrame peopleWithCity = sqlContext
                .sql("SELECT name, address.city FROM people2");
        List<String> nameAndCity = peopleWithCity.toJavaRDD()
                .map(new Function<Row, String>() {
                    @Override
                    public String call(Row row) {
                        return "Name: " + row.getString(0) + ", City: "
                                + row.getString(1);
                    }
                }).collect();
        for (String name : nameAndCity) {
            System.out.println(name);
        }

        ctx.stop();
    }
}

Related Tutorials