Java examples for Big Data:apache spark
Process custom data object using apache spark
import java.io.Serializable; import java.util.Arrays; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.SaveMode; public class Basic { public static class Person implements Serializable { private String name; private int age; public String getName() { return name; }/* ww w . ja va 2s.c o m*/ public void setName(String name) { this.name = name; } public int getAge() { return age; } public void setAge(int age) { this.age = age; } } public static void main(String[] args) throws Exception { SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL") .setMaster("local[*]"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); SQLContext sqlContext = new SQLContext(ctx); JavaRDD<Person> people = ctx.textFile( "examples/src/main/resources/people.txt").map( new Function<String, Person>() { @Override public Person call(String line) { String[] parts = line.split(","); Person person = new Person(); person.setName(parts[0]); person.setAge(Integer.parseInt(parts[1].trim())); return person; } }); DataFrame schemaPeople = sqlContext.createDataFrame(people, Person.class); schemaPeople.registerTempTable("people"); DataFrame teenagers = sqlContext .sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); List<String> teenagerNames = teenagers.toJavaRDD() .map(new Function<Row, String>() { @Override public String call(Row row) { return "Name: " + row.getString(0); } }).collect(); for (String name : teenagerNames) { System.out.println(name); } System.out.println("=== Data source: Parquet File ==="); schemaPeople.write().parquet("people.parquet"); DataFrame parquetFile = sqlContext.read().parquet("people.parquet"); parquetFile.registerTempTable("parquetFile"); DataFrame teenagers2 = sqlContext .sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); teenagerNames = teenagers2.toJavaRDD() .map(new Function<Row, String>() { @Override public String call(Row row) { return "Name: " + row.getString(0); } }).collect(); for (String name : teenagerNames) { System.out.println(name); } System.out.println("=== Data source: JSON Dataset ==="); String path = "examples/src/main/resources/people.json"; DataFrame peopleFromJsonFile = sqlContext.read().json(path); peopleFromJsonFile.printSchema(); peopleFromJsonFile.registerTempTable("people"); DataFrame teenagers3 = sqlContext .sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); teenagerNames = teenagers3.toJavaRDD() .map(new Function<Row, String>() { @Override public String call(Row row) { return "Name: " + row.getString(0); } }).collect(); for (String name : teenagerNames) { System.out.println(name); } List<String> jsonData = Arrays .asList("{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}"); JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData); DataFrame peopleFromJsonRDD = sqlContext.read().json( anotherPeopleRDD.rdd()); peopleFromJsonRDD.printSchema(); peopleFromJsonRDD.registerTempTable("people2"); DataFrame peopleWithCity = sqlContext .sql("SELECT name, address.city FROM people2"); List<String> nameAndCity = peopleWithCity.toJavaRDD() .map(new Function<Row, String>() { @Override public String call(Row row) { return "Name: " + row.getString(0) + ", City: " + row.getString(1); } }).collect(); for (String name : nameAndCity) { System.out.println(name); } ctx.stop(); } }