Java examples for Big Data:apache spark
flat map JavaRDD for apache spark
import java.util.Iterator; import java.util.ArrayList; import java.util.List; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.function.Function; import scala.Tuple2; import java.util.Arrays; import java.util.regex.Pattern; public class Main { private static final Pattern COMMA = Pattern.compile(","); public static void main(String[] args) throws Exception { String inputPath = args[0];/* ww w . j a v a 2 s .co m*/ String outputPath = args[1]; JavaSparkContext sc = new JavaSparkContext(); JavaRDD<String> lines = sc.textFile(inputPath); // Get key pair: key has two components (state, category) JavaRDD<String> stateCategory = lines.flatMap(line -> { String[] data = COMMA.split(line); List<String> results = new ArrayList<>(); String state = data[1]; String category = data[3]; results.add(state + " " + category); return results.iterator(); }); // Get the amount for each key pair JavaRDD<Integer> amount = lines.flatMap(line -> { String[] data = COMMA.split(line); List<Integer> results = new ArrayList<>(); int returnAmount = (int) Float.parseFloat(data[4]); results.add(returnAmount); return results.iterator(); }); JavaPairRDD<String, Integer> stateCategoryAmount = stateCategory.zip(amount); stateCategory.collect(); JavaPairRDD<String, Integer> counts = stateCategoryAmount.reduceByKey((x, y) -> x + y); JavaPairRDD<String, Integer> sorted = counts.sortByKey(); sorted.coalesce(1).saveAsTextFile(outputPath); sc.stop(); } }