flat map JavaRDD for apache spark - Java Big Data

Java examples for Big Data:apache spark

Description

flat map JavaRDD for apache spark

Demo Code



import java.util.Iterator;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.Function;
import scala.Tuple2;
import java.util.Arrays;
import java.util.regex.Pattern;

public class Main {

  private static final Pattern COMMA = Pattern.compile(",");

  public static void main(String[] args) throws Exception {

    String inputPath = args[0];/* ww w  .  j  a v a 2  s .co  m*/
    String outputPath = args[1];

    JavaSparkContext sc = new JavaSparkContext();
    JavaRDD<String> lines = sc.textFile(inputPath);

    // Get key pair: key has two components (state, category)
    JavaRDD<String> stateCategory = lines.flatMap(line -> {
      String[] data = COMMA.split(line);
      List<String> results = new ArrayList<>();

      String state = data[1];
      String category = data[3];
      results.add(state + " " + category);
      return results.iterator();
    });

    // Get the amount for each key pair
    JavaRDD<Integer> amount = lines.flatMap(line -> {
      String[] data = COMMA.split(line);
      List<Integer> results = new ArrayList<>();

      int returnAmount = (int) Float.parseFloat(data[4]);
      results.add(returnAmount);
      return results.iterator();
    });

    JavaPairRDD<String, Integer> stateCategoryAmount = stateCategory.zip(amount);
    stateCategory.collect();
    JavaPairRDD<String, Integer> counts = stateCategoryAmount.reduceByKey((x, y) -> x + y);
    JavaPairRDD<String, Integer> sorted = counts.sortByKey();

    sorted.coalesce(1).saveAsTextFile(outputPath);
    sc.stop();
  }
}

Related Tutorials