Create an RDD backed by the MongoDB collection using apache spark - Java Big Data

Java examples for Big Data:apache spark

Description

Create an RDD backed by the MongoDB collection using apache spark

Demo Code

    import com.mongodb.hadoop.MongoInputFormat;

    import com.mongodb.hadoop.MongoOutputFormat;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.spark.SparkConf;
    import org.apache.spark.api.java.JavaPairRDD;
    import org.apache.spark.api.java.JavaRDD;
    import org.apache.spark.api.java.JavaSparkContext;
    import org.bson.BSONObject;
    import org.bson.BasicBSONObject;

    import scala.Tuple2;

    import java.util.ArrayList;
    import java.util.List;

    public class IncidentiMunicipi {

        public void run() {
    JavaSparkContext sc = new JavaSparkContext(new SparkConf().setMaster("local[4]").setAppName("AccidentRome"));
    // Set configuration options for the MongoDB Hadoop Connector.
    Configuration mongodbConfig = new Configuration();
    // MongoInputFormat allows us to read from a live MongoDB instance.
    // We could also use BSONFileInputFormat to read BSON snapshots.
    mongodbConfig.set("mongo.job.input.format", "com.mongodb.hadoop.MongoInputFormat");

    // MongoDB connection string naming a collection to use.
    // If using BSON, use "mapred.input.dir" to configure the directory
    // where BSON files are located instead.
    mongodbConfig.set("mongo.input.uri",
        "mongodb://localhost:27017/incidenti_new.incidenti");

    // Create an RDD backed by the MongoDB collection.
    JavaPairRDD<Object, BSONObject> documents = sc.newAPIHadoopRDD(
        mongodbConfig,            // Configuration
        MongoInputFormat.class,   // InputFormat: read from a live cluster.
        Object.class,             // Key class
        BSONObject.class          // Value class
        );/*  ww w.j  a  va2  s.c om*/

    JavaPairRDD<String, Integer> municipality = documents.flatMapToPair(
        t-> {
          String group = ((String)t._2.get("Gruppo")).trim();
          List<Tuple2<String,Integer>> temp = new ArrayList<Tuple2<String,Integer>>();
          if (group != null && group.length() > 0) {
            temp.add(new Tuple2<String,Integer>(group,1));
          }

          return temp;
        }
        );

    JavaPairRDD<String, Integer> counts = municipality.reduceByKey(
        (a, b) -> a + b
        );

    JavaPairRDD<Object, BSONObject> fin = counts.flatMapToPair(
        t -> {
          BSONObject bo = new BasicBSONObject();
          List<Tuple2<Object, BSONObject>> temp = new ArrayList<Tuple2<Object, BSONObject>>();
          bo.put("Municipio", t._1);
          bo.put("NumIncidenti", t._2);
          temp.add(new Tuple2<Object,BSONObject>(null,bo));
          return temp;
        });

        // Create a separate Configuration for saving data back to MongoDB.
        Configuration outputConfig = new Configuration();
    outputConfig.set("mongo.output.uri",
        "mongodb://localhost:27017/incidenti_new.incidenti_municipi");

    // Save this RDD as a Hadoop "file".
    // The path argument is unused; all documents will go to 'mongo.output.uri'.
    fin.saveAsNewAPIHadoopFile(
        "file:///this-is-completely-unused",
        Object.class,
        BSONObject.class,
        MongoOutputFormat.class,
        outputConfig
        );
}

        public static void main(final String[] args) {
            new IncidentiMunicipi().run();
        }
    }

Related Tutorials