List of usage examples for org.apache.hadoop.conf Configuration set
public void set(String name, String value)
value
of the name
property. From source file:JaqlShell.java
License:Apache License
/** * @param mrc/*from w w w . j a v a2 s . com*/ * @param conf * @throws Exception */ private static void setupOverride(MiniMRCluster mrc, Configuration conf) throws Exception { File overrideDir = new File(System.getProperty("hadoop.conf.override")); if (!overrideDir.exists()) { overrideDir.mkdirs(); } // write out the JobConf from MiniMR to the override dir JobConf jc = mrc.createJobConf(); conf.set("mapred.job.tracker", jc.get("mapred.job.tracker", null)); String name = "mapred.job.tracker.info.port"; String addr = jc.get(name, null); if (addr == null) { name = "mapred.job.tracker.http.address"; addr = jc.get(name, null); } conf.set(name, addr); OutputStream outCore = new FileOutputStream( overrideDir.getCanonicalPath() + File.separator + "core-default.xml"); OutputStream outMapred = new FileOutputStream( overrideDir.getCanonicalPath() + File.separator + "mapred-default.xml"); OutputStream outHdfs = new FileOutputStream( overrideDir.getCanonicalPath() + File.separator + "hdfs-default.xml"); conf.writeXml(outCore); conf.writeXml(outMapred); conf.writeXml(outHdfs); outCore.close(); outMapred.close(); outHdfs.close(); }
From source file:HoopRemoteTask.java
License:Open Source License
/** * /*from w w w. j av a 2s. c o m*/ */ public static void transferConf(Configuration conf) { conf.set("useHadoop", new Boolean(HoopLink.useHadoop).toString()); conf.set("casefold", new Boolean(HoopLink.casefold).toString()); conf.set("stopwords", new Boolean(HoopLink.stopwords).toString()); conf.set("stemming", new Boolean(HoopLink.stemming).toString()); conf.set("cleanoutput", new Boolean(HoopLink.cleanoutput).toString()); conf.set("dbglocal", new Boolean(HoopLink.dbglocal).toString()); conf.set("minstemsize", new Integer(HoopLink.minstemsize).toString()); conf.set("splitsize", new Long(HoopLink.splitsize).toString()); conf.set("nrshards", new Integer(HoopLink.nrshards).toString()); conf.set("shardtype", HoopLink.shardtype); conf.set("shardcount", new Integer(HoopLink.shardcount).toString()); conf.set("shardcreate", new Boolean(HoopLink.shardcreate).toString()); conf.set("task", HoopLink.task); conf.set("monitorHost", HoopLink.monitorHost); }
From source file:HoopRemoteTask.java
License:Open Source License
/** * *//* w ww . j a v a 2s .c o m*/ public static void main(String args[]) throws Exception { // run the HoopLink constructor; We need this to have a global settings registry @SuppressWarnings("unused") HoopLink link = new HoopLink(); dbg("main ()"); showTimeStamp(); /** * I've taken out the statistics portion since it relies on code that isn't distributed * The next version will have this solved. I might try the solution in: * http://stackoverflow.com/questions/7443074/initialize-public-static-variable-in-hadoop-through-arguments * Although chances are I will switch to using Hoop to collect much better performance and distribution * statistics. See Hoop.java for more information */ HoopPerformanceMeasure metrics = new HoopPerformanceMeasure(); metrics.setMarker("main"); HoopLink.metrics.getDataSet().add(metrics); if (parseArgs(args) == false) { usage(); return; } if (HoopLink.postonly == true) { postOnly(); return; } if (HoopLink.task.equals("none") == true) { dbg("No task defined, please use the commandline option -task <task>"); return; } dbg("Starting system ..."); HoopRemoteTask driver = new HoopRemoteTask(); if (HoopLink.useHadoop == false) { dbg("Starting built-in mapper ..."); driver.indexDocuments(); } else { dbg("Starting hadoop job ..."); Configuration conf = new Configuration(); // TRANSFER SETTHoopGS FROM HoopLink to Configuration!!! transferConf(conf); // Now we're feeling much better HoopRemoteTask.hdfs = FileSystem.get(conf); if (HoopLink.dbglocal == true) { dbg("Enabling local debugging ..."); conf.set("mapred.job.tracker", "local"); } else dbg("Disabling local debugging"); JobConf job = new JobConf(conf, HoopRemoteTask.class); job.setJobName(driver.getClassName()); driver.setJob(job); @SuppressWarnings("unused") String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); job.setJarByClass(HoopRemoteTask.class); if (HoopLink.task.equals("invert") == true) { dbg("Configuring job for invert task ..."); job.setReducerClass(HoopInvertedListReducer.class); job.setMapperClass(HoopInvertedListMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); } if (HoopLink.task.equals("wordcount") == true) { dbg("Configuring job for wordcount task ..."); job.setReducerClass(HoopWordCountReducer.class); job.setMapperClass(HoopWordCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); } dbg("Using input path: " + HoopLink.datapath); dbg("Using output path: " + HoopLink.outputpath); FileInputFormat.addInputPath(job, new Path(HoopLink.datapath)); FileOutputFormat.setOutputPath(job, new Path(HoopLink.outputpath)); job.setInputFormat(HoopWholeFileInputFormat.class); if ((HoopLink.shardcreate.equals("mos") == true) && (HoopLink.nrshards > 1)) { dbg("Setting output to sharded output streams class ..."); job.setOutputFormat(HoopShardedOutputFormat.class); } else job.setOutputFormat(TextOutputFormat.class); /** * Temporarily commented out for testing purposes */ //job.setPartitionerClass (HoopPartitioner.class); driver.register("Main"); JobClient.runJob(job); postProcess(conf); } showTimeStamp(); metrics.closeMarker(); long timeTaken = metrics.getYValue(); //long timeTaken=metrics.getMarkerRaw (); metrics.printMetrics(timeTaken); driver.unregister(); /** * I've taken out the statistics portion since it relies on code that isn't distributed * The next version will have this solved. I might try the solution in: * http://stackoverflow.com/questions/7443074/initialize-public-static-variable-in-hadoop-through-arguments * Although chances are I will switch to using Hoop to collect much better performance and distribution * statistics. See Hoop.java for more information */ //stats.calcStatistics(); //dbg (stats.printStatistics()); }
From source file:TestBAM.java
License:Open Source License
public int run(String[] args) throws Exception { final Configuration conf = getConf(); conf.set(MyOutputFormat.HEADER_FROM_FILE, args[0]); DistributedCache.addFileToClassPath(new Path("hdfs:///libjars/hadoop-bam-7.0.0-jar-with-dependencies.jar"), conf);//from w ww .j a va2s . c om final Job job = new Job(conf); job.setJarByClass(TestBAM.class); job.setMapperClass(TestBAMMapper.class); job.setReducerClass(TestBAMReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(SAMRecordWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SAMRecordWritable.class); job.setInputFormatClass(AnySAMInputFormat.class); job.setOutputFormatClass(TestBAM.MyOutputFormat.class); org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new Path(args[0])); org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job, new Path(args[1])); job.submit(); if (!job.waitForCompletion(true)) { System.err.println("sort :: Job failed."); return 1; } return 0; }
From source file:TestParascaleFileSystem.java
License:Apache License
public void testInitialize() throws IOException, URISyntaxException { final Configuration conf = getConf(); // comma seperated username + groups conf.set("hadoop.job.ugi", "hadoop, hadoop"); final ParascaleFileSystem fs = new ParascaleFileSystem(); fs.initialize(new URI(conf.get(FS_DEFAULT_NAME)), conf); assertEquals("psdfs://filesystem@10.200.2.10/user/hadoop", fs.getHomeDirectory().toString()); assertEquals("psdfs://filesystem@10.200.2.10/user/hadoop", fs.getWorkingDirectory().toString()); }
From source file:BigBWA.java
License:Open Source License
@Override public int run(String[] args) throws Exception { Configuration conf = this.getConf(); for (String argumento : args) { LOG.info("Arg: " + argumento); }/*from w w w . j a v a2 s . c om*/ String inputPath = ""; String outputPath = ""; boolean useReducer = false; BwaOptions options = new BwaOptions(args); //We set the timeout and stablish the bwa library to call BWA methods conf.set("mapreduce.task.timeout", "0"); conf.set("mapreduce.map.env", "LD_LIBRARY_PATH=./bwa.zip/"); //==================Algorithm election================== //One of the algorithms is going to be in use, because tge default is always specified. if (options.isMemAlgorithm()) { //Case of the mem algorithm conf.set("mem", "true"); conf.set("aln", "false"); conf.set("bwasw", "false"); } else if (options.isAlnAlgorithm()) { // Case of aln algorithm conf.set("mem", "false"); conf.set("aln", "true"); conf.set("bwasw", "false"); } else if (options.isBwaswAlgorithm()) { // Case of bwasw algorithm conf.set("mem", "false"); conf.set("aln", "false"); conf.set("bwasw", "true"); } //==================Index election================== if (options.getIndexPath() != "") { conf.set("indexRoute", options.getIndexPath()); } else { System.err.println("No index has been found. Aborting."); System.exit(1); } //==================Type of reads election================== //There is always going to be a type of reads, because default is paired if (options.isPairedReads()) { conf.set("paired", "true"); conf.set("single", "false"); } else if (options.isSingleReads()) { conf.set("paired", "false"); conf.set("single", "true"); } //==================Use of reducer================== if (options.isUseReducer()) { useReducer = true; conf.set("useReducer", "true"); } else { conf.set("useReducer", "false"); } //==================Number of threads per map================== if (options.getNumThreads() != "0") { conf.set("bwathreads", options.getNumThreads()); } //==================RG Header=================== if (options.getReadgroupHeader() != "") { conf.set("rgheader", options.getReadgroupHeader()); } //==================Input and output paths================== inputPath = options.getInputPath(); outputPath = options.getOutputPath(); conf.set("outputGenomics", outputPath); //==================Partition number================== if (options.getPartitionNumber() != 0) { try { FileSystem fs = FileSystem.get(conf); Path inputFilePath = new Path(inputPath); ContentSummary cSummary = fs.getContentSummary(inputFilePath); long length = cSummary.getLength(); fs.close(); conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf((length) / options.getPartitionNumber())); conf.set("mapreduce.input.fileinputformat.split.minsize", String.valueOf((length) / options.getPartitionNumber())); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); LOG.error(e.toString()); System.exit(1); } } //Job job = new Job(conf,"BigBWA_"+outputPath); Job job = Job.getInstance(conf, "BigBWA_" + outputPath); job.setJarByClass(BigBWA.class); job.setMapperClass(BigBWAMap.class); //job.setCombinerClass(BigBWACombiner.class); if (useReducer) { job.setReducerClass(BigBWAReducer.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(1); } else { job.setNumReduceTasks(0); } job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:a.TestConcatExample.java
License:Apache License
@Test public void concatIsPermissive() throws IOException, URISyntaxException { MiniDFSCluster cluster = null;/*from www . ja v a 2 s . c o m*/ final Configuration conf = WebHdfsTestUtil.createConf(); conf.set("dfs.namenode.fs-limits.min-block-size", "1000"); // Allow tiny blocks for the test try { cluster = new MiniDFSCluster.Builder(conf).numDataNodes(2).build(); cluster.waitActive(); final FileSystem webHdfs = WebHdfsTestUtil.getWebHdfsFileSystem(conf, WebHdfsFileSystem.SCHEME); final FileSystem dfs = cluster.getFileSystem(); final FileSystem fs = dfs; // WebHDFS has a bug in getLocatedBlocks Path root = new Path("/dir"); fs.mkdirs(root); short origRep = 3; short secondRep = (short) (origRep - 1); Path f1 = new Path("/dir/f1"); long size1 = writeFile(fs, f1, /* blocksize */ 4096, origRep, 5); long f1NumBlocks = fs.getFileBlockLocations(f1, 0, size1).length; assertEquals(5, f1NumBlocks); Path f2 = new Path("/dir/f2"); long size2 = writeFile(fs, f2, /* blocksize (must divide 512 for checksum) */ 4096 - 512, secondRep, 4); long f2NumBlocks = fs.getFileBlockLocations(f2, 0, size2).length; assertEquals(5, f2NumBlocks); fs.concat(f1, new Path[] { f2 }); FileStatus[] fileStatuses = fs.listStatus(root); // Only one file should remain assertEquals(1, fileStatuses.length); FileStatus fileStatus = fileStatuses[0]; // And it should be named after the first file assertEquals("f1", fileStatus.getPath().getName()); // The entire file takes the replication of the first argument assertEquals(origRep, fileStatus.getReplication()); // As expected, the new concated file is the length of both the previous files assertEquals(size1 + size2, fileStatus.getLen()); // And we should have the same number of blocks assertEquals(f1NumBlocks + f2NumBlocks, fs.getFileBlockLocations(fileStatus.getPath(), 0, size1 + size2).length); } finally { if (cluster != null) { cluster.shutdown(); } } }
From source file:abtGlobals.TestConnection.java
public static void main(String[] args) throws IOException { // Instantiating configuration class Configuration config = HBaseConfiguration.create(); config.set("hbase.zookeeper.quorum", "hortonnodo1"); config.set("hbase.zookeeper.property.clientport", "2181"); config.set("zookeeper.znode.parent", "/hbase-unsecure"); //this is what most people miss :) }
From source file:accumulo.AccumuloStuff.java
License:Apache License
private static void setCoreSite(MiniAccumuloClusterImpl cluster) throws Exception { File csFile = new File(cluster.getConfig().getConfDir(), "core-site.xml"); if (csFile.exists()) throw new RuntimeException(csFile + " already exist"); Configuration coreSite = new Configuration(false); coreSite.set("fs.file.impl", RawLocalFileSystem.class.getName()); OutputStream out = new BufferedOutputStream( new FileOutputStream(new File(cluster.getConfig().getConfDir(), "core-site.xml"))); coreSite.writeXml(out);//from w w w .j av a 2s.c om out.close(); }
From source file:ai.grakn.kb.internal.computer.GraknSparkComputer.java
License:Open Source License
@SuppressWarnings("PMD.UnusedFormalParameter") private Future<ComputerResult> submitWithExecutor() { jobGroupId = Integer.toString(ThreadLocalRandom.current().nextInt(Integer.MAX_VALUE)); String jobDescription = this.vertexProgram == null ? this.mapReducers.toString() : this.vertexProgram + "+" + this.mapReducers; // Use different output locations this.sparkConfiguration.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, this.sparkConfiguration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION) + "/" + jobGroupId); updateConfigKeys(sparkConfiguration); final Future<ComputerResult> result = computerService.submit(() -> { final long startTime = System.currentTimeMillis(); // apache and hadoop configurations that are used throughout the graph computer computation final org.apache.commons.configuration.Configuration graphComputerConfiguration = new HadoopConfiguration( this.sparkConfiguration); if (!graphComputerConfiguration.containsKey(Constants.SPARK_SERIALIZER)) { graphComputerConfiguration.setProperty(Constants.SPARK_SERIALIZER, GryoSerializer.class.getCanonicalName()); }// ww w . ja v a2 s . c o m graphComputerConfiguration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, this.persist.equals(GraphComputer.Persist.EDGES)); final Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(graphComputerConfiguration); final Storage fileSystemStorage = FileSystemStorage.open(hadoopConfiguration); final boolean inputFromHDFS = FileInputFormat.class.isAssignableFrom( hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class)); final boolean inputFromSpark = PersistedInputRDD.class.isAssignableFrom( hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class)); final boolean outputToHDFS = FileOutputFormat.class.isAssignableFrom( hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class)); final boolean outputToSpark = PersistedOutputRDD.class.isAssignableFrom( hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class)); final boolean skipPartitioner = graphComputerConfiguration .getBoolean(Constants.GREMLIN_SPARK_SKIP_PARTITIONER, false); final boolean skipPersist = graphComputerConfiguration .getBoolean(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE, false); if (inputFromHDFS) { String inputLocation = Constants .getSearchGraphLocation(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION), fileSystemStorage) .orElse(null); if (null != inputLocation) { try { graphComputerConfiguration.setProperty(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, FileSystem.get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath() .toString()); hadoopConfiguration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, FileSystem.get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath() .toString()); } catch (final IOException e) { throw new IllegalStateException(e.getMessage(), e); } } } final InputRDD inputRDD; final OutputRDD outputRDD; final boolean filtered; try { inputRDD = InputRDD.class.isAssignableFrom( hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class)) ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputRDD.class, InputRDD.class).newInstance() : InputFormatRDD.class.newInstance(); outputRDD = OutputRDD.class.isAssignableFrom( hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class)) ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputRDD.class, OutputRDD.class).newInstance() : OutputFormatRDD.class.newInstance(); // if the input class can filter on load, then set the filters if (inputRDD instanceof InputFormatRDD && GraphFilterAware.class.isAssignableFrom(hadoopConfiguration.getClass( Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class))) { GraphFilterAware.storeGraphFilter(graphComputerConfiguration, hadoopConfiguration, this.graphFilter); filtered = false; } else if (inputRDD instanceof GraphFilterAware) { ((GraphFilterAware) inputRDD).setGraphFilter(this.graphFilter); filtered = false; } else filtered = this.graphFilter.hasFilter(); } catch (final InstantiationException | IllegalAccessException e) { throw new IllegalStateException(e.getMessage(), e); } // create the spark context from the graph computer configuration final JavaSparkContext sparkContext = new JavaSparkContext(Spark.create(hadoopConfiguration)); final Storage sparkContextStorage = SparkContextStorage.open(); sparkContext.setJobGroup(jobGroupId, jobDescription); GraknSparkMemory memory = null; // delete output location final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, null); if (null != outputLocation) { if (outputToHDFS && fileSystemStorage.exists(outputLocation)) { fileSystemStorage.rm(outputLocation); } if (outputToSpark && sparkContextStorage.exists(outputLocation)) { sparkContextStorage.rm(outputLocation); } } // the Spark application name will always be set by SparkContextStorage, // thus, INFO the name to make it easier to debug logger.debug(Constants.GREMLIN_HADOOP_SPARK_JOB_PREFIX + (null == this.vertexProgram ? "No VertexProgram" : this.vertexProgram) + "[" + this.mapReducers + "]"); // add the project jars to the cluster this.loadJars(hadoopConfiguration, sparkContext); updateLocalConfiguration(sparkContext, hadoopConfiguration); // create a message-passing friendly rdd from the input rdd boolean partitioned = false; JavaPairRDD<Object, VertexWritable> loadedGraphRDD = inputRDD.readGraphRDD(graphComputerConfiguration, sparkContext); // if there are vertex or edge filters, filter the loaded graph rdd prior to partitioning and persisting if (filtered) { this.logger.debug("Filtering the loaded graphRDD: " + this.graphFilter); loadedGraphRDD = GraknSparkExecutor.applyGraphFilter(loadedGraphRDD, this.graphFilter); } // if the loaded graph RDD is already partitioned use that partitioner, // else partition it with HashPartitioner if (loadedGraphRDD.partitioner().isPresent()) { this.logger.debug("Using the existing partitioner associated with the loaded graphRDD: " + loadedGraphRDD.partitioner().get()); } else { if (!skipPartitioner) { final Partitioner partitioner = new HashPartitioner( this.workersSet ? this.workers : loadedGraphRDD.partitions().size()); this.logger.debug("Partitioning the loaded graphRDD: " + partitioner); loadedGraphRDD = loadedGraphRDD.partitionBy(partitioner); partitioned = true; assert loadedGraphRDD.partitioner().isPresent(); } else { // no easy way to test this with a test case assert skipPartitioner == !loadedGraphRDD.partitioner().isPresent(); this.logger.debug("Partitioning has been skipped for the loaded graphRDD via " + Constants.GREMLIN_SPARK_SKIP_PARTITIONER); } } // if the loaded graphRDD was already partitioned previous, // then this coalesce/repartition will not take place if (this.workersSet) { // ensures that the loaded graphRDD does not have more partitions than workers if (loadedGraphRDD.partitions().size() > this.workers) { loadedGraphRDD = loadedGraphRDD.coalesce(this.workers); } else { // ensures that the loaded graphRDD does not have less partitions than workers if (loadedGraphRDD.partitions().size() < this.workers) { loadedGraphRDD = loadedGraphRDD.repartition(this.workers); } } } // persist the vertex program loaded graph as specified by configuration // or else use default cache() which is MEMORY_ONLY if (!skipPersist && (!inputFromSpark || partitioned || filtered)) { loadedGraphRDD = loadedGraphRDD.persist(StorageLevel.fromString( hadoopConfiguration.get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY"))); } // final graph with view // (for persisting and/or mapReducing -- may be null and thus, possible to save space/time) JavaPairRDD<Object, VertexWritable> computedGraphRDD = null; try { //////////////////////////////// // process the vertex program // //////////////////////////////// if (null != this.vertexProgram) { memory = new GraknSparkMemory(this.vertexProgram, this.mapReducers, sparkContext); ///////////////// // if there is a registered VertexProgramInterceptor, use it to bypass the GraphComputer semantics if (graphComputerConfiguration .containsKey(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR)) { try { final GraknSparkVertexProgramInterceptor<VertexProgram> interceptor = (GraknSparkVertexProgramInterceptor) Class .forName(graphComputerConfiguration .getString(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR)) .newInstance(); computedGraphRDD = interceptor.apply(this.vertexProgram, loadedGraphRDD, memory); } catch (final ClassNotFoundException | IllegalAccessException | InstantiationException e) { throw new IllegalStateException(e.getMessage()); } } else { // standard GraphComputer semantics // get a configuration that will be propagated to all workers final HadoopConfiguration vertexProgramConfiguration = new HadoopConfiguration(); this.vertexProgram.storeState(vertexProgramConfiguration); // set up the vertex program and wire up configurations this.vertexProgram.setup(memory); JavaPairRDD<Object, ViewIncomingPayload<Object>> viewIncomingRDD = null; memory.broadcastMemory(sparkContext); // execute the vertex program while (true) { if (Thread.interrupted()) { sparkContext.cancelAllJobs(); throw new TraversalInterruptedException(); } memory.setInExecute(true); viewIncomingRDD = GraknSparkExecutor.executeVertexProgramIteration(loadedGraphRDD, viewIncomingRDD, memory, graphComputerConfiguration, vertexProgramConfiguration); memory.setInExecute(false); if (this.vertexProgram.terminate(memory)) { break; } else { memory.incrIteration(); memory.broadcastMemory(sparkContext); } } // if the graph will be continued to be used (persisted or mapreduced), // then generate a view+graph if ((null != outputRDD && !this.persist.equals(Persist.NOTHING)) || !this.mapReducers.isEmpty()) { computedGraphRDD = GraknSparkExecutor.prepareFinalGraphRDD(loadedGraphRDD, viewIncomingRDD, this.vertexProgram.getVertexComputeKeys()); assert null != computedGraphRDD && computedGraphRDD != loadedGraphRDD; } else { // ensure that the computedGraphRDD was not created assert null == computedGraphRDD; } } ///////////////// memory.complete(); // drop all transient memory keys // write the computed graph to the respective output (rdd or output format) if (null != outputRDD && !this.persist.equals(Persist.NOTHING)) { // the logic holds that a computeGraphRDD must be created at this point assert null != computedGraphRDD; outputRDD.writeGraphRDD(graphComputerConfiguration, computedGraphRDD); } } final boolean computedGraphCreated = computedGraphRDD != null && computedGraphRDD != loadedGraphRDD; if (!computedGraphCreated) { computedGraphRDD = loadedGraphRDD; } final Memory.Admin finalMemory = null == memory ? new MapMemory() : new MapMemory(memory); ////////////////////////////// // process the map reducers // ////////////////////////////// if (!this.mapReducers.isEmpty()) { // create a mapReduceRDD for executing the map reduce jobs on JavaPairRDD<Object, VertexWritable> mapReduceRDD = computedGraphRDD; if (computedGraphCreated && !outputToSpark) { // drop all the edges of the graph as they are not used in mapReduce processing mapReduceRDD = computedGraphRDD.mapValues(vertexWritable -> { vertexWritable.get().dropEdges(Direction.BOTH); return vertexWritable; }); // if there is only one MapReduce to execute, don't bother wasting the clock cycles. if (this.mapReducers.size() > 1) { mapReduceRDD = mapReduceRDD.persist(StorageLevel.fromString(hadoopConfiguration .get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY"))); } } for (final MapReduce mapReduce : this.mapReducers) { // execute the map reduce job final HadoopConfiguration newApacheConfiguration = new HadoopConfiguration( graphComputerConfiguration); mapReduce.storeState(newApacheConfiguration); // map final JavaPairRDD mapRDD = GraknSparkExecutor.executeMap(mapReduceRDD, mapReduce, newApacheConfiguration); // combine final JavaPairRDD combineRDD = mapReduce.doStage(MapReduce.Stage.COMBINE) ? GraknSparkExecutor.executeCombine(mapRDD, newApacheConfiguration) : mapRDD; // reduce final JavaPairRDD reduceRDD = mapReduce.doStage(MapReduce.Stage.REDUCE) ? GraknSparkExecutor.executeReduce(combineRDD, mapReduce, newApacheConfiguration) : combineRDD; // write the map reduce output back to disk and computer result memory if (null != outputRDD) { mapReduce.addResultToMemory(finalMemory, outputRDD.writeMemoryRDD( graphComputerConfiguration, mapReduce.getMemoryKey(), reduceRDD)); } } // if the mapReduceRDD is not simply the computed graph, unpersist the mapReduceRDD if (computedGraphCreated && !outputToSpark) { assert loadedGraphRDD != computedGraphRDD; assert mapReduceRDD != computedGraphRDD; mapReduceRDD.unpersist(); } else { assert mapReduceRDD == computedGraphRDD; } } // unpersist the loaded graph if it will not be used again (no PersistedInputRDD) // if the graphRDD was loaded from Spark, but then partitioned or filtered, its a different RDD if (!inputFromSpark || partitioned || filtered) { loadedGraphRDD.unpersist(); } // unpersist the computed graph if it will not be used again (no PersistedOutputRDD) // if the computed graph is the loadedGraphRDD because it was not mutated and not-unpersisted, // then don't unpersist the computedGraphRDD/loadedGraphRDD if ((!outputToSpark || this.persist.equals(GraphComputer.Persist.NOTHING)) && computedGraphCreated) { computedGraphRDD.unpersist(); } // delete any file system or rdd data if persist nothing if (null != outputLocation && this.persist.equals(GraphComputer.Persist.NOTHING)) { if (outputToHDFS) { fileSystemStorage.rm(outputLocation); } if (outputToSpark) { sparkContextStorage.rm(outputLocation); } } // update runtime and return the newly computed graph finalMemory.setRuntime(System.currentTimeMillis() - startTime); // clear properties that should not be propagated in an OLAP chain graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_GRAPH_FILTER); graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR); graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE); graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_PARTITIONER); return new DefaultComputerResult(InputOutputHelper.getOutputGraph(graphComputerConfiguration, this.resultGraph, this.persist), finalMemory.asImmutable()); } catch (Exception e) { // So it throws the same exception as tinker does throw new RuntimeException(e); } }); computerService.shutdown(); return result; }