List of usage examples for org.apache.hadoop.conf Configuration getClass
public Class<?> getClass(String name, Class<?> defaultValue)
name
property as a Class
. From source file:org.apache.tinkerpop.gremlin.hadoop.process.computer.util.MapReduceHelper.java
License:Apache License
public static void executeMapReduceJob(final MapReduce mapReduce, final Memory.Admin memory, final Configuration configuration) throws IOException, ClassNotFoundException, InterruptedException { final Configuration newConfiguration = new Configuration(configuration); final boolean vertexProgramExists = newConfiguration.get(VertexProgram.VERTEX_PROGRAM, null) != null; if (vertexProgramExists) { newConfiguration.set(Constants.GREMLIN_HADOOP_GRAPH_READER, InputOutputHelper.getInputFormat( (Class) newConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputFormat.class)) .getCanonicalName());//from ww w.j a v a 2s .c o m newConfiguration.unset(Constants.GREMLIN_HADOOP_GRAPH_FILTER); } final BaseConfiguration apacheConfiguration = new BaseConfiguration(); apacheConfiguration.setDelimiterParsingDisabled(true); mapReduce.storeState(apacheConfiguration); ConfUtil.mergeApacheIntoHadoopConfiguration(apacheConfiguration, newConfiguration); final Optional<Comparator<?>> mapSort = mapReduce.getMapKeySort(); final Optional<Comparator<?>> reduceSort = mapReduce.getReduceKeySort(); newConfiguration.setClass(Constants.GREMLIN_HADOOP_MAP_REDUCE_CLASS, mapReduce.getClass(), MapReduce.class); final Job job = Job.getInstance(newConfiguration, mapReduce.toString()); HadoopGraph.LOGGER.info(Constants.GREMLIN_HADOOP_JOB_PREFIX + mapReduce.toString()); job.setJarByClass(HadoopGraph.class); if (mapSort.isPresent()) job.setSortComparatorClass(ObjectWritableComparator.ObjectWritableMapComparator.class); job.setMapperClass(HadoopMap.class); if (mapReduce.doStage(MapReduce.Stage.REDUCE)) { if (mapReduce.doStage(MapReduce.Stage.COMBINE)) job.setCombinerClass(HadoopCombine.class); job.setReducerClass(HadoopReduce.class); } else { if (mapSort.isPresent()) { job.setReducerClass(Reducer.class); job.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order? } else { job.setNumReduceTasks(0); } } job.setMapOutputKeyClass(ObjectWritable.class); job.setMapOutputValueClass(ObjectWritable.class); job.setOutputKeyClass(ObjectWritable.class); job.setOutputValueClass(ObjectWritable.class); job.setInputFormatClass(GraphFilterInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); // if there is no vertex program, then grab the graph from the input location final Path graphPath; if (vertexProgramExists) { graphPath = new Path( Constants.getGraphLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))); } else { graphPath = new Path(newConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION)); } Path memoryPath = new Path( Constants.getMemoryLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), (reduceSort.isPresent() ? mapReduce.getMemoryKey() + "-temp" : mapReduce.getMemoryKey()))); if (FileSystem.get(newConfiguration).exists(memoryPath)) { FileSystem.get(newConfiguration).delete(memoryPath, true); } FileInputFormat.setInputPaths(job, graphPath); FileOutputFormat.setOutputPath(job, memoryPath); job.waitForCompletion(true); // if there is a reduce sort, we need to run another identity MapReduce job if (reduceSort.isPresent()) { final Job reduceSortJob = Job.getInstance(newConfiguration, "ReduceKeySort"); reduceSortJob.setSortComparatorClass(ObjectWritableComparator.ObjectWritableReduceComparator.class); reduceSortJob.setMapperClass(Mapper.class); reduceSortJob.setReducerClass(Reducer.class); reduceSortJob.setMapOutputKeyClass(ObjectWritable.class); reduceSortJob.setMapOutputValueClass(ObjectWritable.class); reduceSortJob.setOutputKeyClass(ObjectWritable.class); reduceSortJob.setOutputValueClass(ObjectWritable.class); reduceSortJob.setInputFormatClass(SequenceFileInputFormat.class); reduceSortJob.setOutputFormatClass(SequenceFileOutputFormat.class); reduceSortJob.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order? FileInputFormat.setInputPaths(reduceSortJob, memoryPath); final Path sortedMemoryPath = new Path(Constants.getMemoryLocation( newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), mapReduce.getMemoryKey())); FileOutputFormat.setOutputPath(reduceSortJob, sortedMemoryPath); reduceSortJob.waitForCompletion(true); FileSystem.get(newConfiguration).delete(memoryPath, true); // delete the temporary memory path memoryPath = sortedMemoryPath; } mapReduce.addResultToMemory(memory, new ObjectWritableIterator(newConfiguration, memoryPath)); }
From source file:org.apache.tinkerpop.gremlin.hadoop.structure.util.ConfUtil.java
License:Apache License
public static InputFormat<NullWritable, VertexWritable> getReaderAsInputFormat( final Configuration hadoopConfiguration) { final Class<?> readerClass = hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class); try {// w ww . j a v a2s .c om return InputFormat.class.isAssignableFrom(readerClass) ? (InputFormat<NullWritable, VertexWritable>) readerClass.newInstance() : (InputFormat<NullWritable, VertexWritable>) Class .forName("org.apache.tinkerpop.gremlin.spark.structure.io.InputRDDFormat") .newInstance(); } catch (final Exception e) { throw new IllegalStateException(e.getMessage(), e); } }
From source file:org.apache.tinkerpop.gremlin.spark.process.computer.SparkGraphComputer.java
License:Apache License
private Future<ComputerResult> submitWithExecutor(Executor exec) { // create the completable future return computerService.submit(() -> { final long startTime = System.currentTimeMillis(); ////////////////////////////////////////////////// /////// PROCESS SHIM AND SYSTEM PROPERTIES /////// ////////////////////////////////////////////////// ConfigurationUtils.copy(this.hadoopGraph.configuration(), this.sparkConfiguration); final String shimService = KryoSerializer.class.getCanonicalName() .equals(this.sparkConfiguration.getString(Constants.SPARK_SERIALIZER, null)) ? UnshadedKryoShimService.class.getCanonicalName() : HadoopPoolShimService.class.getCanonicalName(); this.sparkConfiguration.setProperty(KryoShimServiceLoader.KRYO_SHIM_SERVICE, shimService); /////////// final StringBuilder params = new StringBuilder(); this.sparkConfiguration.getKeys().forEachRemaining(key -> { if (KEYS_PASSED_IN_JVM_SYSTEM_PROPERTIES.contains(key)) { params.append(" -D").append("tinkerpop.").append(key).append("=") .append(this.sparkConfiguration.getProperty(key)); System.setProperty("tinkerpop." + key, this.sparkConfiguration.getProperty(key).toString()); }/*from w w w . j av a2 s . c om*/ }); if (params.length() > 0) { this.sparkConfiguration.setProperty(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS, (this.sparkConfiguration.getString(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS, "") + params.toString()).trim()); this.sparkConfiguration.setProperty(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, (this.sparkConfiguration.getString(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, "") + params.toString()).trim()); } KryoShimServiceLoader.applyConfiguration(this.sparkConfiguration); ////////////////////////////////////////////////// ////////////////////////////////////////////////// ////////////////////////////////////////////////// // apache and hadoop configurations that are used throughout the graph computer computation final org.apache.commons.configuration.Configuration graphComputerConfiguration = new HadoopConfiguration( this.sparkConfiguration); if (!graphComputerConfiguration.containsKey(Constants.SPARK_SERIALIZER)) { graphComputerConfiguration.setProperty(Constants.SPARK_SERIALIZER, KryoSerializer.class.getCanonicalName()); if (!graphComputerConfiguration.containsKey(Constants.SPARK_KRYO_REGISTRATOR)) graphComputerConfiguration.setProperty(Constants.SPARK_KRYO_REGISTRATOR, GryoRegistrator.class.getCanonicalName()); } graphComputerConfiguration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, this.persist.equals(GraphComputer.Persist.EDGES)); final Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(graphComputerConfiguration); final Storage fileSystemStorage = FileSystemStorage.open(hadoopConfiguration); final Storage sparkContextStorage = SparkContextStorage.open(graphComputerConfiguration); final boolean inputFromHDFS = FileInputFormat.class.isAssignableFrom( hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class)); final boolean inputFromSpark = PersistedInputRDD.class.isAssignableFrom( hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class)); final boolean outputToHDFS = FileOutputFormat.class.isAssignableFrom( hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class)); final boolean outputToSpark = PersistedOutputRDD.class.isAssignableFrom( hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class)); final boolean skipPartitioner = graphComputerConfiguration .getBoolean(Constants.GREMLIN_SPARK_SKIP_PARTITIONER, false); final boolean skipPersist = graphComputerConfiguration .getBoolean(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE, false); String inputLocation = null; if (inputFromSpark) inputLocation = Constants .getSearchGraphLocation(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION), sparkContextStorage) .orElse(null); else if (inputFromHDFS) inputLocation = Constants .getSearchGraphLocation(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION), fileSystemStorage) .orElse(null); if (null == inputLocation) inputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION); if (null != inputLocation && inputFromHDFS) { try { graphComputerConfiguration.setProperty(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, FileSystem.get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath() .toString()); hadoopConfiguration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, FileSystem .get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath().toString()); } catch (final IOException e) { throw new IllegalStateException(e.getMessage(), e); } } final InputRDD inputRDD; final OutputRDD outputRDD; final boolean filtered; try { inputRDD = InputRDD.class.isAssignableFrom( hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class)) ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputRDD.class, InputRDD.class).newInstance() : InputFormatRDD.class.newInstance(); outputRDD = OutputRDD.class.isAssignableFrom( hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class)) ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputRDD.class, OutputRDD.class).newInstance() : OutputFormatRDD.class.newInstance(); // if the input class can filter on load, then set the filters if (inputRDD instanceof InputFormatRDD && GraphFilterAware.class.isAssignableFrom(hadoopConfiguration.getClass( Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class))) { GraphFilterAware.storeGraphFilter(graphComputerConfiguration, hadoopConfiguration, this.graphFilter); filtered = false; } else if (inputRDD instanceof GraphFilterAware) { ((GraphFilterAware) inputRDD).setGraphFilter(this.graphFilter); filtered = false; } else if (this.graphFilter.hasFilter()) { filtered = true; } else { filtered = false; } } catch (final InstantiationException | IllegalAccessException e) { throw new IllegalStateException(e.getMessage(), e); } SparkMemory memory = null; // delete output location final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, null); if (null != outputLocation) { if (outputToHDFS && fileSystemStorage.exists(outputLocation)) fileSystemStorage.rm(outputLocation); if (outputToSpark && sparkContextStorage.exists(outputLocation)) sparkContextStorage.rm(outputLocation); } // the Spark application name will always be set by SparkContextStorage, thus, INFO the name to make it easier to debug logger.debug(Constants.GREMLIN_HADOOP_SPARK_JOB_PREFIX + (null == this.vertexProgram ? "No VertexProgram" : this.vertexProgram) + "[" + this.mapReducers + "]"); // create the spark configuration from the graph computer configuration final SparkConf sparkConfiguration = new SparkConf(); hadoopConfiguration.forEach(entry -> sparkConfiguration.set(entry.getKey(), entry.getValue())); // execute the vertex program and map reducers and if there is a failure, auto-close the spark context try { final JavaSparkContext sparkContext = new JavaSparkContext( SparkContext.getOrCreate(sparkConfiguration)); this.loadJars(hadoopConfiguration, sparkContext); // add the project jars to the cluster Spark.create(sparkContext.sc()); // this is the context RDD holder that prevents GC updateLocalConfiguration(sparkContext, sparkConfiguration); // create a message-passing friendly rdd from the input rdd boolean partitioned = false; JavaPairRDD<Object, VertexWritable> loadedGraphRDD = inputRDD .readGraphRDD(graphComputerConfiguration, sparkContext); // if there are vertex or edge filters, filter the loaded graph rdd prior to partitioning and persisting if (filtered) { this.logger.debug("Filtering the loaded graphRDD: " + this.graphFilter); loadedGraphRDD = SparkExecutor.applyGraphFilter(loadedGraphRDD, this.graphFilter); } // if the loaded graph RDD is already partitioned use that partitioner, else partition it with HashPartitioner if (loadedGraphRDD.partitioner().isPresent()) this.logger.debug("Using the existing partitioner associated with the loaded graphRDD: " + loadedGraphRDD.partitioner().get()); else { if (!skipPartitioner) { final Partitioner partitioner = new HashPartitioner( this.workersSet ? this.workers : loadedGraphRDD.partitions().size()); this.logger.debug("Partitioning the loaded graphRDD: " + partitioner); loadedGraphRDD = loadedGraphRDD.partitionBy(partitioner); partitioned = true; assert loadedGraphRDD.partitioner().isPresent(); } else { assert skipPartitioner == !loadedGraphRDD.partitioner().isPresent(); // no easy way to test this with a test case this.logger.debug("Partitioning has been skipped for the loaded graphRDD via " + Constants.GREMLIN_SPARK_SKIP_PARTITIONER); } } // if the loaded graphRDD was already partitioned previous, then this coalesce/repartition will not take place if (this.workersSet) { if (loadedGraphRDD.partitions().size() > this.workers) // ensures that the loaded graphRDD does not have more partitions than workers loadedGraphRDD = loadedGraphRDD.coalesce(this.workers); else if (loadedGraphRDD.partitions().size() < this.workers) // ensures that the loaded graphRDD does not have less partitions than workers loadedGraphRDD = loadedGraphRDD.repartition(this.workers); } // persist the vertex program loaded graph as specified by configuration or else use default cache() which is MEMORY_ONLY if (!skipPersist && (!inputFromSpark || partitioned || filtered)) loadedGraphRDD = loadedGraphRDD.persist(StorageLevel.fromString( hadoopConfiguration.get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY"))); // final graph with view (for persisting and/or mapReducing -- may be null and thus, possible to save space/time) JavaPairRDD<Object, VertexWritable> computedGraphRDD = null; //////////////////////////////// // process the vertex program // //////////////////////////////// if (null != this.vertexProgram) { memory = new SparkMemory(this.vertexProgram, this.mapReducers, sparkContext); ///////////////// // if there is a registered VertexProgramInterceptor, use it to bypass the GraphComputer semantics if (graphComputerConfiguration .containsKey(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR)) { try { final SparkVertexProgramInterceptor<VertexProgram> interceptor = (SparkVertexProgramInterceptor) Class .forName(graphComputerConfiguration .getString(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR)) .newInstance(); computedGraphRDD = interceptor.apply(this.vertexProgram, loadedGraphRDD, memory); } catch (final ClassNotFoundException | IllegalAccessException | InstantiationException e) { throw new IllegalStateException(e.getMessage()); } } else { // standard GraphComputer semantics // get a configuration that will be propagated to all workers final HadoopConfiguration vertexProgramConfiguration = new HadoopConfiguration(); this.vertexProgram.storeState(vertexProgramConfiguration); // set up the vertex program and wire up configurations this.vertexProgram.setup(memory); JavaPairRDD<Object, ViewIncomingPayload<Object>> viewIncomingRDD = null; memory.broadcastMemory(sparkContext); // execute the vertex program while (true) { if (Thread.interrupted()) { sparkContext.cancelAllJobs(); throw new TraversalInterruptedException(); } memory.setInExecute(true); viewIncomingRDD = SparkExecutor.executeVertexProgramIteration(loadedGraphRDD, viewIncomingRDD, memory, graphComputerConfiguration, vertexProgramConfiguration); memory.setInExecute(false); if (this.vertexProgram.terminate(memory)) break; else { memory.incrIteration(); memory.broadcastMemory(sparkContext); } } // if the graph will be continued to be used (persisted or mapreduced), then generate a view+graph if ((null != outputRDD && !this.persist.equals(Persist.NOTHING)) || !this.mapReducers.isEmpty()) { computedGraphRDD = SparkExecutor.prepareFinalGraphRDD(loadedGraphRDD, viewIncomingRDD, this.vertexProgram.getVertexComputeKeys()); assert null != computedGraphRDD && computedGraphRDD != loadedGraphRDD; } else { // ensure that the computedGraphRDD was not created assert null == computedGraphRDD; } } ///////////////// memory.complete(); // drop all transient memory keys // write the computed graph to the respective output (rdd or output format) if (null != outputRDD && !this.persist.equals(Persist.NOTHING)) { assert null != computedGraphRDD; // the logic holds that a computeGraphRDD must be created at this point outputRDD.writeGraphRDD(graphComputerConfiguration, computedGraphRDD); } } final boolean computedGraphCreated = computedGraphRDD != null && computedGraphRDD != loadedGraphRDD; if (!computedGraphCreated) computedGraphRDD = loadedGraphRDD; final Memory.Admin finalMemory = null == memory ? new MapMemory() : new MapMemory(memory); ////////////////////////////// // process the map reducers // ////////////////////////////// if (!this.mapReducers.isEmpty()) { // create a mapReduceRDD for executing the map reduce jobs on JavaPairRDD<Object, VertexWritable> mapReduceRDD = computedGraphRDD; if (computedGraphCreated && !outputToSpark) { // drop all the edges of the graph as they are not used in mapReduce processing mapReduceRDD = computedGraphRDD.mapValues(vertexWritable -> { vertexWritable.get().dropEdges(Direction.BOTH); return vertexWritable; }); // if there is only one MapReduce to execute, don't bother wasting the clock cycles. if (this.mapReducers.size() > 1) mapReduceRDD = mapReduceRDD.persist(StorageLevel.fromString(hadoopConfiguration .get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY"))); } for (final MapReduce mapReduce : this.mapReducers) { // execute the map reduce job final HadoopConfiguration newApacheConfiguration = new HadoopConfiguration( graphComputerConfiguration); mapReduce.storeState(newApacheConfiguration); // map final JavaPairRDD mapRDD = SparkExecutor.executeMap((JavaPairRDD) mapReduceRDD, mapReduce, newApacheConfiguration); // combine final JavaPairRDD combineRDD = mapReduce.doStage(MapReduce.Stage.COMBINE) ? SparkExecutor.executeCombine(mapRDD, newApacheConfiguration) : mapRDD; // reduce final JavaPairRDD reduceRDD = mapReduce.doStage(MapReduce.Stage.REDUCE) ? SparkExecutor.executeReduce(combineRDD, mapReduce, newApacheConfiguration) : combineRDD; // write the map reduce output back to disk and computer result memory if (null != outputRDD) mapReduce.addResultToMemory(finalMemory, outputRDD.writeMemoryRDD( graphComputerConfiguration, mapReduce.getMemoryKey(), reduceRDD)); } // if the mapReduceRDD is not simply the computed graph, unpersist the mapReduceRDD if (computedGraphCreated && !outputToSpark) { assert loadedGraphRDD != computedGraphRDD; assert mapReduceRDD != computedGraphRDD; mapReduceRDD.unpersist(); } else { assert mapReduceRDD == computedGraphRDD; } } // unpersist the loaded graph if it will not be used again (no PersistedInputRDD) // if the graphRDD was loaded from Spark, but then partitioned or filtered, its a different RDD if (!inputFromSpark || partitioned || filtered) loadedGraphRDD.unpersist(); // unpersist the computed graph if it will not be used again (no PersistedOutputRDD) // if the computed graph is the loadedGraphRDD because it was not mutated and not-unpersisted, then don't unpersist the computedGraphRDD/loadedGraphRDD if ((!outputToSpark || this.persist.equals(GraphComputer.Persist.NOTHING)) && computedGraphCreated) computedGraphRDD.unpersist(); // delete any file system or rdd data if persist nothing if (null != outputLocation && this.persist.equals(GraphComputer.Persist.NOTHING)) { if (outputToHDFS) fileSystemStorage.rm(outputLocation); if (outputToSpark) sparkContextStorage.rm(outputLocation); } // update runtime and return the newly computed graph finalMemory.setRuntime(System.currentTimeMillis() - startTime); // clear properties that should not be propagated in an OLAP chain graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_GRAPH_FILTER); graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR); graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE); graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_PARTITIONER); return new DefaultComputerResult(InputOutputHelper.getOutputGraph(graphComputerConfiguration, this.resultGraph, this.persist), finalMemory.asImmutable()); } finally { if (!graphComputerConfiguration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false)) Spark.close(); } }); }
From source file:org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter.java
License:Open Source License
public SequenceFileSpillWriter(FileSystem fileSystem, Configuration conf, Path outputFilePath, Class<KeyType> keyClass, Class<ValueType> valueClass, SequenceFileIndexWriter<KeyType, ValueType> optionalIndexWriter, boolean compress) throws IOException { _indexWriter = optionalIndexWriter;//from w ww .ja v a2 s.com _spillBufferSize = conf.getInt(SPILL_WRITER_BUFFER_SIZE_PARAM, DEFAULT_SPILL_BUFFER_SIZE); _outputStream = fileSystem.create(outputFilePath); // allocate buffer ... _activeBuffer = ByteBuffer.allocate(_spillBufferSize); if (compress) { Class codecClass = conf.getClass("mapred.output.compression.codec", DefaultCodec.class); CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); writer = SequenceFile.createWriter(conf, _outputStream, keyClass, valueClass, CompressionType.BLOCK, codec); } else { writer = SequenceFile.createWriter(conf, _outputStream, keyClass, valueClass, CompressionType.NONE, null); } _writerThread = new Thread(new Runnable() { @Override public void run() { // LOG.info("Writer Thread Starting"); while (true) { QueuedBufferItem queuedBufferItem = null; try { queuedBufferItem = _bufferQueue.take(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (queuedBufferItem._buffer == null) { // LOG.info("Writer Thread received empty buffer item. Exiting"); return; } else { ByteBuffer theBuffer = queuedBufferItem._buffer; // LOG.info("Writer Thread received item. Limit:" + // theBuffer.limit()); // get byte pointer byte[] bufferAsBytes = theBuffer.array(); int itemsWritten = 0; long timeStart = System.currentTimeMillis(); while (theBuffer.remaining() != 0) { // now read in key length int keyLen = theBuffer.getInt(); // mark key position int keyPos = theBuffer.position(); // now skip past key length theBuffer.position(keyPos + keyLen); // read value length int valueLen = theBuffer.getInt(); // mark value position int valuePosition = theBuffer.position(); // now skip past it ... theBuffer.position(valuePosition + valueLen); // now write this out to the sequence file ... try { spillRawRecord2(bufferAsBytes, keyPos, keyLen, bufferAsBytes, valuePosition, valueLen); } catch (IOException e) { LOG.error("Writer Thread Failed with Error:" + StringUtils.stringifyException(e)); _writerException = e; return; } itemsWritten++; } // LOG.info("Writer Thread Finished With Buffer. Wrote:"+ // itemsWritten + " in:" + (System.currentTimeMillis() - // timeStart)); } } } }); _writerThread.start(); }
From source file:org.hypertable.FsBroker.hadoop.HadoopBroker.java
License:Open Source License
/** * Returns a brand new instance of the FileSystem. It does not use * the FileSystem.Cache. In newer versions of HDFS, we can directly * invoke FileSystem.newInstance(Configuration). * //from www . j a va2s.co m * @param conf Configuration * @return A new instance of the filesystem */ private static FileSystem newInstanceFileSystem(Configuration conf) throws IOException { URI uri = FileSystem.getDefaultUri(conf); Class<?> clazz = conf.getClass("fs." + uri.getScheme() + ".impl", null); if (clazz == null) { throw new IOException("No FileSystem for scheme: " + uri.getScheme()); } FileSystem fs = (FileSystem) ReflectionUtils.newInstance(clazz, conf); fs.initialize(uri, conf); return fs; }
From source file:org.kiji.mapreduce.IntegrationTestJobHistoryKijiTable.java
License:Apache License
/** * Test of all the basic information recorded by a mapper. *//* w w w .ja v a2 s .c o m*/ @Test public void testMappers() throws Exception { createAndPopulateFooTable(); final Configuration jobConf = getConf(); // Set a value in the configuration. We'll check to be sure we can retrieve it later. jobConf.set("conf.test.animal.string", "squirrel"); final Kiji kiji = Kiji.Factory.open(getKijiURI()); final KijiTable fooTable = kiji.openTable("foo"); final JobHistoryKijiTable jobHistory = JobHistoryKijiTable.open(kiji); // Construct a Producer for this table. final KijiProduceJobBuilder builder = KijiProduceJobBuilder.create().withConf(jobConf) .withInputTable(fooTable).withProducer(EmailDomainProducer.class) .withOutput(new DirectKijiTableMapReduceJobOutput(fooTable)); MapReduceJob mrJob = builder.build(); // Record the jobId and run the job. String jobName = mrJob.getHadoopJob().getJobName(); LOG.info("About to run job: " + jobName); assertTrue(mrJob.run()); String jobId = mrJob.getHadoopJob().getJobID().toString(); LOG.info("Job was run with id: " + jobId); // Retrieve the recorded values and sanity test them. KijiRowData jobRecord = jobHistory.getJobDetails(jobId); assertTrue(jobRecord.containsColumn("info", "jobName")); assertEquals(jobRecord.getMostRecentValue("info", "jobName").toString(), jobName); assertTrue(jobRecord.containsColumn("info", "jobId")); assertEquals(jobRecord.getMostRecentValue("info", "jobId").toString(), jobId); assertTrue(jobRecord.containsColumn("info", "startTime")); assertTrue(jobRecord.containsColumn("info", "endTime")); assertTrue(jobRecord.<Long>getMostRecentValue("info", "startTime") < jobRecord .<Long>getMostRecentValue("info", "endTime")); // Check counters. We don't know the exact number of rows in the foo table, so just check if // it's greater than 0. assertTrue(jobRecord.containsColumn("info", "counters")); final String countersString = jobRecord.getMostRecentValue("info", "counters").toString(); final Pattern countersPattern = Pattern.compile("PRODUCER_ROWS_PROCESSED=(\\d+)"); final Matcher countersMatcher = countersPattern.matcher(countersString); assertTrue(countersMatcher.find()); assertTrue(Integer.parseInt(countersMatcher.group(1)) > 0); // Test to make sure the Configuration has the correct producer class, and records the value // we set previously. assertTrue(jobRecord.containsColumn("info", "configuration")); final String configString = jobRecord.getMostRecentValue("info", "configuration").toString(); final Configuration config = new Configuration(); config.addResource(new ByteArrayInputStream(configString.getBytes())); assertTrue(EmailDomainProducer.class == config.getClass(KijiConfKeys.KIJI_PRODUCER_CLASS, null)); assertEquals("Couldn't retrieve configuration field from deserialized configuration.", "squirrel", config.get("conf.test.animal.string")); fooTable.close(); jobHistory.close(); kiji.release(); }
From source file:org.kiji.scoring.batch.impl.ScoreFunctionMapper.java
License:Apache License
/** {@inheritDoc} */ @Override/*from w w w . ja v a2 s .c om*/ @SuppressWarnings("unchecked") protected void setup(final Context context) throws IOException { super.setup(context); Preconditions.checkState(null == mFreshenerContext); final Configuration conf = context.getConfiguration(); final Class<? extends ScoreFunction<?>> scoreFunctionClass = (Class<? extends ScoreFunction<?>>) conf .getClass(ScoreFunctionJobBuilder.SCORE_FUNCTION_CLASS_CONF_KEY, null); if (null == scoreFunctionClass) { throw new IOException("ScoreFunction class could not be found in configuration."); } mScoreFunction = ReflectionUtils.newInstance(scoreFunctionClass, conf); mAttachedColumn = new KijiColumnName( conf.get(ScoreFunctionJobBuilder.SCORE_FUNCTION_ATTACHED_COLUMN_CONF_KEY)); mParameters = GSON.fromJson(conf.get(ScoreFunctionJobBuilder.SCORE_FUNCTION_PARAMETERS_CONF_KEY), Map.class); final KeyValueStoreReaderFactory factory = KeyValueStoreReaderFactory.create(conf); mClientDataRequest = getClientDataRequestFromConf(conf); mFreshenerContext = InternalFreshenerContext.create(mClientDataRequest, mAttachedColumn, mParameters, Maps.<String, String>newHashMap(), factory); mTableContext = KijiTableContextFactory.create(context); mScoreFunction.setup(mFreshenerContext); }
From source file:org.kitesdk.data.mapreduce.DatasetKeyInputFormat.java
License:Apache License
@SuppressWarnings({ "deprecation", "unchecked" }) private static <E> View<E> load(Configuration conf) { Class<E> type;/* www . ja v a 2 s . com*/ try { type = (Class<E>) conf.getClass(KITE_TYPE, GenericData.Record.class); } catch (RuntimeException e) { if (e.getCause() instanceof ClassNotFoundException) { throw new TypeNotFoundException(String .format("The Java class %s for the entity type could not be found", conf.get(KITE_TYPE)), e.getCause()); } else { throw e; } } String schemaStr = conf.get(KITE_READER_SCHEMA); Schema projection = null; if (schemaStr != null) { projection = new Schema.Parser().parse(schemaStr); } String inputUri = conf.get(KITE_INPUT_URI); if (projection != null) { return Datasets.load(inputUri).asSchema(projection).asType(type); } else { return Datasets.load(inputUri, type); } }
From source file:org.kitesdk.data.mapreduce.DatasetKeyOutputFormat.java
License:Apache License
@SuppressWarnings("unchecked") private static <E> Class<E> getType(JobContext jobContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); Class<E> type;//from w ww . j a v a2 s . co m try { type = (Class<E>) conf.getClass(KITE_TYPE, GenericData.Record.class); } catch (RuntimeException e) { if (e.getCause() instanceof ClassNotFoundException) { throw new TypeNotFoundException(String .format("The Java class %s for the entity type could not be found", conf.get(KITE_TYPE)), e.getCause()); } else { throw e; } } return type; }
From source file:org.mrgeo.buildpyramid.BuildPyramidMapper.java
License:Apache License
@SuppressWarnings("rawtypes") @Override/*from w w w. j a v a2 s . co m*/ public void setup(Mapper.Context context) { Configuration conf = context.getConfiguration(); tolevel = conf.getInt(BuildPyramidDriver.TO_LEVEL, 0); fromlevel = conf.getInt(BuildPyramidDriver.FROM_LEVEL, 0); try { Map<String, MrsImagePyramidMetadata> meta = HadoopUtils.getMetadata(context.getConfiguration()); metadata = meta.values().iterator().next(); aggregator = (Aggregator) ReflectionUtils .newInstance(conf.getClass(BuildPyramidDriver.AGGREGATOR, MeanAggregator.class), conf); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e); } tileCounter = context.getCounter("Build Pyramid Mapper", "Source Tiles Processed"); }