Example usage for org.apache.hadoop.conf Configuration getClass

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration getClass.

Prototype

public Class<?> getClass(String name, Class<?> defaultValue)

Source Link

Document

Get the value of the name property as a Class.

Usage

From source file:org.apache.tinkerpop.gremlin.hadoop.process.computer.util.MapReduceHelper.java

License:Apache License

public static void executeMapReduceJob(final MapReduce mapReduce, final Memory.Admin memory,
        final Configuration configuration) throws IOException, ClassNotFoundException, InterruptedException {
    final Configuration newConfiguration = new Configuration(configuration);
    final boolean vertexProgramExists = newConfiguration.get(VertexProgram.VERTEX_PROGRAM, null) != null;
    if (vertexProgramExists) {
        newConfiguration.set(Constants.GREMLIN_HADOOP_GRAPH_READER, InputOutputHelper.getInputFormat(
                (Class) newConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputFormat.class))
                .getCanonicalName());//from  ww  w.j  a v a  2s .c o  m
        newConfiguration.unset(Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    }
    final BaseConfiguration apacheConfiguration = new BaseConfiguration();
    apacheConfiguration.setDelimiterParsingDisabled(true);
    mapReduce.storeState(apacheConfiguration);
    ConfUtil.mergeApacheIntoHadoopConfiguration(apacheConfiguration, newConfiguration);

    final Optional<Comparator<?>> mapSort = mapReduce.getMapKeySort();
    final Optional<Comparator<?>> reduceSort = mapReduce.getReduceKeySort();
    newConfiguration.setClass(Constants.GREMLIN_HADOOP_MAP_REDUCE_CLASS, mapReduce.getClass(), MapReduce.class);
    final Job job = Job.getInstance(newConfiguration, mapReduce.toString());
    HadoopGraph.LOGGER.info(Constants.GREMLIN_HADOOP_JOB_PREFIX + mapReduce.toString());
    job.setJarByClass(HadoopGraph.class);
    if (mapSort.isPresent())
        job.setSortComparatorClass(ObjectWritableComparator.ObjectWritableMapComparator.class);
    job.setMapperClass(HadoopMap.class);
    if (mapReduce.doStage(MapReduce.Stage.REDUCE)) {
        if (mapReduce.doStage(MapReduce.Stage.COMBINE))
            job.setCombinerClass(HadoopCombine.class);
        job.setReducerClass(HadoopReduce.class);
    } else {
        if (mapSort.isPresent()) {
            job.setReducerClass(Reducer.class);
            job.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order?
        } else {
            job.setNumReduceTasks(0);
        }
    }
    job.setMapOutputKeyClass(ObjectWritable.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(ObjectWritable.class);
    job.setOutputValueClass(ObjectWritable.class);
    job.setInputFormatClass(GraphFilterInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    // if there is no vertex program, then grab the graph from the input location
    final Path graphPath;
    if (vertexProgramExists) {
        graphPath = new Path(
                Constants.getGraphLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)));
    } else {
        graphPath = new Path(newConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION));
    }

    Path memoryPath = new Path(
            Constants.getMemoryLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION),
                    (reduceSort.isPresent() ? mapReduce.getMemoryKey() + "-temp" : mapReduce.getMemoryKey())));
    if (FileSystem.get(newConfiguration).exists(memoryPath)) {
        FileSystem.get(newConfiguration).delete(memoryPath, true);
    }
    FileInputFormat.setInputPaths(job, graphPath);
    FileOutputFormat.setOutputPath(job, memoryPath);
    job.waitForCompletion(true);

    // if there is a reduce sort, we need to run another identity MapReduce job
    if (reduceSort.isPresent()) {
        final Job reduceSortJob = Job.getInstance(newConfiguration, "ReduceKeySort");
        reduceSortJob.setSortComparatorClass(ObjectWritableComparator.ObjectWritableReduceComparator.class);
        reduceSortJob.setMapperClass(Mapper.class);
        reduceSortJob.setReducerClass(Reducer.class);
        reduceSortJob.setMapOutputKeyClass(ObjectWritable.class);
        reduceSortJob.setMapOutputValueClass(ObjectWritable.class);
        reduceSortJob.setOutputKeyClass(ObjectWritable.class);
        reduceSortJob.setOutputValueClass(ObjectWritable.class);
        reduceSortJob.setInputFormatClass(SequenceFileInputFormat.class);
        reduceSortJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        reduceSortJob.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order?
        FileInputFormat.setInputPaths(reduceSortJob, memoryPath);
        final Path sortedMemoryPath = new Path(Constants.getMemoryLocation(
                newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), mapReduce.getMemoryKey()));
        FileOutputFormat.setOutputPath(reduceSortJob, sortedMemoryPath);
        reduceSortJob.waitForCompletion(true);
        FileSystem.get(newConfiguration).delete(memoryPath, true); // delete the temporary memory path
        memoryPath = sortedMemoryPath;
    }
    mapReduce.addResultToMemory(memory, new ObjectWritableIterator(newConfiguration, memoryPath));
}

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.util.ConfUtil.java

License:Apache License

public static InputFormat<NullWritable, VertexWritable> getReaderAsInputFormat(
        final Configuration hadoopConfiguration) {
    final Class<?> readerClass = hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER,
            Object.class);
    try {//  w  ww  . j  a  v  a2s .c  om
        return InputFormat.class.isAssignableFrom(readerClass)
                ? (InputFormat<NullWritable, VertexWritable>) readerClass.newInstance()
                : (InputFormat<NullWritable, VertexWritable>) Class
                        .forName("org.apache.tinkerpop.gremlin.spark.structure.io.InputRDDFormat")
                        .newInstance();
    } catch (final Exception e) {
        throw new IllegalStateException(e.getMessage(), e);
    }
}

From source file:org.apache.tinkerpop.gremlin.spark.process.computer.SparkGraphComputer.java

License:Apache License

private Future<ComputerResult> submitWithExecutor(Executor exec) {
    // create the completable future
    return computerService.submit(() -> {
        final long startTime = System.currentTimeMillis();
        //////////////////////////////////////////////////
        /////// PROCESS SHIM AND SYSTEM PROPERTIES ///////
        //////////////////////////////////////////////////
        ConfigurationUtils.copy(this.hadoopGraph.configuration(), this.sparkConfiguration);
        final String shimService = KryoSerializer.class.getCanonicalName()
                .equals(this.sparkConfiguration.getString(Constants.SPARK_SERIALIZER, null))
                        ? UnshadedKryoShimService.class.getCanonicalName()
                        : HadoopPoolShimService.class.getCanonicalName();
        this.sparkConfiguration.setProperty(KryoShimServiceLoader.KRYO_SHIM_SERVICE, shimService);
        ///////////
        final StringBuilder params = new StringBuilder();
        this.sparkConfiguration.getKeys().forEachRemaining(key -> {
            if (KEYS_PASSED_IN_JVM_SYSTEM_PROPERTIES.contains(key)) {
                params.append(" -D").append("tinkerpop.").append(key).append("=")
                        .append(this.sparkConfiguration.getProperty(key));
                System.setProperty("tinkerpop." + key, this.sparkConfiguration.getProperty(key).toString());
            }/*from w w w .  j  av a2  s  .  c  om*/
        });
        if (params.length() > 0) {
            this.sparkConfiguration.setProperty(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS,
                    (this.sparkConfiguration.getString(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS, "")
                            + params.toString()).trim());
            this.sparkConfiguration.setProperty(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS,
                    (this.sparkConfiguration.getString(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, "")
                            + params.toString()).trim());
        }
        KryoShimServiceLoader.applyConfiguration(this.sparkConfiguration);
        //////////////////////////////////////////////////
        //////////////////////////////////////////////////
        //////////////////////////////////////////////////
        // apache and hadoop configurations that are used throughout the graph computer computation
        final org.apache.commons.configuration.Configuration graphComputerConfiguration = new HadoopConfiguration(
                this.sparkConfiguration);
        if (!graphComputerConfiguration.containsKey(Constants.SPARK_SERIALIZER)) {
            graphComputerConfiguration.setProperty(Constants.SPARK_SERIALIZER,
                    KryoSerializer.class.getCanonicalName());
            if (!graphComputerConfiguration.containsKey(Constants.SPARK_KRYO_REGISTRATOR))
                graphComputerConfiguration.setProperty(Constants.SPARK_KRYO_REGISTRATOR,
                        GryoRegistrator.class.getCanonicalName());
        }
        graphComputerConfiguration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES,
                this.persist.equals(GraphComputer.Persist.EDGES));
        final Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(graphComputerConfiguration);
        final Storage fileSystemStorage = FileSystemStorage.open(hadoopConfiguration);
        final Storage sparkContextStorage = SparkContextStorage.open(graphComputerConfiguration);
        final boolean inputFromHDFS = FileInputFormat.class.isAssignableFrom(
                hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class));
        final boolean inputFromSpark = PersistedInputRDD.class.isAssignableFrom(
                hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class));
        final boolean outputToHDFS = FileOutputFormat.class.isAssignableFrom(
                hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class));
        final boolean outputToSpark = PersistedOutputRDD.class.isAssignableFrom(
                hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class));
        final boolean skipPartitioner = graphComputerConfiguration
                .getBoolean(Constants.GREMLIN_SPARK_SKIP_PARTITIONER, false);
        final boolean skipPersist = graphComputerConfiguration
                .getBoolean(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE, false);
        String inputLocation = null;
        if (inputFromSpark)
            inputLocation = Constants
                    .getSearchGraphLocation(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION),
                            sparkContextStorage)
                    .orElse(null);
        else if (inputFromHDFS)
            inputLocation = Constants
                    .getSearchGraphLocation(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION),
                            fileSystemStorage)
                    .orElse(null);
        if (null == inputLocation)
            inputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION);

        if (null != inputLocation && inputFromHDFS) {
            try {
                graphComputerConfiguration.setProperty(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR,
                        FileSystem.get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath()
                                .toString());
                hadoopConfiguration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, FileSystem
                        .get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath().toString());
            } catch (final IOException e) {
                throw new IllegalStateException(e.getMessage(), e);
            }
        }
        final InputRDD inputRDD;
        final OutputRDD outputRDD;
        final boolean filtered;
        try {
            inputRDD = InputRDD.class.isAssignableFrom(
                    hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class))
                            ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER,
                                    InputRDD.class, InputRDD.class).newInstance()
                            : InputFormatRDD.class.newInstance();
            outputRDD = OutputRDD.class.isAssignableFrom(
                    hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class))
                            ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER,
                                    OutputRDD.class, OutputRDD.class).newInstance()
                            : OutputFormatRDD.class.newInstance();
            // if the input class can filter on load, then set the filters
            if (inputRDD instanceof InputFormatRDD
                    && GraphFilterAware.class.isAssignableFrom(hadoopConfiguration.getClass(
                            Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class))) {
                GraphFilterAware.storeGraphFilter(graphComputerConfiguration, hadoopConfiguration,
                        this.graphFilter);
                filtered = false;
            } else if (inputRDD instanceof GraphFilterAware) {
                ((GraphFilterAware) inputRDD).setGraphFilter(this.graphFilter);
                filtered = false;
            } else if (this.graphFilter.hasFilter()) {
                filtered = true;
            } else {
                filtered = false;
            }
        } catch (final InstantiationException | IllegalAccessException e) {
            throw new IllegalStateException(e.getMessage(), e);
        }

        SparkMemory memory = null;
        // delete output location
        final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, null);
        if (null != outputLocation) {
            if (outputToHDFS && fileSystemStorage.exists(outputLocation))
                fileSystemStorage.rm(outputLocation);
            if (outputToSpark && sparkContextStorage.exists(outputLocation))
                sparkContextStorage.rm(outputLocation);
        }

        // the Spark application name will always be set by SparkContextStorage, thus, INFO the name to make it easier to debug
        logger.debug(Constants.GREMLIN_HADOOP_SPARK_JOB_PREFIX
                + (null == this.vertexProgram ? "No VertexProgram" : this.vertexProgram) + "["
                + this.mapReducers + "]");

        // create the spark configuration from the graph computer configuration
        final SparkConf sparkConfiguration = new SparkConf();
        hadoopConfiguration.forEach(entry -> sparkConfiguration.set(entry.getKey(), entry.getValue()));
        // execute the vertex program and map reducers and if there is a failure, auto-close the spark context
        try {
            final JavaSparkContext sparkContext = new JavaSparkContext(
                    SparkContext.getOrCreate(sparkConfiguration));
            this.loadJars(hadoopConfiguration, sparkContext); // add the project jars to the cluster
            Spark.create(sparkContext.sc()); // this is the context RDD holder that prevents GC
            updateLocalConfiguration(sparkContext, sparkConfiguration);
            // create a message-passing friendly rdd from the input rdd
            boolean partitioned = false;
            JavaPairRDD<Object, VertexWritable> loadedGraphRDD = inputRDD
                    .readGraphRDD(graphComputerConfiguration, sparkContext);
            // if there are vertex or edge filters, filter the loaded graph rdd prior to partitioning and persisting
            if (filtered) {
                this.logger.debug("Filtering the loaded graphRDD: " + this.graphFilter);
                loadedGraphRDD = SparkExecutor.applyGraphFilter(loadedGraphRDD, this.graphFilter);
            }
            // if the loaded graph RDD is already partitioned use that partitioner, else partition it with HashPartitioner
            if (loadedGraphRDD.partitioner().isPresent())
                this.logger.debug("Using the existing partitioner associated with the loaded graphRDD: "
                        + loadedGraphRDD.partitioner().get());
            else {
                if (!skipPartitioner) {
                    final Partitioner partitioner = new HashPartitioner(
                            this.workersSet ? this.workers : loadedGraphRDD.partitions().size());
                    this.logger.debug("Partitioning the loaded graphRDD: " + partitioner);
                    loadedGraphRDD = loadedGraphRDD.partitionBy(partitioner);
                    partitioned = true;
                    assert loadedGraphRDD.partitioner().isPresent();
                } else {
                    assert skipPartitioner == !loadedGraphRDD.partitioner().isPresent(); // no easy way to test this with a test case
                    this.logger.debug("Partitioning has been skipped for the loaded graphRDD via "
                            + Constants.GREMLIN_SPARK_SKIP_PARTITIONER);
                }
            }
            // if the loaded graphRDD was already partitioned previous, then this coalesce/repartition will not take place
            if (this.workersSet) {
                if (loadedGraphRDD.partitions().size() > this.workers) // ensures that the loaded graphRDD does not have more partitions than workers
                    loadedGraphRDD = loadedGraphRDD.coalesce(this.workers);
                else if (loadedGraphRDD.partitions().size() < this.workers) // ensures that the loaded graphRDD does not have less partitions than workers
                    loadedGraphRDD = loadedGraphRDD.repartition(this.workers);
            }
            // persist the vertex program loaded graph as specified by configuration or else use default cache() which is MEMORY_ONLY
            if (!skipPersist && (!inputFromSpark || partitioned || filtered))
                loadedGraphRDD = loadedGraphRDD.persist(StorageLevel.fromString(
                        hadoopConfiguration.get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY")));

            // final graph with view (for persisting and/or mapReducing -- may be null and thus, possible to save space/time)
            JavaPairRDD<Object, VertexWritable> computedGraphRDD = null;
            ////////////////////////////////
            // process the vertex program //
            ////////////////////////////////
            if (null != this.vertexProgram) {
                memory = new SparkMemory(this.vertexProgram, this.mapReducers, sparkContext);
                /////////////////
                // if there is a registered VertexProgramInterceptor, use it to bypass the GraphComputer semantics
                if (graphComputerConfiguration
                        .containsKey(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR)) {
                    try {
                        final SparkVertexProgramInterceptor<VertexProgram> interceptor = (SparkVertexProgramInterceptor) Class
                                .forName(graphComputerConfiguration
                                        .getString(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR))
                                .newInstance();
                        computedGraphRDD = interceptor.apply(this.vertexProgram, loadedGraphRDD, memory);
                    } catch (final ClassNotFoundException | IllegalAccessException | InstantiationException e) {
                        throw new IllegalStateException(e.getMessage());
                    }
                } else { // standard GraphComputer semantics
                    // get a configuration that will be propagated to all workers
                    final HadoopConfiguration vertexProgramConfiguration = new HadoopConfiguration();
                    this.vertexProgram.storeState(vertexProgramConfiguration);
                    // set up the vertex program and wire up configurations
                    this.vertexProgram.setup(memory);
                    JavaPairRDD<Object, ViewIncomingPayload<Object>> viewIncomingRDD = null;
                    memory.broadcastMemory(sparkContext);
                    // execute the vertex program
                    while (true) {
                        if (Thread.interrupted()) {
                            sparkContext.cancelAllJobs();
                            throw new TraversalInterruptedException();
                        }
                        memory.setInExecute(true);
                        viewIncomingRDD = SparkExecutor.executeVertexProgramIteration(loadedGraphRDD,
                                viewIncomingRDD, memory, graphComputerConfiguration,
                                vertexProgramConfiguration);
                        memory.setInExecute(false);
                        if (this.vertexProgram.terminate(memory))
                            break;
                        else {
                            memory.incrIteration();
                            memory.broadcastMemory(sparkContext);
                        }
                    }
                    // if the graph will be continued to be used (persisted or mapreduced), then generate a view+graph
                    if ((null != outputRDD && !this.persist.equals(Persist.NOTHING))
                            || !this.mapReducers.isEmpty()) {
                        computedGraphRDD = SparkExecutor.prepareFinalGraphRDD(loadedGraphRDD, viewIncomingRDD,
                                this.vertexProgram.getVertexComputeKeys());
                        assert null != computedGraphRDD && computedGraphRDD != loadedGraphRDD;
                    } else {
                        // ensure that the computedGraphRDD was not created
                        assert null == computedGraphRDD;
                    }
                }
                /////////////////
                memory.complete(); // drop all transient memory keys
                // write the computed graph to the respective output (rdd or output format)
                if (null != outputRDD && !this.persist.equals(Persist.NOTHING)) {
                    assert null != computedGraphRDD; // the logic holds that a computeGraphRDD must be created at this point
                    outputRDD.writeGraphRDD(graphComputerConfiguration, computedGraphRDD);
                }
            }

            final boolean computedGraphCreated = computedGraphRDD != null && computedGraphRDD != loadedGraphRDD;
            if (!computedGraphCreated)
                computedGraphRDD = loadedGraphRDD;

            final Memory.Admin finalMemory = null == memory ? new MapMemory() : new MapMemory(memory);

            //////////////////////////////
            // process the map reducers //
            //////////////////////////////
            if (!this.mapReducers.isEmpty()) {
                // create a mapReduceRDD for executing the map reduce jobs on
                JavaPairRDD<Object, VertexWritable> mapReduceRDD = computedGraphRDD;
                if (computedGraphCreated && !outputToSpark) {
                    // drop all the edges of the graph as they are not used in mapReduce processing
                    mapReduceRDD = computedGraphRDD.mapValues(vertexWritable -> {
                        vertexWritable.get().dropEdges(Direction.BOTH);
                        return vertexWritable;
                    });
                    // if there is only one MapReduce to execute, don't bother wasting the clock cycles.
                    if (this.mapReducers.size() > 1)
                        mapReduceRDD = mapReduceRDD.persist(StorageLevel.fromString(hadoopConfiguration
                                .get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY")));
                }

                for (final MapReduce mapReduce : this.mapReducers) {
                    // execute the map reduce job
                    final HadoopConfiguration newApacheConfiguration = new HadoopConfiguration(
                            graphComputerConfiguration);
                    mapReduce.storeState(newApacheConfiguration);
                    // map
                    final JavaPairRDD mapRDD = SparkExecutor.executeMap((JavaPairRDD) mapReduceRDD, mapReduce,
                            newApacheConfiguration);
                    // combine
                    final JavaPairRDD combineRDD = mapReduce.doStage(MapReduce.Stage.COMBINE)
                            ? SparkExecutor.executeCombine(mapRDD, newApacheConfiguration)
                            : mapRDD;
                    // reduce
                    final JavaPairRDD reduceRDD = mapReduce.doStage(MapReduce.Stage.REDUCE)
                            ? SparkExecutor.executeReduce(combineRDD, mapReduce, newApacheConfiguration)
                            : combineRDD;
                    // write the map reduce output back to disk and computer result memory
                    if (null != outputRDD)
                        mapReduce.addResultToMemory(finalMemory, outputRDD.writeMemoryRDD(
                                graphComputerConfiguration, mapReduce.getMemoryKey(), reduceRDD));
                }
                // if the mapReduceRDD is not simply the computed graph, unpersist the mapReduceRDD
                if (computedGraphCreated && !outputToSpark) {
                    assert loadedGraphRDD != computedGraphRDD;
                    assert mapReduceRDD != computedGraphRDD;
                    mapReduceRDD.unpersist();
                } else {
                    assert mapReduceRDD == computedGraphRDD;
                }
            }

            // unpersist the loaded graph if it will not be used again (no PersistedInputRDD)
            // if the graphRDD was loaded from Spark, but then partitioned or filtered, its a different RDD
            if (!inputFromSpark || partitioned || filtered)
                loadedGraphRDD.unpersist();
            // unpersist the computed graph if it will not be used again (no PersistedOutputRDD)
            // if the computed graph is the loadedGraphRDD because it was not mutated and not-unpersisted, then don't unpersist the computedGraphRDD/loadedGraphRDD
            if ((!outputToSpark || this.persist.equals(GraphComputer.Persist.NOTHING)) && computedGraphCreated)
                computedGraphRDD.unpersist();
            // delete any file system or rdd data if persist nothing
            if (null != outputLocation && this.persist.equals(GraphComputer.Persist.NOTHING)) {
                if (outputToHDFS)
                    fileSystemStorage.rm(outputLocation);
                if (outputToSpark)
                    sparkContextStorage.rm(outputLocation);
            }
            // update runtime and return the newly computed graph
            finalMemory.setRuntime(System.currentTimeMillis() - startTime);
            // clear properties that should not be propagated in an OLAP chain
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_GRAPH_FILTER);
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR);
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE);
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_PARTITIONER);
            return new DefaultComputerResult(InputOutputHelper.getOutputGraph(graphComputerConfiguration,
                    this.resultGraph, this.persist), finalMemory.asImmutable());
        } finally {
            if (!graphComputerConfiguration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false))
                Spark.close();
        }
    });
}

From source file:org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter.java

License:Open Source License

public SequenceFileSpillWriter(FileSystem fileSystem, Configuration conf, Path outputFilePath,
        Class<KeyType> keyClass, Class<ValueType> valueClass,
        SequenceFileIndexWriter<KeyType, ValueType> optionalIndexWriter, boolean compress) throws IOException {

    _indexWriter = optionalIndexWriter;//from   w ww  .ja v a2 s.com
    _spillBufferSize = conf.getInt(SPILL_WRITER_BUFFER_SIZE_PARAM, DEFAULT_SPILL_BUFFER_SIZE);
    _outputStream = fileSystem.create(outputFilePath);

    // allocate buffer ...
    _activeBuffer = ByteBuffer.allocate(_spillBufferSize);

    if (compress) {
        Class codecClass = conf.getClass("mapred.output.compression.codec", DefaultCodec.class);
        CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);

        writer = SequenceFile.createWriter(conf, _outputStream, keyClass, valueClass, CompressionType.BLOCK,
                codec);
    } else {
        writer = SequenceFile.createWriter(conf, _outputStream, keyClass, valueClass, CompressionType.NONE,
                null);
    }

    _writerThread = new Thread(new Runnable() {

        @Override
        public void run() {
            // LOG.info("Writer Thread Starting");

            while (true) {

                QueuedBufferItem queuedBufferItem = null;

                try {
                    queuedBufferItem = _bufferQueue.take();
                } catch (InterruptedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                if (queuedBufferItem._buffer == null) {
                    // LOG.info("Writer Thread received empty buffer item. Exiting");
                    return;
                } else {

                    ByteBuffer theBuffer = queuedBufferItem._buffer;

                    // LOG.info("Writer Thread received item. Limit:" +
                    // theBuffer.limit());

                    // get byte pointer
                    byte[] bufferAsBytes = theBuffer.array();

                    int itemsWritten = 0;
                    long timeStart = System.currentTimeMillis();

                    while (theBuffer.remaining() != 0) {

                        // now read in key length
                        int keyLen = theBuffer.getInt();
                        // mark key position
                        int keyPos = theBuffer.position();
                        // now skip past key length
                        theBuffer.position(keyPos + keyLen);
                        // read value length
                        int valueLen = theBuffer.getInt();
                        // mark value position
                        int valuePosition = theBuffer.position();
                        // now skip past it ...
                        theBuffer.position(valuePosition + valueLen);
                        // now write this out to the sequence file ...

                        try {
                            spillRawRecord2(bufferAsBytes, keyPos, keyLen, bufferAsBytes, valuePosition,
                                    valueLen);
                        } catch (IOException e) {
                            LOG.error("Writer Thread Failed with Error:" + StringUtils.stringifyException(e));
                            _writerException = e;
                            return;
                        }
                        itemsWritten++;
                    }
                    // LOG.info("Writer Thread Finished With Buffer. Wrote:"+
                    // itemsWritten + " in:" + (System.currentTimeMillis() -
                    // timeStart));
                }
            }
        }

    });
    _writerThread.start();
}

From source file:org.hypertable.FsBroker.hadoop.HadoopBroker.java

License:Open Source License

/**
 * Returns a brand new instance of the FileSystem. It does not use
 * the FileSystem.Cache. In newer versions of HDFS, we can directly
 * invoke FileSystem.newInstance(Configuration).
 * //from  www .  j a  va2s.co  m
 * @param conf Configuration
 * @return A new instance of the filesystem
 */
private static FileSystem newInstanceFileSystem(Configuration conf) throws IOException {
    URI uri = FileSystem.getDefaultUri(conf);
    Class<?> clazz = conf.getClass("fs." + uri.getScheme() + ".impl", null);
    if (clazz == null) {
        throw new IOException("No FileSystem for scheme: " + uri.getScheme());
    }
    FileSystem fs = (FileSystem) ReflectionUtils.newInstance(clazz, conf);
    fs.initialize(uri, conf);
    return fs;
}

From source file:org.kiji.mapreduce.IntegrationTestJobHistoryKijiTable.java

License:Apache License

/**
 * Test of all the basic information recorded by a mapper.
 *//* w  w  w .ja v  a2 s  .c  o m*/
@Test
public void testMappers() throws Exception {
    createAndPopulateFooTable();
    final Configuration jobConf = getConf();
    // Set a value in the configuration. We'll check to be sure we can retrieve it later.
    jobConf.set("conf.test.animal.string", "squirrel");
    final Kiji kiji = Kiji.Factory.open(getKijiURI());
    final KijiTable fooTable = kiji.openTable("foo");
    final JobHistoryKijiTable jobHistory = JobHistoryKijiTable.open(kiji);

    // Construct a Producer for this table.
    final KijiProduceJobBuilder builder = KijiProduceJobBuilder.create().withConf(jobConf)
            .withInputTable(fooTable).withProducer(EmailDomainProducer.class)
            .withOutput(new DirectKijiTableMapReduceJobOutput(fooTable));
    MapReduceJob mrJob = builder.build();

    // Record the jobId and run the job.
    String jobName = mrJob.getHadoopJob().getJobName();
    LOG.info("About to run job: " + jobName);
    assertTrue(mrJob.run());
    String jobId = mrJob.getHadoopJob().getJobID().toString();
    LOG.info("Job was run with id: " + jobId);

    // Retrieve the recorded values and sanity test them.
    KijiRowData jobRecord = jobHistory.getJobDetails(jobId);
    assertTrue(jobRecord.containsColumn("info", "jobName"));
    assertEquals(jobRecord.getMostRecentValue("info", "jobName").toString(), jobName);
    assertTrue(jobRecord.containsColumn("info", "jobId"));
    assertEquals(jobRecord.getMostRecentValue("info", "jobId").toString(), jobId);

    assertTrue(jobRecord.containsColumn("info", "startTime"));
    assertTrue(jobRecord.containsColumn("info", "endTime"));
    assertTrue(jobRecord.<Long>getMostRecentValue("info", "startTime") < jobRecord
            .<Long>getMostRecentValue("info", "endTime"));

    // Check counters. We don't know the exact number of rows in the foo table, so just check if
    // it's greater than 0.
    assertTrue(jobRecord.containsColumn("info", "counters"));
    final String countersString = jobRecord.getMostRecentValue("info", "counters").toString();
    final Pattern countersPattern = Pattern.compile("PRODUCER_ROWS_PROCESSED=(\\d+)");
    final Matcher countersMatcher = countersPattern.matcher(countersString);
    assertTrue(countersMatcher.find());
    assertTrue(Integer.parseInt(countersMatcher.group(1)) > 0);

    // Test to make sure the Configuration has the correct producer class, and records the value
    // we set previously.
    assertTrue(jobRecord.containsColumn("info", "configuration"));
    final String configString = jobRecord.getMostRecentValue("info", "configuration").toString();
    final Configuration config = new Configuration();
    config.addResource(new ByteArrayInputStream(configString.getBytes()));
    assertTrue(EmailDomainProducer.class == config.getClass(KijiConfKeys.KIJI_PRODUCER_CLASS, null));
    assertEquals("Couldn't retrieve configuration field from deserialized configuration.", "squirrel",
            config.get("conf.test.animal.string"));

    fooTable.close();
    jobHistory.close();
    kiji.release();
}

From source file:org.kiji.scoring.batch.impl.ScoreFunctionMapper.java

License:Apache License

/** {@inheritDoc} */
@Override/*from  w  w  w .  ja v a2  s .c om*/
@SuppressWarnings("unchecked")
protected void setup(final Context context) throws IOException {
    super.setup(context);
    Preconditions.checkState(null == mFreshenerContext);
    final Configuration conf = context.getConfiguration();
    final Class<? extends ScoreFunction<?>> scoreFunctionClass = (Class<? extends ScoreFunction<?>>) conf
            .getClass(ScoreFunctionJobBuilder.SCORE_FUNCTION_CLASS_CONF_KEY, null);
    if (null == scoreFunctionClass) {
        throw new IOException("ScoreFunction class could not be found in configuration.");
    }
    mScoreFunction = ReflectionUtils.newInstance(scoreFunctionClass, conf);
    mAttachedColumn = new KijiColumnName(
            conf.get(ScoreFunctionJobBuilder.SCORE_FUNCTION_ATTACHED_COLUMN_CONF_KEY));
    mParameters = GSON.fromJson(conf.get(ScoreFunctionJobBuilder.SCORE_FUNCTION_PARAMETERS_CONF_KEY),
            Map.class);
    final KeyValueStoreReaderFactory factory = KeyValueStoreReaderFactory.create(conf);
    mClientDataRequest = getClientDataRequestFromConf(conf);
    mFreshenerContext = InternalFreshenerContext.create(mClientDataRequest, mAttachedColumn, mParameters,
            Maps.<String, String>newHashMap(), factory);
    mTableContext = KijiTableContextFactory.create(context);
    mScoreFunction.setup(mFreshenerContext);
}

From source file:org.kitesdk.data.mapreduce.DatasetKeyInputFormat.java

License:Apache License

@SuppressWarnings({ "deprecation", "unchecked" })
private static <E> View<E> load(Configuration conf) {
    Class<E> type;/* www  . ja v a 2  s  .  com*/
    try {
        type = (Class<E>) conf.getClass(KITE_TYPE, GenericData.Record.class);
    } catch (RuntimeException e) {
        if (e.getCause() instanceof ClassNotFoundException) {
            throw new TypeNotFoundException(String
                    .format("The Java class %s for the entity type could not be found", conf.get(KITE_TYPE)),
                    e.getCause());
        } else {
            throw e;
        }
    }

    String schemaStr = conf.get(KITE_READER_SCHEMA);
    Schema projection = null;
    if (schemaStr != null) {
        projection = new Schema.Parser().parse(schemaStr);
    }

    String inputUri = conf.get(KITE_INPUT_URI);
    if (projection != null) {
        return Datasets.load(inputUri).asSchema(projection).asType(type);
    } else {
        return Datasets.load(inputUri, type);
    }
}

From source file:org.kitesdk.data.mapreduce.DatasetKeyOutputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
private static <E> Class<E> getType(JobContext jobContext) {
    Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
    Class<E> type;//from w  ww . j  a  v a2  s .  co m
    try {
        type = (Class<E>) conf.getClass(KITE_TYPE, GenericData.Record.class);
    } catch (RuntimeException e) {
        if (e.getCause() instanceof ClassNotFoundException) {
            throw new TypeNotFoundException(String
                    .format("The Java class %s for the entity type could not be found", conf.get(KITE_TYPE)),
                    e.getCause());
        } else {
            throw e;
        }
    }
    return type;
}

From source file:org.mrgeo.buildpyramid.BuildPyramidMapper.java

License:Apache License

@SuppressWarnings("rawtypes")
@Override/*from w  w w. j a  v  a2 s .  co  m*/
public void setup(Mapper.Context context) {
    Configuration conf = context.getConfiguration();

    tolevel = conf.getInt(BuildPyramidDriver.TO_LEVEL, 0);
    fromlevel = conf.getInt(BuildPyramidDriver.FROM_LEVEL, 0);

    try {
        Map<String, MrsImagePyramidMetadata> meta = HadoopUtils.getMetadata(context.getConfiguration());
        metadata = meta.values().iterator().next();

        aggregator = (Aggregator) ReflectionUtils
                .newInstance(conf.getClass(BuildPyramidDriver.AGGREGATOR, MeanAggregator.class), conf);

    } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
    tileCounter = context.getCounter("Build Pyramid Mapper", "Source Tiles Processed");
}