Example usage for org.apache.hadoop.conf Configuration getClass

List of usage examples for org.apache.hadoop.conf Configuration getClass

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration getClass.

Prototype

public Class<?> getClass(String name, Class<?> defaultValue) 

Source Link

Document

Get the value of the name property as a Class.

Usage

From source file:org.apache.tinkerpop.gremlin.hadoop.process.computer.util.MapReduceHelper.java

License:Apache License

public static void executeMapReduceJob(final MapReduce mapReduce, final Memory.Admin memory,
        final Configuration configuration) throws IOException, ClassNotFoundException, InterruptedException {
    final Configuration newConfiguration = new Configuration(configuration);
    final boolean vertexProgramExists = newConfiguration.get(VertexProgram.VERTEX_PROGRAM, null) != null;
    if (vertexProgramExists) {
        newConfiguration.set(Constants.GREMLIN_HADOOP_GRAPH_READER, InputOutputHelper.getInputFormat(
                (Class) newConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputFormat.class))
                .getCanonicalName());//from  ww  w.j  a v a  2s .c o  m
        newConfiguration.unset(Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    }
    final BaseConfiguration apacheConfiguration = new BaseConfiguration();
    apacheConfiguration.setDelimiterParsingDisabled(true);
    mapReduce.storeState(apacheConfiguration);
    ConfUtil.mergeApacheIntoHadoopConfiguration(apacheConfiguration, newConfiguration);

    final Optional<Comparator<?>> mapSort = mapReduce.getMapKeySort();
    final Optional<Comparator<?>> reduceSort = mapReduce.getReduceKeySort();
    newConfiguration.setClass(Constants.GREMLIN_HADOOP_MAP_REDUCE_CLASS, mapReduce.getClass(), MapReduce.class);
    final Job job = Job.getInstance(newConfiguration, mapReduce.toString());
    HadoopGraph.LOGGER.info(Constants.GREMLIN_HADOOP_JOB_PREFIX + mapReduce.toString());
    job.setJarByClass(HadoopGraph.class);
    if (mapSort.isPresent())
        job.setSortComparatorClass(ObjectWritableComparator.ObjectWritableMapComparator.class);
    job.setMapperClass(HadoopMap.class);
    if (mapReduce.doStage(MapReduce.Stage.REDUCE)) {
        if (mapReduce.doStage(MapReduce.Stage.COMBINE))
            job.setCombinerClass(HadoopCombine.class);
        job.setReducerClass(HadoopReduce.class);
    } else {
        if (mapSort.isPresent()) {
            job.setReducerClass(Reducer.class);
            job.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order?
        } else {
            job.setNumReduceTasks(0);
        }
    }
    job.setMapOutputKeyClass(ObjectWritable.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(ObjectWritable.class);
    job.setOutputValueClass(ObjectWritable.class);
    job.setInputFormatClass(GraphFilterInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    // if there is no vertex program, then grab the graph from the input location
    final Path graphPath;
    if (vertexProgramExists) {
        graphPath = new Path(
                Constants.getGraphLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)));
    } else {
        graphPath = new Path(newConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION));
    }

    Path memoryPath = new Path(
            Constants.getMemoryLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION),
                    (reduceSort.isPresent() ? mapReduce.getMemoryKey() + "-temp" : mapReduce.getMemoryKey())));
    if (FileSystem.get(newConfiguration).exists(memoryPath)) {
        FileSystem.get(newConfiguration).delete(memoryPath, true);
    }
    FileInputFormat.setInputPaths(job, graphPath);
    FileOutputFormat.setOutputPath(job, memoryPath);
    job.waitForCompletion(true);

    // if there is a reduce sort, we need to run another identity MapReduce job
    if (reduceSort.isPresent()) {
        final Job reduceSortJob = Job.getInstance(newConfiguration, "ReduceKeySort");
        reduceSortJob.setSortComparatorClass(ObjectWritableComparator.ObjectWritableReduceComparator.class);
        reduceSortJob.setMapperClass(Mapper.class);
        reduceSortJob.setReducerClass(Reducer.class);
        reduceSortJob.setMapOutputKeyClass(ObjectWritable.class);
        reduceSortJob.setMapOutputValueClass(ObjectWritable.class);
        reduceSortJob.setOutputKeyClass(ObjectWritable.class);
        reduceSortJob.setOutputValueClass(ObjectWritable.class);
        reduceSortJob.setInputFormatClass(SequenceFileInputFormat.class);
        reduceSortJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        reduceSortJob.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order?
        FileInputFormat.setInputPaths(reduceSortJob, memoryPath);
        final Path sortedMemoryPath = new Path(Constants.getMemoryLocation(
                newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), mapReduce.getMemoryKey()));
        FileOutputFormat.setOutputPath(reduceSortJob, sortedMemoryPath);
        reduceSortJob.waitForCompletion(true);
        FileSystem.get(newConfiguration).delete(memoryPath, true); // delete the temporary memory path
        memoryPath = sortedMemoryPath;
    }
    mapReduce.addResultToMemory(memory, new ObjectWritableIterator(newConfiguration, memoryPath));
}

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.util.ConfUtil.java

License:Apache License

public static InputFormat<NullWritable, VertexWritable> getReaderAsInputFormat(
        final Configuration hadoopConfiguration) {
    final Class<?> readerClass = hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER,
            Object.class);
    try {//  w  ww  . j  a  v  a2s .c  om
        return InputFormat.class.isAssignableFrom(readerClass)
                ? (InputFormat<NullWritable, VertexWritable>) readerClass.newInstance()
                : (InputFormat<NullWritable, VertexWritable>) Class
                        .forName("org.apache.tinkerpop.gremlin.spark.structure.io.InputRDDFormat")
                        .newInstance();
    } catch (final Exception e) {
        throw new IllegalStateException(e.getMessage(), e);
    }
}

From source file:org.apache.tinkerpop.gremlin.spark.process.computer.SparkGraphComputer.java

License:Apache License

private Future<ComputerResult> submitWithExecutor(Executor exec) {
    // create the completable future
    return computerService.submit(() -> {
        final long startTime = System.currentTimeMillis();
        //////////////////////////////////////////////////
        /////// PROCESS SHIM AND SYSTEM PROPERTIES ///////
        //////////////////////////////////////////////////
        ConfigurationUtils.copy(this.hadoopGraph.configuration(), this.sparkConfiguration);
        final String shimService = KryoSerializer.class.getCanonicalName()
                .equals(this.sparkConfiguration.getString(Constants.SPARK_SERIALIZER, null))
                        ? UnshadedKryoShimService.class.getCanonicalName()
                        : HadoopPoolShimService.class.getCanonicalName();
        this.sparkConfiguration.setProperty(KryoShimServiceLoader.KRYO_SHIM_SERVICE, shimService);
        ///////////
        final StringBuilder params = new StringBuilder();
        this.sparkConfiguration.getKeys().forEachRemaining(key -> {
            if (KEYS_PASSED_IN_JVM_SYSTEM_PROPERTIES.contains(key)) {
                params.append(" -D").append("tinkerpop.").append(key).append("=")
                        .append(this.sparkConfiguration.getProperty(key));
                System.setProperty("tinkerpop." + key, this.sparkConfiguration.getProperty(key).toString());
            }/*from w w w .  j  av a2  s  .  c  om*/
        });
        if (params.length() > 0) {
            this.sparkConfiguration.setProperty(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS,
                    (this.sparkConfiguration.getString(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS, "")
                            + params.toString()).trim());
            this.sparkConfiguration.setProperty(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS,
                    (this.sparkConfiguration.getString(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, "")
                            + params.toString()).trim());
        }
        KryoShimServiceLoader.applyConfiguration(this.sparkConfiguration);
        //////////////////////////////////////////////////
        //////////////////////////////////////////////////
        //////////////////////////////////////////////////
        // apache and hadoop configurations that are used throughout the graph computer computation
        final org.apache.commons.configuration.Configuration graphComputerConfiguration = new HadoopConfiguration(
                this.sparkConfiguration);
        if (!graphComputerConfiguration.containsKey(Constants.SPARK_SERIALIZER)) {
            graphComputerConfiguration.setProperty(Constants.SPARK_SERIALIZER,
                    KryoSerializer.class.getCanonicalName());
            if (!graphComputerConfiguration.containsKey(Constants.SPARK_KRYO_REGISTRATOR))
                graphComputerConfiguration.setProperty(Constants.SPARK_KRYO_REGISTRATOR,
                        GryoRegistrator.class.getCanonicalName());
        }
        graphComputerConfiguration.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES,
                this.persist.equals(GraphComputer.Persist.EDGES));
        final Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(graphComputerConfiguration);
        final Storage fileSystemStorage = FileSystemStorage.open(hadoopConfiguration);
        final Storage sparkContextStorage = SparkContextStorage.open(graphComputerConfiguration);
        final boolean inputFromHDFS = FileInputFormat.class.isAssignableFrom(
                hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class));
        final boolean inputFromSpark = PersistedInputRDD.class.isAssignableFrom(
                hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class));
        final boolean outputToHDFS = FileOutputFormat.class.isAssignableFrom(
                hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class));
        final boolean outputToSpark = PersistedOutputRDD.class.isAssignableFrom(
                hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class));
        final boolean skipPartitioner = graphComputerConfiguration
                .getBoolean(Constants.GREMLIN_SPARK_SKIP_PARTITIONER, false);
        final boolean skipPersist = graphComputerConfiguration
                .getBoolean(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE, false);
        String inputLocation = null;
        if (inputFromSpark)
            inputLocation = Constants
                    .getSearchGraphLocation(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION),
                            sparkContextStorage)
                    .orElse(null);
        else if (inputFromHDFS)
            inputLocation = Constants
                    .getSearchGraphLocation(hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION),
                            fileSystemStorage)
                    .orElse(null);
        if (null == inputLocation)
            inputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION);

        if (null != inputLocation && inputFromHDFS) {
            try {
                graphComputerConfiguration.setProperty(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR,
                        FileSystem.get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath()
                                .toString());
                hadoopConfiguration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, FileSystem
                        .get(hadoopConfiguration).getFileStatus(new Path(inputLocation)).getPath().toString());
            } catch (final IOException e) {
                throw new IllegalStateException(e.getMessage(), e);
            }
        }
        final InputRDD inputRDD;
        final OutputRDD outputRDD;
        final boolean filtered;
        try {
            inputRDD = InputRDD.class.isAssignableFrom(
                    hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, Object.class))
                            ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER,
                                    InputRDD.class, InputRDD.class).newInstance()
                            : InputFormatRDD.class.newInstance();
            outputRDD = OutputRDD.class.isAssignableFrom(
                    hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, Object.class))
                            ? hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER,
                                    OutputRDD.class, OutputRDD.class).newInstance()
                            : OutputFormatRDD.class.newInstance();
            // if the input class can filter on load, then set the filters
            if (inputRDD instanceof InputFormatRDD
                    && GraphFilterAware.class.isAssignableFrom(hadoopConfiguration.getClass(
                            Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class))) {
                GraphFilterAware.storeGraphFilter(graphComputerConfiguration, hadoopConfiguration,
                        this.graphFilter);
                filtered = false;
            } else if (inputRDD instanceof GraphFilterAware) {
                ((GraphFilterAware) inputRDD).setGraphFilter(this.graphFilter);
                filtered = false;
            } else if (this.graphFilter.hasFilter()) {
                filtered = true;
            } else {
                filtered = false;
            }
        } catch (final InstantiationException | IllegalAccessException e) {
            throw new IllegalStateException(e.getMessage(), e);
        }

        SparkMemory memory = null;
        // delete output location
        final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, null);
        if (null != outputLocation) {
            if (outputToHDFS && fileSystemStorage.exists(outputLocation))
                fileSystemStorage.rm(outputLocation);
            if (outputToSpark && sparkContextStorage.exists(outputLocation))
                sparkContextStorage.rm(outputLocation);
        }

        // the Spark application name will always be set by SparkContextStorage, thus, INFO the name to make it easier to debug
        logger.debug(Constants.GREMLIN_HADOOP_SPARK_JOB_PREFIX
                + (null == this.vertexProgram ? "No VertexProgram" : this.vertexProgram) + "["
                + this.mapReducers + "]");

        // create the spark configuration from the graph computer configuration
        final SparkConf sparkConfiguration = new SparkConf();
        hadoopConfiguration.forEach(entry -> sparkConfiguration.set(entry.getKey(), entry.getValue()));
        // execute the vertex program and map reducers and if there is a failure, auto-close the spark context
        try {
            final JavaSparkContext sparkContext = new JavaSparkContext(
                    SparkContext.getOrCreate(sparkConfiguration));
            this.loadJars(hadoopConfiguration, sparkContext); // add the project jars to the cluster
            Spark.create(sparkContext.sc()); // this is the context RDD holder that prevents GC
            updateLocalConfiguration(sparkContext, sparkConfiguration);
            // create a message-passing friendly rdd from the input rdd
            boolean partitioned = false;
            JavaPairRDD<Object, VertexWritable> loadedGraphRDD = inputRDD
                    .readGraphRDD(graphComputerConfiguration, sparkContext);
            // if there are vertex or edge filters, filter the loaded graph rdd prior to partitioning and persisting
            if (filtered) {
                this.logger.debug("Filtering the loaded graphRDD: " + this.graphFilter);
                loadedGraphRDD = SparkExecutor.applyGraphFilter(loadedGraphRDD, this.graphFilter);
            }
            // if the loaded graph RDD is already partitioned use that partitioner, else partition it with HashPartitioner
            if (loadedGraphRDD.partitioner().isPresent())
                this.logger.debug("Using the existing partitioner associated with the loaded graphRDD: "
                        + loadedGraphRDD.partitioner().get());
            else {
                if (!skipPartitioner) {
                    final Partitioner partitioner = new HashPartitioner(
                            this.workersSet ? this.workers : loadedGraphRDD.partitions().size());
                    this.logger.debug("Partitioning the loaded graphRDD: " + partitioner);
                    loadedGraphRDD = loadedGraphRDD.partitionBy(partitioner);
                    partitioned = true;
                    assert loadedGraphRDD.partitioner().isPresent();
                } else {
                    assert skipPartitioner == !loadedGraphRDD.partitioner().isPresent(); // no easy way to test this with a test case
                    this.logger.debug("Partitioning has been skipped for the loaded graphRDD via "
                            + Constants.GREMLIN_SPARK_SKIP_PARTITIONER);
                }
            }
            // if the loaded graphRDD was already partitioned previous, then this coalesce/repartition will not take place
            if (this.workersSet) {
                if (loadedGraphRDD.partitions().size() > this.workers) // ensures that the loaded graphRDD does not have more partitions than workers
                    loadedGraphRDD = loadedGraphRDD.coalesce(this.workers);
                else if (loadedGraphRDD.partitions().size() < this.workers) // ensures that the loaded graphRDD does not have less partitions than workers
                    loadedGraphRDD = loadedGraphRDD.repartition(this.workers);
            }
            // persist the vertex program loaded graph as specified by configuration or else use default cache() which is MEMORY_ONLY
            if (!skipPersist && (!inputFromSpark || partitioned || filtered))
                loadedGraphRDD = loadedGraphRDD.persist(StorageLevel.fromString(
                        hadoopConfiguration.get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY")));

            // final graph with view (for persisting and/or mapReducing -- may be null and thus, possible to save space/time)
            JavaPairRDD<Object, VertexWritable> computedGraphRDD = null;
            ////////////////////////////////
            // process the vertex program //
            ////////////////////////////////
            if (null != this.vertexProgram) {
                memory = new SparkMemory(this.vertexProgram, this.mapReducers, sparkContext);
                /////////////////
                // if there is a registered VertexProgramInterceptor, use it to bypass the GraphComputer semantics
                if (graphComputerConfiguration
                        .containsKey(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR)) {
                    try {
                        final SparkVertexProgramInterceptor<VertexProgram> interceptor = (SparkVertexProgramInterceptor) Class
                                .forName(graphComputerConfiguration
                                        .getString(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR))
                                .newInstance();
                        computedGraphRDD = interceptor.apply(this.vertexProgram, loadedGraphRDD, memory);
                    } catch (final ClassNotFoundException | IllegalAccessException | InstantiationException e) {
                        throw new IllegalStateException(e.getMessage());
                    }
                } else { // standard GraphComputer semantics
                    // get a configuration that will be propagated to all workers
                    final HadoopConfiguration vertexProgramConfiguration = new HadoopConfiguration();
                    this.vertexProgram.storeState(vertexProgramConfiguration);
                    // set up the vertex program and wire up configurations
                    this.vertexProgram.setup(memory);
                    JavaPairRDD<Object, ViewIncomingPayload<Object>> viewIncomingRDD = null;
                    memory.broadcastMemory(sparkContext);
                    // execute the vertex program
                    while (true) {
                        if (Thread.interrupted()) {
                            sparkContext.cancelAllJobs();
                            throw new TraversalInterruptedException();
                        }
                        memory.setInExecute(true);
                        viewIncomingRDD = SparkExecutor.executeVertexProgramIteration(loadedGraphRDD,
                                viewIncomingRDD, memory, graphComputerConfiguration,
                                vertexProgramConfiguration);
                        memory.setInExecute(false);
                        if (this.vertexProgram.terminate(memory))
                            break;
                        else {
                            memory.incrIteration();
                            memory.broadcastMemory(sparkContext);
                        }
                    }
                    // if the graph will be continued to be used (persisted or mapreduced), then generate a view+graph
                    if ((null != outputRDD && !this.persist.equals(Persist.NOTHING))
                            || !this.mapReducers.isEmpty()) {
                        computedGraphRDD = SparkExecutor.prepareFinalGraphRDD(loadedGraphRDD, viewIncomingRDD,
                                this.vertexProgram.getVertexComputeKeys());
                        assert null != computedGraphRDD && computedGraphRDD != loadedGraphRDD;
                    } else {
                        // ensure that the computedGraphRDD was not created
                        assert null == computedGraphRDD;
                    }
                }
                /////////////////
                memory.complete(); // drop all transient memory keys
                // write the computed graph to the respective output (rdd or output format)
                if (null != outputRDD && !this.persist.equals(Persist.NOTHING)) {
                    assert null != computedGraphRDD; // the logic holds that a computeGraphRDD must be created at this point
                    outputRDD.writeGraphRDD(graphComputerConfiguration, computedGraphRDD);
                }
            }

            final boolean computedGraphCreated = computedGraphRDD != null && computedGraphRDD != loadedGraphRDD;
            if (!computedGraphCreated)
                computedGraphRDD = loadedGraphRDD;

            final Memory.Admin finalMemory = null == memory ? new MapMemory() : new MapMemory(memory);

            //////////////////////////////
            // process the map reducers //
            //////////////////////////////
            if (!this.mapReducers.isEmpty()) {
                // create a mapReduceRDD for executing the map reduce jobs on
                JavaPairRDD<Object, VertexWritable> mapReduceRDD = computedGraphRDD;
                if (computedGraphCreated && !outputToSpark) {
                    // drop all the edges of the graph as they are not used in mapReduce processing
                    mapReduceRDD = computedGraphRDD.mapValues(vertexWritable -> {
                        vertexWritable.get().dropEdges(Direction.BOTH);
                        return vertexWritable;
                    });
                    // if there is only one MapReduce to execute, don't bother wasting the clock cycles.
                    if (this.mapReducers.size() > 1)
                        mapReduceRDD = mapReduceRDD.persist(StorageLevel.fromString(hadoopConfiguration
                                .get(Constants.GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY")));
                }

                for (final MapReduce mapReduce : this.mapReducers) {
                    // execute the map reduce job
                    final HadoopConfiguration newApacheConfiguration = new HadoopConfiguration(
                            graphComputerConfiguration);
                    mapReduce.storeState(newApacheConfiguration);
                    // map
                    final JavaPairRDD mapRDD = SparkExecutor.executeMap((JavaPairRDD) mapReduceRDD, mapReduce,
                            newApacheConfiguration);
                    // combine
                    final JavaPairRDD combineRDD = mapReduce.doStage(MapReduce.Stage.COMBINE)
                            ? SparkExecutor.executeCombine(mapRDD, newApacheConfiguration)
                            : mapRDD;
                    // reduce
                    final JavaPairRDD reduceRDD = mapReduce.doStage(MapReduce.Stage.REDUCE)
                            ? SparkExecutor.executeReduce(combineRDD, mapReduce, newApacheConfiguration)
                            : combineRDD;
                    // write the map reduce output back to disk and computer result memory
                    if (null != outputRDD)
                        mapReduce.addResultToMemory(finalMemory, outputRDD.writeMemoryRDD(
                                graphComputerConfiguration, mapReduce.getMemoryKey(), reduceRDD));
                }
                // if the mapReduceRDD is not simply the computed graph, unpersist the mapReduceRDD
                if (computedGraphCreated && !outputToSpark) {
                    assert loadedGraphRDD != computedGraphRDD;
                    assert mapReduceRDD != computedGraphRDD;
                    mapReduceRDD.unpersist();
                } else {
                    assert mapReduceRDD == computedGraphRDD;
                }
            }

            // unpersist the loaded graph if it will not be used again (no PersistedInputRDD)
            // if the graphRDD was loaded from Spark, but then partitioned or filtered, its a different RDD
            if (!inputFromSpark || partitioned || filtered)
                loadedGraphRDD.unpersist();
            // unpersist the computed graph if it will not be used again (no PersistedOutputRDD)
            // if the computed graph is the loadedGraphRDD because it was not mutated and not-unpersisted, then don't unpersist the computedGraphRDD/loadedGraphRDD
            if ((!outputToSpark || this.persist.equals(GraphComputer.Persist.NOTHING)) && computedGraphCreated)
                computedGraphRDD.unpersist();
            // delete any file system or rdd data if persist nothing
            if (null != outputLocation && this.persist.equals(GraphComputer.Persist.NOTHING)) {
                if (outputToHDFS)
                    fileSystemStorage.rm(outputLocation);
                if (outputToSpark)
                    sparkContextStorage.rm(outputLocation);
            }
            // update runtime and return the newly computed graph
            finalMemory.setRuntime(System.currentTimeMillis() - startTime);
            // clear properties that should not be propagated in an OLAP chain
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_GRAPH_FILTER);
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_HADOOP_VERTEX_PROGRAM_INTERCEPTOR);
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_GRAPH_CACHE);
            graphComputerConfiguration.clearProperty(Constants.GREMLIN_SPARK_SKIP_PARTITIONER);
            return new DefaultComputerResult(InputOutputHelper.getOutputGraph(graphComputerConfiguration,
                    this.resultGraph, this.persist), finalMemory.asImmutable());
        } finally {
            if (!graphComputerConfiguration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false))
                Spark.close();
        }
    });
}

From source file:org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter.java

License:Open Source License

public SequenceFileSpillWriter(FileSystem fileSystem, Configuration conf, Path outputFilePath,
        Class<KeyType> keyClass, Class<ValueType> valueClass,
        SequenceFileIndexWriter<KeyType, ValueType> optionalIndexWriter, boolean compress) throws IOException {

    _indexWriter = optionalIndexWriter;//from   w ww  .ja v a2 s.com
    _spillBufferSize = conf.getInt(SPILL_WRITER_BUFFER_SIZE_PARAM, DEFAULT_SPILL_BUFFER_SIZE);
    _outputStream = fileSystem.create(outputFilePath);

    // allocate buffer ...
    _activeBuffer = ByteBuffer.allocate(_spillBufferSize);

    if (compress) {
        Class codecClass = conf.getClass("mapred.output.compression.codec", DefaultCodec.class);
        CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);

        writer = SequenceFile.createWriter(conf, _outputStream, keyClass, valueClass, CompressionType.BLOCK,
                codec);
    } else {
        writer = SequenceFile.createWriter(conf, _outputStream, keyClass, valueClass, CompressionType.NONE,
                null);
    }

    _writerThread = new Thread(new Runnable() {

        @Override
        public void run() {
            // LOG.info("Writer Thread Starting");

            while (true) {

                QueuedBufferItem queuedBufferItem = null;

                try {
                    queuedBufferItem = _bufferQueue.take();
                } catch (InterruptedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                if (queuedBufferItem._buffer == null) {
                    // LOG.info("Writer Thread received empty buffer item. Exiting");
                    return;
                } else {

                    ByteBuffer theBuffer = queuedBufferItem._buffer;

                    // LOG.info("Writer Thread received item. Limit:" +
                    // theBuffer.limit());

                    // get byte pointer
                    byte[] bufferAsBytes = theBuffer.array();

                    int itemsWritten = 0;
                    long timeStart = System.currentTimeMillis();

                    while (theBuffer.remaining() != 0) {

                        // now read in key length
                        int keyLen = theBuffer.getInt();
                        // mark key position
                        int keyPos = theBuffer.position();
                        // now skip past key length
                        theBuffer.position(keyPos + keyLen);
                        // read value length
                        int valueLen = theBuffer.getInt();
                        // mark value position
                        int valuePosition = theBuffer.position();
                        // now skip past it ...
                        theBuffer.position(valuePosition + valueLen);
                        // now write this out to the sequence file ...

                        try {
                            spillRawRecord2(bufferAsBytes, keyPos, keyLen, bufferAsBytes, valuePosition,
                                    valueLen);
                        } catch (IOException e) {
                            LOG.error("Writer Thread Failed with Error:" + StringUtils.stringifyException(e));
                            _writerException = e;
                            return;
                        }
                        itemsWritten++;
                    }
                    // LOG.info("Writer Thread Finished With Buffer. Wrote:"+
                    // itemsWritten + " in:" + (System.currentTimeMillis() -
                    // timeStart));
                }
            }
        }

    });
    _writerThread.start();
}

From source file:org.hypertable.FsBroker.hadoop.HadoopBroker.java

License:Open Source License

/**
 * Returns a brand new instance of the FileSystem. It does not use
 * the FileSystem.Cache. In newer versions of HDFS, we can directly
 * invoke FileSystem.newInstance(Configuration).
 * //from  www .  j a  va2s.co  m
 * @param conf Configuration
 * @return A new instance of the filesystem
 */
private static FileSystem newInstanceFileSystem(Configuration conf) throws IOException {
    URI uri = FileSystem.getDefaultUri(conf);
    Class<?> clazz = conf.getClass("fs." + uri.getScheme() + ".impl", null);
    if (clazz == null) {
        throw new IOException("No FileSystem for scheme: " + uri.getScheme());
    }
    FileSystem fs = (FileSystem) ReflectionUtils.newInstance(clazz, conf);
    fs.initialize(uri, conf);
    return fs;
}

From source file:org.kiji.mapreduce.IntegrationTestJobHistoryKijiTable.java

License:Apache License

/**
 * Test of all the basic information recorded by a mapper.
 *//* w  w  w .ja v  a2 s  .c  o m*/
@Test
public void testMappers() throws Exception {
    createAndPopulateFooTable();
    final Configuration jobConf = getConf();
    // Set a value in the configuration. We'll check to be sure we can retrieve it later.
    jobConf.set("conf.test.animal.string", "squirrel");
    final Kiji kiji = Kiji.Factory.open(getKijiURI());
    final KijiTable fooTable = kiji.openTable("foo");
    final JobHistoryKijiTable jobHistory = JobHistoryKijiTable.open(kiji);

    // Construct a Producer for this table.
    final KijiProduceJobBuilder builder = KijiProduceJobBuilder.create().withConf(jobConf)
            .withInputTable(fooTable).withProducer(EmailDomainProducer.class)
            .withOutput(new DirectKijiTableMapReduceJobOutput(fooTable));
    MapReduceJob mrJob = builder.build();

    // Record the jobId and run the job.
    String jobName = mrJob.getHadoopJob().getJobName();
    LOG.info("About to run job: " + jobName);
    assertTrue(mrJob.run());
    String jobId = mrJob.getHadoopJob().getJobID().toString();
    LOG.info("Job was run with id: " + jobId);

    // Retrieve the recorded values and sanity test them.
    KijiRowData jobRecord = jobHistory.getJobDetails(jobId);
    assertTrue(jobRecord.containsColumn("info", "jobName"));
    assertEquals(jobRecord.getMostRecentValue("info", "jobName").toString(), jobName);
    assertTrue(jobRecord.containsColumn("info", "jobId"));
    assertEquals(jobRecord.getMostRecentValue("info", "jobId").toString(), jobId);

    assertTrue(jobRecord.containsColumn("info", "startTime"));
    assertTrue(jobRecord.containsColumn("info", "endTime"));
    assertTrue(jobRecord.<Long>getMostRecentValue("info", "startTime") < jobRecord
            .<Long>getMostRecentValue("info", "endTime"));

    // Check counters. We don't know the exact number of rows in the foo table, so just check if
    // it's greater than 0.
    assertTrue(jobRecord.containsColumn("info", "counters"));
    final String countersString = jobRecord.getMostRecentValue("info", "counters").toString();
    final Pattern countersPattern = Pattern.compile("PRODUCER_ROWS_PROCESSED=(\\d+)");
    final Matcher countersMatcher = countersPattern.matcher(countersString);
    assertTrue(countersMatcher.find());
    assertTrue(Integer.parseInt(countersMatcher.group(1)) > 0);

    // Test to make sure the Configuration has the correct producer class, and records the value
    // we set previously.
    assertTrue(jobRecord.containsColumn("info", "configuration"));
    final String configString = jobRecord.getMostRecentValue("info", "configuration").toString();
    final Configuration config = new Configuration();
    config.addResource(new ByteArrayInputStream(configString.getBytes()));
    assertTrue(EmailDomainProducer.class == config.getClass(KijiConfKeys.KIJI_PRODUCER_CLASS, null));
    assertEquals("Couldn't retrieve configuration field from deserialized configuration.", "squirrel",
            config.get("conf.test.animal.string"));

    fooTable.close();
    jobHistory.close();
    kiji.release();
}

From source file:org.kiji.scoring.batch.impl.ScoreFunctionMapper.java

License:Apache License

/** {@inheritDoc} */
@Override/*from  w  w  w .  ja v a2  s .c om*/
@SuppressWarnings("unchecked")
protected void setup(final Context context) throws IOException {
    super.setup(context);
    Preconditions.checkState(null == mFreshenerContext);
    final Configuration conf = context.getConfiguration();
    final Class<? extends ScoreFunction<?>> scoreFunctionClass = (Class<? extends ScoreFunction<?>>) conf
            .getClass(ScoreFunctionJobBuilder.SCORE_FUNCTION_CLASS_CONF_KEY, null);
    if (null == scoreFunctionClass) {
        throw new IOException("ScoreFunction class could not be found in configuration.");
    }
    mScoreFunction = ReflectionUtils.newInstance(scoreFunctionClass, conf);
    mAttachedColumn = new KijiColumnName(
            conf.get(ScoreFunctionJobBuilder.SCORE_FUNCTION_ATTACHED_COLUMN_CONF_KEY));
    mParameters = GSON.fromJson(conf.get(ScoreFunctionJobBuilder.SCORE_FUNCTION_PARAMETERS_CONF_KEY),
            Map.class);
    final KeyValueStoreReaderFactory factory = KeyValueStoreReaderFactory.create(conf);
    mClientDataRequest = getClientDataRequestFromConf(conf);
    mFreshenerContext = InternalFreshenerContext.create(mClientDataRequest, mAttachedColumn, mParameters,
            Maps.<String, String>newHashMap(), factory);
    mTableContext = KijiTableContextFactory.create(context);
    mScoreFunction.setup(mFreshenerContext);
}

From source file:org.kitesdk.data.mapreduce.DatasetKeyInputFormat.java

License:Apache License

@SuppressWarnings({ "deprecation", "unchecked" })
private static <E> View<E> load(Configuration conf) {
    Class<E> type;/* www  . ja v a 2  s  .  com*/
    try {
        type = (Class<E>) conf.getClass(KITE_TYPE, GenericData.Record.class);
    } catch (RuntimeException e) {
        if (e.getCause() instanceof ClassNotFoundException) {
            throw new TypeNotFoundException(String
                    .format("The Java class %s for the entity type could not be found", conf.get(KITE_TYPE)),
                    e.getCause());
        } else {
            throw e;
        }
    }

    String schemaStr = conf.get(KITE_READER_SCHEMA);
    Schema projection = null;
    if (schemaStr != null) {
        projection = new Schema.Parser().parse(schemaStr);
    }

    String inputUri = conf.get(KITE_INPUT_URI);
    if (projection != null) {
        return Datasets.load(inputUri).asSchema(projection).asType(type);
    } else {
        return Datasets.load(inputUri, type);
    }
}

From source file:org.kitesdk.data.mapreduce.DatasetKeyOutputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
private static <E> Class<E> getType(JobContext jobContext) {
    Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext);
    Class<E> type;//from w  ww . j  a  v a2  s .  co m
    try {
        type = (Class<E>) conf.getClass(KITE_TYPE, GenericData.Record.class);
    } catch (RuntimeException e) {
        if (e.getCause() instanceof ClassNotFoundException) {
            throw new TypeNotFoundException(String
                    .format("The Java class %s for the entity type could not be found", conf.get(KITE_TYPE)),
                    e.getCause());
        } else {
            throw e;
        }
    }
    return type;
}

From source file:org.mrgeo.buildpyramid.BuildPyramidMapper.java

License:Apache License

@SuppressWarnings("rawtypes")
@Override/*from w  w w. j a  v  a2 s .  co  m*/
public void setup(Mapper.Context context) {
    Configuration conf = context.getConfiguration();

    tolevel = conf.getInt(BuildPyramidDriver.TO_LEVEL, 0);
    fromlevel = conf.getInt(BuildPyramidDriver.FROM_LEVEL, 0);

    try {
        Map<String, MrsImagePyramidMetadata> meta = HadoopUtils.getMetadata(context.getConfiguration());
        metadata = meta.values().iterator().next();

        aggregator = (Aggregator) ReflectionUtils
                .newInstance(conf.getClass(BuildPyramidDriver.AGGREGATOR, MeanAggregator.class), conf);

    } catch (Exception e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
    tileCounter = context.getCounter("Build Pyramid Mapper", "Source Tiles Processed");
}