Example usage for org.apache.hadoop.conf Configuration set

List of usage examples for org.apache.hadoop.conf Configuration set

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration set.

Prototype

public void set(String name, String value) 

Source Link

Document

Set the value of the name property.

Usage

From source file:cascading.flow.tez.Hadoop2TezFlowStep.java

License:Open Source License

private static void setWorkingDirectory(Configuration conf) {
    String name = conf.get(JobContext.WORKING_DIR);

    if (name != null)
        return;/*from   w ww . j ava 2s  . c om*/

    try {
        Path dir = FileSystem.get(conf).getWorkingDirectory();
        conf.set(JobContext.WORKING_DIR, dir.toString());
    } catch (IOException exception) {
        throw new RuntimeException(exception);
    }
}

From source file:cascading.flow.tez.Hadoop2TezFlowStep.java

License:Open Source License

protected Map<FlowElement, Configuration> initFromSources(FlowNode flowNode,
        FlowProcess<TezConfiguration> flowProcess, Configuration conf,
        Map<String, LocalResource> taskLocalResources) {
    Set<? extends FlowElement> accumulatedSources = flowNode.getSourceElements(StreamMode.Accumulated);

    for (FlowElement element : accumulatedSources) {
        if (element instanceof Tap) {
            JobConf current = new JobConf(conf);
            Tap tap = (Tap) element;//from  w  w w . jav  a2  s .  c o  m

            if (tap.getIdentifier() == null)
                throw new IllegalStateException("tap may not have null identifier: " + tap.toString());

            tap.sourceConfInit(flowProcess, current);

            Collection<String> paths = current.getStringCollection(CASCADING_LOCAL_RESOURCES + Tap.id(tap));

            if (!paths.isEmpty()) {
                String flowStagingPath = ((Hadoop2TezFlow) getFlow()).getFlowStagingPath();
                String resourceSubPath = Tap.id(tap);
                Map<Path, Path> pathMap = TezUtil.addToClassPath(current, flowStagingPath, resourceSubPath,
                        paths, LocalResourceType.FILE, taskLocalResources, null);

                current.setStrings(CASCADING_REMOTE_RESOURCES + Tap.id(tap),
                        taskLocalResources.keySet().toArray(new String[taskLocalResources.size()]));

                allLocalResources.putAll(taskLocalResources);
                syncPaths.putAll(pathMap);
            }

            Map<String, String> map = flowProcess.diffConfigIntoMap(new TezConfiguration(conf),
                    new TezConfiguration(current));
            conf.set("cascading.node.accumulated.source.conf." + Tap.id(tap), pack(map, conf));

            setLocalMode(conf, current, tap);
        }
    }

    Set<FlowElement> sources = new HashSet<>(flowNode.getSourceElements());

    sources.removeAll(accumulatedSources);

    if (sources.isEmpty())
        throw new IllegalStateException("all sources marked as accumulated");

    Map<FlowElement, Configuration> configs = new HashMap<>();

    for (FlowElement element : sources) {
        JobConf current = new JobConf(conf);

        String id = FlowElements.id(element);

        current.set("cascading.node.source", id);

        if (element instanceof Tap) {
            Tap tap = (Tap) element;

            if (tap.getIdentifier() == null)
                throw new IllegalStateException("tap may not have null identifier: " + tap.toString());

            tap.sourceConfInit(flowProcess, current);

            setLocalMode(conf, current, tap);
        }

        configs.put(element, current);
    }

    return configs;
}

From source file:cascading.flow.tez.Hadoop2TezFlowStep.java

License:Open Source License

private Vertex newVertex(FlowNode flowNode, Configuration conf, int parallelism) {
    conf.set(FlowNode.CASCADING_FLOW_NODE, pack(flowNode, conf)); // todo: pack into payload directly

    ProcessorDescriptor descriptor = ProcessorDescriptor.create(FlowProcessor.class.getName());

    descriptor.setUserPayload(getPayload(conf));

    Vertex vertex = Vertex.create(flowNode.getID(), descriptor, parallelism);

    if (environment != null)
        vertex.setTaskEnvironment(environment);

    return vertex;
}

From source file:cascading.flow.tez.planner.Hadoop2TezPlanner.java

License:Open Source License

public static void copyProperties(Configuration jobConf, Map<Object, Object> properties) {
    if (properties instanceof Properties) {
        Properties props = (Properties) properties;
        Set<String> keys = props.stringPropertyNames();

        for (String key : keys)
            jobConf.set(key, props.getProperty(key));
    } else {/*from  ww  w . j  a  va2  s. c o m*/
        for (Map.Entry<Object, Object> entry : properties.entrySet()) {
            if (entry.getValue() != null)
                jobConf.set(entry.getKey().toString(), entry.getValue().toString());
        }
    }
}

From source file:cascading.flow.tez.util.TezUtil.java

License:Open Source License

public static void setSourcePathForSplit(MRInput input, MRReader reader, Configuration configuration) {
    Path path = null;//from   www  .  j a v a2 s. com

    if (Util.returnInstanceFieldIfExistsSafe(input, "useNewApi")) {
        org.apache.hadoop.mapreduce.InputSplit newInputSplit = (org.apache.hadoop.mapreduce.InputSplit) reader
                .getSplit();

        if (newInputSplit instanceof org.apache.hadoop.mapreduce.lib.input.FileSplit)
            path = ((org.apache.hadoop.mapreduce.lib.input.FileSplit) newInputSplit).getPath();
    } else {
        org.apache.hadoop.mapred.InputSplit oldInputSplit = (org.apache.hadoop.mapred.InputSplit) reader
                .getSplit();

        if (oldInputSplit instanceof org.apache.hadoop.mapred.FileSplit)
            path = ((org.apache.hadoop.mapred.FileSplit) oldInputSplit).getPath();
    }

    if (path != null)
        configuration.set(MultiInputSplit.CASCADING_SOURCE_PATH, path.toString());
}

From source file:cascading.flow.tez.util.TezUtil.java

License:Open Source License

public static void setMRProperties(ProcessorContext context, Configuration config, boolean isMapperOutput) {
    TaskAttemptID taskAttemptId = org.apache.tez.mapreduce.hadoop.mapreduce.TaskAttemptContextImpl
            .createMockTaskAttemptID(context.getApplicationId().getClusterTimestamp(),
                    context.getTaskVertexIndex(), context.getApplicationId().getId(), context.getTaskIndex(),
                    context.getTaskAttemptNumber(), isMapperOutput);

    config.set(JobContext.TASK_ATTEMPT_ID, taskAttemptId.toString());
    config.set(JobContext.TASK_ID, taskAttemptId.getTaskID().toString());
    config.setBoolean(JobContext.TASK_ISMAP, isMapperOutput);
    config.setInt(JobContext.TASK_PARTITION, taskAttemptId.getTaskID().getId());
}

From source file:cascading.platform.tez.Hadoop2TezPlatform.java

License:Open Source License

@Override
public synchronized void setUp() throws IOException {
    if (configuration != null)
        return;//from ww  w.j  a v  a  2  s. c  om

    if (!isUseCluster()) {
        // Current usage requirements:
        // 1. Clients need to set "tez.local.mode" to true when creating a TezClient instance. (For the examples this can be done via -Dtez.local.mode=true)
        // 2. fs.defaultFS must be set to "file:///"
        // 2.1 If running examples - this must be set in tez-site.xml (so that it's picked up by the client, as well as the conf instances used to configure the Inputs / Outputs).
        // 2.2 If using programatically (without a tez-site.xml present). All configuration instances used (to crate the client / configure Inputs / Outputs) - must have this property set.
        // 3. tez.runtime.optimize.local.fetch needs to be set to true (either via tez-site.xml or in all configurations used to create the job (similar to fs.defaultFS in step 2))
        // 4. tez.staging-dir must be set (either programatically or via tez-site.xml).
        // Until TEZ-1337 goes in - the staging-dir for the job is effectively the root of the filesystem (and where inputs are read from / written to if relative paths are used).

        LOG.info("not using cluster");
        configuration = new Configuration();

        configuration.setInt(FlowRuntimeProps.GATHER_PARTITIONS, getNumGatherPartitions());
        //      configuration.setInt( FlowRuntimeProps.GATHER_PARTITIONS, 1 ); // deadlocks if larger than 1

        configuration.set(TezConfiguration.TEZ_LOCAL_MODE, "true");
        configuration.set("fs.defaultFS", "file:///");
        configuration.set("tez.runtime.optimize.local.fetch", "true");

        // hack to prevent deadlocks where downstream processors are scheduled before upstream
        configuration.setInt("tez.am.inline.task.execution.max-tasks", 3); // testHashJoinMergeIntoHashJoinAccumulatedAccumulatedMerge fails if set to 2

        configuration.set(TezConfiguration.TEZ_IGNORE_LIB_URIS, "true"); // in local mode, use local classpath
        configuration.setInt(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, -1);
        configuration.set(TezConfiguration.TEZ_GENERATE_DEBUG_ARTIFACTS, "true");

        configuration.set("tez.am.mode.session", "true"); // allows multiple TezClient instances to be used in a single jvm

        if (!Util.isEmpty(System.getProperty("hadoop.tmp.dir")))
            configuration.set("hadoop.tmp.dir", System.getProperty("hadoop.tmp.dir"));
        else
            configuration.set("hadoop.tmp.dir", "build/test/tmp");

        fileSys = FileSystem.get(configuration);
    } else {
        LOG.info("using cluster");

        if (Util.isEmpty(System.getProperty("hadoop.log.dir")))
            System.setProperty("hadoop.log.dir", "build/test/log");

        if (Util.isEmpty(System.getProperty("hadoop.tmp.dir")))
            System.setProperty("hadoop.tmp.dir", "build/test/tmp");

        new File(System.getProperty("hadoop.log.dir")).mkdirs(); // ignored
        new File(System.getProperty("hadoop.tmp.dir")).mkdirs(); // ignored

        Configuration defaultConf = new Configuration();

        defaultConf.setInt(FlowRuntimeProps.GATHER_PARTITIONS, getNumGatherPartitions());

        defaultConf.setInt(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, -1);

        //      defaultConf.set( TezConfiguration.TEZ_AM_LOG_LEVEL, "DEBUG" );
        //      defaultConf.set( TezConfiguration.TEZ_TASK_LOG_LEVEL, "DEBUG" );

        defaultConf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
        defaultConf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);
        defaultConf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, System.getProperty("hadoop.tmp.dir"));

        miniDFSCluster = new MiniDFSCluster.Builder(defaultConf).numDataNodes(4).format(true).racks(null)
                .build();

        fileSys = miniDFSCluster.getFileSystem();

        Configuration tezConf = new Configuration(defaultConf);
        tezConf.set("fs.defaultFS", fileSys.getUri().toString()); // use HDFS
        tezConf.set(MRJobConfig.MR_AM_STAGING_DIR, "/apps_staging_dir");

        // see MiniTezClusterWithTimeline as alternate
        miniTezCluster = new MiniTezCluster(getClass().getName(), 4, 1, 1); // todo: set to 4
        miniTezCluster.init(tezConf);
        miniTezCluster.start();

        configuration = miniTezCluster.getConfig();

        // stats won't work after completion unless ATS is used
        if (setTimelineStore(configuration)) // true if ats can be loaded and configured for this hadoop version
        {
            configuration.set(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS,
                    ATSHistoryLoggingService.class.getName());
            configuration.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
            configuration.set(YarnConfiguration.TIMELINE_SERVICE_ADDRESS, "localhost:10200");
            configuration.set(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_ADDRESS, "localhost:8188");
            configuration.set(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_HTTPS_ADDRESS, "localhost:8190");

            yarnHistoryServer = new ApplicationHistoryServer();
            yarnHistoryServer.init(configuration);
            yarnHistoryServer.start();
        }
    }

    configuration.setInt(TezConfiguration.TEZ_AM_MAX_APP_ATTEMPTS, 1);
    configuration.setInt(TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS, 1);
    configuration.setInt(TezConfiguration.TEZ_AM_MAX_TASK_FAILURES_PER_NODE, 1);

    Map<Object, Object> globalProperties = getGlobalProperties();

    if (logger != null)
        globalProperties.put("log4j.logger", logger);

    FlowProps.setJobPollingInterval(globalProperties, 10); // should speed up tests

    Hadoop2TezPlanner.copyProperties(configuration, globalProperties); // copy any external properties

    Hadoop2TezPlanner.copyConfiguration(properties, configuration); // put all properties on the jobconf

    ExitUtil.disableSystemExit();

    //    forbidSystemExitCall();
}

From source file:cascading.plumber.grids.AbstractGridTest.java

License:Apache License

@Test
public void shouldCopyConfigurationIntoProperties() {
    Configuration configuration = new Configuration();
    configuration.set(KEY, VALUE);
    new MockGrid().createFlowConnector(configuration);
}

From source file:cascading.scheme.DeprecatedAvroScheme.java

License:Apache License

/**
 * sinkConfInit is called by cascading to set up the sinks. This happens on the client side before the
 * job is distributed./* ww w  .ja  v a  2s. c o  m*/
 * There is a check for the presence of a schema and an exception is thrown if none has been provided.
 * After the schema check the conf object is given the options that Avro needs.
 *
 * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically.
 * @param tap         The cascading Tap object. Should be passed in by cascading automatically.
 * @param conf        The Hadoop JobConf object. This is passed in by cascading automatically.
 * @throws RuntimeException If no schema is present this halts the entire process.
 */
@Override
public void sinkConfInit(FlowProcess<? extends Configuration> flowProcess,
        Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf) {

    if (schema == null) {
        throw new RuntimeException("Must provide sink schema");
    }
    // Set the output schema and output format class
    conf.set(AvroJob.OUTPUT_SCHEMA, schema.toString());
    conf.set("mapred.output.format.class", "org.apache.avro.mapred.AvroOutputFormat");

    // add AvroSerialization to io.serializations
    addAvroSerializations(conf);
}

From source file:cascading.scheme.DeprecatedAvroScheme.java

License:Apache License

/**
 * sourceConfInit is called by cascading to set up the sources. This happens on the client side before the
 * job is distributed.//w  ww.  j a  v a2 s .c om
 * There is a check for the presence of a schema and if none has been provided the data is peeked at to get a schema.
 * After the schema check the conf object is given the options that Avro needs.
 *
 * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically.
 * @param tap         The cascading Tap object. Should be passed in by cascading automatically.
 * @param conf        The Hadoop JobConf object. This is passed in by cascading automatically.
 * @throws RuntimeException If no schema is present this halts the entire process.
 */
@Override
public void sourceConfInit(FlowProcess<? extends Configuration> flowProcess,
        Tap<Configuration, RecordReader, OutputCollector> tap, Configuration conf) {

    retrieveSourceFields(flowProcess, tap);
    // Set the input schema and input class
    conf.set(AvroJob.INPUT_SCHEMA, schema.toString());
    conf.set("mapred.input.format.class", "org.apache.avro.mapred.AvroInputFormat");

    // add AvroSerialization to io.serializations
    addAvroSerializations(conf);
}