Example usage for org.apache.hadoop.conf Configuration setBoolean

List of usage examples for org.apache.hadoop.conf Configuration setBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setBoolean.

Prototype

public void setBoolean(String name, boolean value) 

Source Link

Document

Set the value of the name property to a boolean.

Usage

From source file:kogiri.common.hadoop.io.format.fasta.FastaReadDescriptionInputFormat.java

License:Open Source License

public static void setSplitable(Configuration conf, boolean splitable) {
    conf.setBoolean(CONF_SPLITABLE, splitable);
}

From source file:ml.shifu.guagua.hadoop.io.GuaguaOptionsParser.java

License:Apache License

/**
 * Modify configuration according user-specified generic options
 * /*from  w  w w  .j a v  a2  s.  c o  m*/
 * @param conf
 *            Configuration to be modified
 * @param line
 *            User-specified generic options
 */
private void processGeneralOptions(Configuration conf, CommandLine line) throws IOException {
    if (line.hasOption("fs")) {
        FileSystem.setDefaultUri(conf, line.getOptionValue("fs"));
    }

    if (line.hasOption("jt")) {
        conf.set("mapred.job.tracker", line.getOptionValue("jt"));
    }
    if (line.hasOption("conf")) {
        String[] values = line.getOptionValues("conf");
        for (String value : values) {
            conf.addResource(new Path(value));
        }
    }
    if (line.hasOption("libjars")) {
        conf.set("tmpjars", validateFiles(line.getOptionValue("libjars"), conf));
        // setting libjars in client classpath
        URL[] libjars = getLibJars(conf);
        if (libjars != null && libjars.length > 0) {
            conf.setClassLoader(new URLClassLoader(libjars, conf.getClassLoader()));
            Thread.currentThread().setContextClassLoader(
                    new URLClassLoader(libjars, Thread.currentThread().getContextClassLoader()));
        }
    }
    if (line.hasOption("files")) {
        conf.set("tmpfiles", validateFiles(line.getOptionValue("files"), conf));
    }
    if (line.hasOption("archives")) {
        conf.set("tmparchives", validateFiles(line.getOptionValue("archives"), conf));
    }
    if (line.hasOption('D')) {
        String[] property = line.getOptionValues('D');
        for (String prop : property) {
            String[] keyval = prop.split("=", 2);
            if (keyval.length == 2) {
                conf.set(keyval[0], keyval[1]);
            }
        }
    }
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    // tokensFile
    if (line.hasOption("tokenCacheFile")) {
        String fileName = line.getOptionValue("tokenCacheFile");
        // check if the local file exists
        try {
            FileSystem localFs = FileSystem.getLocal(conf);
            Path p = new Path(fileName);
            if (!localFs.exists(p)) {
                throw new FileNotFoundException("File " + fileName + " does not exist.");
            }

            LOG.debug("setting conf tokensFile: {}", fileName);
            conf.set("mapreduce.job.credentials.json", localFs.makeQualified(p).toString());
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}

From source file:ml.shifu.guagua.mapreduce.example.nn.NNMapReduceClient.java

License:Apache License

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 6) {
        throw new IllegalArgumentException(
                "NNMapReduceClient: Must have at least 5 arguments <guagua.iteration.count> <guagua.zk.servers> <nn.test.scale> <nn.record.scales> <input path or folder> <guagua.nn.output>. ");
    }//from  www.j a  va2s  .co m
    conf.set(GuaguaConstants.WORKER_COMPUTABLE_CLASS, NNWorker.class.getName());
    conf.set(GuaguaConstants.MASTER_COMPUTABLE_CLASS, NNMaster.class.getName());
    conf.set(GuaguaConstants.GUAGUA_ITERATION_COUNT, otherArgs[0]);

    conf.set(GuaguaConstants.GUAGUA_ZK_SERVERS, otherArgs[1]);

    conf.set(NNConstants.NN_TEST_SCALE, otherArgs[2]);
    conf.set(NNConstants.NN_RECORD_SCALE, otherArgs[3]);

    conf.set(GuaguaConstants.GUAGUA_MASTER_RESULT_CLASS, NNParams.class.getName());
    conf.set(GuaguaConstants.GUAGUA_WORKER_RESULT_CLASS, NNParams.class.getName());

    conf.setInt(NNConstants.GUAGUA_NN_INPUT_NODES, NNConstants.GUAGUA_NN_DEFAULT_INPUT_NODES);
    conf.setInt(NNConstants.GUAGUA_NN_HIDDEN_NODES, NNConstants.GUAGUA_NN_DEFAULT_HIDDEN_NODES);
    conf.setInt(NNConstants.GUAGUA_NN_OUTPUT_NODES, NNConstants.GUAGUA_NN_DEFAULT_OUTPUT_NODES);
    conf.set(NNConstants.GUAGUA_NN_ALGORITHM, NNConstants.GUAGUA_NN_DEFAULT_ALGORITHM);
    conf.setInt(NNConstants.GUAGUA_NN_THREAD_COUNT, NNConstants.GUAGUA_NN_DEFAULT_THREAD_COUNT);
    conf.set(NNConstants.GUAGUA_NN_LEARNING_RATE, NNConstants.GUAGUA_NN_DEFAULT_LEARNING_RATE);

    conf.set(NNConstants.GUAGUA_NN_OUTPUT, otherArgs[5]);

    conf.set(GuaguaConstants.GUAGUA_MASTER_INTERCEPTERS, NNOutput.class.getName());

    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, false);
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, false);
    conf.setInt(GuaguaMapReduceConstants.MAPRED_TASK_TIMEOUT, 3600000);
    conf.setInt(GuaguaMapReduceConstants.IO_SORT_MB, 0);

    Job job = new Job(conf, "Guagua NN Master-Workers Job");
    job.setJarByClass(NNMapReduceClient.class);
    job.setMapperClass(GuaguaMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setInputFormatClass(NNInputFormat.class);
    job.setOutputFormatClass(GuaguaOutputFormat.class);
    job.setNumReduceTasks(0);
    FileInputFormat.addInputPath(job, new Path(otherArgs[4]));
    job.waitForCompletion(true);
}

From source file:ml.shifu.guagua.mapreduce.example.sum.SumMapReduceClient.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 3) {
        throw new IllegalArgumentException(
                "NNMapReduceClient: Must have at least 2 arguments <guagua.iteration.count> <guagua.zk.servers> <input path or folder>. ");
    }//from w  ww .j  a v  a 2  s  .co m
    conf.set(GuaguaConstants.WORKER_COMPUTABLE_CLASS, SumWorker.class.getName());
    conf.set(GuaguaConstants.MASTER_COMPUTABLE_CLASS, SumMaster.class.getName());
    conf.set(GuaguaConstants.GUAGUA_ITERATION_COUNT, otherArgs[0]);

    conf.set(GuaguaConstants.GUAGUA_ZK_SERVERS, otherArgs[1]);
    conf.setInt(GuaguaConstants.GUAGUA_ZK_SESSION_TIMEOUT, 300 * 1000);
    conf.setInt(GuaguaConstants.GUAGUA_ZK_MAX_ATTEMPTS, 5);
    conf.setInt(GuaguaConstants.GUAGUA_ZK_RETRY_WAIT_MILLS, 1000);

    // if you set result class to hadoop Writable, you must use GuaguaWritableSerializer, this can be avoided by
    // using GuaguaMapReduceClient
    conf.set(GuaguaConstants.GUAGUA_MASTER_RESULT_CLASS, LongWritable.class.getName());
    conf.set(GuaguaConstants.GUAGUA_WORKER_RESULT_CLASS, LongWritable.class.getName());
    conf.set(GuaguaConstants.GUAGUA_MASTER_IO_SERIALIZER, "ml.shifu.guagua.mapreduce.GuaguaWritableSerializer");
    conf.set(GuaguaConstants.GUAGUA_WORKER_IO_SERIALIZER, "ml.shifu.guagua.mapreduce.GuaguaWritableSerializer");

    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, false);
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, false);
    conf.setInt(GuaguaMapReduceConstants.MAPRED_TASK_TIMEOUT, 3600000);
    conf.setInt(GuaguaMapReduceConstants.IO_SORT_MB, 0);

    Job job = new Job(conf, "Guagua Sum Master-Workers Job");
    job.setJarByClass(SumMapReduceClient.class);
    job.setMapperClass(GuaguaMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setInputFormatClass(GuaguaInputFormat.class);
    job.setOutputFormatClass(GuaguaOutputFormat.class);
    job.setNumReduceTasks(0);
    FileInputFormat.addInputPath(job, new Path(otherArgs[2]));
    job.waitForCompletion(true);
}

From source file:ml.shifu.guagua.mapreduce.GuaguaMapReduceClient.java

License:Apache License

/**
 * Create Hadoop job according to arguments from main.
 *//*from  w  ww  . j  a  v a  2  s  . c  o m*/
@SuppressWarnings("deprecation")
public synchronized Job createJob(String[] args) throws IOException {
    Configuration conf = new Configuration();
    // set it here to make it can be over-written. Set task timeout to a long period 20 minutes.
    conf.setInt(GuaguaMapReduceConstants.MAPRED_TASK_TIMEOUT,
            conf.getInt(GuaguaMapReduceConstants.MAPRED_TASK_TIMEOUT, 1800000));
    GuaguaOptionsParser parser = new GuaguaOptionsParser(conf, args);

    // process a bug on hdp 2.2.4
    String hdpVersion = HDPUtils.getHdpVersionForHDP224();
    if (hdpVersion != null && hdpVersion.length() != 0) {
        conf.set("hdp.version", hdpVersion);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
    }
    CommandLine cmdLine = parser.getCommandLine();
    checkInputSetting(conf, cmdLine);
    checkZkServerSetting(conf, cmdLine);
    checkWorkerClassSetting(conf, cmdLine);
    checkMasterClassSetting(conf, cmdLine);
    checkIterationCountSetting(conf, cmdLine);
    checkResultClassSetting(conf, cmdLine);
    String name = checkMapReduceNameSetting(cmdLine);
    @SuppressWarnings("rawtypes")
    Class<? extends InputFormat> inputFormatClass = checkInputFormatSetting(cmdLine);

    // set map reduce parameters for specified master-workers architecture
    // speculative execution should be disabled
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, false);
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, false);
    // set mapreduce.job.max.split.locations to 100 to suppress warnings
    int maxSplits = conf.getInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 100);
    if (maxSplits < 100) {
        maxSplits = 100;
    }
    conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, maxSplits);

    // Set cache to 0.
    conf.setInt(GuaguaMapReduceConstants.IO_SORT_MB, 0);
    // Most users won't hit this hopefully and can set it higher if desired
    conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_COUNTERS_LIMIT,
            conf.getInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_COUNTERS_LIMIT, 512));
    conf.setInt(GuaguaMapReduceConstants.MAPRED_JOB_REDUCE_MEMORY_MB, 0);

    // append concurrent gc to avoid long gc stop-the-world
    String childJavaOpts = conf.get(GuaguaMapReduceConstants.MAPRED_CHILD_JAVA_OPTS, "");
    if (childJavaOpts == null || childJavaOpts.length() == 0) {
        conf.set(GuaguaMapReduceConstants.MAPRED_CHILD_JAVA_OPTS,
                GuaguaMapReduceConstants.MAPRED_DEFAULT_CHILD_JAVA_OPTS);
    } else {
        String newChildJavaOpts = GuaguaMapReduceConstants.MAPRED_DEFAULT_CHILD_JAVA_OPTS + " " + childJavaOpts;
        conf.set(GuaguaMapReduceConstants.MAPRED_CHILD_JAVA_OPTS, newChildJavaOpts.trim());
    }

    Job job = new Job(conf, name);
    job.setJarByClass(GuaguaMapReduceClient.class);
    job.setMapperClass(GuaguaMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setInputFormatClass(inputFormatClass);
    job.setOutputFormatClass(GuaguaOutputFormat.class);
    job.setNumReduceTasks(0);
    return job;
}

From source file:ml.shifu.shifu.core.processor.InitModelProcessor.java

License:Apache License

private Map<Integer, Long> getApproxDistinctCountByMRJob()
        throws IOException, InterruptedException, ClassNotFoundException {
    SourceType source = this.modelConfig.getDataSet().getSource();
    Configuration conf = new Configuration();

    // add jars to hadoop mapper and reducer
    new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars() });

    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
    conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME,
            Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
    conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 30);
    conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString());
    conf.set("mapred.reduce.slowstart.completed.maps",
            Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.9"));
    String hdpVersion = HDPUtils.getHdpVersionForHDP224();
    if (StringUtils.isNotBlank(hdpVersion)) {
        // for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
        conf.set("hdp.version", hdpVersion);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
    }//from w w w  .j a va  2  s .  c om

    @SuppressWarnings("deprecation")
    Job job = new Job(conf, "Shifu: Column Type Auto Checking Job : " + this.modelConfig.getModelSetName());
    job.setJarByClass(getClass());
    job.setMapperClass(AutoTypeDistinctCountMapper.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BytesWritable.class);
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(super.modelConfig.getDataSetRawPath())));

    job.setReducerClass(AutoTypeDistinctCountReducer.class);
    job.setNumReduceTasks(1);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(LongWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    String autoTypePath = super.getPathFinder().getAutoTypeFilePath(source);
    FileOutputFormat.setOutputPath(job, new Path(autoTypePath));

    // clean output firstly
    ShifuFileUtils.deleteFile(autoTypePath, source);

    // submit job
    if (job.waitForCompletion(true)) {
        return getDistinctCountMap(source, autoTypePath);
    } else {
        throw new RuntimeException("MapReduce Job Auto Type Distinct Count failed.");
    }
}

From source file:ml.shifu.shifu.core.processor.PostTrainModelProcessor.java

License:Apache License

private void runMRBinAvgScoreJob(SourceType source, String postTrainOutputPath)
        throws IOException, InterruptedException, ClassNotFoundException {
    final Configuration conf = new Configuration();
    // add jars to hadoop mapper and reducer
    new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars() });

    conf.setBoolean(CombineInputFormat.SHIFU_VS_SPLIT_COMBINABLE, true);
    conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);

    conf.set(Constants.SHIFU_STATS_EXLCUDE_MISSING,
            Environment.getProperty(Constants.SHIFU_STATS_EXLCUDE_MISSING, "true"));

    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
    conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString());
    conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(super.getPathFinder().getColumnConfigPath(source))).toString());
    conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME,
            Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
    conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString());
    // set mapreduce.job.max.split.locations to 30 to suppress warnings
    conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 5000);

    conf.set("mapred.reduce.slowstart.completed.maps",
            Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.8"));
    String hdpVersion = HDPUtils.getHdpVersionForHDP224();
    if (StringUtils.isNotBlank(hdpVersion)) {
        // for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
        conf.set("hdp.version", hdpVersion);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
    }/*ww w . ja  va2  s  .c  o m*/

    // one can set guagua conf in shifuconfig
    CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {
        @Override
        public void inject(Object key, Object value) {
            conf.set(key.toString(), value.toString());
        }
    });

    @SuppressWarnings("deprecation")
    Job job = new Job(conf, "Shifu: Post Train : " + this.modelConfig.getModelSetName());
    job.setJarByClass(getClass());
    job.setMapperClass(PostTrainMapper.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(FeatureStatsWritable.class);
    job.setInputFormatClass(CombineInputFormat.class);
    FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(super.modelConfig.getDataSetRawPath())));

    MultipleOutputs.addNamedOutput(job, Constants.POST_TRAIN_OUTPUT_SCORE, TextOutputFormat.class,
            NullWritable.class, Text.class);

    job.setReducerClass(PostTrainReducer.class);
    job.setNumReduceTasks(1);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileOutputFormat.setOutputPath(job, new Path(postTrainOutputPath));

    // clean output firstly
    ShifuFileUtils.deleteFile(postTrainOutputPath, source);

    // submit job
    if (!job.waitForCompletion(true)) {
        throw new RuntimeException("Post train Bin Avg Score MapReduce job is failed.");
    }
}

From source file:ml.shifu.shifu.core.processor.PostTrainModelProcessor.java

License:Apache License

private void runMRFeatureImportanceJob(SourceType source, String output)
        throws IOException, InterruptedException, ClassNotFoundException {
    final Configuration conf = new Configuration();
    // add jars to hadoop mapper and reducer
    new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars() });

    conf.setBoolean(CombineInputFormat.SHIFU_VS_SPLIT_COMBINABLE, true);
    conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);

    conf.set(Constants.SHIFU_STATS_EXLCUDE_MISSING,
            Environment.getProperty(Constants.SHIFU_STATS_EXLCUDE_MISSING, "true"));

    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
    conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString());
    conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(super.getPathFinder().getColumnConfigPath(source))).toString());
    conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME,
            Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
    conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString());
    // set mapreduce.job.max.split.locations to 30 to suppress warnings
    conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 5000);

    conf.set("mapred.reduce.slowstart.completed.maps",
            Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.8"));
    String hdpVersion = HDPUtils.getHdpVersionForHDP224();
    if (StringUtils.isNotBlank(hdpVersion)) {
        // for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
        conf.set("hdp.version", hdpVersion);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
    }/*ww  w .j  av a  2 s  .c o m*/

    // one can set guagua conf in shifuconfig
    CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {
        @Override
        public void inject(Object key, Object value) {
            conf.set(key.toString(), value.toString());
        }
    });

    @SuppressWarnings("deprecation")
    Job job = new Job(conf, "Shifu: Post Train FeatureImportance : " + this.modelConfig.getModelSetName());
    job.setJarByClass(getClass());
    job.setMapperClass(FeatureImportanceMapper.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);
    job.setInputFormatClass(CombineInputFormat.class);
    FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(super.modelConfig.getDataSetRawPath())));

    job.setReducerClass(FeatureImportanceReducer.class);
    job.setNumReduceTasks(1);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(DoubleWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileOutputFormat.setOutputPath(job, new Path(output));

    // clean output firstly
    ShifuFileUtils.deleteFile(output, source);

    // submit job
    if (!job.waitForCompletion(true)) {
        throw new RuntimeException("Post train Feature Importance MapReduce job is failed.");
    }
}

From source file:ml.shifu.shifu.core.processor.stats.MapReducerStatsWorker.java

License:Apache License

private void prepareJobConf(RawSourceData.SourceType source, final Configuration conf, String filePath)
        throws IOException {
    // add jars to hadoop mapper and reducer
    new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars(), "-files", filePath });

    conf.setBoolean(CombineInputFormat.SHIFU_VS_SPLIT_COMBINABLE, true);
    conf.setBoolean("mapreduce.input.fileinputformat.input.dir.recursive", true);

    conf.set(Constants.SHIFU_STATS_EXLCUDE_MISSING,
            Environment.getProperty(Constants.SHIFU_STATS_EXLCUDE_MISSING, "true"));

    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_MAP_SPECULATIVE, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPREDUCE_REDUCE_SPECULATIVE, true);
    conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(this.pathFinder.getModelConfigPath(source))).toString());
    conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(this.pathFinder.getColumnConfigPath(source))).toString());
    conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME,
            Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
    conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString());

    // set mapreduce.job.max.split.locations to 30 to suppress warnings
    conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 5000);
    conf.set("mapred.reduce.slowstart.completed.maps",
            Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.8"));

    conf.set(Constants.SHIFU_STATS_FILTER_EXPRESSIONS, super.modelConfig.getSegmentFilterExpressionsAsString());
    log.info("segment expressions is {}", super.modelConfig.getSegmentFilterExpressionsAsString());

    String hdpVersion = HDPUtils.getHdpVersionForHDP224();
    if (StringUtils.isNotBlank(hdpVersion)) {
        // for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
        conf.set("hdp.version", hdpVersion);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
    }/*  w  ww. j  av a 2 s  . com*/

    // one can set guagua conf in shifuconfig
    CommonUtils.injectHadoopShifuEnvironments(new ValueVisitor() {
        @Override
        public void inject(Object key, Object value) {
            conf.set(key.toString(), value.toString());
        }
    });
}

From source file:ml.shifu.shifu.core.processor.StatsModelProcessor.java

License:Apache License

private void prepareJobConf(SourceType source, Configuration conf, String filePath) throws IOException {
    // add jars to hadoop mapper and reducer
    new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars(), "-files", filePath });

    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true);
    conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true);
    conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString());
    conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(super.getPathFinder().getColumnConfigPath(source))).toString());
    conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME,
            Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default"));
    conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString());
    // set mapreduce.job.max.split.locations to 30 to suppress warnings
    conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 30);
    conf.set("mapred.reduce.slowstart.completed.maps",
            Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.9"));
    String hdpVersion = HDPUtils.getHdpVersionForHDP224();
    if (StringUtils.isNotBlank(hdpVersion)) {
        // for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path
        conf.set("hdp.version", hdpVersion);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf);
        HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf);
    }// ww w . j a  v  a2 s. c o  m
}