List of usage examples for org.apache.hadoop.conf Configuration setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:com.cloudera.oryx.computation.common.JobStep.java
License:Open Source License
/** * Creates a new {@link MRPipeline} instance that contains common configuration * settings./*from w ww . ja va2 s. co m*/ * * @return a new {@link MRPipeline} instance, suitably configured */ protected final MRPipeline createBasicPipeline(Class<?> jarClass) throws IOException { Configuration conf = OryxConfiguration.get(getConf()); conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); conf.setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); conf.setBoolean("mapred.output.compress", true); conf.set("mapred.output.compression.type", "BLOCK"); conf.setClass("mapred.output.compression.codec", SnappyCodec.class, CompressionCodec.class); // Set old-style equivalents for Avro/Crunch's benefit conf.set("avro.output.codec", "snappy"); conf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true); conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, true); conf.setBoolean(TTConfig.TT_OUTOFBAND_HEARBEAT, true); conf.setInt(MRJobConfig.JVM_NUMTASKS_TORUN, -1); //conf.setBoolean("crunch.disable.deep.copy", true); // Giving one mapper a lot of data can cause issues in some stages, so default to disable this conf.setBoolean("crunch.disable.combine.file", true); Config appConfig = ConfigUtils.getDefaultConfig(); conf.set("crunch.tmp.dir", appConfig.getString("computation-layer.tmp-dir")); int mapMemoryMB = appConfig.getInt("computation-layer.mapper-memory-mb"); log.info("Mapper memory: {}", mapMemoryMB); int mapHeapMB = (int) (mapMemoryMB / 1.3); // Matches Hadoop's default log.info("Mappers have {}MB heap and can access {}MB RAM", mapHeapMB, mapMemoryMB); if (conf.get(MRJobConfig.MAP_JAVA_OPTS) != null) { log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS)); } conf.set(MRJobConfig.MAP_JAVA_OPTS, "-Xmx" + mapHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC"); log.info("Set {} to '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS)); // See comment below on CM conf.setInt("mapreduce.map.java.opts.max.heap", mapHeapMB); int reduceMemoryMB = appConfig.getInt("computation-layer.reducer-memory-mb"); log.info("Reducer memory: {}", reduceMemoryMB); if (isHighMemoryStep()) { reduceMemoryMB *= appConfig.getInt("computation-layer.worker-high-memory-factor"); log.info("Increasing {} to {} for high-memory step", MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB); } conf.setInt(MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB); int reduceHeapMB = (int) (reduceMemoryMB / 1.3); // Matches Hadoop's default log.info("Reducers have {}MB heap and can access {}MB RAM", reduceHeapMB, reduceMemoryMB); if (conf.get(MRJobConfig.REDUCE_JAVA_OPTS) != null) { log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS)); } conf.set(MRJobConfig.REDUCE_JAVA_OPTS, "-Xmx" + reduceHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC"); log.info("Set {} to '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS)); // I see this in CM but not in Hadoop docs; probably won't hurt as it's supposed to result in // -Xmx appended to opts above, which is at worst redundant conf.setInt("mapreduce.reduce.java.opts.max.heap", reduceHeapMB); conf.setInt("yarn.scheduler.capacity.minimum-allocation-mb", 128); conf.setInt("yarn.app.mapreduce.am.resource.mb", 384); // Pass total config state conf.set(CONFIG_SERIALIZATION_KEY, ConfigUtils.getDefaultConfig().root().render()); // Make sure to set any args to conf above this line! setConf(conf); Job job = Job.getInstance(conf); // Basic File IO settings FileInputFormat.setMaxInputSplitSize(job, 1L << 28); // ~268MB SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); log.info("Created pipeline configuration {}", job.getConfiguration()); return new MRPipeline(jarClass, getCustomJobName(), job.getConfiguration()); }
From source file:com.cloudera.science.quince.LoadVariantsTool.java
License:Open Source License
@Override public int run(String[] args) throws Exception { JCommander jc = new JCommander(this); try {//from ww w . j a v a 2 s .c om jc.parse(args); } catch (ParameterException e) { jc.usage(); return 1; } if (paths == null || paths.size() != 2) { jc.usage(); return 1; } String inputPath = paths.get(0); String outputPath = paths.get(1); Configuration conf = getConf(); // Copy records to avoid problem with Parquet string statistics not being correct. // This can be removed from parquet 1.8.0 // (see https://issues.apache.org/jira/browse/PARQUET-251). conf.setBoolean(DatasetKeyOutputFormat.KITE_COPY_RECORDS, true); Path path = new Path(inputPath); if (path.getName().endsWith(".vcf")) { int size = 500000; byte[] bytes = new byte[size]; InputStream inputStream = path.getFileSystem(conf).open(path); inputStream.read(bytes, 0, size); conf.set(VariantContextToVariantFn.VARIANT_HEADER, Base64.encodeBase64String(bytes)); } Pipeline pipeline = new MRPipeline(getClass(), conf); PCollection<Variant> records = readVariants(path, conf, pipeline); PCollection<FlatVariant> flatRecords = records.parallelDo(new FlattenVariantFn(), Avros.specifics(FlatVariant.class)); DatasetDescriptor desc = new DatasetDescriptor.Builder().schema(FlatVariant.getClassSchema()) .partitionStrategy(buildPartitionStrategy(segmentSize)).format(Formats.PARQUET) .compressionType(CompressionType.Uncompressed).build(); View<FlatVariant> dataset; if (Datasets.exists(outputPath)) { dataset = Datasets.load(outputPath, FlatVariant.class).getDataset().with("sample_group", sampleGroup); } else { dataset = Datasets.create(outputPath, desc, FlatVariant.class).getDataset().with("sample_group", sampleGroup); } int numReducers = conf.getInt("mapreduce.job.reduces", 1); System.out.println("Num reducers: " + numReducers); final Schema sortKeySchema = SchemaBuilder.record("sortKey").fields().requiredString("sampleId") .endRecord(); PCollection<FlatVariant> partitioned = CrunchDatasets.partitionAndSort(flatRecords, dataset, new FlatVariantRecordMapFn(sortKeySchema), sortKeySchema, numReducers, 1); try { Target.WriteMode writeMode = overwrite ? Target.WriteMode.OVERWRITE : Target.WriteMode.DEFAULT; pipeline.write(partitioned, CrunchDatasets.asTarget(dataset), writeMode); } catch (CrunchRuntimeException e) { LOG.error("Crunch runtime error", e); return 1; } PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; }
From source file:com.cloudera.sqoop.mapreduce.MySQLDumpImportJob.java
License:Apache License
/** * Configure the inputformat to use for the job. *//* w w w .j av a 2s. c o m*/ protected void configureInputFormat(Job job, String tableName, String tableClassName, String splitByCol) throws ClassNotFoundException, IOException { if (null == tableName) { LOG.error("mysqldump-based import cannot support free-form query imports."); LOG.error("Do not use --direct and --query together for MySQL."); throw new IOException("null tableName for MySQLDumpImportJob."); } ConnManager mgr = getContext().getConnManager(); String username = options.getUsername(); if (null == username || username.length() == 0) { DBConfiguration.configureDB(job.getConfiguration(), mgr.getDriverClass(), options.getConnectString()); } else { DBConfiguration.configureDB(job.getConfiguration(), mgr.getDriverClass(), options.getConnectString(), username, options.getPassword()); } String[] colNames = options.getColumns(); if (null == colNames) { colNames = mgr.getColumnNames(tableName); } String[] sqlColNames = null; if (null != colNames) { sqlColNames = new String[colNames.length]; for (int i = 0; i < colNames.length; i++) { sqlColNames[i] = mgr.escapeColName(colNames[i]); } } // It's ok if the where clause is null in DBInputFormat.setInput. String whereClause = options.getWhereClause(); // We can't set the class properly in here, because we may not have the // jar loaded in this JVM. So we start by calling setInput() with // DBWritable and then overriding the string manually. // Note that mysqldump also does *not* want a quoted table name. DataDrivenDBInputFormat.setInput(job, DBWritable.class, tableName, whereClause, mgr.escapeColName(splitByCol), sqlColNames); Configuration conf = job.getConfiguration(); conf.setInt(MySQLUtils.OUTPUT_FIELD_DELIM_KEY, options.getOutputFieldDelim()); conf.setInt(MySQLUtils.OUTPUT_RECORD_DELIM_KEY, options.getOutputRecordDelim()); conf.setInt(MySQLUtils.OUTPUT_ENCLOSED_BY_KEY, options.getOutputEnclosedBy()); conf.setInt(MySQLUtils.OUTPUT_ESCAPED_BY_KEY, options.getOutputEscapedBy()); conf.setBoolean(MySQLUtils.OUTPUT_ENCLOSE_REQUIRED_KEY, options.isOutputEncloseRequired()); String[] extraArgs = options.getExtraArgs(); if (null != extraArgs) { conf.setStrings(MySQLUtils.EXTRA_ARGS_KEY, extraArgs); } LOG.debug("Using InputFormat: " + inputFormatClass); job.setInputFormatClass(getInputFormatClass()); }
From source file:com.cloudera.sqoop.mapreduce.MySQLExportJob.java
License:Apache License
@Override /**/*from ww w . ja v a2 s . c o m*/ * Configure the inputformat to use for the job. */ protected void configureInputFormat(Job job, String tableName, String tableClassName, String splitByCol) throws ClassNotFoundException, IOException { // Configure the delimiters, etc. Configuration conf = job.getConfiguration(); conf.setInt(MySQLUtils.OUTPUT_FIELD_DELIM_KEY, options.getOutputFieldDelim()); conf.setInt(MySQLUtils.OUTPUT_RECORD_DELIM_KEY, options.getOutputRecordDelim()); conf.setInt(MySQLUtils.OUTPUT_ENCLOSED_BY_KEY, options.getOutputEnclosedBy()); conf.setInt(MySQLUtils.OUTPUT_ESCAPED_BY_KEY, options.getOutputEscapedBy()); conf.setBoolean(MySQLUtils.OUTPUT_ENCLOSE_REQUIRED_KEY, options.isOutputEncloseRequired()); String[] extraArgs = options.getExtraArgs(); if (null != extraArgs) { conf.setStrings(MySQLUtils.EXTRA_ARGS_KEY, extraArgs); } ConnManager mgr = context.getConnManager(); String username = options.getUsername(); if (null == username || username.length() == 0) { DBConfiguration.configureDB(job.getConfiguration(), mgr.getDriverClass(), options.getConnectString()); } else { DBConfiguration.configureDB(job.getConfiguration(), mgr.getDriverClass(), options.getConnectString(), username, options.getPassword()); } String[] colNames = options.getColumns(); if (null == colNames) { colNames = mgr.getColumnNames(tableName); } String[] sqlColNames = null; if (null != colNames) { sqlColNames = new String[colNames.length]; for (int i = 0; i < colNames.length; i++) { sqlColNames[i] = mgr.escapeColName(colNames[i]); } } // Note that mysqldump also does *not* want a quoted table name. DataDrivenDBInputFormat.setInput(job, DBWritable.class, tableName, null, null, sqlColNames); // Configure the actual InputFormat to use. super.configureInputFormat(job, tableName, tableClassName, splitByCol); }
From source file:com.cloudy.mapred.base.JobUtil.java
License:Apache License
public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends OutputFormat> outputFormat, Configuration conf) throws IOException { Job job = new Job(new Configuration(conf)); Configuration jobConf = job.getConfiguration(); if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); }/*from www . j ava2s .c om*/ job.setJarByClass(mapper); job.setInputFormatClass(inputFormat); jobConf.set("mapred.input.dir", inputPath.toString()); job.setMapperClass(mapper); job.setMapOutputKeyClass(mapperKey); job.setMapOutputValueClass(mapperValue); job.setOutputKeyClass(mapperKey); job.setOutputValueClass(mapperValue); jobConf.setBoolean("mapred.compress.map.output", true); job.setNumReduceTasks(0); job.setOutputFormatClass(outputFormat); jobConf.set("mapred.output.dir", outputPath.toString()); return job; }
From source file:com.datasalt.pangool.tuplemr.serialization.TupleSerialization.java
License:Apache License
/** * see {@link #CONF_SCHEMA_VALIDATION}/*from w w w . j a v a 2 s .c o m*/ */ public static void enableSchemaValidation(Configuration conf) { conf.setBoolean(CONF_SCHEMA_VALIDATION, true); }
From source file:com.datasalt.pangool.tuplemr.serialization.TupleSerialization.java
License:Apache License
/** * see {@link #CONF_SCHEMA_VALIDATION}//from w w w. j a v a 2 s . co m */ public static void disableSchemaValidation(Configuration conf) { conf.setBoolean(CONF_SCHEMA_VALIDATION, false); }
From source file:com.datascience.cascading.scheme.CsvScheme.java
License:Apache License
/** * Configures the Hadoop configuration for the given CSV format. *//*from w w w. j a v a 2 s .co m*/ private void configureReaderFormat(CSVFormat format, Configuration conf) { conf.set(CsvOutputFormat.CHARSET, charset); // If the format header was explicitly provided by the user then forward it to the record reader. If skipHeaderRecord // is enabled then that indicates that field names were detected. We need to ensure that headers are defined in order // for the CSV reader to skip the header record. conf.setBoolean(CsvInputFormat.STRICT_MODE, strict); if (format.getHeader() != null) { conf.setStrings(CsvInputFormat.CSV_READER_COLUMNS, format.getHeader()); } else if (format.getSkipHeaderRecord()) { Fields fields = getSourceFields(); String[] columns = new String[fields.size()]; for (int i = 0; i < fields.size(); i++) { columns[i] = fields.get(i).toString(); } conf.setStrings(CsvInputFormat.CSV_READER_COLUMNS, columns); } conf.setBoolean(CsvInputFormat.CSV_READER_SKIP_HEADER, format.getSkipHeaderRecord()); conf.set(CsvInputFormat.CSV_READER_DELIMITER, String.valueOf(format.getDelimiter())); if (format.getRecordSeparator() != null) { conf.set(CsvInputFormat.CSV_READER_RECORD_SEPARATOR, format.getRecordSeparator()); } if (format.getQuoteCharacter() != null) { conf.set(CsvInputFormat.CSV_READER_QUOTE_CHARACTER, String.valueOf(format.getQuoteCharacter())); } if (format.getQuoteMode() != null) { conf.set(CsvInputFormat.CSV_READER_QUOTE_MODE, format.getQuoteMode().name()); } if (format.getEscapeCharacter() != null) { conf.set(CsvInputFormat.CSV_READER_ESCAPE_CHARACTER, String.valueOf(format.getEscapeCharacter())); } conf.setBoolean(CsvInputFormat.CSV_READER_IGNORE_EMPTY_LINES, format.getIgnoreEmptyLines()); conf.setBoolean(CsvInputFormat.CSV_READER_IGNORE_SURROUNDING_SPACES, format.getIgnoreSurroundingSpaces()); if (format.getNullString() != null) { conf.set(CsvInputFormat.CSV_READER_NULL_STRING, format.getNullString()); } }
From source file:com.datascience.cascading.scheme.CsvScheme.java
License:Apache License
/** * Configures the Hadoop configuration for the given CSV format. *//*w ww . j av a 2 s . c o m*/ private void configureWriterFormat(CSVFormat format, Configuration conf) { conf.set(CsvOutputFormat.CHARSET, charset); // Apache CSV doesn't really handle the skipHeaderRecord flag correctly when writing output. If the skip flag is set // and headers are configured, headers will always be written to the output. Since we always have headers and/or // fields configured, we need to use the skipHeaderRecord flag to determine whether headers should be written. if (!format.getSkipHeaderRecord()) { if (format.getHeader() != null && format.getHeader().length != 0) { conf.setStrings(CsvOutputFormat.CSV_WRITER_COLUMNS, format.getHeader()); } else { Fields fields = getSinkFields(); String[] columns = new String[fields.size()]; for (int i = 0; i < fields.size(); i++) { columns[i] = fields.get(i).toString(); } conf.setStrings(CsvOutputFormat.CSV_WRITER_COLUMNS, columns); } } conf.setBoolean(CsvOutputFormat.CSV_WRITER_SKIP_HEADER, format.getSkipHeaderRecord()); conf.set(CsvOutputFormat.CSV_WRITER_DELIMITER, String.valueOf(format.getDelimiter())); if (format.getRecordSeparator() != null) { conf.set(CsvOutputFormat.CSV_WRITER_RECORD_SEPARATOR, format.getRecordSeparator()); } if (format.getQuoteCharacter() != null) { conf.set(CsvOutputFormat.CSV_WRITER_QUOTE_CHARACTER, String.valueOf(format.getQuoteCharacter())); } if (format.getQuoteMode() != null) { conf.set(CsvOutputFormat.CSV_WRITER_QUOTE_MODE, format.getQuoteMode().name()); } if (format.getEscapeCharacter() != null) { conf.set(CsvOutputFormat.CSV_WRITER_ESCAPE_CHARACTER, String.valueOf(format.getEscapeCharacter())); } conf.setBoolean(CsvOutputFormat.CSV_WRITER_IGNORE_EMPTY_LINES, format.getIgnoreEmptyLines()); conf.setBoolean(CsvOutputFormat.CSV_WRITER_IGNORE_SURROUNDING_SPACES, format.getIgnoreSurroundingSpaces()); if (format.getNullString() != null) { conf.set(CsvOutputFormat.CSV_WRITER_NULL_STRING, format.getNullString()); } }
From source file:com.datatorrent.stram.client.StramClientUtilsTest.java
License:Apache License
@Test public void testRMWebAddress() throws UnknownHostException { Configuration conf = new YarnConfiguration(new Configuration(false)) { @Override//from w w w. j a v a 2 s .c o m public InetSocketAddress getSocketAddr(String name, String defaultAddress, int defaultPort) { String rmId = get(ConfigUtils.RM_HA_ID); if (rmId != null) { name = name + "." + rmId; } return super.getSocketAddr(name, defaultAddress, defaultPort); } }; // basic test conf.setBoolean(CommonConfigurationKeysPublic.HADOOP_SSL_ENABLED_KEY, false); conf.set(YarnConfiguration.RM_WEBAPP_ADDRESS, "192.168.1.1:8032"); conf.set(YarnConfiguration.RM_WEBAPP_HTTPS_ADDRESS, "192.168.1.2:8032"); Assert.assertEquals(getHostString("192.168.1.1") + ":8032", StramClientUtils.getSocketConnectString(StramClientUtils.getRMWebAddress(conf, null))); List<InetSocketAddress> addresses = StramClientUtils.getRMAddresses(conf); Assert.assertEquals(1, addresses.size()); Assert.assertEquals(getHostString("192.168.1.1") + ":8032", StramClientUtils.getSocketConnectString(addresses.get(0))); conf.setBoolean(CommonConfigurationKeysPublic.HADOOP_SSL_ENABLED_KEY, true); Assert.assertEquals(getHostString("192.168.1.2") + ":8032", StramClientUtils.getSocketConnectString(StramClientUtils.getRMWebAddress(conf, null))); addresses = StramClientUtils.getRMAddresses(conf); Assert.assertEquals(1, addresses.size()); Assert.assertEquals(getHostString("192.168.1.2") + ":8032", StramClientUtils.getSocketConnectString(addresses.get(0))); // set localhost if host is unknown conf.set(YarnConfiguration.RM_WEBAPP_HTTPS_ADDRESS, "someunknownhost.:8032"); Assert.assertEquals(InetAddress.getLocalHost().getCanonicalHostName() + ":8032", StramClientUtils.getSocketConnectString(StramClientUtils.getRMWebAddress(conf, null))); // set localhost conf.set(YarnConfiguration.RM_WEBAPP_HTTPS_ADDRESS, "127.0.0.1:8032"); Assert.assertEquals(InetAddress.getLocalHost().getCanonicalHostName() + ":8032", StramClientUtils.getSocketConnectString(StramClientUtils.getRMWebAddress(conf, null))); // test when HA is enabled conf.setBoolean(ConfigUtils.RM_HA_ENABLED, true); conf.set(YarnConfiguration.RM_WEBAPP_HTTPS_ADDRESS + ".rm1", "192.168.1.1:8032"); conf.set(YarnConfiguration.RM_WEBAPP_HTTPS_ADDRESS + ".rm2", "192.168.1.2:8032"); conf.set("yarn.resourcemanager.ha.rm-ids", "rm1,rm2"); Assert.assertEquals(getHostString("192.168.1.1") + ":8032", StramClientUtils.getSocketConnectString(StramClientUtils.getRMWebAddress(conf, "rm1"))); Assert.assertEquals(getHostString("192.168.1.2") + ":8032", StramClientUtils.getSocketConnectString(StramClientUtils.getRMWebAddress(conf, "rm2"))); addresses = StramClientUtils.getRMAddresses(conf); Assert.assertEquals(2, addresses.size()); Assert.assertEquals(getHostString("192.168.1.1") + ":8032", StramClientUtils.getSocketConnectString(addresses.get(0))); Assert.assertEquals(getHostString("192.168.1.2") + ":8032", StramClientUtils.getSocketConnectString(addresses.get(1))); }