List of usage examples for org.apache.hadoop.conf Configuration get
public String get(String name)
name
property, null
if no such property exists. From source file:com.cloudera.oryx.als.computation.similar.DistributeSimilarWorkReduceFn.java
License:Open Source License
@Override public void initialize() { super.initialize(); Configuration conf = getConfiguration(); String yKey = conf.get(DistributeSimilarWorkStep.Y_KEY_KEY); try {/* w w w . ja v a 2 s. c o m*/ partialY = ComputationDataUtils.loadPartialY(getPartition(), getNumPartitions(), yKey, conf); } catch (IOException e) { throw new CrunchRuntimeException(e); } numSimilar = ConfigUtils.getDefaultConfig().getInt("model.item-similarity.how-many"); Preconditions.checkArgument(numSimilar > 0, "# similar must be positive: %s", numSimilar); }
From source file:com.cloudera.oryx.common.servcomp.OryxConfiguration.java
License:Open Source License
private static void configure(Configuration conf) { if (!Namespaces.isLocalComputation() || !Namespaces.isLocalData()) { File hadoopConfDir = findHadoopConfDir(); addResource(hadoopConfDir, "core-site.xml", conf); addResource(hadoopConfDir, "hdfs-site.xml", conf); addResource(hadoopConfDir, "mapred-site.xml", conf); addResource(hadoopConfDir, "yarn-site.xml", conf); String fsDefaultFS = conf.get(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY); if (fsDefaultFS == null || fsDefaultFS.equals(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_DEFAULT)) { // Standard config generated by Hadoop 2.0.x seemed to set fs.default.name instead of fs.defaultFS? conf.set(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY, conf.get("fs.default.name")); }/*from w w w.j ava2 s .co m*/ fixLzoCodecIssue(conf); } }
From source file:com.cloudera.oryx.common.servcomp.OryxConfiguration.java
License:Open Source License
/** * Removes {@code LzoCodec} and {@code LzopCodec} from key {@code io.compression.codecs}. * Implementations aren't shipped with Hadoop, but are in some cases instantiated anyway even when unused. * So, try to erase them./*from www. j av a 2 s. co m*/ */ private static void fixLzoCodecIssue(Configuration conf) { String codecsProperty = conf.get("io.compression.codecs"); if (codecsProperty != null && codecsProperty.contains(".lzo.Lzo")) { List<String> codecs = Lists.newArrayList(Splitter.on(',').split(codecsProperty)); for (Iterator<String> it = codecs.iterator(); it.hasNext();) { if (it.next().contains(".lzo.Lzo")) { it.remove(); } } conf.set("io.compression.codecs", Joiner.on(',').join(codecs)); } }
From source file:com.cloudera.oryx.computation.common.JobStep.java
License:Open Source License
/** * Creates a new {@link MRPipeline} instance that contains common configuration * settings.//w w w . ja v a 2s. c om * * @return a new {@link MRPipeline} instance, suitably configured */ protected final MRPipeline createBasicPipeline(Class<?> jarClass) throws IOException { Configuration conf = OryxConfiguration.get(getConf()); conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); conf.setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); conf.setBoolean("mapred.output.compress", true); conf.set("mapred.output.compression.type", "BLOCK"); conf.setClass("mapred.output.compression.codec", SnappyCodec.class, CompressionCodec.class); // Set old-style equivalents for Avro/Crunch's benefit conf.set("avro.output.codec", "snappy"); conf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true); conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, true); conf.setBoolean(TTConfig.TT_OUTOFBAND_HEARBEAT, true); conf.setInt(MRJobConfig.JVM_NUMTASKS_TORUN, -1); //conf.setBoolean("crunch.disable.deep.copy", true); // Giving one mapper a lot of data can cause issues in some stages, so default to disable this conf.setBoolean("crunch.disable.combine.file", true); Config appConfig = ConfigUtils.getDefaultConfig(); conf.set("crunch.tmp.dir", appConfig.getString("computation-layer.tmp-dir")); int mapMemoryMB = appConfig.getInt("computation-layer.mapper-memory-mb"); log.info("Mapper memory: {}", mapMemoryMB); int mapHeapMB = (int) (mapMemoryMB / 1.3); // Matches Hadoop's default log.info("Mappers have {}MB heap and can access {}MB RAM", mapHeapMB, mapMemoryMB); if (conf.get(MRJobConfig.MAP_JAVA_OPTS) != null) { log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS)); } conf.set(MRJobConfig.MAP_JAVA_OPTS, "-Xmx" + mapHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC"); log.info("Set {} to '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS)); // See comment below on CM conf.setInt("mapreduce.map.java.opts.max.heap", mapHeapMB); int reduceMemoryMB = appConfig.getInt("computation-layer.reducer-memory-mb"); log.info("Reducer memory: {}", reduceMemoryMB); if (isHighMemoryStep()) { reduceMemoryMB *= appConfig.getInt("computation-layer.worker-high-memory-factor"); log.info("Increasing {} to {} for high-memory step", MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB); } conf.setInt(MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB); int reduceHeapMB = (int) (reduceMemoryMB / 1.3); // Matches Hadoop's default log.info("Reducers have {}MB heap and can access {}MB RAM", reduceHeapMB, reduceMemoryMB); if (conf.get(MRJobConfig.REDUCE_JAVA_OPTS) != null) { log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS)); } conf.set(MRJobConfig.REDUCE_JAVA_OPTS, "-Xmx" + reduceHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC"); log.info("Set {} to '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS)); // I see this in CM but not in Hadoop docs; probably won't hurt as it's supposed to result in // -Xmx appended to opts above, which is at worst redundant conf.setInt("mapreduce.reduce.java.opts.max.heap", reduceHeapMB); conf.setInt("yarn.scheduler.capacity.minimum-allocation-mb", 128); conf.setInt("yarn.app.mapreduce.am.resource.mb", 384); // Pass total config state conf.set(CONFIG_SERIALIZATION_KEY, ConfigUtils.getDefaultConfig().root().render()); // Make sure to set any args to conf above this line! setConf(conf); Job job = Job.getInstance(conf); // Basic File IO settings FileInputFormat.setMaxInputSplitSize(job, 1L << 28); // ~268MB SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); log.info("Created pipeline configuration {}", job.getConfiguration()); return new MRPipeline(jarClass, getCustomJobName(), job.getConfiguration()); }
From source file:com.cloudera.recordservice.hcatalog.mapreduce.HCatRSBaseInputFormat.java
License:Apache License
private static HCatSchema getOutputSchema(Configuration conf) throws IOException { String os = conf.get(HCatConstants.HCAT_KEY_OUTPUT_SCHEMA); if (os == null) { return getTableSchema(conf); } else {/*from w w w . j a v a 2s .co m*/ return (HCatSchema) HCatUtil.deserialize(os); } }
From source file:com.cloudera.recordservice.hcatalog.mapreduce.HCatRSBaseInputFormat.java
License:Apache License
/** * Gets the InputJobInfo object by reading the Configuration and deserializing * the string. If InputJobInfo is not present in the configuration, throws an * exception since that means HCatInputFormat.setInput has not been called. * @param conf the Configuration object// www . j a v a2 s . c o m * @return the InputJobInfo object * @throws IOException the exception */ private static InputJobInfo getJobInfo(Configuration conf) throws IOException { String jobString = conf.get(HCatConstants.HCAT_KEY_JOB_INFO); if (jobString == null) { throw new IOException( "job information not found in JobContext." + " HCatInputFormat.setInput() not called?"); } return (InputJobInfo) HCatRSUtil.deserialize(jobString); }
From source file:com.cloudera.recordservice.hcatalog.mapreduce.HCatRSInputFormat.java
License:Apache License
/** * Initializes the input with a provided filter. * See {@link #setInput(Configuration, String, String, String)} *///from www . jav a2 s . c o m public static HCatRSInputFormat setInput(Job job, String location, String filter) throws IOException { Configuration conf = job.getConfiguration(); String kerberosPrincipal = conf.get(ConfVars.KERBEROS_PRINCIPAL_CONF.name); Pair<String, String> dbTablePair = HCatUtil.getDbAndTableName(location); dbTablePair = HCatRSUtil.cleanQueryPair(dbTablePair); String dbName = dbTablePair.first; String tableName = dbTablePair.second; if (location.toLowerCase().startsWith("select")) { RecordServiceConfig.setInputQuery(conf, location); } else { RecordServiceConfig.setInputTable(conf, dbName, tableName); } Credentials credentials = job.getCredentials(); RecordServicePlannerClient.Builder builder = PlanUtil.getBuilder(conf); List<NetworkAddress> plannerHosts = PlanUtil.getPlannerHostPorts(conf); RecordServicePlannerClient planner = PlanUtil.getPlanner(conf, builder, plannerHosts, kerberosPrincipal, credentials); try { if (planner.isKerberosAuthenticated()) { Token<DelegationTokenIdentifier> delegationToken = TokenUtils .fromTDelegationToken(planner.getDelegationToken("")); credentials.addToken(DelegationTokenIdentifier.DELEGATION_KIND, delegationToken); } } catch (RecordServiceException e) { throw new IOException(e); } finally { if (planner != null) planner.close(); } job.setInputFormatClass(HCatRSInputFormat.class); return setInput(conf, dbName, tableName, filter); }
From source file:com.cloudera.recordservice.mr.PlanUtil.java
License:Apache License
/** * Generates a request from the configs set in jobConf. *///from w w w .ja va 2 s . c o m public static Request getRequest(Configuration jobConf) throws IOException { LOG.debug("Generating input splits."); String tblName = jobConf.get(ConfVars.TBL_NAME_CONF.name); String inputDir = jobConf.get(FileInputFormat.INPUT_DIR); String sqlQuery = jobConf.get(ConfVars.QUERY_NAME_CONF.name); int numSet = 0; if (tblName != null) ++numSet; if (inputDir != null) ++numSet; if (sqlQuery != null) ++numSet; if (numSet == 0) { throw new IllegalArgumentException("No input specified. Specify either '" + ConfVars.TBL_NAME_CONF.name + "', '" + ConfVars.QUERY_NAME_CONF.name + "' or '" + FileInputFormat.INPUT_DIR + "'"); } if (numSet > 1) { throw new IllegalArgumentException("More than one input specified. Can " + "only specify one of '" + ConfVars.TBL_NAME_CONF.name + "'=" + tblName + ", '" + FileInputFormat.INPUT_DIR + "'=" + inputDir + ", '" + ConfVars.QUERY_NAME_CONF.name + "'=" + sqlQuery); } String[] colNames = jobConf.getStrings(ConfVars.COL_NAMES_CONF.name); if (colNames == null) colNames = new String[0]; if (tblName == null && colNames.length > 0) { // TODO: support this. throw new IllegalArgumentException("Column projections can only be specified with table inputs."); } Request request = null; if (tblName != null) { if (colNames.length == 0) { // If length of colNames = 0, return all possible columns // TODO: this has slightly different meaning than createProjectionRequest() // which treats empty columns as an empty projection. i.e. select * vs count(*) // Reconcile this. request = Request.createTableScanRequest(tblName); } else { List<String> projection = new ArrayList<String>(); for (String c : colNames) { if (c == null || c.isEmpty()) { throw new IllegalArgumentException( "Cannot specify projection with null or empty column name."); } projection.add(c); } request = Request.createProjectionRequest(tblName, projection); } } else if (inputDir != null) { // TODO: inputDir is a comma separate list of paths. The service needs to // handle that. if (inputDir.contains(",")) { throw new IllegalArgumentException("Only reading a single directory is currently supported."); } request = Request.createPathRequest(inputDir); } else if (sqlQuery != null) { request = Request.createSqlRequest(sqlQuery); } else { Preconditions.checkState(false); } return request; }
From source file:com.cloudera.recordservice.mr.PlanUtil.java
License:Apache License
/** * Given a configuration, returns a list of network addresses for all the planners. * This first tries to use the planner auto discovery feature and use ZooKeeper * to find all the available planners. If that is not feasible, it tries to use * the hardcoded planner host/port lists in the configuration. In case that is * not feasible as well, it uses the default planner host/port. * @param conf the hadoop job configuration * @return a list of network addresses for all the available planners *//*from www . j a v a 2 s . co m*/ public static List<NetworkAddress> getPlannerHostPorts(Configuration conf) throws IOException { List<NetworkAddress> plannerHostPorts = null; if (isPlannerDiscoveryEnabled(conf)) { try { LOG.info("Using planner auto discovery on ZK connection string {}", conf.get(ConfVars.ZOOKEEPER_CONNECTION_STRING_CONF.name)); plannerHostPorts = ZooKeeperUtil.getPlanners(conf); } catch (IOException e) { LOG.warn("Planner discovery failed. Now fall back to use " + ConfVars.PLANNER_HOSTPORTS_CONF.name + " in the job configuration.", e); } } if (plannerHostPorts == null || plannerHostPorts.isEmpty()) { plannerHostPorts = RecordServiceConfig.getPlannerHostPort( conf.get(ConfVars.PLANNER_HOSTPORTS_CONF.name, RecordServiceConfig.DEFAULT_PLANNER_HOSTPORTS)); } return plannerHostPorts; }
From source file:com.cloudera.recordservice.mr.PlanUtil.java
License:Apache License
/** * Returns the kerberos principal to connect with. *///from www . j av a 2 s. c o m public static String getKerberosPrincipal(Configuration conf) { return conf.get(ConfVars.KERBEROS_PRINCIPAL_CONF.name); }