public String get(String name) 

Get the value of the name property, null if no such property exists.


From source file:com.cloudera.oryx.als.computation.similar.DistributeSimilarWorkReduceFn.java

License:Open Source License

public void initialize() {
    Configuration conf = getConfiguration();
    String yKey = conf.get(DistributeSimilarWorkStep.Y_KEY_KEY);
    try {/* w w w . ja v a 2 s.  c  o m*/
        partialY = ComputationDataUtils.loadPartialY(getPartition(), getNumPartitions(), yKey, conf);
    } catch (IOException e) {
        throw new CrunchRuntimeException(e);
    numSimilar = ConfigUtils.getDefaultConfig().getInt("model.item-similarity.how-many");
    Preconditions.checkArgument(numSimilar > 0, "# similar must be positive: %s", numSimilar);

From source file:com.cloudera.oryx.common.servcomp.OryxConfiguration.java

License:Open Source License

private static void configure(Configuration conf) {
    if (!Namespaces.isLocalComputation() || !Namespaces.isLocalData()) {
        File hadoopConfDir = findHadoopConfDir();
        addResource(hadoopConfDir, "core-site.xml", conf);
        addResource(hadoopConfDir, "hdfs-site.xml", conf);
        addResource(hadoopConfDir, "mapred-site.xml", conf);
        addResource(hadoopConfDir, "yarn-site.xml", conf);

        String fsDefaultFS = conf.get(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY);
        if (fsDefaultFS == null || fsDefaultFS.equals(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_DEFAULT)) {
            // Standard config generated by Hadoop 2.0.x seemed to set fs.default.name instead of fs.defaultFS?
            conf.set(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY, conf.get("fs.default.name"));
        }/*from  w  w w.j ava2 s  .co  m*/


From source file:com.cloudera.oryx.common.servcomp.OryxConfiguration.java

License:Open Source License

 * Removes {@code LzoCodec} and {@code LzopCodec} from key {@code io.compression.codecs}.
 * Implementations aren't shipped with Hadoop, but are in some cases instantiated anyway even when unused.
 * So, try to erase them./*from www. j  av  a  2 s. co m*/
private static void fixLzoCodecIssue(Configuration conf) {
    String codecsProperty = conf.get("io.compression.codecs");
    if (codecsProperty != null && codecsProperty.contains(".lzo.Lzo")) {
        List<String> codecs = Lists.newArrayList(Splitter.on(',').split(codecsProperty));
        for (Iterator<String> it = codecs.iterator(); it.hasNext();) {
            if (it.next().contains(".lzo.Lzo")) {
        conf.set("io.compression.codecs", Joiner.on(',').join(codecs));

From source file:com.cloudera.oryx.computation.common.JobStep.java

License:Open Source License

 * Creates a new {@link MRPipeline} instance that contains common configuration
 * settings.//w  w  w .  ja v a 2s.  c om
 * @return a new {@link MRPipeline} instance, suitably configured
protected final MRPipeline createBasicPipeline(Class<?> jarClass) throws IOException {
    Configuration conf = OryxConfiguration.get(getConf());

    conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
    conf.setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);

    conf.setBoolean("mapred.output.compress", true);
    conf.set("mapred.output.compression.type", "BLOCK");
    conf.setClass("mapred.output.compression.codec", SnappyCodec.class, CompressionCodec.class);
    // Set old-style equivalents for Avro/Crunch's benefit
    conf.set("avro.output.codec", "snappy");

    conf.setBoolean(MRJobConfig.MAP_SPECULATIVE, true);
    conf.setBoolean(MRJobConfig.REDUCE_SPECULATIVE, true);
    conf.setBoolean(TTConfig.TT_OUTOFBAND_HEARBEAT, true);
    conf.setInt(MRJobConfig.JVM_NUMTASKS_TORUN, -1);

    //conf.setBoolean("crunch.disable.deep.copy", true);
    // Giving one mapper a lot of data can cause issues in some stages, so default to disable this
    conf.setBoolean("crunch.disable.combine.file", true);

    Config appConfig = ConfigUtils.getDefaultConfig();

    conf.set("crunch.tmp.dir", appConfig.getString("computation-layer.tmp-dir"));

    int mapMemoryMB = appConfig.getInt("computation-layer.mapper-memory-mb");
    log.info("Mapper memory: {}", mapMemoryMB);
    int mapHeapMB = (int) (mapMemoryMB / 1.3); // Matches Hadoop's default
    log.info("Mappers have {}MB heap and can access {}MB RAM", mapHeapMB, mapMemoryMB);
    if (conf.get(MRJobConfig.MAP_JAVA_OPTS) != null) {
        log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.MAP_JAVA_OPTS,
            "-Xmx" + mapHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC");
    log.info("Set {} to '{}'", MRJobConfig.MAP_JAVA_OPTS, conf.get(MRJobConfig.MAP_JAVA_OPTS));
    // See comment below on CM
    conf.setInt("mapreduce.map.java.opts.max.heap", mapHeapMB);

    int reduceMemoryMB = appConfig.getInt("computation-layer.reducer-memory-mb");
    log.info("Reducer memory: {}", reduceMemoryMB);
    if (isHighMemoryStep()) {
        reduceMemoryMB *= appConfig.getInt("computation-layer.worker-high-memory-factor");
        log.info("Increasing {} to {} for high-memory step", MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB);
    conf.setInt(MRJobConfig.REDUCE_MEMORY_MB, reduceMemoryMB);

    int reduceHeapMB = (int) (reduceMemoryMB / 1.3); // Matches Hadoop's default
    log.info("Reducers have {}MB heap and can access {}MB RAM", reduceHeapMB, reduceMemoryMB);
    if (conf.get(MRJobConfig.REDUCE_JAVA_OPTS) != null) {
        log.info("Overriding previous setting of {}, which was '{}'", MRJobConfig.REDUCE_JAVA_OPTS,
            "-Xmx" + reduceHeapMB + "m -XX:+UseCompressedOops -XX:+UseParallelGC -XX:+UseParallelOldGC");
    log.info("Set {} to '{}'", MRJobConfig.REDUCE_JAVA_OPTS, conf.get(MRJobConfig.REDUCE_JAVA_OPTS));
    // I see this in CM but not in Hadoop docs; probably won't hurt as it's supposed to result in
    // -Xmx appended to opts above, which is at worst redundant
    conf.setInt("mapreduce.reduce.java.opts.max.heap", reduceHeapMB);

    conf.setInt("yarn.scheduler.capacity.minimum-allocation-mb", 128);
    conf.setInt("yarn.app.mapreduce.am.resource.mb", 384);

    // Pass total config state
    conf.set(CONFIG_SERIALIZATION_KEY, ConfigUtils.getDefaultConfig().root().render());

    // Make sure to set any args to conf above this line!


    Job job = Job.getInstance(conf);

    // Basic File IO settings
    FileInputFormat.setMaxInputSplitSize(job, 1L << 28); // ~268MB
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class);

    log.info("Created pipeline configuration {}", job.getConfiguration());

    return new MRPipeline(jarClass, getCustomJobName(), job.getConfiguration());

From source file:com.cloudera.recordservice.hcatalog.mapreduce.HCatRSBaseInputFormat.java

License:Apache License

private static HCatSchema getOutputSchema(Configuration conf) throws IOException {
    String os = conf.get(HCatConstants.HCAT_KEY_OUTPUT_SCHEMA);
    if (os == null) {
        return getTableSchema(conf);
    } else {/*from w  w w  . j a  v a  2s .co m*/
        return (HCatSchema) HCatUtil.deserialize(os);

From source file:com.cloudera.recordservice.hcatalog.mapreduce.HCatRSBaseInputFormat.java

License:Apache License

 * Gets the InputJobInfo object by reading the Configuration and deserializing
 * the string. If InputJobInfo is not present in the configuration, throws an
 * exception since that means HCatInputFormat.setInput has not been called.
 * @param conf the Configuration object// www  . j  a  v a2  s . c o  m
 * @return the InputJobInfo object
 * @throws IOException the exception
private static InputJobInfo getJobInfo(Configuration conf) throws IOException {
    String jobString = conf.get(HCatConstants.HCAT_KEY_JOB_INFO);
    if (jobString == null) {
        throw new IOException(
                "job information not found in JobContext." + " HCatInputFormat.setInput() not called?");

    return (InputJobInfo) HCatRSUtil.deserialize(jobString);

From source file:com.cloudera.recordservice.hcatalog.mapreduce.HCatRSInputFormat.java

License:Apache License

 * Initializes the input with a provided filter.
 * See {@link #setInput(Configuration, String, String, String)}
 *///from  www  .  jav  a2 s . c  o  m
public static HCatRSInputFormat setInput(Job job, String location, String filter) throws IOException {
    Configuration conf = job.getConfiguration();
    String kerberosPrincipal = conf.get(ConfVars.KERBEROS_PRINCIPAL_CONF.name);
    Pair<String, String> dbTablePair = HCatUtil.getDbAndTableName(location);
    dbTablePair = HCatRSUtil.cleanQueryPair(dbTablePair);
    String dbName = dbTablePair.first;
    String tableName = dbTablePair.second;
    if (location.toLowerCase().startsWith("select")) {
        RecordServiceConfig.setInputQuery(conf, location);
    } else {
        RecordServiceConfig.setInputTable(conf, dbName, tableName);
    Credentials credentials = job.getCredentials();
    RecordServicePlannerClient.Builder builder = PlanUtil.getBuilder(conf);
    List<NetworkAddress> plannerHosts = PlanUtil.getPlannerHostPorts(conf);
    RecordServicePlannerClient planner = PlanUtil.getPlanner(conf, builder, plannerHosts, kerberosPrincipal,
    try {
        if (planner.isKerberosAuthenticated()) {
            Token<DelegationTokenIdentifier> delegationToken = TokenUtils
            credentials.addToken(DelegationTokenIdentifier.DELEGATION_KIND, delegationToken);
    } catch (RecordServiceException e) {
        throw new IOException(e);
    } finally {
        if (planner != null)
    return setInput(conf, dbName, tableName, filter);

From source file:com.cloudera.recordservice.mr.PlanUtil.java

License:Apache License

 * Generates a request from the configs set in jobConf.
 *///from w w  w .ja va  2  s .  c  o m
public static Request getRequest(Configuration jobConf) throws IOException {
    LOG.debug("Generating input splits.");

    String tblName = jobConf.get(ConfVars.TBL_NAME_CONF.name);
    String inputDir = jobConf.get(FileInputFormat.INPUT_DIR);
    String sqlQuery = jobConf.get(ConfVars.QUERY_NAME_CONF.name);

    int numSet = 0;
    if (tblName != null)
    if (inputDir != null)
    if (sqlQuery != null)

    if (numSet == 0) {
        throw new IllegalArgumentException("No input specified. Specify either '" + ConfVars.TBL_NAME_CONF.name
                + "', '" + ConfVars.QUERY_NAME_CONF.name + "' or '" + FileInputFormat.INPUT_DIR + "'");
    if (numSet > 1) {
        throw new IllegalArgumentException("More than one input specified. Can " + "only specify one of '"
                + ConfVars.TBL_NAME_CONF.name + "'=" + tblName + ", '" + FileInputFormat.INPUT_DIR + "'="
                + inputDir + ", '" + ConfVars.QUERY_NAME_CONF.name + "'=" + sqlQuery);

    String[] colNames = jobConf.getStrings(ConfVars.COL_NAMES_CONF.name);
    if (colNames == null)
        colNames = new String[0];

    if (tblName == null && colNames.length > 0) {
        // TODO: support this.
        throw new IllegalArgumentException("Column projections can only be specified with table inputs.");

    Request request = null;
    if (tblName != null) {
        if (colNames.length == 0) {
            // If length of colNames = 0, return all possible columns
            // TODO: this has slightly different meaning than createProjectionRequest()
            // which treats empty columns as an empty projection. i.e. select * vs count(*)
            // Reconcile this.
            request = Request.createTableScanRequest(tblName);
        } else {
            List<String> projection = new ArrayList<String>();
            for (String c : colNames) {
                if (c == null || c.isEmpty()) {
                    throw new IllegalArgumentException(
                            "Cannot specify projection with null or empty column name.");
            request = Request.createProjectionRequest(tblName, projection);
    } else if (inputDir != null) {
        // TODO: inputDir is a comma separate list of paths. The service needs to
        // handle that.
        if (inputDir.contains(",")) {
            throw new IllegalArgumentException("Only reading a single directory is currently supported.");
        request = Request.createPathRequest(inputDir);
    } else if (sqlQuery != null) {
        request = Request.createSqlRequest(sqlQuery);
    } else {
    return request;

From source file:com.cloudera.recordservice.mr.PlanUtil.java

License:Apache License

 * Given a configuration, returns a list of network addresses for all the planners.
 * This first tries to use the planner auto discovery feature and use ZooKeeper
 * to find all the available planners. If that is not feasible, it tries to use
 * the hardcoded planner host/port lists in the configuration. In case that is
 * not feasible as well, it uses the default planner host/port.
 * @param conf the hadoop job configuration
 * @return a list of network addresses for all the available planners
 *//*from www . j a  v a 2 s  . co m*/
public static List<NetworkAddress> getPlannerHostPorts(Configuration conf) throws IOException {
    List<NetworkAddress> plannerHostPorts = null;
    if (isPlannerDiscoveryEnabled(conf)) {
        try {
            LOG.info("Using planner auto discovery on ZK connection string {}",
            plannerHostPorts = ZooKeeperUtil.getPlanners(conf);
        } catch (IOException e) {
            LOG.warn("Planner discovery failed. Now fall back to use " + ConfVars.PLANNER_HOSTPORTS_CONF.name
                    + " in the job configuration.", e);
    if (plannerHostPorts == null || plannerHostPorts.isEmpty()) {
        plannerHostPorts = RecordServiceConfig.getPlannerHostPort(
                conf.get(ConfVars.PLANNER_HOSTPORTS_CONF.name, RecordServiceConfig.DEFAULT_PLANNER_HOSTPORTS));
    return plannerHostPorts;

From source file:com.cloudera.recordservice.mr.PlanUtil.java

License:Apache License

 * Returns the kerberos principal to connect with.
 *///from www .  j av a  2  s. c o  m
public static String getKerberosPrincipal(Configuration conf) {
    return conf.get(ConfVars.KERBEROS_PRINCIPAL_CONF.name);