Example usage for org.apache.hadoop.conf Configuration setStrings

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration setStrings.

Prototype

public void setStrings(String name, String... values)

Source Link

Document

Set the array of string values for the name property as as comma delimited values.

Usage

From source file:com.twitter.elephanttwin.lucene.indexing.HadoopSplitIndexingJob.java

License:Apache License

/**
 * Override and extend this in implementations to add custom settings to the Job and Conf to
 * create lucene-based indexes that will point you at what splits contain values you are looking for.
 * You are on your own for filtering the splits appropriately before creating an MR job.. but
 * check out how this was done over MapFile-based indexes in
 * com.twitter.elephanttwin.indexing.AbstractIndexesFileInputFormat
 *//*  w w  w.j  a  v a2 s .  c om*/
@Override
protected void setupJob(Job job) {
    Configuration conf = job.getConfiguration();
    conf.set("mapred.child.java.opts", "-Xmx4g");
    List<String> fieldNames = Lists.newArrayList();
    for (IndexedField field : getIndexedFields()) {
        fieldNames.add(field.getFieldName());
        conf.set(HadoopSplitIndexingMapper.FIELD_VALUE_EXTRACTOR_PREFIX + field.getFieldName(),
                getExtractorClassName(field.getFieldName()));
    }
    conf.setStrings(HadoopSplitIndexingMapper.FIELDS_TO_INDEX_KEY, fieldNames.toArray(new String[] {}));
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(HadoopSplitDocument.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setInputFormatClass(getInputFormatClass());

    job.setMapperClass(HadoopSplitIndexingMapper.class);
    job.setReducerClass(HadoopSplitIndexingReducer.class);
}

From source file:com.twitter.hraven.etl.JobFileProcessor.java

License:Apache License

public int run(String[] args) throws Exception {

    Configuration hbaseConf = HBaseConfiguration.create(getConf());

    // Grab input args and allow for -Dxyz style arguments
    String[] otherArgs = new GenericOptionsParser(hbaseConf, args).getRemainingArgs();

    // Grab the arguments we're looking for.
    CommandLine commandLine = parseArgs(otherArgs);

    // Grab the cluster argument
    String cluster = commandLine.getOptionValue("c");
    LOG.info("cluster=" + cluster);

    // Number of parallel threads to use
    int threadCount = 1;
    if (commandLine.hasOption("t")) {
        try {//from w  w  w .ja  va2  s.  c  o  m
            threadCount = Integer.parseInt(commandLine.getOptionValue("t"));
        } catch (NumberFormatException nfe) {
            throw new IllegalArgumentException(
                    "Provided thread-count argument (-t) is not a number: " + commandLine.getOptionValue("t"),
                    nfe);
        }
        if (threadCount < 1) {
            throw new IllegalArgumentException(
                    "Cannot run fewer than 1 thread. Provided thread-count argument (-t): " + threadCount);
        }
    }
    LOG.info("threadCount=" + threadCount);

    boolean reprocess = commandLine.hasOption("r");
    LOG.info("reprocess=" + reprocess);

    // Grab the batch-size argument
    int batchSize;
    if (commandLine.hasOption("b")) {
        try {
            batchSize = Integer.parseInt(commandLine.getOptionValue("b"));
        } catch (NumberFormatException nfe) {
            throw new IllegalArgumentException(
                    "batch size option -b is is not a valid number: " + commandLine.getOptionValue("b"), nfe);
        }
        // Additional check
        if (batchSize < 1) {
            throw new IllegalArgumentException(
                    "Cannot process files in batches smaller than 1. Specified batch size option -b is: "
                            + commandLine.getOptionValue("b"));
        }
    } else {
        batchSize = DEFAULT_BATCH_SIZE;
    }

    // Grab the costfile argument

    String costFilePath = commandLine.getOptionValue("zf");
    LOG.info("cost properties file on hdfs=" + costFilePath);
    if (costFilePath == null)
        costFilePath = Constants.COST_PROPERTIES_HDFS_DIR;
    Path hdfsPath = new Path(costFilePath + Constants.COST_PROPERTIES_FILENAME);
    // add to distributed cache
    DistributedCache.addCacheFile(hdfsPath.toUri(), hbaseConf);

    // Grab the machine type argument
    String machineType = commandLine.getOptionValue("m");
    // set it as part of conf so that the
    // hRaven job can access it in the mapper
    hbaseConf.set(Constants.HRAVEN_MACHINE_TYPE, machineType);

    // check if re-aggregate option is forced on
    // if yes, we need to aggregate for this job inspite of
    // job having aggregation done status in raw table
    boolean reAggregateFlagValue = false;
    if (commandLine.hasOption("ra")) {
        String reaggregateFlag = commandLine.getOptionValue("ra");
        // set it as part of conf so that the
        // hRaven jobProcessor can access it in the mapper
        if (StringUtils.isNotBlank(reaggregateFlag)) {
            LOG.info(" reaggregateFlag is: " + reaggregateFlag);
            if (StringUtils.equalsIgnoreCase(reaggregateFlag, Boolean.TRUE.toString())) {
                reAggregateFlagValue = true;
            }
        }
    }
    LOG.info(AggregationConstants.RE_AGGREGATION_FLAG_NAME + "=" + reAggregateFlagValue);
    hbaseConf.setBoolean(AggregationConstants.RE_AGGREGATION_FLAG_NAME, reAggregateFlagValue);

    // set aggregation to off by default
    boolean aggFlagValue = false;
    if (commandLine.hasOption("a")) {
        String aggregateFlag = commandLine.getOptionValue("a");
        // set it as part of conf so that the
        // hRaven jobProcessor can access it in the mapper
        if (StringUtils.isNotBlank(aggregateFlag)) {
            LOG.info(" aggregateFlag is: " + aggregateFlag);
            if (StringUtils.equalsIgnoreCase(aggregateFlag, Boolean.TRUE.toString())) {
                aggFlagValue = true;
            }
        }
    }
    if (reprocess) {
        // turn off aggregation if reprocessing is true
        // we don't want to inadvertently aggregate again while re-processing
        // re-aggregation needs to be a conscious setting
        aggFlagValue = false;
    }
    LOG.info(AggregationConstants.AGGREGATION_FLAG_NAME + "=" + aggFlagValue);
    hbaseConf.setBoolean(AggregationConstants.AGGREGATION_FLAG_NAME, aggFlagValue);

    String processFileSubstring = null;
    if (commandLine.hasOption("p")) {
        processFileSubstring = commandLine.getOptionValue("p");
    }
    LOG.info("processFileSubstring=" + processFileSubstring);

    // hbase.client.keyvalue.maxsize somehow defaults to 10 MB and we have
    // history files exceeding that. Disable limit.
    hbaseConf.setInt("hbase.client.keyvalue.maxsize", 0);

    // Shove this into the jobConf so that we can get it out on the task side.
    hbaseConf.setStrings(Constants.CLUSTER_JOB_CONF_KEY, cluster);

    boolean success = false;
    if (reprocess) {
        success = reProcessRecords(hbaseConf, cluster, batchSize, threadCount);
    } else {
        success = processRecords(hbaseConf, cluster, batchSize, threadCount, processFileSubstring);
    }

    // Return the status
    return success ? 0 : 1;
}

From source file:com.twitter.hraven.etl.JobFileRawLoader.java

License:Apache License

public int run(String[] args) throws ParseException, IOException, ClassNotFoundException, InterruptedException {

    Configuration myHBaseConf = HBaseConfiguration.create(getConf());
    hdfs = FileSystem.get(myHBaseConf);

    // Grab input args and allow for -Dxyz style arguments
    String[] otherArgs = new GenericOptionsParser(myHBaseConf, args).getRemainingArgs();

    // Grab the arguments we're looking for.
    CommandLine commandLine = parseArgs(otherArgs);

    String input = null;/*from w  w w.  j a  va  2s.co m*/
    boolean inputSpecified = commandLine.hasOption("i");
    if (inputSpecified) {
        // Grab the input path argument
        input = commandLine.getOptionValue("i");
        LOG.info("input=" + input);
    } else {
        LOG.info("Processing input from HBase ProcessRecords");
    }

    // Grab the cluster argument
    String cluster = commandLine.getOptionValue("c");
    LOG.info("cluster=" + cluster);

    String processFileSubstring = null;
    if (commandLine.hasOption("p")) {
        processFileSubstring = commandLine.getOptionValue("p");
    }
    LOG.info("processFileSubstring=" + processFileSubstring);

    boolean forceReprocess = commandLine.hasOption("f");
    LOG.info("forceReprocess: " + forceReprocess);

    // hbase.client.keyvalue.maxsize somehow defaults to 10 MB and we have
    // history files exceeding that. Disable limit.
    myHBaseConf.setInt("hbase.client.keyvalue.maxsize", 0);

    // Shove this into the jobConf so that we can get it out on the task side.
    myHBaseConf.setStrings(Constants.CLUSTER_JOB_CONF_KEY, cluster);

    boolean success = processRecordsFromHBase(myHBaseConf, cluster, processFileSubstring, forceReprocess);

    // Return the status
    return success ? 0 : 1;
}

From source file:com.vertica.hadoop.VerticaConfiguration.java

License:Apache License

/**
  * Sets the Vertica database connection information in the (@link
  * Configuration)//w w  w .  j a v  a  2s .co m
  * 
  * @param conf
  *          the configuration
  * @param hostnames
  *          one or more hosts in the Vertica cluster
  * @param database
  *          the name of the Vertica database
  * @param username
  *          Vertica database username
  * @param password
  *          Vertica database password
  * @param port
  *          Vertica database port         
  */
public static void configureVertica(Configuration conf, String[] hostnames, String database, String port,
        String username, String password) {
    conf.setBoolean(MAP_SPECULATIVE_EXEC, false);
    conf.setBoolean(REDUCE_SPECULATIVE_EXEC, false);

    conf.setStrings(HOSTNAMES_PROP, hostnames);
    conf.set(DATABASE_PROP, database);
    conf.set(USERNAME_PROP, username);
    conf.set(PASSWORD_PROP, password);
    conf.set(PORT_PROP, port);
}

From source file:com.vertica.hadoop.VerticaConfiguration.java

License:Apache License

/**
  * Sets the Vertica database connection information in the (@link
  * Configuration)//from  w w  w . j a  v  a 2s.c  om
  * 
  * @param conf
  *          the configuration
  * @param hostnames
  *          one or more hosts in the source Cluster
  * @param database
  *          the name of the source Vertica database
  * @param username
  *          for the source Vertica database
  * @param password
  *          for the source Vertica database
  * @param port
  *          for the source Vertica database
  * @param level
  *          JDBC logging level        
  * @param logpath
  *          JDBC deug logging  - location of logs to be written, default is current directory          
  * @param output_hostnames
  *          one or more hosts in the output Cluster
  * @param output_database
  *          the name of the output VerticaDatabase
  * @param output_username
  *          for the target Vertica database
  * @param output_password
  *          for the target Vertica database
  * @param output_port
  *          for the target Vertica database         
  */
public static void configureVertica(Configuration conf, String[] hostnames, String database, String port,
        String username, String password, String level, String logpath, String[] output_hostnames,
        String output_database, String output_port, String output_username, String output_password) {
    configureVertica(conf, hostnames, database, port, username, password, level, logpath);
    conf.setStrings(OUTPUT_HOSTNAMES_PROP, output_hostnames);
    conf.set(OUTPUT_DATABASE_PROP, output_database);
    conf.set(OUTPUT_PORT_PROP, output_port);
    conf.set(OUTPUT_USERNAME_PROP, output_username);
    conf.set(OUTPUT_PASSWORD_PROP, output_password);
}

From source file:com.willetinc.hadoop.mapreduce.dynamodb.DynamoDBQueryInputFormat.java

License:Apache License

public static void setRangeKeyValues(Configuration conf, Types type, Collection<AttributeValue> values) {
    setInterpolateAcrossRangeKeyValues(conf, false);
    setRangeKeyType(conf, type);//from  w  w  w  .  j  a va2 s  . co m
    List<String> attrValues = new ArrayList<String>();
    for (AttributeValue attr : values) {
        attrValues.add(AttributeValueIOUtils.toString(type, attr));
    }

    conf.setStrings(DynamoDBConfiguration.RANGE_KEY_VALUES_PROPERTY, attrValues.toArray(new String[] {}));
}

From source file:com.willetinc.hadoop.mapreduce.dynamodb.DynamoDBQueryInputFormatTest.java

License:Apache License

@Test
public void testSetRangeKeyValues() {
    Configuration conf = createMock(Configuration.class);
    final String[] VALUES = new String[] { "TEST1", "TEST2" };
    Types type = Types.STRING;

    List<AttributeValue> attrs = new ArrayList<AttributeValue>();
    for (String value : VALUES) {
        attrs.add(new AttributeValue().withS(value));
    }/*from   w  w  w. ja v  a2s  .  com*/

    conf.setBoolean(DynamoDBConfiguration.RANGE_KEY_INTERPOLATE_PROPERTY, false);
    conf.setInt(DynamoDBConfiguration.RANGE_KEY_TYPE_PROPERTY, type.ordinal());
    conf.setStrings(DynamoDBConfiguration.RANGE_KEY_VALUES_PROPERTY, VALUES);

    replay(conf);

    DynamoDBQueryInputFormat.setRangeKeyValues(conf, type, attrs);

    verify(conf);
}

From source file:com.willetinc.hadoop.mapreduce.dynamodb.DynamoDBQueryInputFormatTest.java

License:Apache License

@Test
public void testSetRangeKeyCondition() {
    Configuration conf = createMock(Configuration.class);
    final String[] VALUES = new String[] { "TEST1", "TEST2" };
    Types type = Types.STRING;
    ComparisonOperator operator = ComparisonOperator.BETWEEN;

    List<AttributeValue> attrs = new ArrayList<AttributeValue>();
    for (String value : VALUES) {
        attrs.add(new AttributeValue().withS(value));
    }/*from   www.j  a  va  2 s.  c  om*/

    conf.setBoolean(DynamoDBConfiguration.RANGE_KEY_INTERPOLATE_PROPERTY, false);
    conf.setInt(DynamoDBConfiguration.RANGE_KEY_OPERATOR_PROPERTY, ComparisonOperator.BETWEEN.ordinal());
    conf.setInt(DynamoDBConfiguration.RANGE_KEY_TYPE_PROPERTY, type.ordinal());
    conf.setStrings(DynamoDBConfiguration.RANGE_KEY_VALUES_PROPERTY, VALUES);

    replay(conf);

    DynamoDBQueryInputFormat.setRangeKeyCondition(conf, type, operator, attrs);

    verify(conf);
}

From source file:com.yahoo.glimmer.indexing.preprocessor.PrepTool.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {

    SimpleJSAP jsap = new SimpleJSAP(PrepTool.class.getName(), "RDF tuples pre-processor for Glimmer",
            new Parameter[] {
                    new Switch(NO_CONTEXTS_ARG, 'C', NO_CONTEXTS_ARG,
                            "Don't process the contexts for each tuple."),
                    new FlaggedOption(ONTOLOGY_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'O',
                            ONTOLOGY_ARG),
                    new FlaggedOption(REDUCER_COUNT_ARG, JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT,
                            JSAP.NOT_REQUIRED, 'r', REDUCER_COUNT_ARG),
                    new UnflaggedOption(INPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the input data."),
                    new UnflaggedOption(OUTPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the out data."), });

    JSAPResult jsapResult = jsap.parse(args);
    if (!jsapResult.success()) {
        System.err.print(jsap.getUsage());
        System.exit(1);//from  w  ww  . j a  v  a 2  s.  co m
    }

    Configuration config = getConf();

    boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false);
    config.setBoolean(TuplesToResourcesMapper.INCLUDE_CONTEXTS_KEY, withContexts);

    // The ontology if any...
    String ontologyFilename = jsapResult.getString(ONTOLOGY_ARG);
    if (ontologyFilename != null) {
        // Load the ontology
        InputStream ontologyInputStream = new FileInputStream(ontologyFilename);
        OWLOntology ontology = OntologyLoader.load(ontologyInputStream);
        System.out.println(
                "Loaded ontology from " + ontologyFilename + " with " + ontology.getAxiomCount() + " axioms.");

        ArrayList<String> ontologyClasses = new ArrayList<String>();
        for (OWLClass owlClass : ontology.getClassesInSignature()) {
            ontologyClasses.add(owlClass.getIRI().toString());
        }
        System.out.println("Adding " + ontologyClasses.size() + " classes from ontology.");
        config.setStrings(TuplesToResourcesMapper.EXTRA_RESOURCES, ontologyClasses.toArray(new String[0]));
    } else {
        System.out.println("No ontology filename set in conf.  No ontology has been loaded.");
    }

    Job job = Job.getInstance(config);
    job.setJarByClass(PrepTool.class);

    job.setJobName(PrepTool.class.getName() + "-part1-" + System.currentTimeMillis());
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(TuplesToResourcesMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    int reducerCount = jsapResult.getInt(REDUCER_COUNT_ARG, DEFAULT_REDUCER_COUNT);
    job.setNumReduceTasks(reducerCount);
    if (reducerCount == 1) {
        // We assign 'global' ids in the reducer. For this to work, there
        // can be only one. But using just one reducer, we run out of local disk space during the
        // pre-reduce merge with big data sets like WCC.

        job.setReducerClass(ResourcesReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Object.class);
        job.setOutputFormatClass(ResourceRecordWriter.OutputFormat.class);
    } else {
        /*
         * TODO: Take the functionality of the reducer and move it to run on
         * the gateway. We then use n identity reducers, the output of which
         * will be read and merged as streams on the gateway.
         */
    }

    FileInputFormat.setInputPaths(job, new Path(jsapResult.getString(INPUT_ARG)));

    Path outputDir = new Path(jsapResult.getString(OUTPUT_ARG));
    FileOutputFormat.setOutputPath(job, outputDir);

    if (!job.waitForCompletion(true)) {
        System.err.println("Failed to process tuples from " + jsapResult.getString(INPUT_ARG));
        return 1;
    }

    // IF THERE WAS ONLY ONE REDUCER WE NOW HAVE
    // One file per reducer containing lists of urls(recourses) for
    // subjects, predicates, objects and contexts.
    // One file per reducer that contains all resources. subjects +
    // predicates + objects + contexts.
    // One file per reducer that contains the subjects + all <predicate>
    // <object>|"Literal" <context> on that subject.

    // IF THERE WAS MORE THAN ONE REDUCER WE NOW HAVE N FILES THAT NEED TO BE MERGED ON THE GATEWAY. TODO.

    return 0;
}

From source file:com.yahoo.glimmer.indexing.RDFDocumentFactory.java

License:Open Source License

protected static void setupConf(Configuration conf, IndexType type, boolean withContexts, String resourcesHash,
        String resourceIdPrefix, String... fields) {
    conf.setEnum(CONF_INDEX_TYPE_KEY, type);
    conf.setBoolean(CONF_WITH_CONTEXTS_KEY, withContexts);
    if (resourcesHash != null) {
        conf.set(CONF_RESOURCES_HASH_KEY, resourcesHash);
    }/*from   w ww.  ja  v a2  s .  c om*/
    conf.set(CONF_RESOURCE_ID_PREFIX_KEY, resourceIdPrefix);
    conf.setStrings(CONF_FIELDNAMES_KEY, fields);
}