List of usage examples for org.apache.hadoop.conf Configuration setStrings
public void setStrings(String name, String... values)
name
property as as comma delimited values. From source file:com.twitter.elephanttwin.lucene.indexing.HadoopSplitIndexingJob.java
License:Apache License
/** * Override and extend this in implementations to add custom settings to the Job and Conf to * create lucene-based indexes that will point you at what splits contain values you are looking for. * You are on your own for filtering the splits appropriately before creating an MR job.. but * check out how this was done over MapFile-based indexes in * com.twitter.elephanttwin.indexing.AbstractIndexesFileInputFormat *//* w w w.j a v a2 s . c om*/ @Override protected void setupJob(Job job) { Configuration conf = job.getConfiguration(); conf.set("mapred.child.java.opts", "-Xmx4g"); List<String> fieldNames = Lists.newArrayList(); for (IndexedField field : getIndexedFields()) { fieldNames.add(field.getFieldName()); conf.set(HadoopSplitIndexingMapper.FIELD_VALUE_EXTRACTOR_PREFIX + field.getFieldName(), getExtractorClassName(field.getFieldName())); } conf.setStrings(HadoopSplitIndexingMapper.FIELDS_TO_INDEX_KEY, fieldNames.toArray(new String[] {})); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(HadoopSplitDocument.class); job.setOutputFormatClass(NullOutputFormat.class); job.setInputFormatClass(getInputFormatClass()); job.setMapperClass(HadoopSplitIndexingMapper.class); job.setReducerClass(HadoopSplitIndexingReducer.class); }
From source file:com.twitter.hraven.etl.JobFileProcessor.java
License:Apache License
public int run(String[] args) throws Exception { Configuration hbaseConf = HBaseConfiguration.create(getConf()); // Grab input args and allow for -Dxyz style arguments String[] otherArgs = new GenericOptionsParser(hbaseConf, args).getRemainingArgs(); // Grab the arguments we're looking for. CommandLine commandLine = parseArgs(otherArgs); // Grab the cluster argument String cluster = commandLine.getOptionValue("c"); LOG.info("cluster=" + cluster); // Number of parallel threads to use int threadCount = 1; if (commandLine.hasOption("t")) { try {//from w w w .ja va2 s. c o m threadCount = Integer.parseInt(commandLine.getOptionValue("t")); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "Provided thread-count argument (-t) is not a number: " + commandLine.getOptionValue("t"), nfe); } if (threadCount < 1) { throw new IllegalArgumentException( "Cannot run fewer than 1 thread. Provided thread-count argument (-t): " + threadCount); } } LOG.info("threadCount=" + threadCount); boolean reprocess = commandLine.hasOption("r"); LOG.info("reprocess=" + reprocess); // Grab the batch-size argument int batchSize; if (commandLine.hasOption("b")) { try { batchSize = Integer.parseInt(commandLine.getOptionValue("b")); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "batch size option -b is is not a valid number: " + commandLine.getOptionValue("b"), nfe); } // Additional check if (batchSize < 1) { throw new IllegalArgumentException( "Cannot process files in batches smaller than 1. Specified batch size option -b is: " + commandLine.getOptionValue("b")); } } else { batchSize = DEFAULT_BATCH_SIZE; } // Grab the costfile argument String costFilePath = commandLine.getOptionValue("zf"); LOG.info("cost properties file on hdfs=" + costFilePath); if (costFilePath == null) costFilePath = Constants.COST_PROPERTIES_HDFS_DIR; Path hdfsPath = new Path(costFilePath + Constants.COST_PROPERTIES_FILENAME); // add to distributed cache DistributedCache.addCacheFile(hdfsPath.toUri(), hbaseConf); // Grab the machine type argument String machineType = commandLine.getOptionValue("m"); // set it as part of conf so that the // hRaven job can access it in the mapper hbaseConf.set(Constants.HRAVEN_MACHINE_TYPE, machineType); // check if re-aggregate option is forced on // if yes, we need to aggregate for this job inspite of // job having aggregation done status in raw table boolean reAggregateFlagValue = false; if (commandLine.hasOption("ra")) { String reaggregateFlag = commandLine.getOptionValue("ra"); // set it as part of conf so that the // hRaven jobProcessor can access it in the mapper if (StringUtils.isNotBlank(reaggregateFlag)) { LOG.info(" reaggregateFlag is: " + reaggregateFlag); if (StringUtils.equalsIgnoreCase(reaggregateFlag, Boolean.TRUE.toString())) { reAggregateFlagValue = true; } } } LOG.info(AggregationConstants.RE_AGGREGATION_FLAG_NAME + "=" + reAggregateFlagValue); hbaseConf.setBoolean(AggregationConstants.RE_AGGREGATION_FLAG_NAME, reAggregateFlagValue); // set aggregation to off by default boolean aggFlagValue = false; if (commandLine.hasOption("a")) { String aggregateFlag = commandLine.getOptionValue("a"); // set it as part of conf so that the // hRaven jobProcessor can access it in the mapper if (StringUtils.isNotBlank(aggregateFlag)) { LOG.info(" aggregateFlag is: " + aggregateFlag); if (StringUtils.equalsIgnoreCase(aggregateFlag, Boolean.TRUE.toString())) { aggFlagValue = true; } } } if (reprocess) { // turn off aggregation if reprocessing is true // we don't want to inadvertently aggregate again while re-processing // re-aggregation needs to be a conscious setting aggFlagValue = false; } LOG.info(AggregationConstants.AGGREGATION_FLAG_NAME + "=" + aggFlagValue); hbaseConf.setBoolean(AggregationConstants.AGGREGATION_FLAG_NAME, aggFlagValue); String processFileSubstring = null; if (commandLine.hasOption("p")) { processFileSubstring = commandLine.getOptionValue("p"); } LOG.info("processFileSubstring=" + processFileSubstring); // hbase.client.keyvalue.maxsize somehow defaults to 10 MB and we have // history files exceeding that. Disable limit. hbaseConf.setInt("hbase.client.keyvalue.maxsize", 0); // Shove this into the jobConf so that we can get it out on the task side. hbaseConf.setStrings(Constants.CLUSTER_JOB_CONF_KEY, cluster); boolean success = false; if (reprocess) { success = reProcessRecords(hbaseConf, cluster, batchSize, threadCount); } else { success = processRecords(hbaseConf, cluster, batchSize, threadCount, processFileSubstring); } // Return the status return success ? 0 : 1; }
From source file:com.twitter.hraven.etl.JobFileRawLoader.java
License:Apache License
public int run(String[] args) throws ParseException, IOException, ClassNotFoundException, InterruptedException { Configuration myHBaseConf = HBaseConfiguration.create(getConf()); hdfs = FileSystem.get(myHBaseConf); // Grab input args and allow for -Dxyz style arguments String[] otherArgs = new GenericOptionsParser(myHBaseConf, args).getRemainingArgs(); // Grab the arguments we're looking for. CommandLine commandLine = parseArgs(otherArgs); String input = null;/*from w w w. j a va 2s.co m*/ boolean inputSpecified = commandLine.hasOption("i"); if (inputSpecified) { // Grab the input path argument input = commandLine.getOptionValue("i"); LOG.info("input=" + input); } else { LOG.info("Processing input from HBase ProcessRecords"); } // Grab the cluster argument String cluster = commandLine.getOptionValue("c"); LOG.info("cluster=" + cluster); String processFileSubstring = null; if (commandLine.hasOption("p")) { processFileSubstring = commandLine.getOptionValue("p"); } LOG.info("processFileSubstring=" + processFileSubstring); boolean forceReprocess = commandLine.hasOption("f"); LOG.info("forceReprocess: " + forceReprocess); // hbase.client.keyvalue.maxsize somehow defaults to 10 MB and we have // history files exceeding that. Disable limit. myHBaseConf.setInt("hbase.client.keyvalue.maxsize", 0); // Shove this into the jobConf so that we can get it out on the task side. myHBaseConf.setStrings(Constants.CLUSTER_JOB_CONF_KEY, cluster); boolean success = processRecordsFromHBase(myHBaseConf, cluster, processFileSubstring, forceReprocess); // Return the status return success ? 0 : 1; }
From source file:com.vertica.hadoop.VerticaConfiguration.java
License:Apache License
/** * Sets the Vertica database connection information in the (@link * Configuration)//w w w . j a v a 2s .co m * * @param conf * the configuration * @param hostnames * one or more hosts in the Vertica cluster * @param database * the name of the Vertica database * @param username * Vertica database username * @param password * Vertica database password * @param port * Vertica database port */ public static void configureVertica(Configuration conf, String[] hostnames, String database, String port, String username, String password) { conf.setBoolean(MAP_SPECULATIVE_EXEC, false); conf.setBoolean(REDUCE_SPECULATIVE_EXEC, false); conf.setStrings(HOSTNAMES_PROP, hostnames); conf.set(DATABASE_PROP, database); conf.set(USERNAME_PROP, username); conf.set(PASSWORD_PROP, password); conf.set(PORT_PROP, port); }
From source file:com.vertica.hadoop.VerticaConfiguration.java
License:Apache License
/** * Sets the Vertica database connection information in the (@link * Configuration)//from w w w . j a v a 2s.c om * * @param conf * the configuration * @param hostnames * one or more hosts in the source Cluster * @param database * the name of the source Vertica database * @param username * for the source Vertica database * @param password * for the source Vertica database * @param port * for the source Vertica database * @param level * JDBC logging level * @param logpath * JDBC deug logging - location of logs to be written, default is current directory * @param output_hostnames * one or more hosts in the output Cluster * @param output_database * the name of the output VerticaDatabase * @param output_username * for the target Vertica database * @param output_password * for the target Vertica database * @param output_port * for the target Vertica database */ public static void configureVertica(Configuration conf, String[] hostnames, String database, String port, String username, String password, String level, String logpath, String[] output_hostnames, String output_database, String output_port, String output_username, String output_password) { configureVertica(conf, hostnames, database, port, username, password, level, logpath); conf.setStrings(OUTPUT_HOSTNAMES_PROP, output_hostnames); conf.set(OUTPUT_DATABASE_PROP, output_database); conf.set(OUTPUT_PORT_PROP, output_port); conf.set(OUTPUT_USERNAME_PROP, output_username); conf.set(OUTPUT_PASSWORD_PROP, output_password); }
From source file:com.willetinc.hadoop.mapreduce.dynamodb.DynamoDBQueryInputFormat.java
License:Apache License
public static void setRangeKeyValues(Configuration conf, Types type, Collection<AttributeValue> values) { setInterpolateAcrossRangeKeyValues(conf, false); setRangeKeyType(conf, type);//from w w w . j a va2 s . co m List<String> attrValues = new ArrayList<String>(); for (AttributeValue attr : values) { attrValues.add(AttributeValueIOUtils.toString(type, attr)); } conf.setStrings(DynamoDBConfiguration.RANGE_KEY_VALUES_PROPERTY, attrValues.toArray(new String[] {})); }
From source file:com.willetinc.hadoop.mapreduce.dynamodb.DynamoDBQueryInputFormatTest.java
License:Apache License
@Test public void testSetRangeKeyValues() { Configuration conf = createMock(Configuration.class); final String[] VALUES = new String[] { "TEST1", "TEST2" }; Types type = Types.STRING; List<AttributeValue> attrs = new ArrayList<AttributeValue>(); for (String value : VALUES) { attrs.add(new AttributeValue().withS(value)); }/*from w w w. ja v a2s . com*/ conf.setBoolean(DynamoDBConfiguration.RANGE_KEY_INTERPOLATE_PROPERTY, false); conf.setInt(DynamoDBConfiguration.RANGE_KEY_TYPE_PROPERTY, type.ordinal()); conf.setStrings(DynamoDBConfiguration.RANGE_KEY_VALUES_PROPERTY, VALUES); replay(conf); DynamoDBQueryInputFormat.setRangeKeyValues(conf, type, attrs); verify(conf); }
From source file:com.willetinc.hadoop.mapreduce.dynamodb.DynamoDBQueryInputFormatTest.java
License:Apache License
@Test public void testSetRangeKeyCondition() { Configuration conf = createMock(Configuration.class); final String[] VALUES = new String[] { "TEST1", "TEST2" }; Types type = Types.STRING; ComparisonOperator operator = ComparisonOperator.BETWEEN; List<AttributeValue> attrs = new ArrayList<AttributeValue>(); for (String value : VALUES) { attrs.add(new AttributeValue().withS(value)); }/*from www.j a va 2 s. c om*/ conf.setBoolean(DynamoDBConfiguration.RANGE_KEY_INTERPOLATE_PROPERTY, false); conf.setInt(DynamoDBConfiguration.RANGE_KEY_OPERATOR_PROPERTY, ComparisonOperator.BETWEEN.ordinal()); conf.setInt(DynamoDBConfiguration.RANGE_KEY_TYPE_PROPERTY, type.ordinal()); conf.setStrings(DynamoDBConfiguration.RANGE_KEY_VALUES_PROPERTY, VALUES); replay(conf); DynamoDBQueryInputFormat.setRangeKeyCondition(conf, type, operator, attrs); verify(conf); }
From source file:com.yahoo.glimmer.indexing.preprocessor.PrepTool.java
License:Open Source License
@Override public int run(String[] args) throws Exception { SimpleJSAP jsap = new SimpleJSAP(PrepTool.class.getName(), "RDF tuples pre-processor for Glimmer", new Parameter[] { new Switch(NO_CONTEXTS_ARG, 'C', NO_CONTEXTS_ARG, "Don't process the contexts for each tuple."), new FlaggedOption(ONTOLOGY_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'O', ONTOLOGY_ARG), new FlaggedOption(REDUCER_COUNT_ARG, JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'r', REDUCER_COUNT_ARG), new UnflaggedOption(INPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location for the input data."), new UnflaggedOption(OUTPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location for the out data."), }); JSAPResult jsapResult = jsap.parse(args); if (!jsapResult.success()) { System.err.print(jsap.getUsage()); System.exit(1);//from w ww . j a v a 2 s. co m } Configuration config = getConf(); boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false); config.setBoolean(TuplesToResourcesMapper.INCLUDE_CONTEXTS_KEY, withContexts); // The ontology if any... String ontologyFilename = jsapResult.getString(ONTOLOGY_ARG); if (ontologyFilename != null) { // Load the ontology InputStream ontologyInputStream = new FileInputStream(ontologyFilename); OWLOntology ontology = OntologyLoader.load(ontologyInputStream); System.out.println( "Loaded ontology from " + ontologyFilename + " with " + ontology.getAxiomCount() + " axioms."); ArrayList<String> ontologyClasses = new ArrayList<String>(); for (OWLClass owlClass : ontology.getClassesInSignature()) { ontologyClasses.add(owlClass.getIRI().toString()); } System.out.println("Adding " + ontologyClasses.size() + " classes from ontology."); config.setStrings(TuplesToResourcesMapper.EXTRA_RESOURCES, ontologyClasses.toArray(new String[0])); } else { System.out.println("No ontology filename set in conf. No ontology has been loaded."); } Job job = Job.getInstance(config); job.setJarByClass(PrepTool.class); job.setJobName(PrepTool.class.getName() + "-part1-" + System.currentTimeMillis()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(TuplesToResourcesMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); int reducerCount = jsapResult.getInt(REDUCER_COUNT_ARG, DEFAULT_REDUCER_COUNT); job.setNumReduceTasks(reducerCount); if (reducerCount == 1) { // We assign 'global' ids in the reducer. For this to work, there // can be only one. But using just one reducer, we run out of local disk space during the // pre-reduce merge with big data sets like WCC. job.setReducerClass(ResourcesReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Object.class); job.setOutputFormatClass(ResourceRecordWriter.OutputFormat.class); } else { /* * TODO: Take the functionality of the reducer and move it to run on * the gateway. We then use n identity reducers, the output of which * will be read and merged as streams on the gateway. */ } FileInputFormat.setInputPaths(job, new Path(jsapResult.getString(INPUT_ARG))); Path outputDir = new Path(jsapResult.getString(OUTPUT_ARG)); FileOutputFormat.setOutputPath(job, outputDir); if (!job.waitForCompletion(true)) { System.err.println("Failed to process tuples from " + jsapResult.getString(INPUT_ARG)); return 1; } // IF THERE WAS ONLY ONE REDUCER WE NOW HAVE // One file per reducer containing lists of urls(recourses) for // subjects, predicates, objects and contexts. // One file per reducer that contains all resources. subjects + // predicates + objects + contexts. // One file per reducer that contains the subjects + all <predicate> // <object>|"Literal" <context> on that subject. // IF THERE WAS MORE THAN ONE REDUCER WE NOW HAVE N FILES THAT NEED TO BE MERGED ON THE GATEWAY. TODO. return 0; }
From source file:com.yahoo.glimmer.indexing.RDFDocumentFactory.java
License:Open Source License
protected static void setupConf(Configuration conf, IndexType type, boolean withContexts, String resourcesHash, String resourceIdPrefix, String... fields) { conf.setEnum(CONF_INDEX_TYPE_KEY, type); conf.setBoolean(CONF_WITH_CONTEXTS_KEY, withContexts); if (resourcesHash != null) { conf.set(CONF_RESOURCES_HASH_KEY, resourcesHash); }/*from w ww. ja v a2 s . c om*/ conf.set(CONF_RESOURCE_ID_PREFIX_KEY, resourceIdPrefix); conf.setStrings(CONF_FIELDNAMES_KEY, fields); }