List of usage examples for org.apache.hadoop.conf Configuration getLong
public long getLong(String name, long defaultValue)
name
property as a long
. From source file:org.apache.crunch.kafka.inputformat.KafkaRecordReader.java
License:Apache License
@Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { if (!(inputSplit instanceof KafkaInputSplit)) { throw new CrunchRuntimeException("InputSplit for RecordReader is not valid split type."); }/*from w w w . j av a 2 s . c o m*/ Properties kafkaConnectionProperties = filterConnectionProperties( getKafkaConnectionProperties(taskAttemptContext.getConfiguration())); consumer = new KafkaConsumer<>(kafkaConnectionProperties); KafkaInputSplit split = (KafkaInputSplit) inputSplit; TopicPartition topicPartition = split.getTopicPartition(); consumer.assign(Collections.singletonList(topicPartition)); //suggested hack to gather info without gathering data consumer.poll(0); //now seek to the desired start location startingOffset = split.getStartingOffset(); consumer.seek(topicPartition, startingOffset); currentOffset = startingOffset - 1; endingOffset = split.getEndingOffset(); maxNumberOfRecords = endingOffset - startingOffset; if (LOG.isInfoEnabled()) { LOG.info("Reading data from {} between {} and {}", new Object[] { topicPartition, startingOffset, endingOffset }); } Configuration config = taskAttemptContext.getConfiguration(); consumerPollTimeout = config.getLong(CONSUMER_POLL_TIMEOUT_KEY, CONSUMER_POLL_TIMEOUT_DEFAULT); maxNumberAttempts = config.getInt(KAFKA_RETRY_ATTEMPTS_KEY, KAFKA_RETRY_ATTEMPTS_DEFAULT); maxConcurrentEmptyResponses = config.getInt(KAFKA_EMPTY_RETRY_ATTEMPTS_KEY, KAFKA_RETRY_EMPTY_ATTEMPTS_DEFAULT); concurrentEmptyResponses = 0; }
From source file:org.apache.crunch.kafka.record.KafkaInputFormat.java
License:Apache License
/** * Reads the {@code configuration} to determine which topics, partitions, and offsets should be used for reading data. * * @param configuration the configuration to derive the data to read. * @return a map of {@link TopicPartition} to a pair of start and end offsets. * @throws IllegalStateException if the {@code configuration} does not have the start and end offsets set properly * for a partition. */// w w w .ja v a 2 s. c om public static Map<TopicPartition, Pair<Long, Long>> getOffsets(Configuration configuration) { Map<TopicPartition, Pair<Long, Long>> offsets = new HashMap<>(); //find configuration for all of the topics with defined partitions Map<String, String> topicPartitionKeys = configuration.getValByRegex(TOPIC_KEY_REGEX); //for each topic start to process it's partitions for (String key : topicPartitionKeys.keySet()) { String topic = getTopicFromKey(key); int[] partitions = configuration.getInts(key); //for each partition find and add the start/end offset for (int partitionId : partitions) { TopicPartition topicPartition = new TopicPartition(topic, partitionId); long start = configuration.getLong(generatePartitionStartKey(topic, partitionId), Long.MIN_VALUE); long end = configuration.getLong(generatePartitionEndKey(topic, partitionId), Long.MIN_VALUE); if (start == Long.MIN_VALUE || end == Long.MIN_VALUE) { throw new IllegalStateException("The " + topicPartition + " has an invalid start:" + start + " or end:" + end + " offset configured."); } offsets.put(topicPartition, Pair.of(start, end)); } } return offsets; }
From source file:org.apache.crunch.kafka.record.KafkaRecordReader.java
License:Apache License
@Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { if (!(inputSplit instanceof KafkaInputSplit)) { throw new CrunchRuntimeException("InputSplit for RecordReader is not valid split type."); }/* w w w. j a v a 2 s .c om*/ kafkaConnectionProperties = filterConnectionProperties( getKafkaConnectionProperties(taskAttemptContext.getConfiguration())); consumer = new KafkaConsumer<>(kafkaConnectionProperties); KafkaInputSplit split = (KafkaInputSplit) inputSplit; topicPartition = split.getTopicPartition(); consumer.assign(Collections.singletonList(topicPartition)); //suggested hack to gather info without gathering data consumer.poll(0); //now seek to the desired start location startingOffset = split.getStartingOffset(); consumer.seek(topicPartition, startingOffset); currentOffset = startingOffset - 1; endingOffset = split.getEndingOffset(); maxNumberOfRecords = endingOffset - startingOffset; if (LOG.isInfoEnabled()) { LOG.info("Reading data from {} between {} and {}", new Object[] { topicPartition, startingOffset, endingOffset }); } Configuration config = taskAttemptContext.getConfiguration(); consumerPollTimeout = config.getLong(CONSUMER_POLL_TIMEOUT_KEY, CONSUMER_POLL_TIMEOUT_DEFAULT); maxNumberAttempts = config.getInt(KAFKA_RETRY_ATTEMPTS_KEY, KAFKA_RETRY_ATTEMPTS_DEFAULT); maxConcurrentEmptyResponses = config.getInt(KAFKA_EMPTY_RETRY_ATTEMPTS_KEY, KAFKA_RETRY_EMPTY_ATTEMPTS_DEFAULT); concurrentEmptyResponses = 0; }
From source file:org.apache.crunch.util.PartitionUtils.java
License:Apache License
public static <T> int getRecommendedPartitions(PCollection<T> pcollection, Configuration conf) { long bytesPerTask = conf.getLong(BYTES_PER_REDUCE_TASK, DEFAULT_BYTES_PER_REDUCE_TASK); return 1 + (int) (pcollection.getSize() / bytesPerTask); }
From source file:org.apache.druid.indexer.hadoop.DatasourceInputFormat.java
License:Apache License
public static long getMaxSplitSize(final Configuration conf, final String dataSource) { return conf.getLong(StringUtils.format("%s.%s", CONF_MAX_SPLIT_SIZE, dataSource), 0L); }
From source file:org.apache.giraph.conf.LongConfOption.java
License:Apache License
/** * Lookup value, use passed in default value if not found. * * @param conf Configuration//from w w w .java2s.c om * @param val default value to use * @return set for key, or default value passed in */ public long getWithDefault(Configuration conf, long val) { return conf.getLong(getKey(), val); }
From source file:org.apache.giraph.examples.RandomWalkWorkerContext.java
License:Apache License
/** * Initialize sources for Random Walk with Restart. First option * (preferential) is single source given from the command line as a parameter. * Second option is a file with a list of vertex IDs, one per line. In this * second case the preference vector is a uniform distribution over these * vertexes.//from w w w . j av a 2s . c o m * @param configuration The configuration. * @return a (possibly empty) set of source vertices */ private ImmutableSet<Long> initializeSources(Configuration configuration) { ImmutableSet.Builder<Long> builder = ImmutableSet.builder(); long sourceVertex = configuration.getLong(SOURCE_VERTEX, Long.MIN_VALUE); if (sourceVertex != Long.MIN_VALUE) { return ImmutableSet.of(sourceVertex); } else { Path sourceFile = null; try { Path[] cacheFiles = DistributedCache.getLocalCacheFiles(configuration); if (cacheFiles == null || cacheFiles.length == 0) { // empty set if no source vertices configured return ImmutableSet.of(); } sourceFile = cacheFiles[0]; FileSystem fs = FileSystem.getLocal(configuration); BufferedReader in = new BufferedReader( new InputStreamReader(fs.open(sourceFile), Charset.defaultCharset())); String line; while ((line = in.readLine()) != null) { builder.add(Long.parseLong(line)); } in.close(); } catch (IOException e) { getContext().setStatus("Could not load local cache files: " + sourceFile); LOG.error("Could not load local cache files: " + sourceFile, e); } } return builder.build(); }
From source file:org.apache.gobblin.runtime.TaskExecutor.java
License:Apache License
/** * Constructor to work with Hadoop {@link org.apache.hadoop.conf.Configuration}. *///from www . j a v a 2 s.c om public TaskExecutor(Configuration conf) { this(conf.getInt(ConfigurationKeys.TASK_EXECUTOR_THREADPOOL_SIZE_KEY, ConfigurationKeys.DEFAULT_TASK_EXECUTOR_THREADPOOL_SIZE), conf.getInt(ConfigurationKeys.TASK_RETRY_THREAD_POOL_CORE_SIZE_KEY, ConfigurationKeys.DEFAULT_TASK_RETRY_THREAD_POOL_CORE_SIZE), conf.getLong(ConfigurationKeys.TASK_RETRY_INTERVAL_IN_SEC_KEY, ConfigurationKeys.DEFAULT_TASK_RETRY_INTERVAL_IN_SEC), conf.getInt(ConfigurationKeys.QUEUED_TASK_TIME_MAX_SIZE, ConfigurationKeys.DEFAULT_QUEUED_TASK_TIME_MAX_SIZE), conf.getLong(ConfigurationKeys.QUEUED_TASK_TIME_MAX_AGE, ConfigurationKeys.DEFAULT_QUEUED_TASK_TIME_MAX_AGE), conf.getInt(ConfigurationKeys.METRIC_TIMER_WINDOW_SIZE_IN_MINUTES, ConfigurationKeys.DEFAULT_METRIC_TIMER_WINDOW_SIZE_IN_MINUTES)); }
From source file:org.apache.hadoop.examples.RandomTextWriter.java
License:Apache License
/** * This is the main routine for launching a distributed random write job. * It runs 10 maps/node and each node writes 1 gig of data to a DFS file. * The reduce doesn't do anything.//from w ww . j ava 2 s . c o m * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { return printUsage(); } Configuration conf = getConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10); long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have " + BYTES_PER_MAP + " set to 0"); return -2; } long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; conf.setLong(BYTES_PER_MAP, totalBytesToWrite); } conf.setInt(MRJobConfig.NUM_MAPS, numMaps); Job job = Job.getInstance(conf); job.setJarByClass(RandomTextWriter.class); job.setJobName("random-text-writer"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(RandomWriter.RandomInputFormat.class); job.setMapperClass(RandomTextMapper.class); Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else { otherArgs.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } job.setOutputFormatClass(outputFormatClass); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0))); System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); int ret = job.waitForCompletion(true) ? 0 : 1; Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds."); return ret; }
From source file:org.apache.hadoop.examples.RandomWriter.java
License:Apache License
/** * This is the main routine for launching a distributed random write job. * It runs 10 maps/node and each node writes 1 gig of data to a DFS file. * The reduce doesn't do anything./*from ww w . j a v a2 s . c o m*/ * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { System.out.println("Usage: writer <out-dir>"); ToolRunner.printGenericCommandUsage(System.out); return 2; } Path outDir = new Path(args[0]); Configuration conf = getConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int numMapsPerHost = conf.getInt(MAPS_PER_HOST, 10); long numBytesToWritePerMap = conf.getLong(BYTES_PER_MAP, 1 * 1024 * 1024 * 1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have" + BYTES_PER_MAP + " set to 0"); return -2; } long totalBytesToWrite = conf.getLong(TOTAL_BYTES, numMapsPerHost * numBytesToWritePerMap * cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; conf.setLong(BYTES_PER_MAP, totalBytesToWrite); } conf.setInt(MRJobConfig.NUM_MAPS, numMaps); Job job = Job.getInstance(conf); job.setJarByClass(RandomWriter.class); job.setJobName("random-writer"); FileOutputFormat.setOutputPath(job, outDir); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormatClass(RandomInputFormat.class); job.setMapperClass(RandomMapper.class); job.setReducerClass(Reducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); int ret = job.waitForCompletion(true) ? 0 : 1; Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) / 1000 + " seconds."); return ret; }