Java tutorial
/* * Druid - a distributed column store. * Copyright 2012 - 2015 Metamarkets Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.druid.indexer; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Optional; import com.google.common.base.Strings; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.google.common.primitives.Longs; import com.metamx.common.IAE; import com.metamx.common.ISE; import com.metamx.common.logger.Logger; import io.druid.collections.StupidPool; import io.druid.data.input.InputRow; import io.druid.data.input.Row; import io.druid.data.input.Rows; import io.druid.indexer.hadoop.SegmentInputRow; import io.druid.offheap.OffheapBufferPool; import io.druid.query.aggregation.AggregatorFactory; import io.druid.segment.IndexIO; import io.druid.segment.IndexMaker; import io.druid.segment.LoggingProgressIndicator; import io.druid.segment.ProgressIndicator; import io.druid.segment.QueryableIndex; import io.druid.segment.incremental.IncrementalIndex; import io.druid.segment.incremental.IncrementalIndexSchema; import io.druid.segment.incremental.OffheapIncrementalIndex; import io.druid.segment.incremental.OnheapIncrementalIndex; import io.druid.timeline.DataSegment; import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.InvalidJobConfException; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.joda.time.DateTime; import org.joda.time.Interval; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Iterator; import java.util.List; import java.util.Set; /** */ public class IndexGeneratorJob implements Jobby { private static final Logger log = new Logger(IndexGeneratorJob.class); public static List<DataSegment> getPublishedSegments(HadoopDruidIndexerConfig config) { final Configuration conf = JobHelper.injectSystemProperties(new Configuration()); final ObjectMapper jsonMapper = HadoopDruidIndexerConfig.jsonMapper; ImmutableList.Builder<DataSegment> publishedSegmentsBuilder = ImmutableList.builder(); final Path descriptorInfoDir = config.makeDescriptorInfoDir(); try { FileSystem fs = descriptorInfoDir.getFileSystem(conf); for (FileStatus status : fs.listStatus(descriptorInfoDir)) { final DataSegment segment = jsonMapper.readValue(fs.open(status.getPath()), DataSegment.class); publishedSegmentsBuilder.add(segment); log.info("Adding segment %s to the list of published segments", segment.getIdentifier()); } } catch (FileNotFoundException e) { log.error( "[%s] SegmentDescriptorInfo is not found usually when indexing process did not produce any segments meaning" + " either there was no input data to process or all the input events were discarded due to some error", e.getMessage()); Throwables.propagate(e); } catch (IOException e) { throw Throwables.propagate(e); } List<DataSegment> publishedSegments = publishedSegmentsBuilder.build(); return publishedSegments; } private final HadoopDruidIndexerConfig config; private IndexGeneratorStats jobStats; public IndexGeneratorJob(HadoopDruidIndexerConfig config) { this.config = config; this.jobStats = new IndexGeneratorStats(); } protected void setReducerClass(final Job job) { job.setReducerClass(IndexGeneratorReducer.class); } public IndexGeneratorStats getJobStats() { return jobStats; } public boolean run() { try { Job job = Job.getInstance(new Configuration(), String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals())); job.getConfiguration().set("io.sort.record.percent", "0.23"); JobHelper.injectSystemProperties(job); config.addJobProperties(job); job.setMapperClass(IndexGeneratorMapper.class); job.setMapOutputValueClass(BytesWritable.class); SortableBytes.useSortableBytesAsMapOutputKey(job); int numReducers = Iterables.size(config.getAllBuckets().get()); if (numReducers == 0) { throw new RuntimeException("No buckets?? seems there is no data to index."); } if (config.getSchema().getTuningConfig().getUseCombiner()) { job.setCombinerClass(IndexGeneratorCombiner.class); job.setCombinerKeyGroupingComparatorClass(BytesWritable.Comparator.class); } job.setNumReduceTasks(numReducers); job.setPartitionerClass(IndexGeneratorPartitioner.class); setReducerClass(job); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(IndexGeneratorOutputFormat.class); FileOutputFormat.setOutputPath(job, config.makeIntermediatePath()); config.addInputPaths(job); // hack to get druid.processing.bitmap property passed down to hadoop job. // once IndexIO doesn't rely on globally injected properties, we can move this into the HadoopTuningConfig. final String bitmapProperty = "druid.processing.bitmap.type"; final String bitmapType = HadoopDruidIndexerConfig.properties.getProperty(bitmapProperty); if (bitmapType != null) { for (String property : new String[] { "mapreduce.reduce.java.opts", "mapreduce.map.java.opts" }) { // prepend property to allow overriding using hadoop.xxx properties by JobHelper.injectSystemProperties above String value = Strings.nullToEmpty(job.getConfiguration().get(property)); job.getConfiguration().set(property, String.format("-D%s=%s %s", bitmapProperty, bitmapType, value)); } } config.intoConfiguration(job); JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), job); job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); boolean success = job.waitForCompletion(true); Counter invalidRowCount = job.getCounters() .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER); jobStats.setInvalidRowCount(invalidRowCount.getValue()); return success; } catch (Exception e) { throw new RuntimeException(e); } } private static IncrementalIndex makeIncrementalIndex(Bucket theBucket, AggregatorFactory[] aggs, HadoopDruidIndexerConfig config, boolean isOffHeap, StupidPool bufferPool) { final HadoopTuningConfig tuningConfig = config.getSchema().getTuningConfig(); final IncrementalIndexSchema indexSchema = new IncrementalIndexSchema.Builder() .withMinTimestamp(theBucket.time.getMillis()) .withDimensionsSpec(config.getSchema().getDataSchema().getParser()) .withQueryGranularity(config.getSchema().getDataSchema().getGranularitySpec().getQueryGranularity()) .withMetrics(aggs).build(); if (isOffHeap) { return new OffheapIncrementalIndex(indexSchema, bufferPool, true, tuningConfig.getBufferSize()); } else { return new OnheapIncrementalIndex(indexSchema, tuningConfig.getRowFlushBoundary()); } } public static class IndexGeneratorMapper extends HadoopDruidIndexerMapper<BytesWritable, BytesWritable> { private static final HashFunction hashFunction = Hashing.murmur3_128(); private AggregatorFactory[] aggregators; private AggregatorFactory[] combiningAggs; @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); aggregators = config.getSchema().getDataSchema().getAggregators(); combiningAggs = new AggregatorFactory[aggregators.length]; for (int i = 0; i < aggregators.length; ++i) { combiningAggs[i] = aggregators[i].getCombiningFactory(); } } @Override protected void innerMap(InputRow inputRow, Object value, Context context) throws IOException, InterruptedException { // Group by bucket, sort by timestamp final Optional<Bucket> bucket = getConfig().getBucket(inputRow); if (!bucket.isPresent()) { throw new ISE("WTF?! No bucket found for row: %s", inputRow); } final long truncatedTimestamp = granularitySpec.getQueryGranularity() .truncate(inputRow.getTimestampFromEpoch()); final byte[] hashedDimensions = hashFunction.hashBytes(HadoopDruidIndexerConfig.jsonMapper .writeValueAsBytes(Rows.toGroupKey(truncatedTimestamp, inputRow))).asBytes(); // type SegmentInputRow serves as a marker that these InputRow instances have already been combined // and they contain the columns as they show up in the segment after ingestion, not what you would see in raw // data byte[] serializedInputRow = inputRow instanceof SegmentInputRow ? InputRowSerde.toBytes(inputRow, combiningAggs) : InputRowSerde.toBytes(inputRow, aggregators); context.write(new SortableBytes(bucket.get().toGroupKey(), // sort rows by truncated timestamp and hashed dimensions to help reduce spilling on the reducer side ByteBuffer.allocate(Longs.BYTES + hashedDimensions.length).putLong(truncatedTimestamp) .put(hashedDimensions).array()).toBytesWritable(), new BytesWritable(serializedInputRow)); } } public static class IndexGeneratorCombiner extends Reducer<BytesWritable, BytesWritable, BytesWritable, BytesWritable> { private HadoopDruidIndexerConfig config; private AggregatorFactory[] aggregators; private AggregatorFactory[] combiningAggs; @Override protected void setup(Context context) throws IOException, InterruptedException { config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration()); aggregators = config.getSchema().getDataSchema().getAggregators(); combiningAggs = new AggregatorFactory[aggregators.length]; for (int i = 0; i < aggregators.length; ++i) { combiningAggs[i] = aggregators[i].getCombiningFactory(); } } @Override protected void reduce(final BytesWritable key, Iterable<BytesWritable> values, final Context context) throws IOException, InterruptedException { Iterator<BytesWritable> iter = values.iterator(); BytesWritable first = iter.next(); if (iter.hasNext()) { SortableBytes keyBytes = SortableBytes.fromBytesWritable(key); Bucket bucket = Bucket.fromGroupKey(keyBytes.getGroupKey()).lhs; IncrementalIndex index = makeIncrementalIndex(bucket, combiningAggs, config, false, null); index.add(InputRowSerde.fromBytes(first.getBytes(), aggregators)); while (iter.hasNext()) { context.progress(); InputRow value = InputRowSerde.fromBytes(iter.next().getBytes(), aggregators); if (!index.canAppendRow()) { log.info("current index full due to [%s]. creating new index.", index.getOutOfRowsReason()); flushIndexToContextAndClose(key, index, context); index = makeIncrementalIndex(bucket, combiningAggs, config, false, null); } index.add(value); } flushIndexToContextAndClose(key, index, context); } else { context.write(key, first); } } private void flushIndexToContextAndClose(BytesWritable key, IncrementalIndex index, Context context) throws IOException, InterruptedException { Iterator<Row> rows = index.iterator(); while (rows.hasNext()) { context.progress(); Row row = rows.next(); InputRow inputRow = getInputRowFromRow(row, index.getDimensions()); context.write(key, new BytesWritable(InputRowSerde.toBytes(inputRow, combiningAggs))); } index.close(); } private InputRow getInputRowFromRow(final Row row, final List<String> dimensions) { return new InputRow() { @Override public List<String> getDimensions() { return dimensions; } @Override public long getTimestampFromEpoch() { return row.getTimestampFromEpoch(); } @Override public DateTime getTimestamp() { return row.getTimestamp(); } @Override public List<String> getDimension(String dimension) { return row.getDimension(dimension); } @Override public Object getRaw(String dimension) { return row.getRaw(dimension); } @Override public float getFloatMetric(String metric) { return row.getFloatMetric(metric); } @Override public long getLongMetric(String metric) { return row.getLongMetric(metric); } @Override public int compareTo(Row o) { return row.compareTo(o); } }; } } public static class IndexGeneratorPartitioner extends Partitioner<BytesWritable, Writable> implements Configurable { private Configuration config; @Override public int getPartition(BytesWritable bytesWritable, Writable value, int numPartitions) { final ByteBuffer bytes = ByteBuffer.wrap(bytesWritable.getBytes()); bytes.position(4); // Skip length added by SortableBytes int shardNum = bytes.getInt(); if (config.get("mapred.job.tracker").equals("local")) { return shardNum % numPartitions; } else { if (shardNum >= numPartitions) { throw new ISE("Not enough partitions, shard[%,d] >= numPartitions[%,d]", shardNum, numPartitions); } return shardNum; } } @Override public Configuration getConf() { return config; } @Override public void setConf(Configuration config) { this.config = config; } } public static class IndexGeneratorReducer extends Reducer<BytesWritable, BytesWritable, BytesWritable, Text> { protected HadoopDruidIndexerConfig config; private List<String> metricNames = Lists.newArrayList(); private AggregatorFactory[] aggregators; private AggregatorFactory[] combiningAggs; protected ProgressIndicator makeProgressIndicator(final Context context) { return new LoggingProgressIndicator("IndexGeneratorJob") { @Override public void progress() { context.progress(); } }; } protected File persist(final IncrementalIndex index, final Interval interval, final File file, final ProgressIndicator progressIndicator) throws IOException { return IndexMaker.persist(index, interval, file, null, config.getIndexSpec(), progressIndicator); } protected File mergeQueryableIndex(final List<QueryableIndex> indexes, final AggregatorFactory[] aggs, final File file, ProgressIndicator progressIndicator) throws IOException { return IndexMaker.mergeQueryableIndex(indexes, aggs, file, config.getIndexSpec(), progressIndicator); } @Override protected void setup(Context context) throws IOException, InterruptedException { config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration()); aggregators = config.getSchema().getDataSchema().getAggregators(); combiningAggs = new AggregatorFactory[aggregators.length]; for (int i = 0; i < aggregators.length; ++i) { metricNames.add(aggregators[i].getName()); combiningAggs[i] = aggregators[i].getCombiningFactory(); } } @Override protected void reduce(BytesWritable key, Iterable<BytesWritable> values, final Context context) throws IOException, InterruptedException { SortableBytes keyBytes = SortableBytes.fromBytesWritable(key); Bucket bucket = Bucket.fromGroupKey(keyBytes.getGroupKey()).lhs; final Interval interval = config.getGranularitySpec().bucketInterval(bucket.time).get(); final int maxTotalBufferSize = config.getSchema().getTuningConfig().getBufferSize(); final int aggregationBufferSize = (int) ((double) maxTotalBufferSize * config.getSchema().getTuningConfig().getAggregationBufferRatio()); final StupidPool<ByteBuffer> bufferPool = new OffheapBufferPool(aggregationBufferSize); IncrementalIndex index = makeIncrementalIndex(bucket, combiningAggs, config, config.getSchema().getTuningConfig().isIngestOffheap(), bufferPool); try { File baseFlushFile = File.createTempFile("base", "flush"); baseFlushFile.delete(); baseFlushFile.mkdirs(); Set<File> toMerge = Sets.newTreeSet(); int indexCount = 0; int lineCount = 0; int runningTotalLineCount = 0; long startTime = System.currentTimeMillis(); Set<String> allDimensionNames = Sets.newHashSet(); final ProgressIndicator progressIndicator = makeProgressIndicator(context); for (final BytesWritable bw : values) { context.progress(); final InputRow inputRow = index.formatRow(InputRowSerde.fromBytes(bw.getBytes(), aggregators)); allDimensionNames.addAll(inputRow.getDimensions()); int numRows = index.add(inputRow); ++lineCount; if (!index.canAppendRow()) { log.info(index.getOutOfRowsReason()); log.info("%,d lines to %,d rows in %,d millis", lineCount - runningTotalLineCount, numRows, System.currentTimeMillis() - startTime); runningTotalLineCount = lineCount; final File file = new File(baseFlushFile, String.format("index%,05d", indexCount)); toMerge.add(file); context.progress(); persist(index, interval, file, progressIndicator); // close this index and make a new one, reusing same buffer index.close(); index = makeIncrementalIndex(bucket, combiningAggs, config, config.getSchema().getTuningConfig().isIngestOffheap(), bufferPool); startTime = System.currentTimeMillis(); ++indexCount; } } log.info("%,d lines completed.", lineCount); List<QueryableIndex> indexes = Lists.newArrayListWithCapacity(indexCount); final File mergedBase; if (toMerge.size() == 0) { if (index.isEmpty()) { throw new IAE("If you try to persist empty indexes you are going to have a bad time"); } mergedBase = new File(baseFlushFile, "merged"); persist(index, interval, mergedBase, progressIndicator); } else { if (!index.isEmpty()) { final File finalFile = new File(baseFlushFile, "final"); persist(index, interval, finalFile, progressIndicator); toMerge.add(finalFile); } for (File file : toMerge) { indexes.add(IndexIO.loadIndex(file)); } mergedBase = mergeQueryableIndex(indexes, aggregators, new File(baseFlushFile, "merged"), progressIndicator); } final FileSystem outputFS = new Path(config.getSchema().getIOConfig().getSegmentOutputPath()) .getFileSystem(context.getConfiguration()); final DataSegment segment = JobHelper.serializeOutIndex( new DataSegment(config.getDataSource(), interval, config.getSchema().getTuningConfig().getVersion(), null, ImmutableList.copyOf(allDimensionNames), metricNames, config.getShardSpec(bucket).getActualSpec(), -1, -1), context.getConfiguration(), context, context.getTaskAttemptID(), mergedBase, JobHelper.makeSegmentOutputPath( new Path(config.getSchema().getIOConfig().getSegmentOutputPath()), outputFS, config.getSchema().getDataSchema().getDataSource(), config.getSchema().getTuningConfig().getVersion(), config.getSchema() .getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), bucket.partitionNum)); Path descriptorPath = config.makeDescriptorInfoPath(segment); descriptorPath = JobHelper.prependFSIfNullScheme( FileSystem.get(descriptorPath.toUri(), context.getConfiguration()), descriptorPath); log.info("Writing descriptor to path[%s]", descriptorPath); JobHelper.writeSegmentDescriptor( config.makeDescriptorInfoDir().getFileSystem(context.getConfiguration()), segment, descriptorPath, context); for (File file : toMerge) { FileUtils.deleteDirectory(file); } } finally { index.close(); } } } public static class IndexGeneratorOutputFormat extends TextOutputFormat { @Override public void checkOutputSpecs(JobContext job) throws IOException { Path outDir = getOutputPath(job); if (outDir == null) { throw new InvalidJobConfException("Output directory not set."); } } } public static class IndexGeneratorStats { private long invalidRowCount = 0; public long getInvalidRowCount() { return invalidRowCount; } public void setInvalidRowCount(long invalidRowCount) { this.invalidRowCount = invalidRowCount; } } }