Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.druid; import io.druid.common.utils.JodaUtils; import io.druid.jackson.DefaultObjectMapper; import io.druid.metadata.MetadataStorageTablesConfig; import io.druid.metadata.SQLMetadataConnector; import io.druid.metadata.storage.mysql.MySQLConnector; import io.druid.query.BaseQuery; import io.druid.query.select.SelectQueryConfig; import io.druid.segment.IndexIO; import io.druid.segment.IndexMergerV9; import io.druid.segment.column.ColumnConfig; import io.druid.segment.loading.DataSegmentPusher; import io.druid.segment.loading.DataSegmentPusherUtil; import io.druid.segment.realtime.appenderator.SegmentIdentifier; import io.druid.storage.hdfs.HdfsDataSegmentPusher; import io.druid.storage.hdfs.HdfsDataSegmentPusherConfig; import io.druid.timeline.DataSegment; import io.druid.timeline.TimelineObjectHolder; import io.druid.timeline.VersionedIntervalTimeline; import io.druid.timeline.partition.LinearShardSpec; import io.druid.timeline.partition.NoneShardSpec; import io.druid.timeline.partition.NumberedShardSpec; import io.druid.timeline.partition.PartitionChunk; import io.druid.timeline.partition.ShardSpec; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.druid.serde.HiveDruidSerializationModule; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.io.retry.RetryPolicies; import org.apache.hadoop.io.retry.RetryProxy; import org.apache.hadoop.util.StringUtils; import com.fasterxml.jackson.databind.InjectableValues; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.jsontype.NamedType; import com.fasterxml.jackson.dataformat.smile.SmileFactory; import com.google.common.base.Function; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Interner; import com.google.common.collect.Interners; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Ordering; import com.google.common.io.CharStreams; import com.metamx.common.MapUtils; import com.metamx.emitter.EmittingLogger; import com.metamx.emitter.core.NoopEmitter; import com.metamx.emitter.service.ServiceEmitter; import com.metamx.http.client.HttpClient; import com.metamx.http.client.Request; import com.metamx.http.client.response.InputStreamResponseHandler; import org.jboss.netty.handler.codec.http.HttpHeaders; import org.jboss.netty.handler.codec.http.HttpMethod; import org.joda.time.DateTime; import org.joda.time.Interval; import org.skife.jdbi.v2.FoldController; import org.skife.jdbi.v2.Folder3; import org.skife.jdbi.v2.Handle; import org.skife.jdbi.v2.PreparedBatch; import org.skife.jdbi.v2.Query; import org.skife.jdbi.v2.ResultIterator; import org.skife.jdbi.v2.StatementContext; import org.skife.jdbi.v2.TransactionCallback; import org.skife.jdbi.v2.TransactionStatus; import org.skife.jdbi.v2.exceptions.CallbackFailedException; import org.skife.jdbi.v2.tweak.HandleCallback; import org.skife.jdbi.v2.util.ByteArrayMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.Reader; import java.net.InetAddress; import java.net.URL; import java.net.UnknownHostException; import java.sql.SQLException; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TimeZone; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; /** * Utils class for Druid storage handler. */ public final class DruidStorageHandlerUtils { private static final Logger LOG = LoggerFactory.getLogger(DruidStorageHandlerUtils.class); private static final String SMILE_CONTENT_TYPE = "application/x-jackson-smile"; /** * Mapper to use to serialize/deserialize Druid objects (JSON) */ public static final ObjectMapper JSON_MAPPER = new DefaultObjectMapper(); /** * Mapper to use to serialize/deserialize Druid objects (SMILE) */ public static final ObjectMapper SMILE_MAPPER = new DefaultObjectMapper(new SmileFactory()); static { // This is needed for serde of PagingSpec as it uses JacksonInject for injecting SelectQueryConfig InjectableValues.Std injectableValues = new InjectableValues.Std().addValue(SelectQueryConfig.class, new SelectQueryConfig(false)); JSON_MAPPER.setInjectableValues(injectableValues); SMILE_MAPPER.setInjectableValues(injectableValues); HiveDruidSerializationModule hiveDruidSerializationModule = new HiveDruidSerializationModule(); JSON_MAPPER.registerModule(hiveDruidSerializationModule); SMILE_MAPPER.registerModule(hiveDruidSerializationModule); } private static final int NUM_RETRIES = 8; private static final int SECONDS_BETWEEN_RETRIES = 2; private static final int DEFAULT_FS_BUFFER_SIZE = 1 << 18; // 256KB private static final int DEFAULT_STREAMING_RESULT_SIZE = 100; /** * Used by druid to perform IO on indexes */ public static final IndexIO INDEX_IO = new IndexIO(JSON_MAPPER, new ColumnConfig() { @Override public int columnCacheSizeBytes() { return 0; } }); /** * Used by druid to merge indexes */ public static final IndexMergerV9 INDEX_MERGER_V9 = new IndexMergerV9(JSON_MAPPER, DruidStorageHandlerUtils.INDEX_IO); /** * Generic Interner implementation used to read segments object from metadata storage */ public static final Interner<DataSegment> DATA_SEGMENT_INTERNER = Interners.newWeakInterner(); static { // Register the shard sub type to be used by the mapper JSON_MAPPER.registerSubtypes(new NamedType(LinearShardSpec.class, "linear")); // set the timezone of the object mapper // THIS IS NOT WORKING workaround is to set it as part of java opts -Duser.timezone="UTC" JSON_MAPPER.setTimeZone(TimeZone.getTimeZone("UTC")); try { // No operation emitter will be used by some internal druid classes. EmittingLogger.registerEmitter(new ServiceEmitter("druid-hive-indexer", InetAddress.getLocalHost().getHostName(), new NoopEmitter())); } catch (UnknownHostException e) { throw Throwables.propagate(e); } } /** * Method that creates a request for Druid JSON query (using SMILE). * * @param address * @param query * * @return * * @throws IOException */ public static Request createRequest(String address, BaseQuery<?> query) throws IOException { return new Request(HttpMethod.POST, new URL(String.format("%s/druid/v2/", "http://" + address))) .setContent(SMILE_MAPPER.writeValueAsBytes(query)) .setHeader(HttpHeaders.Names.CONTENT_TYPE, SMILE_CONTENT_TYPE); } /** * Method that submits a request to an Http address and retrieves the result. * The caller is responsible for closing the stream once it finishes consuming it. * * @param client * @param request * * @return * * @throws IOException */ public static InputStream submitRequest(HttpClient client, Request request) throws IOException { InputStream response; try { response = client.go(request, new InputStreamResponseHandler()).get(); } catch (ExecutionException e) { throw new IOException(e.getCause()); } catch (InterruptedException e) { throw new IOException(e.getCause()); } return response; } public static String getURL(HttpClient client, URL url) throws IOException { try (Reader reader = new InputStreamReader( DruidStorageHandlerUtils.submitRequest(client, new Request(HttpMethod.GET, url)))) { return CharStreams.toString(reader); } } /** * @param taskDir path to the directory containing the segments descriptor info * the descriptor path will be .../workingPath/task_id/{@link DruidStorageHandler#SEGMENTS_DESCRIPTOR_DIR_NAME}/*.json * @param conf hadoop conf to get the file system * * @return List of DataSegments * * @throws IOException can be for the case we did not produce data. */ public static List<DataSegment> getPublishedSegments(Path taskDir, Configuration conf) throws IOException { ImmutableList.Builder<DataSegment> publishedSegmentsBuilder = ImmutableList.builder(); FileSystem fs = taskDir.getFileSystem(conf); for (FileStatus fileStatus : fs.listStatus(taskDir)) { final DataSegment segment = JSON_MAPPER.readValue(fs.open(fileStatus.getPath()), DataSegment.class); publishedSegmentsBuilder.add(segment); } List<DataSegment> publishedSegments = publishedSegmentsBuilder.build(); return publishedSegments; } /** * This function will write to filesystem serialized from of segment descriptor * if an existing file exists it will try to replace it. * * @param outputFS filesystem * @param segment DataSegment object * @param descriptorPath path * * @throws IOException */ public static void writeSegmentDescriptor(final FileSystem outputFS, final DataSegment segment, final Path descriptorPath) throws IOException { final DataPusher descriptorPusher = (DataPusher) RetryProxy.create(DataPusher.class, new DataPusher() { @Override public long push() throws IOException { try { if (outputFS.exists(descriptorPath)) { if (!outputFS.delete(descriptorPath, false)) { throw new IOException( String.format("Failed to delete descriptor at [%s]", descriptorPath)); } } try (final OutputStream descriptorOut = outputFS.create(descriptorPath, true, DEFAULT_FS_BUFFER_SIZE)) { JSON_MAPPER.writeValue(descriptorOut, segment); descriptorOut.flush(); } } catch (RuntimeException | IOException ex) { throw ex; } return -1; } }, RetryPolicies.exponentialBackoffRetry(NUM_RETRIES, SECONDS_BETWEEN_RETRIES, TimeUnit.SECONDS)); descriptorPusher.push(); } /** * @param connector SQL metadata connector to the metadata storage * @param metadataStorageTablesConfig Table config * * @return all the active data sources in the metadata storage */ public static Collection<String> getAllDataSourceNames(SQLMetadataConnector connector, final MetadataStorageTablesConfig metadataStorageTablesConfig) { return connector.getDBI().withHandle(new HandleCallback<List<String>>() { @Override public List<String> withHandle(Handle handle) throws Exception { return handle .createQuery(String.format("SELECT DISTINCT(datasource) FROM %s WHERE used = true", metadataStorageTablesConfig.getSegmentsTable())) .fold(Lists.<String>newArrayList(), new Folder3<ArrayList<String>, Map<String, Object>>() { @Override public ArrayList<String> fold(ArrayList<String> druidDataSources, Map<String, Object> stringObjectMap, FoldController foldController, StatementContext statementContext) throws SQLException { druidDataSources.add(MapUtils.getString(stringObjectMap, "datasource")); return druidDataSources; } }); } }); } /** * @param connector SQL connector to metadata * @param metadataStorageTablesConfig Tables configuration * @param dataSource Name of data source * * @return true if the data source was successfully disabled false otherwise */ public static boolean disableDataSource(SQLMetadataConnector connector, final MetadataStorageTablesConfig metadataStorageTablesConfig, final String dataSource) { try { if (!getAllDataSourceNames(connector, metadataStorageTablesConfig).contains(dataSource)) { LOG.warn("Cannot delete data source {}, does not exist", dataSource); return false; } connector.getDBI().withHandle(new HandleCallback<Void>() { @Override public Void withHandle(Handle handle) throws Exception { disableDataSourceWithHandle(handle, metadataStorageTablesConfig, dataSource); return null; } }); } catch (Exception e) { LOG.error(String.format("Error removing dataSource %s", dataSource), e); return false; } return true; } public static void publishSegments(final SQLMetadataConnector connector, final MetadataStorageTablesConfig metadataStorageTablesConfig, final String dataSource, final List<DataSegment> segments, boolean overwrite, String segmentDirectory, Configuration conf) { try { connector.getDBI().inTransaction(new TransactionCallback<Void>() { @Override public Void inTransaction(Handle handle, TransactionStatus transactionStatus) throws Exception { final List<DataSegment> finalSegmentsToPublish = Lists.newArrayList(); VersionedIntervalTimeline<String, DataSegment> timeline; if (overwrite) { disableDataSourceWithHandle(handle, metadataStorageTablesConfig, dataSource); // When overwriting start with empty timeline, as we are overwriting segments with new versions timeline = new VersionedIntervalTimeline<>(Ordering.natural()); } else { // Append Mode - build a timeline of existing segments in metadata storage. Interval indexedInterval = JodaUtils.umbrellaInterval( Iterables.transform(segments, new Function<DataSegment, Interval>() { @Override public Interval apply(@Nullable DataSegment input) { return input.getInterval(); } })); timeline = getTimelineForIntervalWithHandle(handle, dataSource, indexedInterval, metadataStorageTablesConfig); } for (DataSegment segment : segments) { List<TimelineObjectHolder<String, DataSegment>> existingChunks = timeline .lookup(segment.getInterval()); if (existingChunks.size() > 1) { // Not possible to expand since we have more than one chunk with a single segment. // This is the case when user wants to append a segment with coarser granularity. // e.g If metadata storage already has segments for with granularity HOUR and segments to append have DAY granularity. // Druid shard specs does not support multiple partitions for same interval with different granularity. throw new IllegalStateException(String.format( "Cannot allocate new segment for dataSource[%s], interval[%s], already have [%,d] chunks. Not possible to append new segment.", dataSource, segment.getInterval(), existingChunks.size())); } // Find out the segment with latest version and maximum partition number SegmentIdentifier max = null; final ShardSpec newShardSpec; final String newVersion; if (!existingChunks.isEmpty()) { // Some existing chunk, Find max TimelineObjectHolder<String, DataSegment> existingHolder = Iterables .getOnlyElement(existingChunks); for (PartitionChunk<DataSegment> existing : existingHolder.getObject()) { if (max == null || max.getShardSpec().getPartitionNum() < existing.getObject() .getShardSpec().getPartitionNum()) { max = SegmentIdentifier.fromDataSegment(existing.getObject()); } } } if (max == null) { // No existing shard present in the database, use the current version. newShardSpec = segment.getShardSpec(); newVersion = segment.getVersion(); } else { // use version of existing max segment to generate new shard spec newShardSpec = getNextPartitionShardSpec(max.getShardSpec()); newVersion = max.getVersion(); } DataSegment publishedSegment = publishSegmentWithShardSpec(segment, newShardSpec, newVersion, segmentDirectory, getPath(segment).getFileSystem(conf)); finalSegmentsToPublish.add(publishedSegment); timeline.add(publishedSegment.getInterval(), publishedSegment.getVersion(), publishedSegment.getShardSpec().createChunk(publishedSegment)); } // Publish new segments to metadata storage final PreparedBatch batch = handle.prepareBatch(String.format( "INSERT INTO %1$s (id, dataSource, created_date, start, \"end\", partitioned, version, used, payload) " + "VALUES (:id, :dataSource, :created_date, :start, :end, :partitioned, :version, :used, :payload)", metadataStorageTablesConfig.getSegmentsTable()) ); for (final DataSegment segment : finalSegmentsToPublish) { batch.add(new ImmutableMap.Builder<String, Object>().put("id", segment.getIdentifier()) .put("dataSource", segment.getDataSource()) .put("created_date", new DateTime().toString()) .put("start", segment.getInterval().getStart().toString()) .put("end", segment.getInterval().getEnd().toString()) .put("partitioned", (segment.getShardSpec() instanceof NoneShardSpec) ? false : true) .put("version", segment.getVersion()).put("used", true) .put("payload", JSON_MAPPER.writeValueAsBytes(segment)).build()); LOG.info("Published {}", segment.getIdentifier()); } batch.execute(); return null; } }); } catch (CallbackFailedException e) { LOG.error("Exception while publishing segments", e.getCause()); throw Throwables.propagate(e.getCause()); } } public static void disableDataSourceWithHandle(Handle handle, MetadataStorageTablesConfig metadataStorageTablesConfig, String dataSource) { handle.createStatement(String.format("UPDATE %s SET used=false WHERE dataSource = :dataSource", metadataStorageTablesConfig.getSegmentsTable())).bind("dataSource", dataSource).execute(); } /** * @param connector SQL connector to metadata * @param metadataStorageTablesConfig Tables configuration * @param dataSource Name of data source * * @return List of all data segments part of the given data source */ public static List<DataSegment> getDataSegmentList(final SQLMetadataConnector connector, final MetadataStorageTablesConfig metadataStorageTablesConfig, final String dataSource) { List<DataSegment> segmentList = connector.retryTransaction(new TransactionCallback<List<DataSegment>>() { @Override public List<DataSegment> inTransaction(Handle handle, TransactionStatus status) throws Exception { return handle .createQuery(String.format("SELECT payload FROM %s WHERE dataSource = :dataSource", metadataStorageTablesConfig.getSegmentsTable())) .setFetchSize(getStreamingFetchSize(connector)).bind("dataSource", dataSource) .map(ByteArrayMapper.FIRST) .fold(new ArrayList<DataSegment>(), new Folder3<List<DataSegment>, byte[]>() { @Override public List<DataSegment> fold(List<DataSegment> accumulator, byte[] payload, FoldController control, StatementContext ctx) throws SQLException { try { final DataSegment segment = DATA_SEGMENT_INTERNER .intern(JSON_MAPPER.readValue(payload, DataSegment.class)); accumulator.add(segment); return accumulator; } catch (Exception e) { throw new SQLException(e.toString()); } } }); } }, 3, SQLMetadataConnector.DEFAULT_MAX_TRIES); return segmentList; } /** * @param connector * * @return streaming fetch size. */ private static int getStreamingFetchSize(SQLMetadataConnector connector) { if (connector instanceof MySQLConnector) { return Integer.MIN_VALUE; } return DEFAULT_STREAMING_RESULT_SIZE; } /** * @param pushedSegment * @param segmentsDescriptorDir * * @return a sanitize file name */ public static Path makeSegmentDescriptorOutputPath(DataSegment pushedSegment, Path segmentsDescriptorDir) { return new Path(segmentsDescriptorDir, String.format("%s.json", pushedSegment.getIdentifier().replace(":", ""))); } /** * Simple interface for retry operations */ public interface DataPusher { long push() throws IOException; } // Thanks, HBase Storage handler public static void addDependencyJars(Configuration conf, Class<?>... classes) throws IOException { FileSystem localFs = FileSystem.getLocal(conf); Set<String> jars = new HashSet<String>(); jars.addAll(conf.getStringCollection("tmpjars")); for (Class<?> clazz : classes) { if (clazz == null) { continue; } String path = Utilities.jarFinderGetJar(clazz); if (path == null) { throw new RuntimeException( "Could not find jar for class " + clazz + " in order to ship it to the cluster."); } if (!localFs.exists(new Path(path))) { throw new RuntimeException("Could not validate jar file " + path + " for class " + clazz); } jars.add(path.toString()); } if (jars.isEmpty()) { return; } conf.set("tmpjars", StringUtils.arrayToString(jars.toArray(new String[jars.size()]))); } private static VersionedIntervalTimeline<String, DataSegment> getTimelineForIntervalWithHandle( final Handle handle, final String dataSource, final Interval interval, final MetadataStorageTablesConfig dbTables) throws IOException { Query<Map<String, Object>> sql = handle.createQuery(String.format( "SELECT payload FROM %s WHERE used = true AND dataSource = ? AND start <= ? AND \"end\" >= ?", dbTables.getSegmentsTable())).bind(0, dataSource).bind(1, interval.getEnd().toString()) .bind(2, interval.getStart().toString()); final VersionedIntervalTimeline<String, DataSegment> timeline = new VersionedIntervalTimeline<>( Ordering.natural()); final ResultIterator<byte[]> dbSegments = sql.map(ByteArrayMapper.FIRST).iterator(); try { while (dbSegments.hasNext()) { final byte[] payload = dbSegments.next(); DataSegment segment = JSON_MAPPER.readValue(payload, DataSegment.class); timeline.add(segment.getInterval(), segment.getVersion(), segment.getShardSpec().createChunk(segment)); } } finally { dbSegments.close(); } return timeline; } public static DataSegmentPusher createSegmentPusherForDirectory(String segmentDirectory, Configuration configuration) throws IOException { final HdfsDataSegmentPusherConfig hdfsDataSegmentPusherConfig = new HdfsDataSegmentPusherConfig(); hdfsDataSegmentPusherConfig.setStorageDirectory(segmentDirectory); return new HdfsDataSegmentPusher(hdfsDataSegmentPusherConfig, configuration, JSON_MAPPER); } public static DataSegment publishSegmentWithShardSpec(DataSegment segment, ShardSpec shardSpec, String version, String segmentDirectory, FileSystem fs) throws IOException { boolean retry = true; DataSegment.Builder dataSegmentBuilder = new DataSegment.Builder(segment).version(version); Path finalPath = null; while (retry) { retry = false; dataSegmentBuilder.shardSpec(shardSpec); final Path intermediatePath = getPath(segment); finalPath = finalPathForSegment(segmentDirectory, dataSegmentBuilder.build()); // Create parent if it does not exist, recreation is not an error fs.mkdirs(finalPath.getParent()); if (!fs.rename(intermediatePath, finalPath)) { if (fs.exists(finalPath)) { // Someone else is also trying to append shardSpec = getNextPartitionShardSpec(shardSpec); retry = true; } else { throw new IOException(String.format( "Failed to rename intermediate segment[%s] to final segment[%s] is not present.", intermediatePath, finalPath)); } } } DataSegment dataSegment = dataSegmentBuilder .loadSpec(ImmutableMap.<String, Object>of("type", "hdfs", "path", finalPath.toString())).build(); writeSegmentDescriptor(fs, dataSegment, new Path(finalPath.getParent(), "descriptor.json")); return dataSegment; } public static Path finalPathForSegment(String segmentDirectory, DataSegment segment) { return new Path(String.format("%s/%s/index.zip", segmentDirectory, DataSegmentPusherUtil.getHdfsStorageDir(segment))); } private static ShardSpec getNextPartitionShardSpec(ShardSpec shardSpec) { if (shardSpec instanceof LinearShardSpec) { return new LinearShardSpec(shardSpec.getPartitionNum() + 1); } else if (shardSpec instanceof NumberedShardSpec) { return new NumberedShardSpec(shardSpec.getPartitionNum(), ((NumberedShardSpec) shardSpec).getPartitions()); } else { // Druid only support appending more partitions to Linear and Numbered ShardSpecs. throw new IllegalStateException(String.format("Cannot expand shard spec [%s]", shardSpec)); } } public static Path getPath(DataSegment dataSegment) { return new Path(String.valueOf(dataSegment.getLoadSpec().get("path"))); } }