Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.druid; import io.druid.metadata.MetadataStorageConnectorConfig; import io.druid.metadata.MetadataStorageTablesConfig; import io.druid.metadata.SQLMetadataConnector; import io.druid.metadata.storage.mysql.MySQLConnector; import io.druid.metadata.storage.postgresql.PostgreSQLConnector; import io.druid.segment.loading.SegmentLoadingException; import io.druid.timeline.DataSegment; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.Constants; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.druid.io.DruidOutputFormat; import org.apache.hadoop.hive.druid.io.DruidQueryBasedInputFormat; import org.apache.hadoop.hive.druid.io.DruidRecordWriter; import org.apache.hadoop.hive.druid.serde.DruidSerDe; import org.apache.hadoop.hive.metastore.DefaultHiveMetaHook; import org.apache.hadoop.hive.metastore.HiveMetaHook; import org.apache.hadoop.hive.metastore.MetaStoreUtils; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider; import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hive.common.util.ShutdownHookManager; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; import com.google.common.base.Strings; import com.google.common.base.Supplier; import com.google.common.base.Suppliers; import com.google.common.base.Throwables; import com.google.common.collect.FluentIterable; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; import com.metamx.common.RetryUtils; import com.metamx.common.lifecycle.Lifecycle; import com.metamx.http.client.HttpClient; import com.metamx.http.client.HttpClientConfig; import com.metamx.http.client.HttpClientInit; import org.joda.time.DateTime; import org.joda.time.DateTimeZone; import org.joda.time.Period; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.concurrent.Callable; import javax.annotation.Nullable; /** * DruidStorageHandler provides a HiveStorageHandler implementation for Druid. */ @SuppressWarnings({ "deprecation", "rawtypes" }) public class DruidStorageHandler extends DefaultHiveMetaHook implements HiveStorageHandler { protected static final Logger LOG = LoggerFactory.getLogger(DruidStorageHandler.class); protected static final SessionState.LogHelper console = new SessionState.LogHelper(LOG); public static final String SEGMENTS_DESCRIPTOR_DIR_NAME = "segmentsDescriptorDir"; public static final String INTERMEDIATE_SEGMENT_DIR_NAME = "intermediateSegmentDir"; private static final HttpClient HTTP_CLIENT; static { final Lifecycle lifecycle = new Lifecycle(); try { lifecycle.start(); } catch (Exception e) { LOG.error("Issues with lifecycle start", e); } HTTP_CLIENT = makeHttpClient(lifecycle); ShutdownHookManager.addShutdownHook(() -> lifecycle.stop()); } private final SQLMetadataConnector connector; private final MetadataStorageTablesConfig druidMetadataStorageTablesConfig; private String uniqueId = null; private String rootWorkingDir = null; private Configuration conf; public DruidStorageHandler() { //this is the default value in druid final String base = HiveConf.getVar(SessionState.getSessionConf(), HiveConf.ConfVars.DRUID_METADATA_BASE); final String dbType = HiveConf.getVar(SessionState.getSessionConf(), HiveConf.ConfVars.DRUID_METADATA_DB_TYPE); final String username = HiveConf.getVar(SessionState.getSessionConf(), HiveConf.ConfVars.DRUID_METADATA_DB_USERNAME); final String password = HiveConf.getVar(SessionState.getSessionConf(), HiveConf.ConfVars.DRUID_METADATA_DB_PASSWORD); final String uri = HiveConf.getVar(SessionState.getSessionConf(), HiveConf.ConfVars.DRUID_METADATA_DB_URI); druidMetadataStorageTablesConfig = MetadataStorageTablesConfig.fromBase(base); final Supplier<MetadataStorageConnectorConfig> storageConnectorConfigSupplier = Suppliers .<MetadataStorageConnectorConfig>ofInstance(new MetadataStorageConnectorConfig() { @Override public String getConnectURI() { return uri; } @Override public String getUser() { return username; } @Override public String getPassword() { return password; } }); if (dbType.equals("mysql")) { connector = new MySQLConnector(storageConnectorConfigSupplier, Suppliers.ofInstance(druidMetadataStorageTablesConfig)); } else if (dbType.equals("postgresql")) { connector = new PostgreSQLConnector(storageConnectorConfigSupplier, Suppliers.ofInstance(druidMetadataStorageTablesConfig)); } else { throw new IllegalStateException(String.format("Unknown metadata storage type [%s]", dbType)); } } @VisibleForTesting public DruidStorageHandler(SQLMetadataConnector connector, MetadataStorageTablesConfig druidMetadataStorageTablesConfig) { this.connector = connector; this.druidMetadataStorageTablesConfig = druidMetadataStorageTablesConfig; } @Override public Class<? extends InputFormat> getInputFormatClass() { return DruidQueryBasedInputFormat.class; } @Override public Class<? extends OutputFormat> getOutputFormatClass() { return DruidOutputFormat.class; } @Override public Class<? extends AbstractSerDe> getSerDeClass() { return DruidSerDe.class; } @Override public HiveMetaHook getMetaHook() { return this; } @Override public HiveAuthorizationProvider getAuthorizationProvider() throws HiveException { return new DefaultHiveAuthorizationProvider(); } @Override public void configureInputJobProperties(TableDesc tableDesc, Map<String, String> jobProperties) { } @Override public void configureInputJobCredentials(TableDesc tableDesc, Map<String, String> jobSecrets) { } @Override public void preCreateTable(Table table) throws MetaException { // Do safety checks if (MetaStoreUtils.isExternalTable(table) && !StringUtils.isEmpty(table.getSd().getLocation())) { throw new MetaException("LOCATION may not be specified for Druid"); } if (table.getPartitionKeysSize() != 0) { throw new MetaException("PARTITIONED BY may not be specified for Druid"); } if (table.getSd().getBucketColsSize() != 0) { throw new MetaException("CLUSTERED BY may not be specified for Druid"); } String dataSourceName = table.getParameters().get(Constants.DRUID_DATA_SOURCE); if (MetaStoreUtils.isExternalTable(table)) { return; } // If it is not an external table we need to check the metadata try { connector.createSegmentTable(); } catch (Exception e) { LOG.error("Exception while trying to create druid segments table", e); throw new MetaException(e.getMessage()); } Collection<String> existingDataSources = DruidStorageHandlerUtils.getAllDataSourceNames(connector, druidMetadataStorageTablesConfig); LOG.debug("pre-create data source with name {}", dataSourceName); if (existingDataSources.contains(dataSourceName)) { throw new MetaException(String.format("Data source [%s] already existing", dataSourceName)); } } @Override public void rollbackCreateTable(Table table) throws MetaException { if (MetaStoreUtils.isExternalTable(table)) { return; } final Path segmentDescriptorDir = getSegmentDescriptorDir(); try { List<DataSegment> dataSegmentList = DruidStorageHandlerUtils.getPublishedSegments(segmentDescriptorDir, getConf()); for (DataSegment dataSegment : dataSegmentList) { try { deleteSegment(dataSegment); } catch (SegmentLoadingException e) { LOG.error(String.format("Error while trying to clean the segment [%s]", dataSegment), e); } } } catch (IOException e) { LOG.error("Exception while rollback", e); throw Throwables.propagate(e); } finally { cleanWorkingDir(); } } @Override public void commitCreateTable(Table table) throws MetaException { LOG.debug("commit create table {}", table.getTableName()); publishSegments(table, true); } public void publishSegments(Table table, boolean overwrite) throws MetaException { if (MetaStoreUtils.isExternalTable(table)) { return; } Lifecycle lifecycle = new Lifecycle(); LOG.info("Committing table {} to the druid metastore", table.getDbName()); final Path tableDir = getSegmentDescriptorDir(); try { List<DataSegment> segmentList = DruidStorageHandlerUtils.getPublishedSegments(tableDir, getConf()); LOG.info("Found {} segments under path {}", segmentList.size(), tableDir); final String dataSourceName = table.getParameters().get(Constants.DRUID_DATA_SOURCE); final String segmentDirectory = table.getParameters().get(Constants.DRUID_SEGMENT_DIRECTORY) != null ? table.getParameters().get(Constants.DRUID_SEGMENT_DIRECTORY) : HiveConf.getVar(getConf(), HiveConf.ConfVars.DRUID_SEGMENT_DIRECTORY); DruidStorageHandlerUtils.publishSegments(connector, druidMetadataStorageTablesConfig, dataSourceName, segmentList, overwrite, segmentDirectory, getConf() ); final String coordinatorAddress = HiveConf.getVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_COORDINATOR_DEFAULT_ADDRESS); int maxTries = HiveConf.getIntVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_MAX_TRIES); LOG.info("checking load status from coordinator {}", coordinatorAddress); String coordinatorResponse = null; try { coordinatorResponse = RetryUtils.retry(new Callable<String>() { @Override public String call() throws Exception { return DruidStorageHandlerUtils.getURL(getHttpClient(), new URL(String.format("http://%s/status", coordinatorAddress))); } }, new Predicate<Throwable>() { @Override public boolean apply(@Nullable Throwable input) { return input instanceof IOException; } }, maxTries); } catch (Exception e) { console.printInfo("Will skip waiting for data loading"); return; } if (Strings.isNullOrEmpty(coordinatorResponse)) { console.printInfo("Will skip waiting for data loading"); return; } console.printInfo(String.format("Waiting for the loading of [%s] segments", segmentList.size())); long passiveWaitTimeMs = HiveConf.getLongVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_PASSIVE_WAIT_TIME); ImmutableSet<URL> setOfUrls = FluentIterable.from(segmentList) .transform(new Function<DataSegment, URL>() { @Override public URL apply(DataSegment dataSegment) { try { //Need to make sure that we are using UTC since most of the druid cluster use UTC by default return new URL( String.format("http://%s/druid/coordinator/v1/datasources/%s/segments/%s", coordinatorAddress, dataSourceName, DataSegment.makeDataSegmentIdentifier(dataSegment.getDataSource(), new DateTime(dataSegment.getInterval().getStartMillis(), DateTimeZone.UTC), new DateTime(dataSegment.getInterval().getEndMillis(), DateTimeZone.UTC), dataSegment.getVersion(), dataSegment.getShardSpec()))); } catch (MalformedURLException e) { Throwables.propagate(e); } return null; } }).toSet(); int numRetries = 0; while (numRetries++ < maxTries && !setOfUrls.isEmpty()) { setOfUrls = ImmutableSet.copyOf(Sets.filter(setOfUrls, new Predicate<URL>() { @Override public boolean apply(URL input) { try { String result = DruidStorageHandlerUtils.getURL(getHttpClient(), input); LOG.debug("Checking segment {} response is {}", input, result); return Strings.isNullOrEmpty(result); } catch (IOException e) { LOG.error(String.format("Error while checking URL [%s]", input), e); return true; } } })); try { if (!setOfUrls.isEmpty()) { Thread.sleep(passiveWaitTimeMs); } } catch (InterruptedException e) { Thread.interrupted(); Throwables.propagate(e); } } if (!setOfUrls.isEmpty()) { // We are not Throwing an exception since it might be a transient issue that is blocking loading console.printError( String.format("Wait time exhausted and we have [%s] out of [%s] segments not loaded yet", setOfUrls.size(), segmentList.size())); } } catch (IOException e) { LOG.error("Exception while commit", e); Throwables.propagate(e); } finally { cleanWorkingDir(); lifecycle.stop(); } } @VisibleForTesting protected void deleteSegment(DataSegment segment) throws SegmentLoadingException { final Path path = DruidStorageHandlerUtils.getPath(segment); LOG.info("removing segment {}, located at path {}", segment.getIdentifier(), path); try { if (path.getName().endsWith(".zip")) { final FileSystem fs = path.getFileSystem(getConf()); if (!fs.exists(path)) { LOG.warn("Segment Path {} does not exist. It appears to have been deleted already.", path); return; } // path format -- > .../dataSource/interval/version/partitionNum/xxx.zip Path partitionNumDir = path.getParent(); if (!fs.delete(partitionNumDir, true)) { throw new SegmentLoadingException("Unable to kill segment, failed to delete dir [%s]", partitionNumDir.toString()); } //try to delete other directories if possible Path versionDir = partitionNumDir.getParent(); if (safeNonRecursiveDelete(fs, versionDir)) { Path intervalDir = versionDir.getParent(); if (safeNonRecursiveDelete(fs, intervalDir)) { Path dataSourceDir = intervalDir.getParent(); safeNonRecursiveDelete(fs, dataSourceDir); } } } else { throw new SegmentLoadingException("Unknown file type[%s]", path); } } catch (IOException e) { throw new SegmentLoadingException(e, "Unable to kill segment"); } } private static boolean safeNonRecursiveDelete(FileSystem fs, Path path) { try { return fs.delete(path, false); } catch (Exception ex) { return false; } } @Override public void preDropTable(Table table) throws MetaException { // Nothing to do } @Override public void rollbackDropTable(Table table) throws MetaException { // Nothing to do } @Override public void commitDropTable(Table table, boolean deleteData) throws MetaException { if (MetaStoreUtils.isExternalTable(table)) { return; } String dataSourceName = Preconditions.checkNotNull(table.getParameters().get(Constants.DRUID_DATA_SOURCE), "DataSource name is null !"); if (deleteData == true) { LOG.info("Dropping with purge all the data for data source {}", dataSourceName); List<DataSegment> dataSegmentList = DruidStorageHandlerUtils.getDataSegmentList(connector, druidMetadataStorageTablesConfig, dataSourceName); if (dataSegmentList.isEmpty()) { LOG.info("Nothing to delete for data source {}", dataSourceName); return; } for (DataSegment dataSegment : dataSegmentList) { try { deleteSegment(dataSegment); } catch (SegmentLoadingException e) { LOG.error(String.format("Error while deleting segment [%s]", dataSegment.getIdentifier()), e); } } } if (DruidStorageHandlerUtils.disableDataSource(connector, druidMetadataStorageTablesConfig, dataSourceName)) { LOG.info("Successfully dropped druid data source {}", dataSourceName); } } @Override public void commitInsertTable(Table table, boolean overwrite) throws MetaException { LOG.debug("commit insert into table {} overwrite {}", table.getTableName(), overwrite); this.publishSegments(table, overwrite); } @Override public void preInsertTable(Table table, boolean overwrite) throws MetaException { } @Override public void rollbackInsertTable(Table table, boolean overwrite) throws MetaException { // do nothing } @Override public void configureOutputJobProperties(TableDesc tableDesc, Map<String, String> jobProperties) { jobProperties.put(Constants.DRUID_SEGMENT_VERSION, new DateTime().toString()); jobProperties.put(Constants.DRUID_JOB_WORKING_DIRECTORY, getStagingWorkingDir().toString()); // DruidOutputFormat will write segments in an intermediate directory jobProperties.put(Constants.DRUID_SEGMENT_INTERMEDIATE_DIRECTORY, getIntermediateSegmentDir().toString()); } @Override public void configureTableJobProperties(TableDesc tableDesc, Map<String, String> jobProperties) { } @Override public void configureJobConf(TableDesc tableDesc, JobConf jobConf) { try { DruidStorageHandlerUtils.addDependencyJars(jobConf, DruidRecordWriter.class); } catch (IOException e) { Throwables.propagate(e); } } @Override public void setConf(Configuration conf) { this.conf = conf; } @Override public Configuration getConf() { return conf; } @Override public String toString() { return Constants.DRUID_HIVE_STORAGE_HANDLER_ID; } public String getUniqueId() { if (uniqueId == null) { uniqueId = Preconditions.checkNotNull( Strings.emptyToNull(HiveConf.getVar(getConf(), HiveConf.ConfVars.HIVEQUERYID)), "Hive query id is null"); } return uniqueId; } private Path getStagingWorkingDir() { return new Path(getRootWorkingDir(), makeStagingName()); } @VisibleForTesting protected String makeStagingName() { return ".staging-".concat(getUniqueId().replace(":", "")); } private Path getSegmentDescriptorDir() { return new Path(getStagingWorkingDir(), SEGMENTS_DESCRIPTOR_DIR_NAME); } private Path getIntermediateSegmentDir() { return new Path(getStagingWorkingDir(), INTERMEDIATE_SEGMENT_DIR_NAME); } private void cleanWorkingDir() { final FileSystem fileSystem; try { fileSystem = getStagingWorkingDir().getFileSystem(getConf()); fileSystem.delete(getStagingWorkingDir(), true); } catch (IOException e) { LOG.error("Got Exception while cleaning working directory", e); } } private String getRootWorkingDir() { if (Strings.isNullOrEmpty(rootWorkingDir)) { rootWorkingDir = HiveConf.getVar(getConf(), HiveConf.ConfVars.DRUID_WORKING_DIR); } return rootWorkingDir; } private static HttpClient makeHttpClient(Lifecycle lifecycle) { final int numConnection = HiveConf.getIntVar(SessionState.getSessionConf(), HiveConf.ConfVars.HIVE_DRUID_NUM_HTTP_CONNECTION); final Period readTimeout = new Period( HiveConf.getVar(SessionState.getSessionConf(), HiveConf.ConfVars.HIVE_DRUID_HTTP_READ_TIMEOUT)); LOG.info("Creating Druid HTTP client with {} max parallel connections and {}ms read timeout", numConnection, readTimeout.toStandardDuration().getMillis()); return HttpClientInit.createClient(HttpClientConfig.builder().withNumConnections(numConnection) .withReadTimeout(new Period(readTimeout).toStandardDuration()).build(), lifecycle); } public static HttpClient getHttpClient() { return HTTP_CLIENT; } }