Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.gobblin.data.management.conversion.hive.dataset; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import lombok.Getter; import lombok.ToString; import lombok.extern.slf4j.Slf4j; import org.apache.gobblin.dataset.DatasetConstants; import org.apache.gobblin.dataset.DatasetDescriptor; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.metadata.Table; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.typesafe.config.Config; import org.apache.gobblin.data.management.conversion.hive.entities.StageableTableMetadata; import org.apache.gobblin.data.management.copy.hive.HiveDataset; import org.apache.gobblin.data.management.copy.hive.HiveDatasetFinder; import org.apache.gobblin.hive.HiveMetastoreClientPool; import org.apache.gobblin.util.ConfigUtils; /** * <p> * A {@link HiveDataset} that can be converted from one source format to several destination formats. * This class holds the {@link ConversionConfig}s required for conversion into each * destination format. The {@link ConversionConfig} for a destination format can be accessed by calling {@link #getConversionConfigForFormat(String)}. * </p> * * <p> * <b>Instantiation</b> * <ul> * <li> The constructor takes in a dataset {@link Config} which MUST have a comma separated list of destination formats at key, * {@value #DESTINATION_CONVERSION_FORMATS_KEY} * <li> Conversion configuration for a format can be set by using this destination format as prefix. * <li> E.g. If {@value #DESTINATION_CONVERSION_FORMATS_KEY}=flattenedOrc,nestedOrc.<br> * The destination table name for flattened ORC is set at flattenedOrc.tableName<br> * And the destination table name for nested ORC is set at nestedOrc.tableName * </ul> * </p> * @see ConversionConfig */ @ToString @Slf4j public class ConvertibleHiveDataset extends HiveDataset { public static final String DESTINATION_CONVERSION_FORMATS_KEY = "destinationFormats"; // Destination formats @Getter private final Set<String> destFormats; // Mapping for destination format to it's Conversion config private final Map<String, ConversionConfig> destConversionConfigs; // Source Dataset Descriptor @Getter private final DatasetDescriptor sourceDataset; // List of destination Dataset Descriptor @Getter private final List<DatasetDescriptor> destDatasets; /** * <ul> * <li> The constructor takes in a dataset {@link Config} which MUST have a comma separated list of destination formats at key, * {@value #DESTINATION_CONVERSION_FORMATS_KEY} * <li> Conversion configuration for a format can be set by using destination format as prefix. * <li> E.g. If {@value #DESTINATION_CONVERSION_FORMATS_KEY}=flattenedOrc,nestedOrc.<br> * The destination table name for flattened ORC is set at flattenedOrc.tableName<br> * And the destination table name for nested ORC is set at nestedOrc.tableName * </ul> * @param fs * @param clientPool * @param table * @param config */ public ConvertibleHiveDataset(FileSystem fs, HiveMetastoreClientPool clientPool, Table table, Properties jobProps, Config config) { super(fs, clientPool, table, jobProps, config); Preconditions.checkArgument(config.hasPath(DESTINATION_CONVERSION_FORMATS_KEY), String.format( "At least one destination format should be specified at %s.%s. If you do not intend to convert dataset %s set %s.%s to true", super.properties.getProperty(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY, ""), DESTINATION_CONVERSION_FORMATS_KEY, table.getCompleteName(), super.properties.getProperty(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY, ""), HiveDatasetFinder.HIVE_DATASET_IS_BLACKLISTED_KEY)); // value for DESTINATION_CONVERSION_FORMATS_KEY can be a TypeSafe list or a comma separated list of string this.destFormats = Sets .newHashSet(ConfigUtils.getStringList(this.datasetConfig, DESTINATION_CONVERSION_FORMATS_KEY)); // For each format create ConversionConfig and store it in a Map<format,conversionConfig> this.destConversionConfigs = Maps.newHashMap(); for (String format : this.destFormats) { if (this.datasetConfig.hasPath(format)) { log.debug("Found destination format: " + format); this.destConversionConfigs.put(format, new ConversionConfig(this.datasetConfig.getConfig(format), table, format)); } } this.sourceDataset = createSourceDataset(); this.destDatasets = createDestDatasets(); } private List<DatasetDescriptor> createDestDatasets() { List<DatasetDescriptor> destDatasets = new ArrayList<>(); for (String format : getDestFormats()) { Optional<ConversionConfig> conversionConfigForFormat = getConversionConfigForFormat(format); if (!conversionConfigForFormat.isPresent()) { continue; } String destTable = conversionConfigForFormat.get().getDestinationDbName() + "." + conversionConfigForFormat.get().getDestinationTableName(); DatasetDescriptor dest = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, destTable); String destLocation = conversionConfigForFormat.get().getDestinationDataPath() + Path.SEPARATOR + "final"; dest.addMetadata(DatasetConstants.FS_SCHEME, getSourceDataset().getMetadata().get(DatasetConstants.FS_SCHEME)); dest.addMetadata(DatasetConstants.FS_LOCATION, destLocation); destDatasets.add(dest); } return destDatasets; } private DatasetDescriptor createSourceDataset() { try { String sourceTable = getTable().getDbName() + "." + getTable().getTableName(); DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, sourceTable); Path sourcePath = getTable().getDataLocation(); log.info(String.format("[%s]Source path %s being used in conversion", this.getClass().getName(), sourcePath)); String sourceLocation = Path.getPathWithoutSchemeAndAuthority(sourcePath).toString(); FileSystem sourceFs = sourcePath.getFileSystem(new Configuration()); source.addMetadata(DatasetConstants.FS_SCHEME, sourceFs.getScheme()); source.addMetadata(DatasetConstants.FS_LOCATION, sourceLocation); return source; } catch (IOException e) { throw new RuntimeException(e); } } /** * Return the {@link ConversionConfig} for a destination format if available. If not return {@link Optional#absent()} * @param format for which {@link ConversionConfig} needs to be returned */ public Optional<ConversionConfig> getConversionConfigForFormat(String format) { return Optional.fromNullable(this.destConversionConfigs.get(format)); } /** * The Conversion configuration for converting from source format to each destination format. * <p> * <b>Required properties</b> * <ul> * <li>{@value #DESTINATION_DB_KEY} * <li>{@value #DESTINATION_TABLE_KEY} * <li>{@value #DESTINATION_DATA_PATH_KEY} * </ul> * <b>Optional properties</b> * <ul> * <li>{@value #CLUSTER_BY_KEY} * <li>{@value #NUM_BUCKETS_KEY} * <li>{@value #HIVE_RUNTIME_PROPERTIES_LIST_KEY} can be used to provide a list of hive properties to be set before * conversion. The value should can be an array of keys and values or a comma separated string of keys and values. * E.g. [key1,value1,key2,value2] or key1,value1,key2,value2 * <li>{@value #DESTINATION_TABLE_PROPERTIES_LIST_KEY} can be used to provide a list of table properties to be set * on the destination table. The value should can be an array of keys and values or a comma separated string of keys and values. * E.g. [key1,value1,key2,value2] or key1,value1,key2,value2 * </ul> * <p> */ @Getter @ToString public static class ConversionConfig extends StageableTableMetadata { public static final String DESTINATION_VIEW_KEY = "destination.viewName"; public static final String UPDATE_VIEW_ALWAYS_ENABLED = "updateViewAlways.enabled"; private final String destinationFormat; // destinationViewName : If specified view with 'destinationViewName' is created if not already exists over destinationTableName private final Optional<String> destinationViewName; // updateViewAlwaysEnabled: If false 'destinationViewName' is only updated when schema evolves; if true 'destinationViewName' // ... is always updated (everytime publish happens) private final boolean updateViewAlwaysEnabled; private ConversionConfig(Config config, Table table, String destinationFormat) { super(config, table); // Required this.destinationFormat = destinationFormat; // Optional this.destinationViewName = Optional.fromNullable( resolveTemplate(ConfigUtils.getString(config, DESTINATION_VIEW_KEY, null), table)); this.updateViewAlwaysEnabled = ConfigUtils.getBoolean(config, UPDATE_VIEW_ALWAYS_ENABLED, true); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } if (!super.equals(o)) { return false; } ConversionConfig that = (ConversionConfig) o; if (isUpdateViewAlwaysEnabled() != that.isUpdateViewAlwaysEnabled()) { return false; } if (!getDestinationFormat().equals(that.getDestinationFormat())) { return false; } return getDestinationViewName().equals(that.getDestinationViewName()); } @Override public int hashCode() { int result = super.hashCode(); result = 31 * result + getDestinationFormat().hashCode(); result = 31 * result + getDestinationViewName().hashCode(); result = 31 * result + (isUpdateViewAlwaysEnabled() ? 1 : 0); return result; } } }