Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.source.extractor.extract; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.slf4j.MDC; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import gobblin.config.client.ConfigClient; import gobblin.config.client.ConfigClientCache; import gobblin.config.client.api.ConfigStoreFactoryDoesNotExistsException; import gobblin.config.client.api.VersionStabilityPolicy; import gobblin.config.store.api.ConfigStoreCreationException; import gobblin.config.store.api.VersionDoesNotExistException; import gobblin.configuration.ConfigurationKeys; import gobblin.configuration.SourceState; import gobblin.configuration.State; import gobblin.configuration.WorkUnitState; import gobblin.configuration.WorkUnitState.WorkingState; import gobblin.source.extractor.JobCommitPolicy; import gobblin.source.extractor.partition.Partition; import gobblin.source.extractor.partition.Partitioner; import gobblin.source.extractor.utils.Utils; import gobblin.source.workunit.Extract; import gobblin.source.workunit.Extract.TableType; import gobblin.source.workunit.MultiWorkUnit; import gobblin.source.workunit.WorkUnit; import gobblin.util.ConfigUtils; import gobblin.util.DatasetFilterUtils; import gobblin.util.PathUtils; import gobblin.util.dataset.DatasetUtils; import lombok.Data; import lombok.extern.slf4j.Slf4j; /** * A base implementation of {@link gobblin.source.Source} for * query-based sources. */ @Slf4j public abstract class QueryBasedSource<S, D> extends AbstractSource<S, D> { public static final String ENTITY_BLACKLIST = "entity.blacklist"; public static final String ENTITY_WHITELIST = "entity.whitelist"; public static final String SOURCE_OBTAIN_TABLE_PROPS_FROM_CONFIG_STORE = "source.obtain_table_props_from_config_store"; public static final boolean DEFAULT_SOURCE_OBTAIN_TABLE_PROPS_FROM_CONFIG_STORE = false; private static final String QUERY_BASED_SOURCE = "query_based_source"; public static final String WORK_UNIT_STATE_VERSION_KEY = "source.querybased.workUnitState.version"; /** * WorkUnit Version 3: * SOURCE_ENTITY = as specified in job config * EXTRACT_TABLE_NAME_KEY = as specified in job config or sanitized version of SOURCE_ENTITY * WorkUnit Version 2 (implicit): * SOURCE_ENTITY = sanitized version of SOURCE_ENTITY in job config * EXTRACT_TABLE_NAME_KEY = as specified in job config * WorkUnit Version 1 (implicit): * SOURCE_ENTITY = as specified in job config * EXTRACT_TABLE_NAME_KEY = as specified in job config */ public static final Integer CURRENT_WORK_UNIT_STATE_VERSION = 3; /** A class that encapsulates a source entity (aka dataset) to be processed */ @Data public static final class SourceEntity { /** * The name of the source entity (as specified in the source) to be processed. For example, * this can be a table name. */ private final String sourceEntityName; /** * The destination table name. This is explicitly specified in the config or is derived from * the sourceEntityName. */ private final String destTableName; /** A string that identifies the source entity */ public String getDatasetName() { return sourceEntityName; } static String sanitizeEntityName(String entity) { return Utils.escapeSpecialCharacters(entity, ConfigurationKeys.ESCAPE_CHARS_IN_TABLE_NAME, "_"); } public static SourceEntity fromSourceEntityName(String sourceEntityName) { return new SourceEntity(sourceEntityName, sanitizeEntityName(sourceEntityName)); } public static Optional<SourceEntity> fromState(State state) { String sourceEntityName; String destTableName; if (state.contains(ConfigurationKeys.SOURCE_ENTITY)) { sourceEntityName = state.getProp(ConfigurationKeys.SOURCE_ENTITY); destTableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, sanitizeEntityName(sourceEntityName)); } else if (state.contains(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY)) { destTableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY); sourceEntityName = destTableName; } else { return Optional.absent(); } return Optional.of(new SourceEntity(sourceEntityName, destTableName)); } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; SourceEntity other = (SourceEntity) obj; if (getDatasetName() == null) { if (other.getDatasetName() != null) return false; } else if (!getDatasetName().equals(other.getDatasetName())) return false; return true; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((getDatasetName() == null) ? 0 : getDatasetName().hashCode()); return result; } } @Override public List<WorkUnit> getWorkunits(SourceState state) { initLogger(state); List<WorkUnit> workUnits = Lists.newArrayList(); // Map<String, String> tableNameToEntityMap = Maps.newHashMap(); Set<SourceEntity> entities = getFilteredSourceEntities(state); Map<SourceEntity, State> tableSpecificPropsMap = shouldObtainTablePropsFromConfigStore(state) ? getTableSpecificPropsFromConfigStore(entities, state) : getTableSpecificPropsFromState(entities, state); Map<SourceEntity, Long> prevWatermarksByTable = getPreviousWatermarksForAllTables(state); for (SourceEntity sourceEntity : Sets.union(entities, prevWatermarksByTable.keySet())) { log.info("Source entity to be processed: {}, carry-over from previous state: {} ", sourceEntity, !entities.contains(sourceEntity)); SourceState combinedState = getCombinedState(state, tableSpecificPropsMap.get(sourceEntity)); long previousWatermark = prevWatermarksByTable.containsKey(sourceEntity) ? prevWatermarksByTable.get(sourceEntity) : ConfigurationKeys.DEFAULT_WATERMARK_VALUE; // If a table name exists in prevWatermarksByTable (i.e., it has a previous watermark) but does not exist // in talbeNameToEntityMap, create an empty workunit for it, so that its previous watermark is preserved. // This is done by overriding the high watermark to be the same as the previous watermark. if (!entities.contains(sourceEntity)) { combinedState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_END_VALUE, previousWatermark); } workUnits.addAll(generateWorkUnits(sourceEntity, state, previousWatermark)); } log.info("Total number of workunits for the current run: " + workUnits.size()); List<WorkUnit> previousWorkUnits = this.getPreviousWorkUnitsForRetry(state); log.info("Total number of incomplete tasks from the previous run: " + previousWorkUnits.size()); workUnits.addAll(previousWorkUnits); int numOfMultiWorkunits = state.getPropAsInt(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY, ConfigurationKeys.DEFAULT_MR_JOB_MAX_MAPPERS); return pack(workUnits, numOfMultiWorkunits); } protected List<WorkUnit> generateWorkUnits(SourceEntity sourceEntity, SourceState state, long previousWatermark) { List<WorkUnit> workUnits = Lists.newArrayList(); String nameSpaceName = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY); TableType tableType = TableType .valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()); List<Partition> partitions = new Partitioner(state).getPartitionList(previousWatermark); Collections.sort(partitions, Partitioner.ascendingComparator); // {@link ConfigurationKeys.EXTRACT_TABLE_NAME_KEY} specify the output path for Extract String outputTableName = sourceEntity.getDestTableName(); log.info("Create extract output with table name is " + outputTableName); Extract extract = createExtract(tableType, nameSpaceName, outputTableName); // Setting current time for the full extract if (Boolean.valueOf(state.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY))) { extract.setFullTrue(System.currentTimeMillis()); } for (Partition partition : partitions) { WorkUnit workunit = WorkUnit.create(extract); workunit.setProp(ConfigurationKeys.SOURCE_ENTITY, sourceEntity.getSourceEntityName()); workunit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, sourceEntity.getDestTableName()); workunit.setProp(WORK_UNIT_STATE_VERSION_KEY, CURRENT_WORK_UNIT_STATE_VERSION); partition.serialize(workunit); workUnits.add(workunit); } return workUnits; } protected Set<SourceEntity> getFilteredSourceEntities(SourceState state) { Set<SourceEntity> unfilteredEntities = getSourceEntities(state); return getFilteredSourceEntitiesHelper(state, unfilteredEntities); } static Set<SourceEntity> getFilteredSourceEntitiesHelper(SourceState state, Iterable<SourceEntity> unfilteredEntities) { Set<SourceEntity> entities = new HashSet<>(); List<Pattern> blacklist = DatasetFilterUtils.getPatternList(state, ENTITY_BLACKLIST); List<Pattern> whitelist = DatasetFilterUtils.getPatternList(state, ENTITY_WHITELIST); for (SourceEntity entity : unfilteredEntities) { if (DatasetFilterUtils.survived(entity.getSourceEntityName(), blacklist, whitelist)) { entities.add(entity); } } return entities; } public static Map<SourceEntity, State> getTableSpecificPropsFromState(Iterable<SourceEntity> entities, SourceState state) { Map<String, SourceEntity> sourceEntityByName = new HashMap<>(); for (SourceEntity entity : entities) { sourceEntityByName.put(entity.getDatasetName(), entity); } Map<String, State> datasetProps = DatasetUtils.getDatasetSpecificProps(sourceEntityByName.keySet(), state); Map<SourceEntity, State> res = new HashMap<>(); for (Map.Entry<String, State> entry : datasetProps.entrySet()) { res.put(sourceEntityByName.get(entry.getKey()), entry.getValue()); } return res; } protected Set<SourceEntity> getSourceEntities(State state) { return getSourceEntitiesHelper(state); } static Set<SourceEntity> getSourceEntitiesHelper(State state) { if (state.contains(ConfigurationKeys.SOURCE_ENTITIES)) { log.info("Using entity names in " + ConfigurationKeys.SOURCE_ENTITIES); HashSet<SourceEntity> res = new HashSet<>(); for (String sourceEntityName : state.getPropAsList(ConfigurationKeys.SOURCE_ENTITIES)) { res.add(SourceEntity.fromSourceEntityName(sourceEntityName)); } return res; } else if (state.contains(ConfigurationKeys.SOURCE_ENTITY) || state.contains(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY)) { Optional<SourceEntity> sourceEntity = SourceEntity.fromState(state); // Guaranteed to be present log.info("Using entity name in " + sourceEntity.get()); return ImmutableSet.of(sourceEntity.get()); } throw new IllegalStateException(String.format("One of the following properties must be specified: %s, %s.", ConfigurationKeys.SOURCE_ENTITIES, ConfigurationKeys.SOURCE_ENTITY)); } private static boolean shouldObtainTablePropsFromConfigStore(SourceState state) { return state.getPropAsBoolean(SOURCE_OBTAIN_TABLE_PROPS_FROM_CONFIG_STORE, DEFAULT_SOURCE_OBTAIN_TABLE_PROPS_FROM_CONFIG_STORE); } private static Map<SourceEntity, State> getTableSpecificPropsFromConfigStore(Collection<SourceEntity> tables, State state) { ConfigClient client = ConfigClientCache.getClient(VersionStabilityPolicy.STRONG_LOCAL_STABILITY); String configStoreUri = state.getProp(ConfigurationKeys.CONFIG_MANAGEMENT_STORE_URI); Preconditions.checkNotNull(configStoreUri); Map<SourceEntity, State> result = Maps.newHashMap(); for (SourceEntity table : tables) { try { result.put(table, ConfigUtils.configToState(client.getConfig(PathUtils .combinePaths(configStoreUri, QUERY_BASED_SOURCE, table.getDatasetName()).toUri()))); } catch (VersionDoesNotExistException | ConfigStoreFactoryDoesNotExistsException | ConfigStoreCreationException e) { throw new RuntimeException("Unable to get table config for " + table, e); } } return result; } private static SourceState getCombinedState(SourceState state, State tableSpecificState) { if (tableSpecificState == null) { return state; } SourceState combinedState = new SourceState(state, state.getPreviousDatasetStatesByUrns(), state.getPreviousWorkUnitStates()); combinedState.addAll(tableSpecificState); return combinedState; } /** * Pack the list of {@code WorkUnit}s into {@code MultiWorkUnit}s. * * TODO: this is currently a simple round-robin packing. More sophisticated bin packing may be necessary * if the round-robin approach leads to mapper skew. */ private static List<WorkUnit> pack(List<WorkUnit> workUnits, int numOfMultiWorkunits) { Preconditions.checkArgument(numOfMultiWorkunits > 0); if (workUnits.size() <= numOfMultiWorkunits) { return workUnits; } List<WorkUnit> result = Lists.newArrayListWithCapacity(numOfMultiWorkunits); for (int i = 0; i < numOfMultiWorkunits; i++) { result.add(MultiWorkUnit.createEmpty()); } for (int i = 0; i < workUnits.size(); i++) { ((MultiWorkUnit) result.get(i % numOfMultiWorkunits)).addWorkUnit(workUnits.get(i)); } return result; } @Override public void shutdown(SourceState state) { } /** * For each table, if job commit policy is to commit on full success, and the table has failed tasks in the * previous run, return the lowest low watermark among all previous {@code WorkUnitState}s of the table. * Otherwise, return the highest high watermark among all previous {@code WorkUnitState}s of the table. */ static Map<SourceEntity, Long> getPreviousWatermarksForAllTables(SourceState state) { Map<SourceEntity, Long> result = Maps.newHashMap(); Map<SourceEntity, Long> prevLowWatermarksByTable = Maps.newHashMap(); Map<SourceEntity, Long> prevActualHighWatermarksByTable = Maps.newHashMap(); Set<SourceEntity> tablesWithFailedTasks = Sets.newHashSet(); Set<SourceEntity> tablesWithNoUpdatesOnPreviousRun = Sets.newHashSet(); boolean commitOnFullSuccess = JobCommitPolicy .getCommitPolicy(state) == JobCommitPolicy.COMMIT_ON_FULL_SUCCESS; for (WorkUnitState previousWus : state.getPreviousWorkUnitStates()) { Optional<SourceEntity> sourceEntity = SourceEntity.fromState(previousWus); if (!sourceEntity.isPresent()) { log.warn("Missing source entity for WorkUnit state: " + previousWus); continue; } SourceEntity table = sourceEntity.get(); long lowWm = ConfigurationKeys.DEFAULT_WATERMARK_VALUE; LongWatermark waterMarkObj = previousWus.getWorkunit().getLowWatermark(LongWatermark.class); // new job state file(version 0.2.1270) , water mark format: // "watermark.interval.value": "{\"low.watermark.to.json\":{\"value\":20160101000000},\"expected.watermark.to.json\":{\"value\":20160715230234}}", if (waterMarkObj != null) { lowWm = waterMarkObj.getValue(); } // job state file(version 0.2.805) // "workunit.low.water.mark": "20160711000000", // "workunit.state.runtime.high.water.mark": "20160716140338", else if (previousWus.getProperties().containsKey(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY)) { lowWm = Long.parseLong( previousWus.getProperties().getProperty(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY)); log.warn("can not find low water mark in json format, getting value from " + ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY + " low water mark " + lowWm); } if (!prevLowWatermarksByTable.containsKey(table)) { prevLowWatermarksByTable.put(table, lowWm); } else { prevLowWatermarksByTable.put(table, Math.min(prevLowWatermarksByTable.get(table), lowWm)); } long highWm = ConfigurationKeys.DEFAULT_WATERMARK_VALUE; waterMarkObj = previousWus.getActualHighWatermark(LongWatermark.class); if (waterMarkObj != null) { highWm = waterMarkObj.getValue(); } else if (previousWus.getProperties() .containsKey(ConfigurationKeys.WORK_UNIT_STATE_RUNTIME_HIGH_WATER_MARK)) { highWm = Long.parseLong(previousWus.getProperties() .getProperty(ConfigurationKeys.WORK_UNIT_STATE_RUNTIME_HIGH_WATER_MARK)); log.warn("can not find high water mark in json format, getting value from " + ConfigurationKeys.WORK_UNIT_STATE_RUNTIME_HIGH_WATER_MARK + " high water mark " + highWm); } if (!prevActualHighWatermarksByTable.containsKey(table)) { prevActualHighWatermarksByTable.put(table, highWm); } else { prevActualHighWatermarksByTable.put(table, Math.max(prevActualHighWatermarksByTable.get(table), highWm)); } if (commitOnFullSuccess && !isSuccessfulOrCommited(previousWus)) { tablesWithFailedTasks.add(table); } if (!isAnyDataProcessed(previousWus)) { tablesWithNoUpdatesOnPreviousRun.add(table); } } for (Map.Entry<SourceEntity, Long> entry : prevLowWatermarksByTable.entrySet()) { if (tablesWithFailedTasks.contains(entry.getKey())) { log.info("Resetting low watermark to {} because previous run failed.", entry.getValue()); result.put(entry.getKey(), entry.getValue()); } else if (tablesWithNoUpdatesOnPreviousRun.contains(entry.getKey())) { log.info("Resetting low watermakr to {} because previous run processed no data.", entry.getValue()); result.put(entry.getKey(), entry.getValue()); } else { result.put(entry.getKey(), prevActualHighWatermarksByTable.get(entry.getKey())); } } return result; } private static boolean isSuccessfulOrCommited(WorkUnitState wus) { return wus.getWorkingState() == WorkingState.SUCCESSFUL || wus.getWorkingState() == WorkingState.COMMITTED; } private static boolean isAnyDataProcessed(WorkUnitState wus) { return wus.getPropAsLong(ConfigurationKeys.EXTRACTOR_ROWS_EXPECTED, 0) > 0; } /** * Initialize the logger. * * @param state * Source state */ private static void initLogger(SourceState state) { StringBuilder sb = new StringBuilder(); sb.append("["); sb.append(StringUtils.stripToEmpty(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA))); sb.append("_"); sb.append(StringUtils.stripToEmpty(state.getProp(ConfigurationKeys.SOURCE_ENTITY))); sb.append("]"); MDC.put("sourceInfo", sb.toString()); } }