gobblin.data.management.copy.CopySource.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.data.management.copy.CopySource.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.data.management.copy;

import java.io.IOException;
import java.net.URI;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;

import javax.annotation.Nullable;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;

import com.google.common.base.Function;
import com.google.common.base.Optional;
import com.google.common.base.Predicates;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimaps;
import com.google.common.collect.SetMultimap;

import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.SourceState;
import gobblin.configuration.State;
import gobblin.configuration.WorkUnitState;
import gobblin.data.management.copy.extractor.EmptyExtractor;
import gobblin.data.management.copy.extractor.FileAwareInputStreamExtractor;
import gobblin.data.management.copy.prioritization.FileSetComparator;
import gobblin.data.management.copy.publisher.CopyEventSubmitterHelper;
import gobblin.data.management.copy.watermark.CopyableFileWatermarkGenerator;
import gobblin.data.management.copy.watermark.CopyableFileWatermarkHelper;
import gobblin.data.management.dataset.DatasetUtils;
import gobblin.data.management.partition.CopyableDatasetRequestor;
import gobblin.data.management.partition.FileSet;
import gobblin.data.management.partition.FileSetResourceEstimator;
import gobblin.dataset.Dataset;
import gobblin.dataset.DatasetsFinder;
import gobblin.dataset.IterableDatasetFinder;
import gobblin.dataset.IterableDatasetFinderImpl;
import gobblin.instrumented.Instrumented;
import gobblin.metrics.GobblinMetrics;
import gobblin.metrics.MetricContext;
import gobblin.metrics.Tag;
import gobblin.metrics.event.EventSubmitter;
import gobblin.metrics.event.sla.SlaEventKeys;
import gobblin.source.extractor.Extractor;
import gobblin.source.extractor.WatermarkInterval;
import gobblin.source.extractor.extract.AbstractSource;
import gobblin.source.workunit.Extract;
import gobblin.source.workunit.WorkUnit;
import gobblin.source.workunit.WorkUnitWeighter;
import gobblin.util.ExecutorsUtils;
import gobblin.util.HadoopUtils;
import gobblin.util.WriterUtils;
import gobblin.util.binpacking.FieldWeighter;
import gobblin.util.binpacking.WorstFitDecreasingBinPacking;
import gobblin.util.deprecation.DeprecationUtils;
import gobblin.util.executors.IteratorExecutor;
import gobblin.util.guid.Guid;
import gobblin.util.request_allocation.GreedyAllocator;
import gobblin.util.request_allocation.HierarchicalAllocator;
import gobblin.util.request_allocation.HierarchicalPrioritizer;
import gobblin.util.request_allocation.RequestAllocator;
import gobblin.util.request_allocation.RequestAllocatorConfig;
import gobblin.util.request_allocation.RequestAllocatorUtils;

import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;

/**
 * {@link gobblin.source.Source} that generates work units from {@link gobblin.data.management.copy.CopyableDataset}s.
 *
 */
@Slf4j
public class CopySource extends AbstractSource<String, FileAwareInputStream> {

    public static final String DEFAULT_DATASET_PROFILE_CLASS_KEY = CopyableGlobDatasetFinder.class
            .getCanonicalName();
    public static final String SERIALIZED_COPYABLE_FILE = CopyConfiguration.COPY_PREFIX
            + ".serialized.copyable.file";
    public static final String COPY_ENTITY_CLASS = CopyConfiguration.COPY_PREFIX + ".copy.entity.class";
    public static final String SERIALIZED_COPYABLE_DATASET = CopyConfiguration.COPY_PREFIX
            + ".serialized.copyable.datasets";
    public static final String WORK_UNIT_GUID = CopyConfiguration.COPY_PREFIX + ".work.unit.guid";
    public static final String MAX_CONCURRENT_LISTING_SERVICES = CopyConfiguration.COPY_PREFIX
            + ".max.concurrent.listing.services";
    public static final int DEFAULT_MAX_CONCURRENT_LISTING_SERVICES = 20;
    public static final String MAX_FILES_COPIED_KEY = CopyConfiguration.COPY_PREFIX + ".max.files.copied";
    public static final String SIMULATE = CopyConfiguration.COPY_PREFIX + ".simulate";
    public static final String MAX_SIZE_MULTI_WORKUNITS = CopyConfiguration.COPY_PREFIX
            + ".binPacking.maxSizePerBin";
    public static final String MAX_WORK_UNITS_PER_BIN = CopyConfiguration.COPY_PREFIX
            + ".binPacking.maxWorkUnitsPerBin";

    private static final String WORK_UNIT_WEIGHT = CopyConfiguration.COPY_PREFIX + ".workUnitWeight";

    private final WorkUnitWeighter weighter = new FieldWeighter(WORK_UNIT_WEIGHT);

    public MetricContext metricContext;

    /**
     * <ul>
     * Does the following:
     * <li>Instantiate a {@link DatasetsFinder}.
     * <li>Find all {@link Dataset} using {@link DatasetsFinder}.
     * <li>For each {@link CopyableDataset} get all {@link CopyEntity}s.
     * <li>Create a {@link WorkUnit} per {@link CopyEntity}.
     * </ul>
     *
     * <p>
     * In this implementation, one workunit is created for every {@link CopyEntity} found. But the extractor/converters
     * and writers are built to support multiple {@link CopyEntity}s per workunit
     * </p>
     *
     * @param state see {@link gobblin.configuration.SourceState}
     * @return Work units for copying files.
     */
    @Override
    public List<WorkUnit> getWorkunits(final SourceState state) {

        this.metricContext = Instrumented.getMetricContext(state, CopySource.class);

        try {

            DeprecationUtils.renameDeprecatedKeys(state,
                    CopyConfiguration.MAX_COPY_PREFIX + "." + CopyResourcePool.ENTITIES_KEY,
                    Lists.newArrayList(MAX_FILES_COPIED_KEY));

            final FileSystem sourceFs = getSourceFileSystem(state);
            final FileSystem targetFs = getTargetFileSystem(state);

            log.info("Identified source file system at {} and target file system at {}.", sourceFs.getUri(),
                    targetFs.getUri());

            long maxSizePerBin = state.getPropAsLong(MAX_SIZE_MULTI_WORKUNITS, 0);
            long maxWorkUnitsPerMultiWorkUnit = state.getPropAsLong(MAX_WORK_UNITS_PER_BIN, 50);
            final long minWorkUnitWeight = Math.max(1, maxSizePerBin / maxWorkUnitsPerMultiWorkUnit);
            final Optional<CopyableFileWatermarkGenerator> watermarkGenerator = CopyableFileWatermarkHelper
                    .getCopyableFileWatermarkGenerator(state);
            int maxThreads = state.getPropAsInt(MAX_CONCURRENT_LISTING_SERVICES,
                    DEFAULT_MAX_CONCURRENT_LISTING_SERVICES);

            final CopyConfiguration copyConfiguration = CopyConfiguration.builder(targetFs, state.getProperties())
                    .build();

            DatasetsFinder<CopyableDatasetBase> datasetFinder = DatasetUtils.instantiateDatasetFinder(
                    state.getProperties(), sourceFs, DEFAULT_DATASET_PROFILE_CLASS_KEY,
                    new EventSubmitter.Builder(this.metricContext, CopyConfiguration.COPY_PREFIX).build(), state);

            IterableDatasetFinder<CopyableDatasetBase> iterableDatasetFinder = datasetFinder instanceof IterableDatasetFinder
                    ? (IterableDatasetFinder<CopyableDatasetBase>) datasetFinder
                    : new IterableDatasetFinderImpl<>(datasetFinder);

            Iterator<CopyableDatasetRequestor> requestorIteratorWithNulls = Iterators.transform(
                    iterableDatasetFinder.getDatasetsIterator(),
                    new CopyableDatasetRequestor.Factory(targetFs, copyConfiguration, log));
            Iterator<CopyableDatasetRequestor> requestorIterator = Iterators.filter(requestorIteratorWithNulls,
                    Predicates.<CopyableDatasetRequestor>notNull());

            final SetMultimap<FileSet<CopyEntity>, WorkUnit> workUnitsMap = Multimaps
                    .<FileSet<CopyEntity>, WorkUnit>synchronizedSetMultimap(
                            HashMultimap.<FileSet<CopyEntity>, WorkUnit>create());

            RequestAllocator<FileSet<CopyEntity>> allocator = createRequestAllocator(copyConfiguration, maxThreads);
            Iterator<FileSet<CopyEntity>> prioritizedFileSets = allocator.allocateRequests(requestorIterator,
                    copyConfiguration.getMaxToCopy());

            Iterator<Callable<Void>> callableIterator = Iterators.transform(prioritizedFileSets,
                    new Function<FileSet<CopyEntity>, Callable<Void>>() {
                        @Nullable
                        @Override
                        public Callable<Void> apply(FileSet<CopyEntity> input) {
                            return new FileSetWorkUnitGenerator((CopyableDatasetBase) input.getDataset(), input,
                                    state, workUnitsMap, watermarkGenerator, minWorkUnitWeight);
                        }
                    });

            try {
                List<Future<Void>> futures = new IteratorExecutor<>(callableIterator, maxThreads, ExecutorsUtils
                        .newDaemonThreadFactory(Optional.of(log), Optional.of("Copy-file-listing-pool-%d")))
                                .execute();

                for (Future<Void> future : futures) {
                    try {
                        future.get();
                    } catch (ExecutionException exc) {
                        log.error("Failed to get work units for dataset.", exc.getCause());
                    }
                }
            } catch (InterruptedException ie) {
                log.error("Retrieval of work units was interrupted. Aborting.");
                return Lists.newArrayList();
            }

            log.info(String.format("Created %s workunits ", workUnitsMap.size()));

            copyConfiguration.getCopyContext().logCacheStatistics();

            if (state.contains(SIMULATE) && state.getPropAsBoolean(SIMULATE)) {
                log.info("Simulate mode enabled. Will not execute the copy.");
                for (Map.Entry<FileSet<CopyEntity>, Collection<WorkUnit>> entry : workUnitsMap.asMap().entrySet()) {
                    log.info(String.format("Actions for dataset %s file set %s.",
                            entry.getKey().getDataset().datasetURN(), entry.getKey().getName()));
                    for (WorkUnit workUnit : entry.getValue()) {
                        CopyEntity copyEntity = deserializeCopyEntity(workUnit);
                        log.info(copyEntity.explain());
                    }
                }
                return Lists.newArrayList();
            }

            List<? extends WorkUnit> workUnits = new WorstFitDecreasingBinPacking(maxSizePerBin)
                    .pack(Lists.newArrayList(workUnitsMap.values()), this.weighter);
            log.info(String.format(
                    "Bin packed work units. Initial work units: %d, packed work units: %d, max weight per bin: %d, "
                            + "max work units per bin: %d.",
                    workUnitsMap.size(), workUnits.size(), maxSizePerBin, maxWorkUnitsPerMultiWorkUnit));
            return ImmutableList.copyOf(workUnits);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private RequestAllocator<FileSet<CopyEntity>> createRequestAllocator(CopyConfiguration copyConfiguration,
            int maxThreads) {
        Optional<FileSetComparator> prioritizer = copyConfiguration.getPrioritizer();

        RequestAllocatorConfig.Builder<FileSet<CopyEntity>> configBuilder = RequestAllocatorConfig
                .builder(new FileSetResourceEstimator()).allowParallelization(maxThreads)
                .withLimitedScopeConfig(copyConfiguration.getPrioritizationConfig());

        if (!prioritizer.isPresent()) {
            return new GreedyAllocator<>(configBuilder.build());
        } else {
            configBuilder.withPrioritizer(prioritizer.get());
        }

        if (prioritizer.get() instanceof HierarchicalPrioritizer) {
            return new HierarchicalAllocator.Factory().createRequestAllocator(configBuilder.build());
        } else {
            return RequestAllocatorUtils.inferFromConfig(configBuilder.build());
        }
    }

    /**
     * {@link Runnable} to generate copy listing for one {@link CopyableDataset}.
     */
    @AllArgsConstructor
    private class FileSetWorkUnitGenerator implements Callable<Void> {

        private final CopyableDatasetBase copyableDataset;
        private final FileSet<CopyEntity> fileSet;
        private final State state;
        private final SetMultimap<FileSet<CopyEntity>, WorkUnit> workUnitList;
        private final Optional<CopyableFileWatermarkGenerator> watermarkGenerator;
        private final long minWorkUnitWeight;

        @Override
        public Void call() {

            try {
                String extractId = fileSet.getName().replace(':', '_');

                Extract extract = new Extract(Extract.TableType.SNAPSHOT_ONLY, CopyConfiguration.COPY_PREFIX,
                        extractId);
                List<WorkUnit> workUnitsForPartition = Lists.newArrayList();
                for (CopyEntity copyEntity : fileSet.getFiles()) {

                    CopyableDatasetMetadata metadata = new CopyableDatasetMetadata(this.copyableDataset);
                    CopyEntity.DatasetAndPartition datasetAndPartition = copyEntity
                            .getDatasetAndPartition(metadata);

                    WorkUnit workUnit = new WorkUnit(extract);
                    workUnit.addAll(this.state);
                    serializeCopyEntity(workUnit, copyEntity);
                    serializeCopyableDataset(workUnit, metadata);
                    GobblinMetrics.addCustomTagToState(workUnit,
                            new Tag<>(CopyEventSubmitterHelper.DATASET_ROOT_METADATA_NAME,
                                    this.copyableDataset.datasetURN()));
                    workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, datasetAndPartition.toString());
                    workUnit.setProp(SlaEventKeys.DATASET_URN_KEY, this.copyableDataset.datasetURN());
                    workUnit.setProp(SlaEventKeys.PARTITION_KEY, copyEntity.getFileSet());
                    setWorkUnitWeight(workUnit, copyEntity, minWorkUnitWeight);
                    setWorkUnitWatermark(workUnit, watermarkGenerator, copyEntity);
                    computeAndSetWorkUnitGuid(workUnit);
                    workUnitsForPartition.add(workUnit);
                }

                this.workUnitList.putAll(this.fileSet, workUnitsForPartition);

                return null;
            } catch (IOException ioe) {
                throw new RuntimeException(
                        "Failed to generate work units for dataset " + this.copyableDataset.datasetURN(), ioe);
            }
        }
    }

    /**
     * @param state a {@link gobblin.configuration.WorkUnitState} carrying properties needed by the returned
     *          {@link Extractor}
     * @return a {@link FileAwareInputStreamExtractor}.
     * @throws IOException
     */
    @Override
    public Extractor<String, FileAwareInputStream> getExtractor(WorkUnitState state) throws IOException {

        Class<?> copyEntityClass = getCopyEntityClass(state);

        if (CopyableFile.class.isAssignableFrom(copyEntityClass)) {
            CopyableFile copyEntity = (CopyableFile) deserializeCopyEntity(state);
            return extractorForCopyableFile(getSourceFileSystem(state), copyEntity, state);
        }
        return new EmptyExtractor<>("empty");
    }

    protected Extractor<String, FileAwareInputStream> extractorForCopyableFile(FileSystem fs, CopyableFile cf,
            WorkUnitState state) throws IOException {
        return new FileAwareInputStreamExtractor(fs, cf, state);
    }

    @Override
    public void shutdown(SourceState state) {
    }

    protected FileSystem getSourceFileSystem(State state) throws IOException {
        Configuration conf = HadoopUtils.getConfFromState(state,
                Optional.of(ConfigurationKeys.SOURCE_FILEBASED_ENCRYPTED_CONFIG_PATH));
        String uri = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
        return HadoopUtils.getOptionallyThrottledFileSystem(FileSystem.get(URI.create(uri), conf), state);
    }

    private static FileSystem getTargetFileSystem(State state) throws IOException {
        return HadoopUtils.getOptionallyThrottledFileSystem(WriterUtils.getWriterFS(state, 1, 0), state);
    }

    private static void setWorkUnitWeight(WorkUnit workUnit, CopyEntity copyEntity, long minWeight) {
        long weight = 0;
        if (copyEntity instanceof CopyableFile) {
            weight = ((CopyableFile) copyEntity).getOrigin().getLen();
        }
        weight = Math.max(weight, minWeight);
        workUnit.setProp(WORK_UNIT_WEIGHT, Long.toString(weight));
    }

    private static void computeAndSetWorkUnitGuid(WorkUnit workUnit) throws IOException {
        Guid guid = Guid.fromStrings(workUnit.contains(ConfigurationKeys.CONVERTER_CLASSES_KEY)
                ? workUnit.getProp(ConfigurationKeys.CONVERTER_CLASSES_KEY)
                : "");
        setWorkUnitGuid(workUnit, guid.append(deserializeCopyEntity(workUnit)));
    }

    /**
     * Set a unique, replicable guid for this work unit. Used for recovering partially successful work units.
     * @param state {@link State} where guid should be written.
     * @param guid A byte array guid.
     */
    public static void setWorkUnitGuid(State state, Guid guid) {
        state.setProp(WORK_UNIT_GUID, guid.toString());
    }

    /**
     * Get guid in this state if available. This is the reverse operation of {@link #setWorkUnitGuid}.
     * @param state State from which guid should be extracted.
     * @return A byte array guid.
     * @throws IOException
     */
    public static Optional<Guid> getWorkUnitGuid(State state) throws IOException {
        if (state.contains(WORK_UNIT_GUID)) {
            return Optional.of(Guid.deserialize(state.getProp(WORK_UNIT_GUID)));
        }
        return Optional.absent();
    }

    /**
     * Serialize a {@link List} of {@link CopyEntity}s into a {@link State} at {@link #SERIALIZED_COPYABLE_FILE}
     */
    public static void serializeCopyEntity(State state, CopyEntity copyEntity) {
        state.setProp(SERIALIZED_COPYABLE_FILE, CopyEntity.serialize(copyEntity));
        state.setProp(COPY_ENTITY_CLASS, copyEntity.getClass().getName());
    }

    public static Class<?> getCopyEntityClass(State state) throws IOException {
        try {
            return Class.forName(state.getProp(COPY_ENTITY_CLASS));
        } catch (ClassNotFoundException cnfe) {
            throw new IOException(cnfe);
        }
    }

    /**
     * Deserialize a {@link List} of {@link CopyEntity}s from a {@link State} at {@link #SERIALIZED_COPYABLE_FILE}
     */
    public static CopyEntity deserializeCopyEntity(State state) {
        return CopyEntity.deserialize(state.getProp(SERIALIZED_COPYABLE_FILE));
    }

    /**
     * Serialize a {@link CopyableDataset} into a {@link State} at {@link #SERIALIZED_COPYABLE_DATASET}
     */
    public static void serializeCopyableDataset(State state, CopyableDatasetMetadata copyableDataset) {
        state.setProp(SERIALIZED_COPYABLE_DATASET, copyableDataset.serialize());
    }

    /**
     * Deserialize a {@link CopyableDataset} from a {@link State} at {@link #SERIALIZED_COPYABLE_DATASET}
     */
    public static CopyableDatasetMetadata deserializeCopyableDataset(State state) {
        return CopyableDatasetMetadata.deserialize(state.getProp(SERIALIZED_COPYABLE_DATASET));
    }

    private void setWorkUnitWatermark(WorkUnit workUnit,
            Optional<CopyableFileWatermarkGenerator> watermarkGenerator, CopyEntity copyEntity) throws IOException {
        if (copyEntity instanceof CopyableFile) {
            Optional<WatermarkInterval> watermarkIntervalOptional = CopyableFileWatermarkHelper
                    .getCopyableFileWatermark((CopyableFile) copyEntity, watermarkGenerator);
            if (watermarkIntervalOptional.isPresent()) {
                workUnit.setWatermarkInterval(watermarkIntervalOptional.get());
            }
        }
    }
}