org.apache.accumulo.core.clientImpl.BulkImport.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.accumulo.core.clientImpl.BulkImport.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.accumulo.core.clientImpl;

import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.stream.Collectors.groupingBy;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.stream.Stream;

import org.apache.accumulo.core.Constants;
import org.apache.accumulo.core.client.AccumuloException;
import org.apache.accumulo.core.client.AccumuloSecurityException;
import org.apache.accumulo.core.client.NamespaceExistsException;
import org.apache.accumulo.core.client.NamespaceNotFoundException;
import org.apache.accumulo.core.client.TableExistsException;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.client.admin.TableOperations.ImportDestinationArguments;
import org.apache.accumulo.core.client.admin.TableOperations.ImportMappingOptions;
import org.apache.accumulo.core.clientImpl.Bulk.FileInfo;
import org.apache.accumulo.core.clientImpl.Bulk.Files;
import org.apache.accumulo.core.clientImpl.Table.ID;
import org.apache.accumulo.core.conf.AccumuloConfiguration;
import org.apache.accumulo.core.conf.ClientProperty;
import org.apache.accumulo.core.conf.ConfigurationCopy;
import org.apache.accumulo.core.conf.ConfigurationTypeHelper;
import org.apache.accumulo.core.crypto.CryptoServiceFactory;
import org.apache.accumulo.core.data.ByteSequence;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.LoadPlan;
import org.apache.accumulo.core.data.LoadPlan.Destination;
import org.apache.accumulo.core.data.LoadPlan.RangeType;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.dataImpl.KeyExtent;
import org.apache.accumulo.core.file.FileOperations;
import org.apache.accumulo.core.file.FileSKVIterator;
import org.apache.accumulo.core.file.blockfile.impl.CachableBlockFile;
import org.apache.accumulo.core.master.thrift.FateOperation;
import org.apache.accumulo.core.spi.crypto.CryptoService;
import org.apache.accumulo.core.util.CachedConfiguration;
import org.apache.accumulo.core.volume.VolumeConfiguration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.collect.Sets;

public class BulkImport implements ImportDestinationArguments, ImportMappingOptions {

    private static final Logger log = LoggerFactory.getLogger(BulkImport.class);

    private boolean setTime = false;
    private Executor executor = null;
    private final String dir;
    private int numThreads = -1;

    private final ClientContext context;
    private String tableName;

    private LoadPlan plan = null;

    BulkImport(String directory, ClientContext context) {
        this.context = context;
        this.dir = Objects.requireNonNull(directory);
    }

    @Override
    public ImportMappingOptions tableTime() {
        this.setTime = true;
        return this;
    }

    @Override
    public void load() throws TableNotFoundException, IOException, AccumuloException, AccumuloSecurityException {

        Table.ID tableId = Tables.getTableId(context, tableName);

        Map<String, String> props = context.instanceOperations().getSystemConfiguration();
        AccumuloConfiguration conf = new ConfigurationCopy(props);

        FileSystem fs = VolumeConfiguration.getVolume(dir, CachedConfiguration.getInstance(), conf).getFileSystem();

        Path srcPath = checkPath(fs, dir);

        SortedMap<KeyExtent, Bulk.Files> mappings;
        if (plan == null) {
            mappings = computeMappingFromFiles(fs, tableId, srcPath);
        } else {
            mappings = computeMappingFromPlan(fs, tableId, srcPath);
        }

        BulkSerialize.writeLoadMapping(mappings, srcPath.toString(), fs::create);

        List<ByteBuffer> args = Arrays.asList(ByteBuffer.wrap(tableId.getUtf8()),
                ByteBuffer.wrap(srcPath.toString().getBytes(UTF_8)),
                ByteBuffer.wrap((setTime + "").getBytes(UTF_8)));
        doFateOperation(FateOperation.TABLE_BULK_IMPORT2, args, Collections.emptyMap(), tableName);
    }

    /**
     * Check path of bulk directory and permissions
     */
    private Path checkPath(FileSystem fs, String dir) throws IOException, AccumuloException {
        Path ret;

        if (dir.contains(":")) {
            ret = new Path(dir);
        } else {
            ret = fs.makeQualified(new Path(dir));
        }

        try {
            if (!fs.getFileStatus(ret).isDirectory()) {
                throw new AccumuloException("Bulk import directory " + dir + " is not a directory!");
            }
            Path tmpFile = new Path(ret, "isWritable");
            if (fs.createNewFile(tmpFile))
                fs.delete(tmpFile, true);
            else
                throw new AccumuloException("Bulk import directory " + dir + " is not writable.");
        } catch (FileNotFoundException fnf) {
            throw new AccumuloException("Bulk import directory " + dir + " does not exist or has bad permissions",
                    fnf);
        }

        // TODO ensure dir does not contain bulk load mapping

        return ret;
    }

    @Override
    public ImportMappingOptions executor(Executor service) {
        this.executor = Objects.requireNonNull(service);
        return this;
    }

    @Override
    public ImportMappingOptions threads(int numThreads) {
        Preconditions.checkArgument(numThreads > 0, "Non positive number of threads given : %s", numThreads);
        this.numThreads = numThreads;
        return this;
    }

    @Override
    public ImportMappingOptions plan(LoadPlan plan) {
        this.plan = plan;
        return this;
    }

    @Override
    public ImportMappingOptions to(String tableName) {
        this.tableName = Objects.requireNonNull(tableName);
        return this;
    }

    private static final byte[] byte0 = { 0 };

    private static class MLong {
        public MLong(long i) {
            l = i;
        }

        long l;
    }

    public static Map<KeyExtent, Long> estimateSizes(AccumuloConfiguration acuConf, Path mapFile, long fileSize,
            Collection<KeyExtent> extents, FileSystem ns, Cache<String, Long> fileLenCache, CryptoService cs)
            throws IOException {

        if (extents.size() == 1) {
            return Collections.singletonMap(extents.iterator().next(), fileSize);
        }

        long totalIndexEntries = 0;
        Map<KeyExtent, MLong> counts = new TreeMap<>();
        for (KeyExtent keyExtent : extents)
            counts.put(keyExtent, new MLong(0));

        Text row = new Text();

        FileSKVIterator index = FileOperations.getInstance().newIndexReaderBuilder()
                .forFile(mapFile.toString(), ns, ns.getConf(), cs).withTableConfiguration(acuConf)
                .withFileLenCache(fileLenCache).build();

        try {
            while (index.hasTop()) {
                Key key = index.getTopKey();
                totalIndexEntries++;
                key.getRow(row);

                // TODO this could use a binary search
                for (Entry<KeyExtent, MLong> entry : counts.entrySet())
                    if (entry.getKey().contains(row))
                        entry.getValue().l++;

                index.next();
            }
        } finally {
            try {
                if (index != null)
                    index.close();
            } catch (IOException e) {
                log.debug("Failed to close " + mapFile, e);
            }
        }

        Map<KeyExtent, Long> results = new TreeMap<>();
        for (KeyExtent keyExtent : extents) {
            double numEntries = counts.get(keyExtent).l;
            if (numEntries == 0)
                numEntries = 1;
            long estSize = (long) ((numEntries / totalIndexEntries) * fileSize);
            results.put(keyExtent, estSize);
        }
        return results;
    }

    public interface KeyExtentCache {
        KeyExtent lookup(Text row)
                throws IOException, AccumuloException, AccumuloSecurityException, TableNotFoundException;
    }

    public static List<KeyExtent> findOverlappingTablets(ClientContext context, KeyExtentCache extentCache,
            FileSKVIterator reader)
            throws IOException, AccumuloException, AccumuloSecurityException, TableNotFoundException {

        Text startRow = null;
        Text endRow = null;

        List<KeyExtent> result = new ArrayList<>();
        Collection<ByteSequence> columnFamilies = Collections.emptyList();
        Text row = startRow;
        if (row == null)
            row = new Text();
        while (true) {
            // log.debug(filename + " Seeking to row " + row);
            reader.seek(new Range(row, null), columnFamilies, false);
            if (!reader.hasTop()) {
                // log.debug(filename + " not found");
                break;
            }
            row = reader.getTopKey().getRow();
            KeyExtent extent = extentCache.lookup(row);
            // log.debug(filename + " found row " + row + " at location " + tabletLocation);
            result.add(extent);
            row = extent.getEndRow();
            if (row != null && (endRow == null || row.compareTo(endRow) < 0)) {
                row = nextRow(row);
            } else
                break;
        }

        return result;
    }

    private static Text nextRow(Text row) {
        Text next = new Text(row);
        next.append(byte0, 0, byte0.length);
        return next;
    }

    public static List<KeyExtent> findOverlappingTablets(ClientContext context, KeyExtentCache extentCache,
            Path file, FileSystem fs, Cache<String, Long> fileLenCache, CryptoService cs)
            throws IOException, AccumuloException, AccumuloSecurityException, TableNotFoundException {
        try (FileSKVIterator reader = FileOperations.getInstance().newReaderBuilder()
                .forFile(file.toString(), fs, fs.getConf(), cs).withTableConfiguration(context.getConfiguration())
                .withFileLenCache(fileLenCache).seekToBeginning().build()) {
            return findOverlappingTablets(context, extentCache, reader);
        }
    }

    private static Map<String, Long> getFileLenMap(List<FileStatus> statuses) {
        HashMap<String, Long> fileLens = new HashMap<>();
        for (FileStatus status : statuses) {
            fileLens.put(status.getPath().getName(), status.getLen());
            status.getLen();
        }

        return fileLens;
    }

    private static Cache<String, Long> getPopulatedFileLenCache(Path dir, List<FileStatus> statuses) {
        Map<String, Long> fileLens = getFileLenMap(statuses);

        Map<String, Long> absFileLens = new HashMap<>();
        fileLens.forEach((k, v) -> {
            absFileLens.put(CachableBlockFile.pathToCacheId(new Path(dir, k)), v);
        });

        Cache<String, Long> fileLenCache = CacheBuilder.newBuilder().build();

        fileLenCache.putAll(absFileLens);

        return fileLenCache;
    }

    private SortedMap<KeyExtent, Files> computeMappingFromPlan(FileSystem fs, ID tableId, Path srcPath)
            throws IOException, AccumuloException, AccumuloSecurityException, TableNotFoundException {

        Map<String, List<Destination>> fileDestinations = plan.getDestinations().stream()
                .collect(groupingBy(Destination::getFileName));

        List<FileStatus> statuses = filterInvalid(
                fs.listStatus(srcPath, p -> !p.getName().equals(Constants.BULK_LOAD_MAPPING)));

        Map<String, Long> fileLens = getFileLenMap(statuses);

        if (!fileDestinations.keySet().equals(fileLens.keySet())) {
            throw new IllegalArgumentException(
                    "Load plan files differ from directory files, symmetric difference : "
                            + Sets.symmetricDifference(fileDestinations.keySet(), fileLens.keySet()));
        }

        KeyExtentCache extentCache = new ConcurrentKeyExtentCache(tableId, context);

        // Pre-populate cache by looking up all end rows in sorted order. Doing this in sorted order
        // leverages read ahead.
        fileDestinations.values().stream().flatMap(List::stream)
                .filter(dest -> dest.getRangeType() == RangeType.FILE)
                .flatMap(dest -> Stream.of(dest.getStartRow(), dest.getEndRow())).filter(row -> row != null)
                .map(Text::new).sorted().distinct().forEach(row -> {
                    try {
                        extentCache.lookup(row);
                    } catch (Exception e) {
                        throw new RuntimeException(e);
                    }
                });

        SortedMap<KeyExtent, Files> mapping = new TreeMap<>();

        for (Entry<String, List<Destination>> entry : fileDestinations.entrySet()) {
            String fileName = entry.getKey();
            List<Destination> destinations = entry.getValue();
            Set<KeyExtent> extents = mapDesitnationsToExtents(tableId, extentCache, destinations);

            long estSize = (long) (fileLens.get(fileName) / (double) extents.size());

            for (KeyExtent keyExtent : extents) {
                mapping.computeIfAbsent(keyExtent, k -> new Files()).add(new FileInfo(fileName, estSize, 0));
            }
        }

        return mergeOverlapping(mapping);
    }

    private Text toText(byte[] row) {
        return row == null ? null : new Text(row);
    }

    private Set<KeyExtent> mapDesitnationsToExtents(Table.ID tableId, KeyExtentCache kec,
            List<Destination> destinations)
            throws IOException, AccumuloException, AccumuloSecurityException, TableNotFoundException {
        Set<KeyExtent> extents = new HashSet<>();

        for (Destination dest : destinations) {

            if (dest.getRangeType() == RangeType.TABLE) {
                extents.add(new KeyExtent(tableId, toText(dest.getEndRow()), toText(dest.getStartRow())));
            } else if (dest.getRangeType() == RangeType.FILE) {
                Text startRow = new Text(dest.getStartRow());
                Text endRow = new Text(dest.getEndRow());

                KeyExtent extent = kec.lookup(startRow);

                extents.add(extent);

                while (!extent.contains(endRow) && extent.getEndRow() != null) {
                    extent = kec.lookup(nextRow(extent.getEndRow()));
                    extents.add(extent);
                }

            } else {
                throw new IllegalStateException();
            }
        }

        return extents;
    }

    private SortedMap<KeyExtent, Bulk.Files> computeMappingFromFiles(FileSystem fs, Table.ID tableId, Path dirPath)
            throws IOException {

        Executor executor;
        ExecutorService service = null;

        if (this.executor != null) {
            executor = this.executor;
        } else if (numThreads > 0) {
            executor = service = Executors.newFixedThreadPool(numThreads);
        } else {
            String threads = context.getConfiguration().get(ClientProperty.BULK_LOAD_THREADS.getKey());
            executor = service = Executors.newFixedThreadPool(ConfigurationTypeHelper.getNumThreads(threads));
        }

        try {
            return computeFileToTabletMappings(fs, tableId, dirPath, executor, context);
        } finally {
            if (service != null) {
                service.shutdown();
            }
        }

    }

    private static List<FileStatus> filterInvalid(FileStatus[] files) {
        ArrayList<FileStatus> fileList = new ArrayList<>(files.length);

        for (FileStatus fileStatus : files) {

            String fname = fileStatus.getPath().getName();

            if (fname.equals("_SUCCESS") || fname.equals("_logs")) {
                log.debug("Ignoring file likely created by map reduce : {}", fileStatus.getPath());
                continue;
            }

            if (fileStatus.isDirectory()) {
                log.warn("{} is a directory, ignoring.", fileStatus.getPath());
                continue;
            }

            String sa[] = fname.split("\\.");
            String extension = "";
            if (sa.length > 1) {
                extension = sa[sa.length - 1];
            }

            if (!FileOperations.getValidExtensions().contains(extension)) {
                log.warn("{} does not have a valid extension, ignoring", fileStatus.getPath());
                continue;
            }

            fileList.add(fileStatus);
        }

        return fileList;
    }

    public static SortedMap<KeyExtent, Bulk.Files> computeFileToTabletMappings(FileSystem fs, Table.ID tableId,
            Path dirPath, Executor executor, ClientContext context) throws IOException {

        KeyExtentCache extentCache = new ConcurrentKeyExtentCache(tableId, context);

        List<FileStatus> files = filterInvalid(
                fs.listStatus(dirPath, p -> !p.getName().equals(Constants.BULK_LOAD_MAPPING)));

        // we know all of the file lens, so construct a cache and populate it in order to avoid later
        // trips to the namenode
        Cache<String, Long> fileLensCache = getPopulatedFileLenCache(dirPath, files);

        List<CompletableFuture<Map<KeyExtent, Bulk.FileInfo>>> futures = new ArrayList<>();

        CryptoService cs = CryptoServiceFactory.newDefaultInstance();

        for (FileStatus fileStatus : files) {
            CompletableFuture<Map<KeyExtent, Bulk.FileInfo>> future = CompletableFuture.supplyAsync(() -> {
                try {
                    long t1 = System.currentTimeMillis();
                    List<KeyExtent> extents = findOverlappingTablets(context, extentCache, fileStatus.getPath(), fs,
                            fileLensCache, cs);
                    Map<KeyExtent, Long> estSizes = estimateSizes(context.getConfiguration(), fileStatus.getPath(),
                            fileStatus.getLen(), extents, fs, fileLensCache, cs);
                    Map<KeyExtent, Bulk.FileInfo> pathLocations = new HashMap<>();
                    for (KeyExtent ke : extents) {
                        pathLocations.put(ke,
                                new Bulk.FileInfo(fileStatus.getPath(), estSizes.getOrDefault(ke, 0L)));
                    }
                    long t2 = System.currentTimeMillis();
                    log.trace("Mapped {} to {} tablets in {}ms", fileStatus.getPath(), pathLocations.size(),
                            t2 - t1);
                    return pathLocations;
                } catch (Exception e) {
                    throw new CompletionException(e);
                }
            }, executor);

            futures.add(future);
        }

        SortedMap<KeyExtent, Bulk.Files> mappings = new TreeMap<>();

        for (CompletableFuture<Map<KeyExtent, Bulk.FileInfo>> future : futures) {
            try {
                Map<KeyExtent, Bulk.FileInfo> pathMapping = future.get();
                pathMapping.forEach((extent, path) -> {
                    mappings.computeIfAbsent(extent, k -> new Bulk.Files()).add(path);
                });
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                throw new RuntimeException(e);
            } catch (ExecutionException e) {
                throw new RuntimeException(e);
            }
        }

        return mergeOverlapping(mappings);
    }

    // This method handles the case of splits happening while files are being examined. It merges
    // smaller tablets into large tablets.
    static SortedMap<KeyExtent, Bulk.Files> mergeOverlapping(SortedMap<KeyExtent, Bulk.Files> mappings) {
        List<KeyExtent> extents = new ArrayList<>(mappings.keySet());

        for (KeyExtent ke : extents) {
            Set<KeyExtent> overlapping = KeyExtent.findOverlapping(ke, mappings);
            for (KeyExtent oke : overlapping) {
                if (ke.equals(oke)) {
                    continue;
                }

                boolean containsPrevRow = ke.getPrevEndRow() == null
                        || (oke.getPrevEndRow() != null && ke.getPrevEndRow().compareTo(oke.getPrevEndRow()) <= 0);
                boolean containsEndRow = ke.getEndRow() == null
                        || (oke.getEndRow() != null && ke.getEndRow().compareTo(oke.getEndRow()) >= 0);

                if (containsPrevRow && containsEndRow) {
                    mappings.get(ke).merge(mappings.remove(oke));
                } else {
                    throw new RuntimeException("TODO handle merges");
                }
            }
        }

        return mappings;
    }

    private String doFateOperation(FateOperation op, List<ByteBuffer> args, Map<String, String> opts,
            String tableName) throws AccumuloSecurityException, AccumuloException {
        try {
            return new TableOperationsImpl(context).doFateOperation(op, args, opts, tableName);
        } catch (TableExistsException | TableNotFoundException | NamespaceNotFoundException
                | NamespaceExistsException e) {
            // should not happen
            throw new AssertionError(e);
        }
    }
}