org.apache.apex.malhar.lib.fs.s3.S3BlockUploadOperator.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.apex.malhar.lib.fs.s3.S3BlockUploadOperator.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.apex.malhar.lib.fs.s3;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import javax.validation.constraints.NotNull;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.apex.malhar.lib.io.block.AbstractBlockReader;
import org.apache.apex.malhar.lib.io.block.BlockMetadata;
import org.apache.apex.malhar.lib.wal.FSWindowDataManager;
import org.apache.apex.malhar.lib.wal.WindowDataManager;
import org.apache.hadoop.classification.InterfaceStability;

import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.PartETag;
import com.amazonaws.services.s3.model.PutObjectRequest;
import com.amazonaws.services.s3.model.PutObjectResult;
import com.amazonaws.services.s3.model.UploadPartRequest;
import com.esotericsoftware.kryo.serializers.FieldSerializer;
import com.esotericsoftware.kryo.serializers.JavaSerializer;
import com.google.common.base.Preconditions;

import com.datatorrent.api.Context;
import com.datatorrent.api.DefaultInputPort;
import com.datatorrent.api.DefaultOutputPort;
import com.datatorrent.api.Operator;
import com.datatorrent.netlet.util.Slice;

/**
 * This operator can be used to upload the block into S3 bucket using multi-part feature or putObject API.
 * Upload the block into S3 using multi-part feature only if the number of blocks of a file is > 1.
 * This operator is useful in context of S3 Output Module.
 *
 * @since 3.7.0
 */

@InterfaceStability.Evolving
public class S3BlockUploadOperator
        implements Operator, Operator.CheckpointNotificationListener, Operator.IdleTimeHandler {
    private static final Logger LOG = LoggerFactory.getLogger(S3BlockUploadOperator.class);
    @NotNull
    private String bucketName;
    @NotNull
    private String accessKey;
    @NotNull
    private String secretAccessKey;
    private String endPoint;
    private Map<String, S3BlockMetaData> blockInfo = new HashMap<>();
    private transient Map<Long, String> blockIdToFilePath = new HashMap<>();
    private WindowDataManager windowDataManager = new FSWindowDataManager();
    protected transient AmazonS3 s3Client;
    private transient long currentWindowId;
    private transient List<AbstractBlockReader.ReaderRecord<Slice>> waitingTuples;
    private transient Map<String, UploadBlockMetadata> currentWindowRecoveryState;
    public final transient DefaultOutputPort<UploadBlockMetadata> output = new DefaultOutputPort<>();

    /**
     * This input port receives incoming tuple's(Block data).
     */
    public final transient DefaultInputPort<AbstractBlockReader.ReaderRecord<Slice>> blockInput = new DefaultInputPort<AbstractBlockReader.ReaderRecord<Slice>>() {
        @Override
        public void process(AbstractBlockReader.ReaderRecord<Slice> tuple) {
            uploadBlockIntoS3(tuple);
        }
    };

    /**
     * Input port to receive Block meta data
     */
    public final transient DefaultInputPort<BlockMetadata.FileBlockMetadata> blockMetadataInput = new DefaultInputPort<BlockMetadata.FileBlockMetadata>() {
        @Override
        public void process(BlockMetadata.FileBlockMetadata blockMetadata) {
            if (currentWindowId <= windowDataManager.getLargestCompletedWindow()) {
                return;
            }
            blockIdToFilePath.put(blockMetadata.getBlockId(), blockMetadata.getFilePath());
            LOG.debug("received blockId {} for file {} ", blockMetadata.getBlockId(), blockMetadata.getFilePath());
        }
    };

    /**
     * Input port to receive upload file meta data.
     */
    public final transient DefaultInputPort<S3InitiateFileUploadOperator.UploadFileMetadata> uploadMetadataInput = new DefaultInputPort<S3InitiateFileUploadOperator.UploadFileMetadata>() {
        @Override
        public void process(S3InitiateFileUploadOperator.UploadFileMetadata tuple) {
            processUploadFileMetadata(tuple);
        }
    };

    /**
     * Convert each block of a given file into S3BlockMetaData
     * @param tuple UploadFileMetadata
     */
    protected void processUploadFileMetadata(S3InitiateFileUploadOperator.UploadFileMetadata tuple) {
        long[] blocks = tuple.getFileMetadata().getBlockIds();
        String filePath = tuple.getFileMetadata().getFilePath();
        for (int i = 0; i < blocks.length; i++) {
            String blockId = getUniqueBlockIdFromFile(blocks[i], filePath);
            if (blockInfo.get(blockId) != null) {
                break;
            }
            blockInfo.put(blockId, new S3BlockMetaData(tuple.getKeyName(), tuple.getUploadId(), i + 1));
        }
        if (blocks.length > 0) {
            blockInfo.get(getUniqueBlockIdFromFile(blocks[blocks.length - 1], filePath)).setLastBlock(true);
        }
    }

    /**
     * Construct the unique block id from the given block id and file path.
     * @param blockId Id of the block
     * @param filepath given filepath
     * @return unique block id
     */
    public static String getUniqueBlockIdFromFile(long blockId, String filepath) {
        return Long.toString(blockId) + "|" + filepath;
    }

    @Override
    public void setup(Context.OperatorContext context) {
        waitingTuples = new ArrayList<>();
        currentWindowRecoveryState = new HashMap<>();
        windowDataManager.setup(context);
        s3Client = createClient();
    }

    /**
     * Create AmazonS3 client using AWS credentials
     * @return AmazonS3
     */
    protected AmazonS3 createClient() {
        AmazonS3 client = new AmazonS3Client(new BasicAWSCredentials(accessKey, secretAccessKey));
        if (endPoint != null) {
            client.setEndpoint(endPoint);
        }
        return client;
    }

    @Override
    public void beginWindow(long windowId) {
        currentWindowId = windowId;
        if (windowId <= windowDataManager.getLargestCompletedWindow()) {
            replay(windowId);
        }
    }

    /**
     * Replay the state.
     * @param windowId replay window Id
     */
    protected void replay(long windowId) {
        try {
            @SuppressWarnings("unchecked")
            Map<String, UploadBlockMetadata> recoveredData = (Map<String, UploadBlockMetadata>) windowDataManager
                    .retrieve(windowId);
            if (recoveredData == null) {
                return;
            }
            for (Map.Entry<String, UploadBlockMetadata> uploadBlockMetadata : recoveredData.entrySet()) {
                output.emit(uploadBlockMetadata.getValue());
                blockInfo.remove(uploadBlockMetadata.getKey());
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public void endWindow() {
        if (waitingTuples.size() > 0) {
            processWaitBlocks();
        }

        for (String uniqueblockId : currentWindowRecoveryState.keySet()) {
            long blockId = Long.parseLong(uniqueblockId.substring(0, uniqueblockId.indexOf("|")));
            LOG.debug("Successfully uploaded {} block", blockId);
            blockIdToFilePath.remove(blockId);
            blockInfo.remove(uniqueblockId);
        }

        if (blockIdToFilePath.size() > 0) {
            for (Long blockId : blockIdToFilePath.keySet()) {
                LOG.info("Unable to uploaded {} block", blockId);
            }
            blockIdToFilePath.clear();
        }

        if (currentWindowId > windowDataManager.getLargestCompletedWindow()) {
            try {
                windowDataManager.save(currentWindowRecoveryState, currentWindowId);
            } catch (IOException e) {
                throw new RuntimeException("Unable to save recovery", e);
            }
        }
        currentWindowRecoveryState.clear();
    }

    @Override
    public void teardown() {
        windowDataManager.teardown();
    }

    /**
     * Process the blocks which are in wait state.
     */
    private void processWaitBlocks() {
        Iterator<AbstractBlockReader.ReaderRecord<Slice>> waitIterator = waitingTuples.iterator();

        while (waitIterator.hasNext()) {
            AbstractBlockReader.ReaderRecord<Slice> blockData = waitIterator.next();
            String filePath = blockIdToFilePath.get(blockData.getBlockId());
            if (filePath != null
                    && blockInfo.get(getUniqueBlockIdFromFile(blockData.getBlockId(), filePath)) != null) {
                uploadBlockIntoS3(blockData);
                waitIterator.remove();
            }
        }
    }

    /**
     * Upload the block into S3 bucket.
     * @param tuple block data
     */
    protected void uploadBlockIntoS3(AbstractBlockReader.ReaderRecord<Slice> tuple) {
        if (currentWindowId <= windowDataManager.getLargestCompletedWindow()) {
            return;
        }
        // Check whether the block metadata is present for this block
        if (blockIdToFilePath.get(tuple.getBlockId()) == null) {
            if (!waitingTuples.contains(tuple)) {
                waitingTuples.add(tuple);
            }
            return;
        }
        String uniqueBlockId = getUniqueBlockIdFromFile(tuple.getBlockId(),
                blockIdToFilePath.get(tuple.getBlockId()));
        S3BlockMetaData metaData = blockInfo.get(uniqueBlockId);
        // Check whether the file metadata is received
        if (metaData == null) {
            if (!waitingTuples.contains(tuple)) {
                waitingTuples.add(tuple);
            }
            return;
        }
        long partSize = tuple.getRecord().length;
        PartETag partETag = null;
        ByteArrayInputStream bis = new ByteArrayInputStream(tuple.getRecord().buffer);
        // Check if it is a Single block of a file
        if (metaData.isLastBlock && metaData.partNo == 1) {
            ObjectMetadata omd = createObjectMetadata();
            omd.setContentLength(partSize);
            PutObjectResult result = s3Client
                    .putObject(new PutObjectRequest(bucketName, metaData.getKeyName(), bis, omd));
            partETag = new PartETag(1, result.getETag());
        } else {
            // Else upload use multi-part feature
            try {
                // Create request to upload a part.
                UploadPartRequest uploadRequest = new UploadPartRequest().withBucketName(bucketName)
                        .withKey(metaData.getKeyName()).withUploadId(metaData.getUploadId())
                        .withPartNumber(metaData.getPartNo()).withInputStream(bis).withPartSize(partSize);
                partETag = s3Client.uploadPart(uploadRequest).getPartETag();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }
        UploadBlockMetadata uploadmetadata = new UploadBlockMetadata(partETag, metaData.getKeyName());
        output.emit(uploadmetadata);
        currentWindowRecoveryState.put(uniqueBlockId, uploadmetadata);
        try {
            bis.close();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Creates the empty object metadata for initiate multipart upload request.
     * @return the ObjectMetadata
     */
    public ObjectMetadata createObjectMetadata() {
        return new ObjectMetadata();
    }

    @Override
    public void beforeCheckpoint(long windowId) {
    }

    @Override
    public void checkpointed(long windowId) {
    }

    @Override
    public void committed(long windowId) {
        try {
            windowDataManager.committed(windowId);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    public void handleIdleTime() {
        if (waitingTuples.size() > 0) {
            processWaitBlocks();
        }
    }

    /**
     * Upload block metadata consists of partETag and key name.
     */
    public static class UploadBlockMetadata {
        @FieldSerializer.Bind(JavaSerializer.class)
        private PartETag partETag;
        private String keyName;

        // For Kryo
        public UploadBlockMetadata() {
        }

        public UploadBlockMetadata(PartETag partETag, String keyName) {
            this.partETag = partETag;
            this.keyName = keyName;
        }

        /**
         * Get the partETag of the block
         * @return the partETag
         */
        public PartETag getPartETag() {
            return partETag;
        }

        /**
         * Return the key name of the file
         * @return key name
         */
        public String getKeyName() {
            return keyName;
        }

        @Override
        public int hashCode() {
            return keyName.hashCode();
        }
    }

    /**
     * S3 Block meta data consists of keyname, upload Id, part number and whether the block is last block or not.
     */
    public static class S3BlockMetaData {
        private String keyName;
        private String uploadId;
        private Integer partNo;
        private boolean isLastBlock;

        // For Kryo Serialization
        public S3BlockMetaData() {
        }

        public S3BlockMetaData(String keyName, String uploadId, Integer partNo) {
            this.keyName = keyName;
            this.uploadId = uploadId;
            this.partNo = partNo;
            this.isLastBlock = false;
        }

        /**
         * Return the key name of the file
         * @return key name
         */
        public String getKeyName() {
            return keyName;
        }

        /**
         * Return the upload id of the block
         * @return the uplaod id
         */
        public String getUploadId() {
            return uploadId;
        }

        /**
         * Return the part number of the block
         * @return the part number
         */
        public Integer getPartNo() {
            return partNo;
        }

        /**
         * Specifies whether the block is last or not.
         * @return isLastBlock
         */
        public boolean isLastBlock() {
            return isLastBlock;
        }

        /**
         * Sets the block is last or not.
         * @param lastBlock Specifies whether the block is last or not
         */
        public void setLastBlock(boolean lastBlock) {
            isLastBlock = lastBlock;
        }
    }

    /**
     * Returns the name of the bucket in which to upload the blocks.
     * @return bucket name
     */
    public String getBucketName() {
        return bucketName;
    }

    /**
     * Sets the name of the bucket in which to upload the blocks.
     * @param bucketName bucket name
     */
    public void setBucketName(@NotNull String bucketName) {
        this.bucketName = Preconditions.checkNotNull(bucketName);
    }

    /**
     * Return the AWS access key
     * @return access key
     */
    public String getAccessKey() {
        return accessKey;
    }

    /**
     * Sets the AWS access key
     * @param accessKey access key
     */
    public void setAccessKey(@NotNull String accessKey) {
        this.accessKey = Preconditions.checkNotNull(accessKey);
    }

    /**
     * Return the AWS access key
     * @return access key
     */
    public String getSecretAccessKey() {
        return secretAccessKey;
    }

    /**
     * Sets the AWS access key
     * @param secretAccessKey access key
     */
    public void setSecretAccessKey(@NotNull String secretAccessKey) {
        this.secretAccessKey = Preconditions.checkNotNull(secretAccessKey);
    }

    /**
     * Return the AWS S3 end point
     * @return S3 end point
     */
    public String getEndPoint() {
        return endPoint;
    }

    /**
     * Sets the AWS S3 end point
     * @param endPoint S3 end point
     */
    public void setEndPoint(String endPoint) {
        this.endPoint = endPoint;
    }
}