org.commoncrawl.util.S3BulkTransferUtil.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.util.S3BulkTransferUtil.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.util;

import java.io.IOException;
import java.io.StringReader;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.concurrent.ConcurrentSkipListMap;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.commoncrawl.io.NIOBufferList;
import org.commoncrawl.io.NIOHttpConnection;
import org.commoncrawl.util.Tuples.Pair;

import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.ListObjectsRequest;
import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.google.common.collect.ImmutableList;
import com.google.gson.JsonArray;
import com.google.gson.JsonParser;
import com.google.gson.JsonPrimitive;
import com.google.gson.stream.JsonReader;

/** 
 * Utility used to transfer data from S3 down to colo in bulk
 * 
 * @author rana
 *
 */
public class S3BulkTransferUtil implements S3Downloader.Callback {

    private static final Log LOG = LogFactory.getLog(S3BulkTransferUtil.class);

    S3Downloader _downloader;

    Configuration _conf;
    FileSystem _fs;
    int _totalQueuedItemsCount;
    int _totalCompletedItemsCount = 0;
    ConcurrentSkipListMap<String, Path> _pathMapping = new ConcurrentSkipListMap<String, Path>();

    S3BulkTransferUtil(String bucketName, String s3AccessKeyId, String s3SecretKey, JsonArray pathList,
            final Path outputPath) throws IOException {
        _conf = new Configuration();
        _fs = FileSystem.get(_conf);
        LOG.info("Initializing Downloader");
        _downloader = new S3Downloader(bucketName, s3AccessKeyId, s3SecretKey, false);
        _downloader.setMaxParallelStreams(150);
        _downloader.initialize(this);

        LOG.info("Got JSON Array with:" + pathList.size() + " elements");
        for (int i = 0; i < pathList.size(); ++i) {
            LOG.info("Collecting files from path:" + pathList.get(i).toString());
            List<S3ObjectSummary> metadataFiles = getPaths(s3AccessKeyId, s3SecretKey, bucketName,
                    pathList.get(i).getAsString());
            LOG.info("Got:" + metadataFiles.size() + " total files");
            for (S3ObjectSummary metadataFile : metadataFiles) {

                Path s3Path = new Path("/" + metadataFile.getKey());
                Path finalPath = new Path(outputPath, s3Path.getName());

                FileStatus fileStatus = null;
                try {
                    fileStatus = _fs.getFileStatus(finalPath);
                } catch (Exception e) {

                }

                if (fileStatus != null && fileStatus.getLen() != metadataFile.getSize()) {
                    LOG.error("SRC-DEST SIZE MISMATCH!! SRC:" + metadataFile + " SRC-SIZE:" + metadataFile.getSize()
                            + " DEST:" + finalPath + " DEST-SIZE:" + fileStatus.getLen());

                    // ok delete the destination 
                    _fs.delete(finalPath, false);
                    // null file status so that the item gets requeued ... 
                    fileStatus = null;
                }

                if (fileStatus == null) {
                    LOG.info("Queueing Item:" + metadataFile);
                    ++_totalQueuedItemsCount;
                    _pathMapping.put(metadataFile.getKey(), finalPath);
                    _downloader.fetchItem(metadataFile.getKey());
                } else {
                    LOG.info("Skipping Already Download Item:" + metadataFile + " Found at:" + finalPath);
                }
            }
        }
        LOG.info("Waiting for shutdown event");
        _downloader.waitForCompletion();
    }

    public static List<S3ObjectSummary> getPaths(String s3AccessKeyId, String s3SecretKey, String bucketName,
            String segmentPath) throws IOException {

        AmazonS3Client s3Client = new AmazonS3Client(new BasicAWSCredentials(s3AccessKeyId, s3SecretKey));

        ImmutableList.Builder<S3ObjectSummary> listBuilder = new ImmutableList.Builder<S3ObjectSummary>();

        ObjectListing response = s3Client
                .listObjects(new ListObjectsRequest().withBucketName(bucketName).withPrefix(segmentPath));

        do {
            LOG.info("Response Key Count:" + response.getObjectSummaries().size());

            for (S3ObjectSummary entry : response.getObjectSummaries()) {
                listBuilder.add(entry);
            }

            if (response.isTruncated()) {
                response = s3Client.listNextBatchOfObjects(response);
            } else {
                break;
            }
        } while (true);

        return listBuilder.build();
    }

    ConcurrentSkipListMap<String, Pair<Path, FSDataOutputStream>> _pathToStreamMap = new ConcurrentSkipListMap<String, Pair<Path, FSDataOutputStream>>();

    @Override
    public boolean contentAvailable(NIOHttpConnection connection, int itemId, String itemKey,
            NIOBufferList contentBuffer) {

        Pair<Path, FSDataOutputStream> downloadTuple = _pathToStreamMap.get(itemKey);
        if (downloadTuple != null) {
            try {
                while (contentBuffer.available() != 0) {
                    ByteBuffer bufferForRead = contentBuffer.read();
                    if (bufferForRead != null) {
                        //LOG.info("Writing: " + bufferForRead.remaining() + " bytes for Key:"+ itemKey);
                        downloadTuple.e1.write(bufferForRead.array(), bufferForRead.position(),
                                bufferForRead.remaining());
                    }
                }
                return true;
            } catch (Exception e) {
                LOG.error("Error during contentAvailable for Key:" + itemKey + " Exception:"
                        + CCStringUtils.stringifyException(e));
            }
        }
        return false;
    }

    static Path finalSegmentOutputDir = new Path("crawl/ec2Import/segment");

    @Override
    public void downloadComplete(NIOHttpConnection connection, int itemId, String itemKey) {
        LOG.info("Received Download Complete Event for Key:" + itemKey);
        Pair<Path, FSDataOutputStream> downloadTuple = _pathToStreamMap.remove(itemKey);
        boolean downloadSuccessful = false;

        if (downloadTuple == null) {
            LOG.error("Excepected Download Tuple for key:" + itemKey + " GOT NULL!");
        } else {
            try {
                // ok close the stream first ... 
                LOG.info("Flushing Stream for key:" + itemKey);
                downloadTuple.e1.flush();
                downloadTuple.e1.close();
                downloadTuple.e1 = null;

                downloadSuccessful = true;
            } catch (Exception e) {
                LOG.error("Error completing download for item:" + itemKey + " Exception:"
                        + CCStringUtils.stringifyException(e));
            } finally {
                if (downloadTuple.e1 != null) {
                    try {
                        downloadTuple.e1.close();
                    } catch (IOException e) {
                        LOG.error(CCStringUtils.stringifyException(e));
                    }
                }
            }
        }

        if (!downloadSuccessful) {
            LOG.error("Download for Key:" + itemKey + " Unsuccessful. Requeueing");
            try {
                _downloader.fetchItem(itemKey);
            } catch (IOException e) {
                LOG.fatal("Failed to Requeue Item:" + itemKey);
            }
        }
    }

    @Override
    public void downloadFailed(NIOHttpConnection connection, int itemId, String itemKey, String errorCode) {
        LOG.info("Received Download Failed Event for Key:" + itemKey);
        Pair<Path, FSDataOutputStream> downloadTuple = _pathToStreamMap.remove(itemKey);

        if (downloadTuple == null) {
            LOG.error("Excepected Download Tuple for Failed Download key:" + itemKey + " GOT NULL!");
        } else {
            try {
                if (downloadTuple.e1 != null) {
                    downloadTuple.e1.close();
                    downloadTuple.e1 = null;
                }
                LOG.info("Deleting Temp File:" + downloadTuple.e0 + " for Key:" + itemKey);
                _fs.delete(downloadTuple.e0, false);
            } catch (IOException e) {
                LOG.error(CCStringUtils.stringifyException(e));
            }
        }
        LOG.error("Download for Key:" + itemKey + " Unsuccessful. Requeueing");
        try {
            _downloader.fetchItem(itemKey);
        } catch (IOException e) {
            LOG.fatal("Failed to Requeue Item:" + itemKey);
        }
    }

    @Override
    public boolean downloadStarting(NIOHttpConnection connection, int itemId, String itemKey, long contentLength) {
        LOG.info("Received Download Start Event for Key:" + itemKey);

        boolean continueDownload = false;

        Path outputFilePath = _pathMapping.get(itemKey);

        if (outputFilePath != null) {
            try {
                _fs.mkdirs(outputFilePath.getParent());
                Pair<Path, FSDataOutputStream> tupleOut = new Pair<Path, FSDataOutputStream>(outputFilePath,
                        _fs.create(outputFilePath));
                LOG.info("Created Stream for Key:" + itemKey + " temp Path:" + outputFilePath);
                _pathToStreamMap.put(itemKey, tupleOut);
                continueDownload = true;
            } catch (IOException e) {
                LOG.error(CCStringUtils.stringifyException(e));
            }
        } else {
            LOG.error("Unable to extract metadata filename parts from name:" + itemKey);
        }
        return continueDownload;
    }

    static Options options = new Options();
    static {

        options.addOption(OptionBuilder.withArgName("awsKey").hasArg().withDescription("AWS Key").isRequired()
                .create("awsKey"));

        options.addOption(OptionBuilder.withArgName("awsSecret").hasArg().withDescription("AWS Secret").isRequired()
                .create("awsSecret"));

        options.addOption(OptionBuilder.withArgName("bucket").hasArg().withDescription("S3 bucket name")
                .isRequired().create("bucket"));

        options.addOption(OptionBuilder.withArgName("outputPath").hasArg().isRequired()
                .withDescription("HDFS output path").create("outputPath"));

        options.addOption(
                OptionBuilder.withArgName("path").hasArg().withDescription("S3 path prefix").create("path"));

        options.addOption(OptionBuilder.withArgName("paths").hasArg().withDescription("S3 paths as a JSON Array")
                .create("paths"));

    }

    static void printUsage() {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("S3BulkTransferUtil", options);
    }

    public static void main(String[] args) throws IOException {

        CommandLineParser parser = new GnuParser();

        try {
            // parse the command line arguments
            CommandLine cmdLine = parser.parse(options, args);

            String s3AccessKey = cmdLine.getOptionValue("awsKey");
            String s3Secret = cmdLine.getOptionValue("awsSecret");
            String s3Bucket = cmdLine.getOptionValue("bucket");
            Path hdfsOutputPath = new Path(cmdLine.getOptionValue("outputPath"));
            JsonArray paths = new JsonArray();

            if (cmdLine.hasOption("path")) {
                String values[] = cmdLine.getOptionValues("path");
                for (String value : values) {
                    paths.add(new JsonPrimitive(value));
                }
            }
            if (cmdLine.hasOption("paths")) {
                JsonParser jsonParser = new JsonParser();
                JsonReader reader = new JsonReader(new StringReader(cmdLine.getOptionValue("paths")));
                reader.setLenient(true);
                JsonArray array = jsonParser.parse(reader).getAsJsonArray();
                if (array != null) {
                    paths.addAll(array);
                }
            }

            if (paths.size() == 0) {
                throw new IOException("No Input Paths Specified!");
            }

            LOG.info("Bucket:" + s3Bucket + " Target Paths:" + paths.toString());

            S3BulkTransferUtil util = new S3BulkTransferUtil(s3Bucket, s3AccessKey, s3Secret, paths,
                    hdfsOutputPath);
        } catch (Exception e) {
            LOG.error(CCStringUtils.stringifyException(e));
            printUsage();
        }
    }

}