org.apache.streams.s3.S3PersistWriter.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.streams.s3.S3PersistWriter.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.streams.s3;

import com.amazonaws.ClientConfiguration;
import com.amazonaws.Protocol;
import com.amazonaws.auth.AWSCredentials;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.regions.Region;
import com.amazonaws.regions.Regions;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.S3ClientOptions;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import org.apache.streams.core.*;
import org.apache.streams.jackson.StreamsJacksonMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

public class S3PersistWriter implements StreamsPersistWriter, DatumStatusCountable {
    public final static String STREAMS_ID = "S3PersistWriter";

    private final static Logger LOGGER = LoggerFactory.getLogger(S3PersistWriter.class);

    private final static char DELIMITER = '\t';

    private ObjectMapper objectMapper;
    private AmazonS3Client amazonS3Client;
    private S3WriterConfiguration s3WriterConfiguration;
    private final List<String> writtenFiles = new ArrayList<String>();

    private final AtomicLong totalBytesWritten = new AtomicLong();
    private AtomicLong bytesWrittenThisFile = new AtomicLong();

    private final AtomicInteger totalRecordsWritten = new AtomicInteger();
    private AtomicInteger fileLineCounter = new AtomicInteger();

    private Map<String, String> objectMetaData = new HashMap<String, String>() {
        {
            put("line[0]", "id");
            put("line[1]", "timeStamp");
            put("line[2]", "metaData");
            put("line[3]", "document");
        }
    };

    private OutputStreamWriter currentWriter = null;

    public AmazonS3Client getAmazonS3Client() {
        return this.amazonS3Client;
    }

    public S3WriterConfiguration getS3WriterConfiguration() {
        return this.s3WriterConfiguration;
    }

    public List<String> getWrittenFiles() {
        return this.writtenFiles;
    }

    public Map<String, String> getObjectMetaData() {
        return this.objectMetaData;
    }

    public ObjectMapper getObjectMapper() {
        return this.objectMapper;
    }

    public void setObjectMapper(ObjectMapper mapper) {
        this.objectMapper = mapper;
    }

    public void setObjectMetaData(Map<String, String> val) {
        this.objectMetaData = val;
    }

    /**
     * Instantiator with a pre-existing amazonS3Client, this is used to help with re-use.
     * @param amazonS3Client
     * If you have an existing amazonS3Client, it wont' bother to create another one
     * @param s3WriterConfiguration
     * Configuration of the write paths and instructions are still required.
     */
    public S3PersistWriter(AmazonS3Client amazonS3Client, S3WriterConfiguration s3WriterConfiguration) {
        this.amazonS3Client = amazonS3Client;
        this.s3WriterConfiguration = s3WriterConfiguration;
    }

    public S3PersistWriter(S3WriterConfiguration s3WriterConfiguration) {
        this.s3WriterConfiguration = s3WriterConfiguration;
    }

    @Override
    public void write(StreamsDatum streamsDatum) {

        synchronized (this) {
            // Check to see if we need to reset the file that we are currently working with
            if (this.currentWriter == null || (this.bytesWrittenThisFile
                    .get() >= (this.s3WriterConfiguration.getMaxFileSize() * 1024 * 1024))) {
                try {
                    LOGGER.info("Resetting the file");
                    this.currentWriter = resetFile();
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }

            String line = convertResultToString(streamsDatum);

            try {
                this.currentWriter.write(line);
            } catch (IOException e) {
                e.printStackTrace();
            }

            // add the bytes we've written
            int recordSize = line.getBytes().length;
            this.totalBytesWritten.addAndGet(recordSize);
            this.bytesWrittenThisFile.addAndGet(recordSize);

            // increment the record count
            this.totalRecordsWritten.incrementAndGet();
            this.fileLineCounter.incrementAndGet();
        }

    }

    private synchronized OutputStreamWriter resetFile() throws Exception {
        // this will keep it thread safe, so we don't create too many files
        if (this.fileLineCounter.get() == 0 && this.currentWriter != null)
            return this.currentWriter;

        closeAndDestroyWriter();

        // Create the path for where the file is going to live.
        try {
            // generate a file name
            String fileName = this.s3WriterConfiguration.getWriterFilePrefix()
                    + (this.s3WriterConfiguration.getChunk() ? "/" : "-") + new Date().getTime() + ".tsv";

            // create the output stream
            OutputStream outputStream = new S3OutputStreamWrapper(this.amazonS3Client,
                    this.s3WriterConfiguration.getBucket(), this.s3WriterConfiguration.getWriterPath(), fileName,
                    this.objectMetaData);

            // reset the counter
            this.fileLineCounter = new AtomicInteger();
            this.bytesWrittenThisFile = new AtomicLong();

            // add this to the list of written files
            writtenFiles.add(this.s3WriterConfiguration.getWriterPath() + fileName);

            // Log that we are creating this file
            LOGGER.info("File Created: Bucket[{}] - {}", this.s3WriterConfiguration.getBucket(),
                    this.s3WriterConfiguration.getWriterPath() + fileName);

            // return the output stream
            return new OutputStreamWriter(outputStream);
        } catch (Exception e) {
            LOGGER.error(e.getMessage());
            throw e;
        }
    }

    private synchronized void closeAndDestroyWriter() {
        // if there is a current writer, we must close it first.
        if (this.currentWriter != null) {
            this.safeFlush(this.currentWriter);
            this.closeSafely(this.currentWriter);
            this.currentWriter = null;

            // Logging of information to alert the user to the activities of this class
            LOGGER.debug("File Closed: Records[{}] Bytes[{}] {} ", this.fileLineCounter.get(),
                    this.bytesWrittenThisFile.get(), this.writtenFiles.get(this.writtenFiles.size() - 1));
        }
    }

    private synchronized void closeSafely(Writer writer) {
        if (writer != null) {
            try {
                writer.flush();
                writer.close();
            } catch (Exception e) {
                // noOp
            }
            LOGGER.debug("File Closed");
        }
    }

    private void safeFlush(Flushable flushable) {
        // This is wrapped with a ByteArrayOutputStream, so this is really safe.
        if (flushable != null) {
            try {
                flushable.flush();
            } catch (IOException e) {
                // noOp
            }
        }
    }

    private String convertResultToString(StreamsDatum entry) {
        String metadata = null;

        try {
            metadata = objectMapper.writeValueAsString(entry.getMetadata());
        } catch (JsonProcessingException e) {
            e.printStackTrace();
        }

        String documentJson = null;
        try {
            documentJson = objectMapper.writeValueAsString(entry.getDocument());
        } catch (JsonProcessingException e) {
            e.printStackTrace();
        }

        // Save the class name that it came from
        entry.metadata.put("class", entry.getDocument().getClass().getName());

        if (Strings.isNullOrEmpty(documentJson))
            return null;
        else
            return entry.getId() + DELIMITER + // [0] = Unique id of the verbatim
                    entry.getTimestamp() + DELIMITER + // [1] = Timestamp of the item
                    metadata + DELIMITER + // [2] = Metadata of the item
                    documentJson + "\n"; // [3] = The actual object
    }

    public void prepare(Object configurationObject) {
        // Connect to S3
        synchronized (this) {

            try {
                // if the user has chosen to not set the object mapper, then set a default object mapper for them.
                if (this.objectMapper == null)
                    this.objectMapper = new StreamsJacksonMapper();

                // Create the credentials Object
                if (this.amazonS3Client == null) {
                    AWSCredentials credentials = new BasicAWSCredentials(s3WriterConfiguration.getKey(),
                            s3WriterConfiguration.getSecretKey());

                    ClientConfiguration clientConfig = new ClientConfiguration();
                    clientConfig.setProtocol(Protocol.valueOf(s3WriterConfiguration.getProtocol().toString()));

                    // We do not want path style access
                    S3ClientOptions clientOptions = new S3ClientOptions();
                    clientOptions.setPathStyleAccess(false);

                    this.amazonS3Client = new AmazonS3Client(credentials, clientConfig);
                    if (!Strings.isNullOrEmpty(s3WriterConfiguration.getRegion()))
                        this.amazonS3Client
                                .setRegion(Region.getRegion(Regions.fromName(s3WriterConfiguration.getRegion())));
                    this.amazonS3Client.setS3ClientOptions(clientOptions);
                }
            } catch (Exception e) {
                LOGGER.error("Exception while preparing the S3 client: {}", e);
            }

            Preconditions.checkArgument(this.amazonS3Client != null);
        }
    }

    public void cleanUp() {
        closeAndDestroyWriter();
    }

    public DatumStatusCounter getDatumStatusCounter() {
        DatumStatusCounter counters = new DatumStatusCounter();
        counters.incrementAttempt(this.totalRecordsWritten.get());
        counters.incrementStatus(DatumStatus.SUCCESS, this.totalRecordsWritten.get());
        return counters;
    }
}