Java tutorial
/* * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. * * Copyright 2017 Nextdoor.com, Inc * */ package com.nextdoor.bender.handler.s3; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.UncheckedIOException; import java.io.UnsupportedEncodingException; import java.net.SocketTimeoutException; import java.net.URLDecoder; import java.time.temporal.ChronoUnit; import java.util.Iterator; import java.util.List; import java.util.concurrent.Callable; import java.util.zip.GZIPInputStream; import org.apache.log4j.Logger; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.event.S3EventNotification.S3Entity; import com.amazonaws.services.s3.event.S3EventNotification.S3EventNotificationRecord; import com.amazonaws.services.s3.model.GetObjectRequest; import com.amazonaws.services.s3.model.S3Object; import com.evanlennick.retry4j.CallExecutor; import com.evanlennick.retry4j.CallResults; import com.evanlennick.retry4j.RetryConfig; import com.evanlennick.retry4j.RetryConfigBuilder; import com.evanlennick.retry4j.exception.RetriesExhaustedException; import com.evanlennick.retry4j.exception.UnexpectedException; import com.nextdoor.bender.InternalEvent; import com.nextdoor.bender.InternalEventIterator; import com.nextdoor.bender.LambdaContext; import com.nextdoor.bender.aws.AmazonS3ClientFactory; /** * Creates a contiguous iterator backed by files in S3. Each file is opened and streamed to an * internal iterator which outputs individual lines (records) from the file. When a file has no more * lines the next file is automatically opened on the following next() or hasNext() call. * * It is important for the user to always call the close() method as otherwise connection leaking * may occur. */ public class S3EventIterator implements InternalEventIterator<InternalEvent> { private static final Logger logger = Logger.getLogger(S3EventIterator.class); private final AmazonS3Client client; private final List<S3EventNotificationRecord> records; private final LambdaContext context; private long arrivalTime; private int currentIndex = 0; private Iterator<String> lineIterator; private BufferedReader reader; private S3Entity currentS3Entity; private RetryConfig config; public S3EventIterator(LambdaContext context, List<S3EventNotificationRecord> records, AmazonS3ClientFactory s3ClientFactory) { this.records = records; this.context = context; this.client = s3ClientFactory.newInstance(); this.config = new RetryConfigBuilder() .retryOnSpecificExceptions(SocketTimeoutException.class, UncheckedIOException.class) .withMaxNumberOfTries(3).withDelayBetweenTries(100, ChronoUnit.MILLIS).withExponentialBackoff() .build(); } @Override public boolean hasNext() { if (this.currentIndex < this.records.size()) { return true; } /* * Wrap has next row in retry logic. This is because there is intermittent socket timeouts when * reading from S3 that cause the function to hang/fail. */ Callable<Boolean> callable = () -> { return this.lineIterator.hasNext(); }; boolean hasNext; try { CallResults<Object> results = new CallExecutor(this.config).execute(callable); hasNext = (boolean) results.getResult(); } catch (RetriesExhaustedException ree) { throw new RuntimeException(ree.getCallResults().getLastExceptionThatCausedRetry()); } catch (UnexpectedException ue) { throw ue; } /* * If there are no lines then the reader from which the lines came from should be closed. */ if (!hasNext) { closeCurrentReader(); } return hasNext; } @Override public InternalEvent next() { updateCursor(); /* * Wrap reading next row in retry logic. This is because there is intermittent socket timeouts * when reading from S3 that cause the function to hang/fail. */ Callable<String> callable = () -> { return this.lineIterator.next(); }; String nextRow; try { CallResults<Object> results = new CallExecutor(this.config).execute(callable); nextRow = (String) results.getResult(); } catch (RetriesExhaustedException ree) { throw new RuntimeException(ree.getCallResults().getLastExceptionThatCausedRetry()); } catch (UnexpectedException ue) { throw ue; } /* * Construct the internal event */ return new S3InternalEvent(nextRow, this.context, this.arrivalTime, currentS3Entity.getObject().getKey(), currentS3Entity.getBucket().getName(), currentS3Entity.getObject().getVersionId()); } @Override public void close() throws IOException { closeCurrentReader(); } private void updateCursor() { if (this.currentIndex == 0 || (this.currentIndex < this.records.size() && !this.lineIterator.hasNext())) { /* * The previous reader must be closed in order to prevent S3 connection leaking */ closeCurrentReader(); /* * Use the S3 trigger event time for arrival time of records in file. This is less precise but * avoids making a call to the S3 api to find file creation time. Note that if the * deserializer creates a {@link com.nextdoor.bender.deserializer.DeserializedTimeSeriesEvent} * then this arrival time is not used. */ S3EventNotificationRecord event = this.records.get(currentIndex); this.arrivalTime = event.getEventTime().toDate().getTime(); this.currentS3Entity = event.getS3(); /* * The S3 Object key is URL encoded and must be decoded before it can be used by the * AmazonS3Client */ String key; try { key = URLDecoder.decode(this.currentS3Entity.getObject().getKey(), "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } /* * Stream object back from S3 into a reader */ String bucketName = this.currentS3Entity.getBucket().getName(); logger.debug("opening s3://" + bucketName + "/" + key); GetObjectRequest req = new GetObjectRequest(bucketName, key); S3Object obj = client.getObject(req); logger.trace("s3 get request id: " + client.getCachedResponseMetadata(req).getRequestId() + " host: " + client.getCachedResponseMetadata(req).getHostId() + " cloudfrontid: " + client.getCachedResponseMetadata(req).getCloudFrontId()); /* * If the file is compressed run it through the GZIP decompressor */ // TODO: support different types of compressions if (key.endsWith(".gz")) { GZIPInputStream gzip; try { gzip = new GZIPInputStream(obj.getObjectContent()); } catch (IOException e) { throw new RuntimeException(e); } reader = new BufferedReader(new InputStreamReader(gzip)); } else { reader = new BufferedReader(new InputStreamReader(obj.getObjectContent())); } /* * Note the BufferedReader is lazy and so is the iterator. The object is directly streamed * from S3, fed into an input stream and consumed line by line by the iterator. */ this.lineIterator = reader.lines().iterator(); currentIndex++; } } private void closeCurrentReader() { if (this.reader != null) { try { this.reader.close(); logger.trace("closed reader"); this.reader = null; } catch (IOException e) { logger.warn("Unable to close S3 reader", e); } } } }