List of usage examples for com.amazonaws.services.s3.model S3ObjectSummary getLastModified
public Date getLastModified()
From source file:com.streamsets.pipeline.stage.origin.s3.AmazonS3Runnable.java
License:Apache License
private S3Offset fetchNextObjectFromSpooler(S3Offset s3Offset, BatchContext batchContext) throws StageException { setCurrentObject(null);//from w w w . j ava 2s.c o m try { //The next object found in queue is mostly eligible since we process objects in chronological order. //However after processing a few files, if the configuration is changed [say relax the prefix] and an older file //gets selected for processing, it must be ignored. S3ObjectSummary nextAvailObj = null; do { if (nextAvailObj != null) { LOG.warn("Ignoring object '{}' in spool directory as is lesser than offset object '{}'", nextAvailObj.getKey(), s3Offset.getKey()); } nextAvailObj = spooler.poolForObject(amazonS3Source, s3ConfigBean.basicConfig.maxWaitTime, TimeUnit.MILLISECONDS, batchContext); } while (!isEligible(nextAvailObj, s3Offset)); if (nextAvailObj == null) { // no object to process LOG.debug("No new object available in spool queue after '{}' secs, producing empty batch", s3ConfigBean.basicConfig.maxWaitTime / 1000); } else { setCurrentObject(nextAvailObj); // if the current offset object is null or the object returned by the spooler is greater than the current offset // object we take the object returned by the spooler as the new object and set the offset to zero. // if not, it means the spooler returned us the current object, we just keep processing it from the last // offset we processed (known via offset tracking) if (s3Offset.getKey() == null || s3Offset.getKey().equals(S3Constants.EMPTY) || isLaterThan(nextAvailObj.getKey(), nextAvailObj.getLastModified().getTime(), s3Offset.getKey(), Long.parseLong(s3Offset.getTimestamp()))) { s3Offset = new S3Offset(getCurrentObject().getKey(), S3Constants.ZERO, getCurrentObject().getETag(), String.valueOf(getCurrentObject().getLastModified().getTime())); } } } catch (InterruptedException ex) { // the spooler was interrupted while waiting for an object, we log and return, the pipeline agent will invoke us // again to wait for an object again LOG.warn("Pooling interrupted"); } catch (AmazonClientException e) { throw new StageException(Errors.S3_SPOOLDIR_23, e.toString(), e); } return s3Offset; }
From source file:com.streamsets.pipeline.stage.origin.s3.AmazonS3Runnable.java
License:Apache License
private boolean isEligible(S3ObjectSummary nextAvailObj, S3Offset s3Offset) { ObjectOrdering objectOrdering = s3ConfigBean.s3FileConfig.objectOrdering; switch (objectOrdering) { case TIMESTAMP: return nextAvailObj == null || s3Offset == null || nextAvailObj.getLastModified().getTime() >= Long.parseLong(s3Offset.getTimestamp()); case LEXICOGRAPHICAL: return nextAvailObj == null || s3Offset == null || s3Offset.getKey() == null || s3Offset.getKey().equals(S3Constants.EMPTY) || nextAvailObj.getKey().compareTo(s3Offset.getKey()) > 0; default://from w w w.j a v a 2s .c o m throw new IllegalArgumentException("Unknown ordering: " + objectOrdering.getLabel()); } }
From source file:com.streamsets.pipeline.stage.origin.s3.AmazonS3Source.java
License:Apache License
private S3Offset fetchNextObjectFromSpooler(S3Offset s3Offset) throws StageException { setCurrentObject(null);/*w ww . j ava 2 s . com*/ try { //The next object found in queue is mostly eligible since we process objects in chronological order. //However after processing a few files, if the configuration is changed [say relax the prefix] and an older file //gets selected for processing, it must be ignored. S3ObjectSummary nextAvailObj = null; do { if (nextAvailObj != null) { LOG.warn("Ignoring object '{}' in spool directory as is lesser than offset object '{}'", nextAvailObj.getKey(), s3Offset.getKey()); } nextAvailObj = spooler.poolForObject(s3Offset, s3ConfigBean.basicConfig.maxWaitTime, TimeUnit.MILLISECONDS); } while (!isEligible(nextAvailObj, s3Offset)); if (nextAvailObj == null) { // no object to process LOG.debug("No new object available in spool directory after '{}' secs, producing empty batch", s3ConfigBean.basicConfig.maxWaitTime / 1000); } else { setCurrentObject(nextAvailObj); // if the current offset object is null or the object returned by the spooler is greater than the current offset // object we take the object returned by the spooler as the new object and set the offset to zero. // if not, it means the spooler returned us the current object, we just keep processing it from the last // offset we processed (known via offset tracking) if (s3Offset.getKey() == null || isLaterThan(nextAvailObj.getKey(), nextAvailObj.getLastModified().getTime(), s3Offset.getKey(), Long.parseLong(s3Offset.getTimestamp()))) { s3Offset = new S3Offset(getCurrentObject().getKey(), ZERO, getCurrentObject().getETag(), String.valueOf(getCurrentObject().getLastModified().getTime())); } } } catch (InterruptedException ex) { // the spooler was interrupted while waiting for an object, we log and return, the pipeline agent will invoke us // again to wait for an object again LOG.warn("Pooling interrupted"); } catch (AmazonClientException e) { throw new StageException(Errors.S3_SPOOLDIR_23, e.toString()); } return s3Offset; }
From source file:com.streamsets.pipeline.stage.origin.s3.AmazonS3Util.java
License:Apache License
/** * Lists objects from AmazonS3 in chronological order [lexicographical order if 2 files have same timestamp] which are * later than or equal to the timestamp of the previous offset object * * @param s3Client/*from ww w . ja v a 2 s. com*/ * @param s3ConfigBean * @param pathMatcher glob patterns to match file name against * @param s3Offset current offset which provides the timestamp of the previous object * @param fetchSize number of objects to fetch in one go * @return * @throws AmazonClientException */ static List<S3ObjectSummary> listObjectsChronologically(AmazonS3Client s3Client, S3ConfigBean s3ConfigBean, PathMatcher pathMatcher, AmazonS3Source.S3Offset s3Offset, int fetchSize) throws AmazonClientException { //Algorithm: // - Full scan all objects that match the file name pattern and which are later than the file in the offset // - Select the oldest "fetchSize" number of files and return them. TreeSet<S3ObjectSummary> treeSet = new TreeSet<>(new Comparator<S3ObjectSummary>() { @Override public int compare(S3ObjectSummary o1, S3ObjectSummary o2) { int result = o1.getLastModified().compareTo(o2.getLastModified()); if (result != 0) { //same modified time. Use name to sort return result; } return o1.getKey().compareTo(o2.getKey()); } }); S3Objects s3ObjectSummaries = S3Objects .withPrefix(s3Client, s3ConfigBean.s3Config.bucket, s3ConfigBean.s3Config.folder) .withBatchSize(BATCH_SIZE); for (S3ObjectSummary s : s3ObjectSummaries) { String fileName = s.getKey().substring(s3ConfigBean.s3Config.folder.length(), s.getKey().length()); if (!fileName.isEmpty()) { //fileName can be empty. //If the user manually creates a folder "myFolder/mySubFolder" in bucket "myBucket" and uploads "myObject", // then the first objects returned here are: // myFolder/mySubFolder // myFolder/mySubFolder/myObject // // All is good when pipeline is run but preview returns with no data. So we should ignore the empty file as it // has no data if (pathMatcher.matches(Paths.get(fileName)) && isEligible(s, s3Offset)) { treeSet.add(s); } if (treeSet.size() > fetchSize) { treeSet.pollLast(); } } } return new ArrayList<>(treeSet); }
From source file:com.streamsets.pipeline.stage.origin.s3.AmazonS3Util.java
License:Apache License
private static boolean isEligible(S3ObjectSummary s, AmazonS3Source.S3Offset s3Offset) { //The object is eligible if //1. The timestamp is greater than that of the current object in offset //2. The timestamp is same but the name is lexicographically greater than the current object [can happen when multiple objects are uploaded in one go] //3. Same timestamp, same name [same as the current object in offset], eligible if it was not completely processed [offset != -1] boolean isEligible = false; if (s.getLastModified().compareTo(new Date(Long.parseLong(s3Offset.getTimestamp()))) > 0) { isEligible = true;//from w ww . jav a 2s. c om } else if (s.getLastModified().compareTo(new Date(Long.parseLong(s3Offset.getTimestamp()))) == 0) { //same timestamp //compare names if (s.getKey().compareTo(s3Offset.getKey()) > 0) { isEligible = true; } else if (s.getKey().compareTo(s3Offset.getKey()) == 0) { //same time stamp, same name //If the current offset is not -1, return the file. It means the previous file was partially processed. if (Long.parseLong(s3Offset.getOffset()) != -1) { isEligible = true; } } } return isEligible; }
From source file:com.streamsets.pipeline.stage.origin.s3.S3Spooler.java
License:Apache License
void addObjectToQueue(S3ObjectSummary objectSummary, boolean checkCurrent) { Preconditions.checkNotNull(objectSummary, "file cannot be null"); if (checkCurrent) { Preconditions.checkState(currentObject == null || currentObject.getLastModified().compareTo(objectSummary.getLastModified()) < 0); }// w w w .j a v a2s . com if (!objectQueue.contains(objectSummary)) { if (objectQueue.size() >= MAX_SPOOL_SIZE) { LOG.warn("Exceeded '{}' of queued files", objectQueue.size()); } objectQueue.add(objectSummary); spoolQueueMeter.mark(objectQueue.size()); } else { LOG.warn("Object '{}' already in queue, ignoring", objectSummary.getKey()); } }
From source file:com.streamsets.pipeline.stage.origin.s3.S3Spooler.java
License:Apache License
public void postProcessOlderObjectIfNeeded(AmazonS3Source.S3Offset s3Offset) { //If sdc was shutdown after reading an object but before post processing it, handle it now. //The scenario is detected as follows: // 1. the current key must not be null // 2. offset must be -1 // 3. An object with same key must exist in s3 // 4. The timestamp of the object ins3 must be same as that of the timestamp in offset [It is possible that one // uploads another object with the same name. We can avoid post processing it without producing records by // comparing the timestamp on that object if (s3Offset.getKey() != null && "-1".equals(s3Offset.getOffset())) { //conditions 1, 2 are met. Check for 3 and 4. S3ObjectSummary objectSummary = AmazonS3Util.getObjectSummary(s3Client, s3ConfigBean.s3Config.bucket, s3Offset.getKey());//w ww.j a v a 2 s . c o m if (objectSummary != null && objectSummary.getLastModified() .compareTo(new Date(Long.parseLong(s3Offset.getTimestamp()))) == 0) { postProcessOrErrorHandle(s3Offset.getKey(), s3ConfigBean.postProcessingConfig.postProcessing, s3ConfigBean.postProcessingConfig.postProcessBucket, s3ConfigBean.postProcessingConfig.postProcessFolder, s3ConfigBean.postProcessingConfig.archivingOption); } } currentObject = null; }
From source file:com.upplication.s3fs.S3FileSystemProvider.java
License:Open Source License
@Override public <A extends BasicFileAttributes> A readAttributes(Path path, Class<A> type, LinkOption... options) throws IOException { Preconditions.checkArgument(path instanceof S3Path, "path must be an instance of %s", S3Path.class.getName()); S3Path s3Path = (S3Path) path; if (type == BasicFileAttributes.class) { S3ObjectSummary objectSummary = s3ObjectSummaryLookup.lookup(s3Path); // parse the data to BasicFileAttributes. FileTime lastModifiedTime = null; if (objectSummary.getLastModified() != null) { lastModifiedTime = FileTime.from(objectSummary.getLastModified().getTime(), TimeUnit.MILLISECONDS); }// w ww. ja va2s .co m long size = objectSummary.getSize(); boolean directory = false; boolean regularFile = false; String key = objectSummary.getKey(); // check if is a directory and exists the key of this directory at amazon s3 if (objectSummary.getKey().equals(s3Path.getKey() + "/") && objectSummary.getKey().endsWith("/")) { directory = true; } // is a directory but not exists at amazon s3 else if ((!objectSummary.getKey().equals(s3Path.getKey()) || "".equals(s3Path.getKey())) && objectSummary.getKey().startsWith(s3Path.getKey())) { directory = true; // no metadata, we fake one size = 0; // delete extra part key = s3Path.getKey() + "/"; } // is a file: else { regularFile = true; } return type.cast(new S3FileAttributes(key, lastModifiedTime, size, directory, regularFile)); } // not support attribute class throw new UnsupportedOperationException(format("only %s supported", BasicFileAttributes.class)); }
From source file:com.yahoo.athenz.zts.store.impl.S3ChangeLogStore.java
License:Apache License
/** * list the objects in the zts bucket. If the mod time is specified as 0 * then we want to list all objects otherwise, we only list objects * that are newer than the specified timestamp * @param s3 AWS S3 client object/*from w w w . j a v a 2 s.co m*/ * @param domains collection to be updated to include domain names * @param modTime only include domains newer than this timestamp */ void listObjects(AmazonS3 s3, Collection<String> domains, long modTime) { if (LOGGER.isDebugEnabled()) { LOGGER.debug("listObjects: Retrieving domains from {} with mod time > {}", s3BucketName, modTime); } ObjectListing objectListing = s3.listObjects(new ListObjectsRequest().withBucketName(s3BucketName)); String objectName; while (objectListing != null) { // process each entry in our result set and add the domain // name to our return list final List<S3ObjectSummary> objectSummaries = objectListing.getObjectSummaries(); boolean listTruncated = objectListing.isTruncated(); if (LOGGER.isDebugEnabled()) { LOGGER.debug("listObjects: retrieved {} objects, more objects available - {}", objectSummaries.size(), listTruncated); } for (S3ObjectSummary objectSummary : objectSummaries) { // if mod time is specified then make sure we automatically skip // any domains older than the specified value if (modTime > 0 && objectSummary.getLastModified().getTime() <= modTime) { continue; } // for now skip any folders/objects that start with '.' objectName = objectSummary.getKey(); if (objectName.charAt(0) == '.') { continue; } domains.add(objectName); } // check if the object listing is truncated or not (break out in this case) // technically we can skip this call and just call listNextBatchOfResults // since that returns null if the object listing is not truncated but // this direct check here makes the logic easier to follow if (!listTruncated) { break; } objectListing = s3.listNextBatchOfObjects(objectListing); } }
From source file:com.yahoo.athenz.zts.store.s3.S3ChangeLogStore.java
License:Apache License
/** * list the objects in the zts bucket. If te mod time is specified as 0 * then we want to list all objects otherwise, we only list objects * that are newer than the specified timestamp * @param s3 AWS S3 client object/*from w ww . j ava2 s. c om*/ * @param domains collection to be updated to include domain names * @param modTime only include domains newer than this timestamp */ void listObjects(AmazonS3 s3, Collection<String> domains, long modTime) { ObjectListing objectListing = s3.listObjects(new ListObjectsRequest().withBucketName(s3BucketName)); String objectName = null; while (objectListing != null) { // process each entry in our result set and add the domain // name to our return list for (S3ObjectSummary objectSummary : objectListing.getObjectSummaries()) { // if mod time is specified then make sure we automatically skip // any domains older than the specified value if (modTime > 0 && objectSummary.getLastModified().getTime() <= modTime) { continue; } // for now skip any folders/objects that start with '.' objectName = objectSummary.getKey(); if (objectName.charAt(0) == '.') { continue; } domains.add(objectName); } // check if the object listing is truncated or not (break out in this case) // technically we can skip this call and just call listNextBatchOfResults // since that returns null if the object listing is not truncated but // this direct check here makes the logic easier to follow if (!objectListing.isTruncated()) { break; } objectListing = s3.listNextBatchOfObjects(objectListing); } }