Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to You under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.apache.manifoldcf.crawler.connectors.amazons3; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InterruptedIOException; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.manifoldcf.agents.interfaces.RepositoryDocument; import org.apache.manifoldcf.agents.interfaces.ServiceInterruption; import org.apache.manifoldcf.connectors.common.amazons3.S3Artifact; import org.apache.manifoldcf.connectors.common.amazons3.XThreadBuffer; import org.apache.manifoldcf.core.interfaces.ConfigParams; import org.apache.manifoldcf.core.interfaces.IHTTPOutput; import org.apache.manifoldcf.core.interfaces.IPasswordMapperActivity; import org.apache.manifoldcf.core.interfaces.IPostParameters; import org.apache.manifoldcf.core.interfaces.IThreadContext; import org.apache.manifoldcf.core.interfaces.ManifoldCFException; import org.apache.manifoldcf.core.interfaces.Specification; import org.apache.manifoldcf.core.interfaces.SpecificationNode; import org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector; import org.apache.manifoldcf.crawler.interfaces.IExistingVersions; import org.apache.manifoldcf.crawler.interfaces.IProcessActivity; import org.apache.manifoldcf.crawler.interfaces.ISeedingActivity; import org.apache.manifoldcf.crawler.system.Logging; import org.apache.tika.Tika; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import com.amazonaws.AmazonClientException; import com.amazonaws.AmazonServiceException; import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.model.AccessControlList; import com.amazonaws.services.s3.model.Bucket; import com.amazonaws.services.s3.model.CanonicalGrantee; import com.amazonaws.services.s3.model.GetObjectRequest; import com.amazonaws.services.s3.model.Grant; import com.amazonaws.services.s3.model.Grantee; import com.amazonaws.services.s3.model.ListObjectsRequest; import com.amazonaws.services.s3.model.ObjectListing; import com.amazonaws.services.s3.model.ObjectMetadata; import com.amazonaws.services.s3.model.Owner; import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectSummary; /** * @author Kuhajeyan * */ public class AmazonS3Connector extends BaseRepositoryConnector { private static final String BUCKET_SPLITTER = ","; private static final String TAB_NAME = "TabName"; private static final String SELECTED_NUM = "SelectedNum"; private static final String SEQ_NUM = "SeqNum"; protected final static String ACTIVITY_READ = "read document"; protected long lastSessionFetch = -1L; protected static final long timeToRelease = 300000L; protected AmazonS3 amazonS3; protected boolean connected = false; protected String amazons3ProxyHost = null; protected String amazons3ProxyPort = null; protected String amazons3ProxyDomain = null; protected String amazons3ProxyUserName = null; protected String amazons3ProxyPassword = null; protected String amazons3AwsAccessKey = null; protected String amazons3AwsSecretKey = null; private static final String STD_SEPARATOR_BUCKET_AND_KEY = BUCKET_SPLITTER; private String[] buckets; AutoDetectParser parser; BodyContentHandler handler; Metadata metadata; Tika tika; ParseContext context; public AmazonS3Connector() { parser = new AutoDetectParser(); handler = new BodyContentHandler(AmazonS3Config.CHARACTER_LIMIT); metadata = new Metadata(); tika = new Tika(); context = new ParseContext(); } @Override public String[] getActivitiesList() { return new String[] { ACTIVITY_READ }; } @Override public String[] getBinNames(String documentIdentifier) { return new String[] { amazons3AwsAccessKey }; } /** * Close the connection. Call this before discarding the connection. */ @Override public void disconnect() throws ManifoldCFException { amazons3AwsAccessKey = null; amazons3AwsSecretKey = null; amazons3ProxyHost = null; amazons3ProxyPort = null; amazons3ProxyDomain = null; amazons3ProxyUserName = null; amazons3ProxyPassword = null; } /** * Connect method initializes the configparams * */ @Override public void connect(ConfigParams configParams) { super.connect(configParams); // aws access and secret keys amazons3AwsAccessKey = configParams.getParameter(AmazonS3Config.AWS_ACCESS_KEY); amazons3AwsSecretKey = configParams.getObfuscatedParameter(AmazonS3Config.AWS_SECRET_KEY); // proxy values amazons3ProxyHost = configParams.getParameter(AmazonS3Config.AMAZONS3_PROXY_HOST); amazons3ProxyPort = configParams.getParameter(AmazonS3Config.AMAZONS3_PROXY_PORT); amazons3ProxyDomain = configParams.getParameter(AmazonS3Config.AMAZONS3_PROXY_DOMAIN); amazons3ProxyUserName = configParams.getParameter(AmazonS3Config.AMAZONS3_PROXY_USERNAME); amazons3ProxyPassword = configParams.getObfuscatedParameter(AmazonS3Config.AMAZONS3_PROXY_PASSWORD); } /** * Get the Amazons3 client, relevant access keys should have been posted * already * @return */ protected AmazonS3 getClient() { if (amazonS3 == null) { try { BasicAWSCredentials awsCreds = new BasicAWSCredentials(amazons3AwsAccessKey, amazons3AwsSecretKey); amazonS3 = new AmazonS3Client(awsCreds); } catch (Exception e) { Logging.connectors.error("Error while amazon s3 connectionr", e); } } lastSessionFetch = System.currentTimeMillis(); return amazonS3; } /** * */ @Override public String check() throws ManifoldCFException { // connect with amazons3 client Logging.connectors.info("Checking connection"); try { // invokes the check thread CheckThread checkThread = new CheckThread(getClient()); checkThread.start(); checkThread.join(); if (checkThread.getException() != null) { Throwable thr = checkThread.getException(); return "Check exception: " + thr.getMessage(); } return checkThread.getResult(); } catch (InterruptedException ex) { Logging.connectors.error("Error while checking connection", ex); throw new ManifoldCFException(ex.getMessage(), ex, ManifoldCFException.INTERRUPTED); } } @Override public boolean isConnected() { return amazonS3 != null && amazonS3.getS3AccountOwner() != null; } @Override public void poll() throws ManifoldCFException { if (lastSessionFetch == -1L) { return; } long currentTime = System.currentTimeMillis(); if (currentTime >= lastSessionFetch + timeToRelease) { amazonS3 = null; lastSessionFetch = -1L; } } @Override public int getMaxDocumentRequest() { return 1; } /** * Return the list of relationship types that this connector recognizes. * * @return the list. */ @Override public String[] getRelationshipTypes() { return new String[] { AmazonS3Config.RELATIONSHIP_RELATED }; } private void fillInServerConfigurationMap(Map<String, Object> newMap, IPasswordMapperActivity mapper, ConfigParams parameters) { String amazons3AccessKey = parameters.getParameter(AmazonS3Config.AWS_ACCESS_KEY); String amazons3SecretKey = parameters.getParameter(AmazonS3Config.AWS_SECRET_KEY); // default values if (amazons3AccessKey == null) amazons3AccessKey = AmazonS3Config.AMAZONS3_AWS_ACCESS_KEY_DEFAULT; if (amazons3SecretKey == null) amazons3SecretKey = AmazonS3Config.AMAZONS3_AWS_SECRET_KEY_DEFAULT; else amazons3SecretKey = mapper.mapPasswordToKey(amazons3SecretKey); // fill the map newMap.put("AMAZONS3_AWS_ACCESS_KEY", amazons3AccessKey); newMap.put("AMAZONS3_AWS_SECRET_KEY", amazons3SecretKey); } private void fillInProxyConfigurationMap(Map<String, Object> newMap, IPasswordMapperActivity mapper, ConfigParams parameters) { String amazons3ProxyHost = parameters.getParameter(AmazonS3Config.AMAZONS3_PROXY_HOST); String amazons3ProxyPort = parameters.getParameter(AmazonS3Config.AMAZONS3_PROXY_PORT); String amazons3ProxyDomain = parameters.getParameter(AmazonS3Config.AMAZONS3_PROXY_DOMAIN); String amazons3ProxyUserName = parameters.getParameter(AmazonS3Config.AMAZONS3_PROXY_USERNAME); String amazons3ProxyPassword = parameters.getObfuscatedParameter(AmazonS3Config.AMAZONS3_PROXY_PASSWORD); if (amazons3ProxyHost == null) amazons3ProxyHost = AmazonS3Config.AMAZONS3_PROXY_HOST_DEFAULT; if (amazons3ProxyPort == null) amazons3ProxyPort = AmazonS3Config.AMAZONS3_PROXY_PORT_DEFAULT; if (amazons3ProxyDomain == null) amazons3ProxyDomain = AmazonS3Config.AMAZONS3_PROXY_DOMAIN_DEFAULT; if (amazons3ProxyUserName == null) amazons3ProxyUserName = AmazonS3Config.AMAZONS3_PROXY_USERNAME_DEFAULT; if (amazons3ProxyPassword == null) amazons3ProxyPassword = AmazonS3Config.AMAZONS3_PROXY_PASSWORD_DEFAULT; else amazons3ProxyPassword = mapper.mapPasswordToKey(amazons3ProxyPassword); // fill the map newMap.put("AMAZONS3_PROXY_HOST", amazons3ProxyHost); newMap.put("AMAZONS3_PROXY_PORT", amazons3ProxyPort); newMap.put("AMAZONS3_PROXY_DOMAIN", amazons3ProxyDomain); newMap.put("AMAZONS3_PROXY_USERNAME", amazons3ProxyUserName); newMap.put("AMAZONS3_PROXY_PWD", amazons3ProxyPassword); } @Override public void viewConfiguration(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters) throws ManifoldCFException, IOException { Map<String, Object> paramMap = new HashMap<String, Object>(); // Fill in map from each tab fillInServerConfigurationMap(paramMap, out, parameters); fillInProxyConfigurationMap(paramMap, out, parameters); Messages.outputResourceWithVelocity(out, locale, AmazonS3Config.VIEW_CONFIG_FORWARD, paramMap); } @Override public void outputConfigurationHeader(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters, List<String> tabsArray) throws ManifoldCFException, IOException { // Add the Server tab tabsArray.add(Messages.getString(locale, AmazonS3Config.AMAZONS3_SERVER_TAB_PROPERTY)); // Add the Proxy tab tabsArray.add(Messages.getString(locale, AmazonS3Config.AMAZONS3_PROXY_TAB_PROPERTY)); // Map the parameters Map<String, Object> paramMap = new HashMap<String, Object>(); // Fill in the parameters from each tab fillInServerConfigurationMap(paramMap, out, parameters); fillInProxyConfigurationMap(paramMap, out, parameters); // Output the Javascript - only one Velocity template for all tabs Messages.outputResourceWithVelocity(out, locale, AmazonS3Config.EDIT_CONFIG_HEADER_FORWARD, paramMap); } @Override public void outputConfigurationBody(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters, String tabName) throws ManifoldCFException, IOException { // Call the Velocity templates for each tab Map<String, Object> paramMap = new HashMap<String, Object>(); // Set the tab name paramMap.put(TAB_NAME, tabName); // Fill in the parameters fillInServerConfigurationMap(paramMap, out, parameters); fillInProxyConfigurationMap(paramMap, out, parameters); // Server tab Messages.outputResourceWithVelocity(out, locale, AmazonS3Config.EDIT_CONFIG_FORWARD_SERVER, paramMap); // Proxy tab Messages.outputResourceWithVelocity(out, locale, AmazonS3Config.EDIT_CONFIG_FORWARD_PROXY, paramMap); } private static void fillInBucketsSpecificationMap(Map<String, Object> newMap, Specification ds) { String s3Buckets = AmazonS3Config.AMAZONS3_BUCKETS_DEFAULT; s3Buckets = getBuckets(ds); String[] buckets = s3Buckets.split(BUCKET_SPLITTER); newMap.put("AMAZONS3BUCKETS", s3Buckets); Logging.connectors.info("resolved s3 bucket values : " + s3Buckets); } private static String getBuckets(Specification ds) { String buckets = null; for (int i = 0; i < ds.getChildCount(); i++) { SpecificationNode sn = ds.getChild(i); if (sn.getType().equals(AmazonS3Config.JOB_STARTPOINT_NODE_TYPE)) { buckets = sn.getAttributeValue(AmazonS3Config.JOB_BUCKETS_ATTRIBUTE); } } return buckets; } @Override public String processConfigurationPost(IThreadContext threadContext, IPostParameters variableContext, Locale locale, ConfigParams parameters) throws ManifoldCFException { // server tab String awsAccessKey = variableContext.getParameter("aws_access_key"); if (awsAccessKey != null) { parameters.setParameter(AmazonS3Config.AWS_ACCESS_KEY, awsAccessKey); } String awsSecretKey = variableContext.getParameter("aws_secret_key"); if (awsSecretKey != null) { // set as obfuscated parameter parameters.setObfuscatedParameter(AmazonS3Config.AWS_SECRET_KEY, awsSecretKey); } // proxy tab String amazons3ProxyHost = variableContext.getParameter("amazons3_proxy_host"); if (amazons3ProxyHost != null) { parameters.setParameter(AmazonS3Config.AMAZONS3_PROXY_HOST, amazons3ProxyHost); } String amazons3ProxyPort = variableContext.getParameter("amazons3_proxy_port"); if (amazons3ProxyPort != null) { parameters.setParameter(AmazonS3Config.AMAZONS3_PROXY_PORT, amazons3ProxyPort); } String amazons3ProxyDomain = variableContext.getParameter("amazons3_proxy_domain"); if (amazons3ProxyDomain != null) { parameters.setParameter(AmazonS3Config.AMAZONS3_PROXY_DOMAIN, amazons3ProxyDomain); } String amazons3ProxyUserName = variableContext.getParameter("amazons3_proxy_username"); if (amazons3ProxyUserName != null) { parameters.setParameter(AmazonS3Config.AMAZONS3_PROXY_USERNAME, amazons3ProxyUserName); } String amazons3ProxyPassword = variableContext.getParameter("amazons3_proxy_pwd"); if (amazons3ProxyPassword != null) { // set as obfuscated parameter parameters.setObfuscatedParameter(AmazonS3Config.AMAZONS3_PROXY_USERNAME, amazons3ProxyUserName); } return null; } @Override public void viewSpecification(IHTTPOutput out, Locale locale, Specification ds, int connectionSequenceNumber) throws ManifoldCFException, IOException { Map<String, Object> paramMap = new HashMap<String, Object>(); paramMap.put(SEQ_NUM, Integer.toString(connectionSequenceNumber)); fillInBucketsSpecificationMap(paramMap, ds); Messages.outputResourceWithVelocity(out, locale, AmazonS3Config.VIEW_SPEC_FORWARD, paramMap); } /** * Process a specification post. This method is called at the start of job's * edit or view page, whenever there is a possibility that form data for a * connection has been posted. Its purpose is to gather form information and * modify the document specification accordingly. The name of the posted * form is always "editjob". The connector will be connected before this * method can be called. * * @param variableContext contains the post data, including binary * file-upload information. * @param locale is the locale the output is preferred to be in. * @param ds is the current document specification for this job. * @param connectionSequenceNumber is the unique number of this connection * within the job. * @return null if all is well, or a string error message if there is an * error that should prevent saving of the job (and cause a redirection to * an error page). */ @Override public String processSpecificationPost(IPostParameters variableContext, Locale locale, Specification ds, int connectionSequenceNumber) throws ManifoldCFException { String seqPrefix = "s" + connectionSequenceNumber + "_"; String s3Buckets = variableContext.getParameter(seqPrefix + AmazonS3Config.JOB_BUCKETS_ATTRIBUTE); // strip off buckets if (StringUtils.isNotEmpty(s3Buckets)) { s3Buckets = s3Buckets.replaceAll("\\s+", ""); buckets = s3Buckets.split(BUCKET_SPLITTER); if (buckets != null) { int i = 0; while (i < ds.getChildCount()) { SpecificationNode oldNode = ds.getChild(i); if (oldNode.getType().equals(AmazonS3Config.JOB_STARTPOINT_NODE_TYPE)) { ds.removeChild(i); break; } i++; } SpecificationNode node = new SpecificationNode(AmazonS3Config.JOB_STARTPOINT_NODE_TYPE); node.setAttribute(AmazonS3Config.JOB_BUCKETS_ATTRIBUTE, s3Buckets); ds.addChild(ds.getChildCount(), node); } } String xc = variableContext.getParameter(seqPrefix + "tokencount"); if (xc != null) { // Delete all tokens first int i = 0; while (i < ds.getChildCount()) { SpecificationNode sn = ds.getChild(i); if (sn.getType().equals(AmazonS3Config.JOB_ACCESS_NODE_TYPE)) ds.removeChild(i); else i++; } int accessCount = Integer.parseInt(xc); i = 0; while (i < accessCount) { String accessDescription = "_" + Integer.toString(i); String accessOpName = seqPrefix + "accessop" + accessDescription; xc = variableContext.getParameter(accessOpName); if (xc != null && xc.equals("Delete")) { // Next row i++; continue; } // Get the stuff we need String accessSpec = variableContext.getParameter(seqPrefix + "spectoken" + accessDescription); SpecificationNode node = new SpecificationNode(AmazonS3Config.JOB_ACCESS_NODE_TYPE); node.setAttribute(AmazonS3Config.JOB_TOKEN_ATTRIBUTE, accessSpec); ds.addChild(ds.getChildCount(), node); i++; } String op = variableContext.getParameter(seqPrefix + "accessop"); if (op != null && op.equals("Add")) { String accessspec = variableContext.getParameter(seqPrefix + "spectoken"); SpecificationNode node = new SpecificationNode(AmazonS3Config.JOB_ACCESS_NODE_TYPE); node.setAttribute(AmazonS3Config.JOB_TOKEN_ATTRIBUTE, accessspec); ds.addChild(ds.getChildCount(), node); } } return null; } @Override public void outputSpecificationBody(IHTTPOutput out, Locale locale, Specification ds, int connectionSequenceNumber, int actualSequenceNumber, String tabName) throws ManifoldCFException, IOException { Map<String, Object> paramMap = new HashMap<String, Object>(); paramMap.put(TAB_NAME, tabName); paramMap.put(SEQ_NUM, Integer.toString(connectionSequenceNumber)); paramMap.put(SELECTED_NUM, Integer.toString(actualSequenceNumber)); fillInBucketsSpecificationMap(paramMap, ds); Messages.outputResourceWithVelocity(out, locale, AmazonS3Config.EDIT_SPEC_FORWARD_BUCKETS, paramMap); } @Override public void outputSpecificationHeader(IHTTPOutput out, Locale locale, Specification ds, int connectionSequenceNumber, List<String> tabsArray) throws ManifoldCFException, IOException { tabsArray.add(Messages.getString(locale, AmazonS3Config.AMAZONS3_BUCKETS_TAB_PROPERTY)); Map<String, Object> paramMap = new HashMap<String, Object>(); paramMap.put(SEQ_NUM, Integer.toString(connectionSequenceNumber)); fillInBucketsSpecificationMap(paramMap, ds); Messages.outputResourceWithVelocity(out, locale, AmazonS3Config.EDIT_SPEC_HEADER_FORWARD, paramMap); } @Override public String addSeedDocuments(ISeedingActivity activities, Specification spec, String lastSeedVersion, long seedTime, int jobMode) throws ManifoldCFException, ServiceInterruption { long startTime; if (lastSeedVersion == null) startTime = 0L; else { // Unpack seed time from seed version string startTime = new Long(lastSeedVersion).longValue(); } String unparsedBuckets = getBuckets(spec); String[] buckets = unparsedBuckets.split(BUCKET_SPLITTER); // get seeds GeedSeeds(activities, buckets); return new Long(seedTime).toString(); } private void GeedSeeds(ISeedingActivity activities, String[] buckets) throws ManifoldCFException, ServiceInterruption { GetSeedsThread t = new GetSeedsThread(getClient(), buckets); try { t.start(); boolean wasInterrupted = false; try { XThreadBuffer<S3Artifact> seedBuffer = t.getBuffer(); // Pick up the paths, and add them to the activities, before we // join with the child thread. while (true) { // The only kind of exceptions this can throw are going to // shut the process down. S3Artifact s3Artifact = seedBuffer.fetch(); if (s3Artifact == null) { Logging.connectors.info("No artifact retured"); break; } String issueKey = s3Artifact.getBucketName() + STD_SEPARATOR_BUCKET_AND_KEY + s3Artifact.getKey(); Logging.connectors.info("Issue key is : " + issueKey); activities.addSeedDocument(issueKey); } } catch (InterruptedException e) { Logging.connectors.error(e); wasInterrupted = true; throw e; } catch (ManifoldCFException e) { Logging.connectors.error(e); if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) wasInterrupted = true; throw e; } finally { if (!wasInterrupted) t.finishUp(); } } catch (InterruptedException e) { Logging.connectors.error(e); t.interrupt(); throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } catch (java.net.SocketTimeoutException e) { Logging.connectors.error(e); handleIOException(e); } catch (InterruptedIOException e) { Logging.connectors.error(e); t.interrupt(); handleIOException(e); } catch (IOException e) { Logging.connectors.error(e); handleIOException(e); } catch (ResponseException e) { Logging.connectors.error(e); handleResponseException(e); } } private static void handleIOException(IOException e) throws ManifoldCFException, ServiceInterruption { if (!(e instanceof java.net.SocketTimeoutException) && (e instanceof InterruptedIOException)) { throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED); } Logging.connectors.warn("Amazons3: IO exception: " + e.getMessage(), e); long currentTime = System.currentTimeMillis(); throw new ServiceInterruption("IO exception: " + e.getMessage(), e, currentTime + 300000L, currentTime + 3 * 60 * 60000L, -1, false); } private static void handleResponseException(ResponseException e) throws ManifoldCFException, ServiceInterruption { throw new ManifoldCFException("Unexpected response: " + e.getMessage(), e); } @Override public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec, IProcessActivity activities, int jobMode, boolean usesDefaultAuthority) throws ManifoldCFException, ServiceInterruption { AmazonS3 amazons3Client = getClient(); if (amazons3Client == null) throw new ManifoldCFException("Amazon client can not connect at the moment"); String[] acls = null; // loop documents and process for (String documentIdentifier : documentIdentifiers) { try { if (documentIdentifier != null && StringUtils.isNotEmpty(documentIdentifier)) { String versionString; String[] aclsToUse; if (documentIdentifier.split(STD_SEPARATOR_BUCKET_AND_KEY) == null && documentIdentifier.length() < 1) { continue; } S3Artifact s3Artifact = getS3Artifact(documentIdentifier); S3Object s3Obj = amazons3Client .getObject(new GetObjectRequest(s3Artifact.getBucketName(), s3Artifact.getKey())); if (s3Obj == null) { // no such document in the bucket now // delete document activities.deleteDocument(documentIdentifier); continue; } Logging.connectors.info("Content-Type: " + s3Obj.getObjectMetadata().getContentType()); ObjectMetadata objectMetadata = s3Obj.getObjectMetadata(); Date lastModified = objectMetadata.getLastModified(); StringBuilder sb = new StringBuilder(); if (lastModified == null) { // remove the content activities.deleteDocument(documentIdentifier); continue; } aclsToUse = new String[0]; AccessControlList objectAcl = amazons3Client.getObjectAcl(s3Artifact.getBucketName(), s3Artifact.getKey()); Set<Grant> grants = objectAcl.getGrants(); String[] users = getUsers(grants); // sort aclsToUse = users; Arrays.sort(aclsToUse); packList(sb, aclsToUse, '+'); if (aclsToUse.length > 0) { sb.append('+'); pack(sb, AmazonS3Config.defaultAuthorityDenyToken, '+'); } else sb.append('-'); // sb.append(lastModified.toString()); versionString = sb.toString(); Logging.connectors.debug("version string : " + versionString); if (versionString.length() > 0 && !activities.checkDocumentNeedsReindexing(documentIdentifier, versionString)) { Logging.connectors.info("Document need not to be reindexed : " + documentIdentifier); continue; } Logging.connectors.debug("JIRA: Processing document identifier '" + documentIdentifier + "'"); long startTime = System.currentTimeMillis(); String errorCode = null; String errorDesc = null; Long fileSize = null; try { String mimeType = "text/plain";// default // tika works starts InputStream in = null; String document = null; try { in = s3Obj.getObjectContent(); parser.parse(in, handler, metadata, context); mimeType = tika.detect(in); document = handler.toString(); if (document == null) continue; metadata.set(Metadata.CONTENT_TYPE, mimeType); } catch (Exception e) { Logging.connectors.error("Error while parsing tika contents", e); } finally { if (in != null) IOUtils.closeQuietly(in); } String documentURI = getDocumentURI(s3Artifact); Logging.connectors.debug("document : " + documentURI); // need some investigation if (!activities.checkURLIndexable(documentURI)) { errorCode = activities.EXCLUDED_URL; errorDesc = "Excluded because of URL ('" + documentURI + "')"; activities.noDocument(documentIdentifier, versionString); continue; } if (!activities.checkMimeTypeIndexable(mimeType)) { errorCode = activities.EXCLUDED_MIMETYPE; errorDesc = "Excluded because of mime type ('" + mimeType + "')"; activities.noDocument(documentIdentifier, versionString); continue; } if (!activities.checkDateIndexable(lastModified)) { errorCode = activities.EXCLUDED_DATE; errorDesc = "Excluded because of date (" + lastModified + ")"; activities.noDocument(documentIdentifier, versionString); continue; } // otherwise process RepositoryDocument rd = new RepositoryDocument(); // Turn into acls and add into // description String[] denyAclsToUse; if (aclsToUse.length > 0) denyAclsToUse = new String[] { AmazonS3Config.defaultAuthorityDenyToken }; else denyAclsToUse = new String[0]; rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, aclsToUse, denyAclsToUse); rd.setMimeType(mimeType); if (lastModified != null) rd.setModifiedDate(lastModified); // set all meta-data fields addAllMetaData(rd, metadata); // get document try { byte[] documentBytes = document.getBytes(StandardCharsets.UTF_8); long fileLength = documentBytes.length; if (!activities.checkLengthIndexable(fileLength)) { errorCode = activities.EXCLUDED_LENGTH; errorDesc = "Excluded because of document length (" + fileLength + ")"; activities.noDocument(documentIdentifier, versionString); continue; } InputStream is = new ByteArrayInputStream(documentBytes); try { rd.setBinary(is, fileLength); activities.ingestDocumentWithException(documentIdentifier, versionString, documentURI, rd); errorCode = "OK"; fileSize = new Long(fileLength); } finally { if (is != null) IOUtils.closeQuietly(is); } } catch (Exception e) { Logging.connectors.error(e); } } catch (Exception e) { Logging.connectors.error(e); } } } catch (AmazonServiceException e) { Logging.connectors.error(e); } catch (AmazonClientException e) { Logging.connectors.error(e); } } } /** * Adds available meta data to repository documetn * @param rd repository document * @param metadata2 * @throws ManifoldCFException */ private void addAllMetaData(RepositoryDocument rd, Metadata metadata2) throws ManifoldCFException { for (String field : metadata2.names()) { rd.addField(field, metadata2.get(field)); } } /** * Constructs document URI for s3artifact * @param s3Artifact * @return */ private String getDocumentURI(S3Artifact s3Artifact) { return String.format(AmazonS3Config.DOCUMENT_URI_FORMAT, s3Artifact.getBucketName(), s3Artifact.getKey()); } /** * Get users has the the access the to artifact * @param grants available for artifact * @return */ private String[] getUsers(Set<Grant> grants) { Set<String> users = new HashSet<String>();// no duplicates for (Grant grant : grants) { if (grant != null && grant.getGrantee() != null) { Grantee grantee = grant.getGrantee(); if (grantee instanceof CanonicalGrantee) { users.add(((CanonicalGrantee) grantee).getDisplayName()); } else { users.add(grantee.getIdentifier()); } } } return users.toArray(new String[users.size()]); } /** * Get the s3artifact (document) using the document identifier ( bucket,key) * @param documentIdentifier * @return * @throws ManifoldCFException */ private S3Artifact getS3Artifact(String documentIdentifier) throws ManifoldCFException { String key; String bucketName = documentIdentifier.split(STD_SEPARATOR_BUCKET_AND_KEY)[0]; key = documentIdentifier.split(STD_SEPARATOR_BUCKET_AND_KEY)[1]; if (StringUtils.isEmpty(bucketName) || StringUtils.isEmpty(key)) throw new ManifoldCFException("bucket or key name is empty"); return new S3Artifact(bucketName, key); } protected static class GetSeedsThread extends Thread { protected Throwable exception = null; protected String[] bucketsToBeRemoved; protected AmazonS3 s3 = null; protected XThreadBuffer<S3Artifact> seedBuffer; public XThreadBuffer<S3Artifact> getBuffer() { return seedBuffer; } public void setBuffer(XThreadBuffer<S3Artifact> buffer) { this.seedBuffer = buffer; } public GetSeedsThread(AmazonS3 s3, String[] buckets) { super(); this.bucketsToBeRemoved = buckets; this.s3 = s3; seedBuffer = new XThreadBuffer<S3Artifact>(); setDaemon(true); } @Override public void run() { try { // push the keys for all documents processSeeds(); } catch (Exception e) { Logging.connectors.error(e); this.exception = e; } finally { seedBuffer.signalDone(); } } private void processSeeds() { if (s3 != null) { List<Bucket> listBuckets = s3.listBuckets(); List<String> refinedBuckets = new ArrayList<String>(); if (bucketsToBeRemoved != null && bucketsToBeRemoved.length > 0) { for (Bucket bucket : listBuckets) { if (!Arrays.asList(bucketsToBeRemoved).contains(bucket.getName())) { refinedBuckets.add(bucket.getName()); } } } for (String bucket : refinedBuckets) { String bucketName = bucket; try { PushSeeds(bucketName); } catch (Exception e) { Logging.connectors.error(e); } } } else { Logging.connectors.info("Could not connect amazon"); } } private void PushSeeds(String bucketName) { try { ObjectListing objectListing = s3.listObjects(new ListObjectsRequest().withBucketName(bucketName)); for (S3ObjectSummary objectSummary : objectListing.getObjectSummaries()) { try { addSeed(bucketName, objectSummary); } catch (Exception e) { Logging.connectors.error(e); } } } catch (Exception e) { Logging.connectors.error(e); } } private void addSeed(String bucketName, S3ObjectSummary objectSummary) throws InterruptedException { String objectKey = objectSummary.getKey(); String combinedKey = bucketName + STD_SEPARATOR_BUCKET_AND_KEY + objectKey; // push the key seedBuffer.add(new S3Artifact(bucketName, objectKey)); Logging.connectors.info("Pused a new key(combined) in seed buffer : " + combinedKey); } public void finishUp() throws InterruptedException, IOException, ResponseException { seedBuffer.abandon(); join(); Throwable thr = exception; if (thr != null) { if (thr instanceof IOException) throw (IOException) thr; else if (thr instanceof ResponseException) throw (ResponseException) thr; else if (thr instanceof RuntimeException) throw (RuntimeException) thr; else if (thr instanceof Error) throw (Error) thr; else throw new RuntimeException("Unhandled exception of type: " + thr.getClass().getName(), thr); } } } protected static class CheckThread extends Thread { protected String result = "Unknown"; protected AmazonS3 s3 = null; protected Throwable exception = null; public CheckThread(AmazonS3 s3) { this.s3 = s3; } public String getResult() { return result; } public Throwable getException() { return exception; } @Override public void run() { try { if (s3 != null) { Owner s3AccountOwner = s3.getS3AccountOwner(); if (s3AccountOwner != null) { result = StringUtils.isNotEmpty(s3AccountOwner.getDisplayName()) ? "Connection OK" : "Connection Failed"; } } } catch (AmazonServiceException e) { result = "Connection Failed : " + e.getMessage(); exception = e; Logging.connectors.error(e); } catch (AmazonClientException e) { result = "Connection Failed : " + e.getMessage(); exception = e; Logging.connectors.error(e); } } } }