Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.gobblin.salesforce; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.ListIterator; import java.util.Map; import org.apache.http.HttpEntity; import org.apache.http.NameValuePair; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.utils.URIBuilder; import org.apache.http.message.BasicNameValuePair; import com.google.common.base.Strings; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Lists; import com.google.gson.Gson; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.sforce.async.AsyncApiException; import com.sforce.async.BatchInfo; import com.sforce.async.BatchInfoList; import com.sforce.async.BatchStateEnum; import com.sforce.async.BulkConnection; import com.sforce.async.ConcurrencyMode; import com.sforce.async.ContentType; import com.sforce.async.JobInfo; import com.sforce.async.OperationEnum; import com.sforce.async.QueryResultList; import com.sforce.soap.partner.PartnerConnection; import com.sforce.ws.ConnectorConfig; import org.apache.gobblin.configuration.ConfigurationKeys; import org.apache.gobblin.configuration.WorkUnitState; import org.apache.gobblin.password.PasswordManager; import org.apache.gobblin.source.extractor.DataRecordException; import org.apache.gobblin.source.extractor.exception.HighWatermarkException; import org.apache.gobblin.source.extractor.exception.RecordCountException; import org.apache.gobblin.source.extractor.exception.RestApiClientException; import org.apache.gobblin.source.extractor.exception.RestApiConnectionException; import org.apache.gobblin.source.extractor.exception.SchemaException; import org.apache.gobblin.source.extractor.extract.Command; import org.apache.gobblin.source.extractor.extract.CommandOutput; import org.apache.gobblin.source.extractor.partition.Partitioner; import org.apache.gobblin.source.jdbc.SqlQueryUtils; import org.apache.gobblin.source.extractor.extract.restapi.RestApiCommand; import org.apache.gobblin.source.extractor.extract.restapi.RestApiCommand.RestApiCommandType; import org.apache.gobblin.source.extractor.extract.restapi.RestApiConnector; import org.apache.gobblin.source.extractor.extract.restapi.RestApiExtractor; import org.apache.gobblin.source.extractor.resultset.RecordSet; import org.apache.gobblin.source.extractor.resultset.RecordSetList; import org.apache.gobblin.source.extractor.schema.Schema; import org.apache.gobblin.source.extractor.utils.InputStreamCSVReader; import org.apache.gobblin.source.extractor.utils.Utils; import org.apache.gobblin.source.extractor.watermark.Predicate; import org.apache.gobblin.source.extractor.watermark.WatermarkType; import org.apache.gobblin.source.workunit.WorkUnit; import lombok.Data; import lombok.extern.slf4j.Slf4j; /** * An implementation of salesforce extractor for extracting data from SFDC */ @Slf4j public class SalesforceExtractor extends RestApiExtractor { private static final String SOQL_RESOURCE = "/queryAll"; public static final String SALESFORCE_TIMESTAMP_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'.000Z'"; private static final String SALESFORCE_DATE_FORMAT = "yyyy-MM-dd"; private static final String SALESFORCE_HOUR_FORMAT = "HH"; private static final String SALESFORCE_SOAP_AUTH_SERVICE = "/services/Soap/u"; private static final Gson GSON = new Gson(); private static final int MAX_PK_CHUNKING_SIZE = 250000; private static final int MIN_PK_CHUNKING_SIZE = 100000; private static final int DEFAULT_PK_CHUNKING_SIZE = 200000; private static final String ENABLE_PK_CHUNKING_KEY = "salesforce.enablePkChunking"; private static final String PK_CHUNKING_SIZE_KEY = "salesforce.pkChunkingSize"; private static final int MAX_RETRY_INTERVAL_SECS = 600; // avoid using too many bulk API calls by only allowing PK chunking only if max partitions is configured <= this private static final int PK_CHUNKING_MAX_PARTITIONS_LIMIT = 3; private static final String FETCH_RETRY_LIMIT_KEY = "salesforce.fetchRetryLimit"; private static final int DEFAULT_FETCH_RETRY_LIMIT = 5; private boolean pullStatus = true; private String nextUrl; private BulkConnection bulkConnection = null; private boolean bulkApiInitialRun = true; private JobInfo bulkJob = new JobInfo(); private BufferedReader bulkBufferedReader = null; private List<BatchIdAndResultId> bulkResultIdList = Lists.newArrayList(); private int bulkResultIdCount = 0; private boolean bulkJobFinished = true; private List<String> bulkRecordHeader; private int bulkResultColumCount; private boolean newBulkResultSet = true; private int bulkRecordCount = 0; private int prevBulkRecordCount = 0; private List<String> csvRecord; private final boolean pkChunking; private final int pkChunkingSize; private final SalesforceConnector sfConnector; private final int fetchRetryLimit; private final int batchSize; public SalesforceExtractor(WorkUnitState state) { super(state); this.sfConnector = (SalesforceConnector) this.connector; // don't allow pk chunking if max partitions too high or have user specified partitions if (state.getPropAsBoolean(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, false) || state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, ConfigurationKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS) > PK_CHUNKING_MAX_PARTITIONS_LIMIT) { if (state.getPropAsBoolean(ENABLE_PK_CHUNKING_KEY, false)) { log.warn("Max partitions too high, so PK chunking is not enabled"); } this.pkChunking = false; } else { this.pkChunking = state.getPropAsBoolean(ENABLE_PK_CHUNKING_KEY, false); } this.pkChunkingSize = Math.max(MIN_PK_CHUNKING_SIZE, Math.min(MAX_PK_CHUNKING_SIZE, state.getPropAsInt(PK_CHUNKING_SIZE_KEY, DEFAULT_PK_CHUNKING_SIZE))); // Get batch size from .pull file int tmpBatchSize = state.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_FETCH_SIZE, ConfigurationKeys.DEFAULT_SOURCE_FETCH_SIZE); this.batchSize = tmpBatchSize == 0 ? ConfigurationKeys.DEFAULT_SOURCE_FETCH_SIZE : tmpBatchSize; this.fetchRetryLimit = state.getPropAsInt(FETCH_RETRY_LIMIT_KEY, DEFAULT_FETCH_RETRY_LIMIT); } @Override protected RestApiConnector getConnector(WorkUnitState state) { return new SalesforceConnector(state); } /** * true is further pull required else false */ public void setPullStatus(boolean pullStatus) { this.pullStatus = pullStatus; } /** * url for the next pull from salesforce */ public void setNextUrl(String nextUrl) { this.nextUrl = nextUrl; } private boolean isBulkJobFinished() { return this.bulkJobFinished; } private void setBulkJobFinished(boolean bulkJobFinished) { this.bulkJobFinished = bulkJobFinished; } public boolean isNewBulkResultSet() { return this.newBulkResultSet; } public void setNewBulkResultSet(boolean newBulkResultSet) { this.newBulkResultSet = newBulkResultSet; } @Override public HttpEntity getAuthentication() throws RestApiConnectionException { log.debug("Authenticating salesforce"); return this.connector.getAuthentication(); } @Override public List<Command> getSchemaMetadata(String schema, String entity) throws SchemaException { log.debug("Build url to retrieve schema"); return constructGetCommand(this.sfConnector.getFullUri("/sobjects/" + entity.trim() + "/describe")); } @Override public JsonArray getSchema(CommandOutput<?, ?> response) throws SchemaException { log.info("Get schema from salesforce"); String output; Iterator<String> itr = (Iterator<String>) response.getResults().values().iterator(); if (itr.hasNext()) { output = itr.next(); } else { throw new SchemaException("Failed to get schema from salesforce; REST response has no output"); } JsonArray fieldJsonArray = new JsonArray(); JsonElement element = GSON.fromJson(output, JsonObject.class); JsonObject jsonObject = element.getAsJsonObject(); try { JsonArray array = jsonObject.getAsJsonArray("fields"); for (JsonElement columnElement : array) { JsonObject field = columnElement.getAsJsonObject(); Schema schema = new Schema(); schema.setColumnName(field.get("name").getAsString()); String dataType = field.get("type").getAsString(); String elementDataType = "string"; List<String> mapSymbols = null; JsonObject newDataType = this.convertDataType(field.get("name").getAsString(), dataType, elementDataType, mapSymbols); log.debug("ColumnName:" + field.get("name").getAsString() + "; old datatype:" + dataType + "; new datatype:" + newDataType); schema.setDataType(newDataType); schema.setLength(field.get("length").getAsLong()); schema.setPrecision(field.get("precision").getAsInt()); schema.setScale(field.get("scale").getAsInt()); schema.setNullable(field.get("nillable").getAsBoolean()); schema.setFormat(null); schema.setComment((field.get("label").isJsonNull() ? null : field.get("label").getAsString())); schema.setDefaultValue( (field.get("defaultValue").isJsonNull() ? null : field.get("defaultValue").getAsString())); schema.setUnique(field.get("unique").getAsBoolean()); String jsonStr = GSON.toJson(schema); JsonObject obj = GSON.fromJson(jsonStr, JsonObject.class).getAsJsonObject(); fieldJsonArray.add(obj); } } catch (Exception e) { throw new SchemaException("Failed to get schema from salesforce; error - " + e.getMessage(), e); } return fieldJsonArray; } @Override public List<Command> getHighWatermarkMetadata(String schema, String entity, String watermarkColumn, List<Predicate> predicateList) throws HighWatermarkException { log.debug("Build url to retrieve high watermark"); String query = "SELECT " + watermarkColumn + " FROM " + entity; String defaultPredicate = " " + watermarkColumn + " != null"; String defaultSortOrder = " ORDER BY " + watermarkColumn + " desc LIMIT 1"; String existingPredicate = ""; if (this.updatedQuery != null) { String queryLowerCase = this.updatedQuery.toLowerCase(); int startIndex = queryLowerCase.indexOf(" where "); if (startIndex > 0) { existingPredicate = this.updatedQuery.substring(startIndex); } } query = query + existingPredicate; String limitString = getLimitFromInputQuery(query); query = query.replace(limitString, ""); Iterator<Predicate> i = predicateList.listIterator(); while (i.hasNext()) { Predicate predicate = i.next(); query = SqlQueryUtils.addPredicate(query, predicate.getCondition()); } query = SqlQueryUtils.addPredicate(query, defaultPredicate); query = query + defaultSortOrder; log.info("QUERY: " + query); try { return constructGetCommand(this.sfConnector.getFullUri(getSoqlUrl(query))); } catch (Exception e) { throw new HighWatermarkException( "Failed to get salesforce url for high watermark; error - " + e.getMessage(), e); } } @Override public long getHighWatermark(CommandOutput<?, ?> response, String watermarkColumn, String format) throws HighWatermarkException { log.info("Get high watermark from salesforce"); String output; Iterator<String> itr = (Iterator<String>) response.getResults().values().iterator(); if (itr.hasNext()) { output = itr.next(); } else { throw new HighWatermarkException( "Failed to get high watermark from salesforce; REST response has no output"); } JsonElement element = GSON.fromJson(output, JsonObject.class); long high_ts; try { JsonObject jsonObject = element.getAsJsonObject(); JsonArray jsonArray = jsonObject.getAsJsonArray("records"); if (jsonArray.size() == 0) { return -1; } String value = jsonObject.getAsJsonArray("records").get(0).getAsJsonObject().get(watermarkColumn) .getAsString(); if (format != null) { SimpleDateFormat inFormat = new SimpleDateFormat(format); Date date = null; try { date = inFormat.parse(value); } catch (ParseException e) { log.error("ParseException: " + e.getMessage(), e); } SimpleDateFormat outFormat = new SimpleDateFormat("yyyyMMddHHmmss"); high_ts = Long.parseLong(outFormat.format(date)); } else { high_ts = Long.parseLong(value); } } catch (Exception e) { throw new HighWatermarkException( "Failed to get high watermark from salesforce; error - " + e.getMessage(), e); } return high_ts; } @Override public List<Command> getCountMetadata(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws RecordCountException { log.debug("Build url to retrieve source record count"); String existingPredicate = ""; if (this.updatedQuery != null) { String queryLowerCase = this.updatedQuery.toLowerCase(); int startIndex = queryLowerCase.indexOf(" where "); if (startIndex > 0) { existingPredicate = this.updatedQuery.substring(startIndex); } } String query = "SELECT COUNT() FROM " + entity + existingPredicate; String limitString = getLimitFromInputQuery(query); query = query.replace(limitString, ""); try { if (isNullPredicate(predicateList)) { log.info("QUERY with null predicate: " + query); return constructGetCommand(this.sfConnector.getFullUri(getSoqlUrl(query))); } Iterator<Predicate> i = predicateList.listIterator(); while (i.hasNext()) { Predicate predicate = i.next(); query = SqlQueryUtils.addPredicate(query, predicate.getCondition()); } query = query + getLimitFromInputQuery(this.updatedQuery); log.info("QUERY: " + query); return constructGetCommand(this.sfConnector.getFullUri(getSoqlUrl(query))); } catch (Exception e) { throw new RecordCountException( "Failed to get salesforce url for record count; error - " + e.getMessage(), e); } } @Override public long getCount(CommandOutput<?, ?> response) throws RecordCountException { log.info("Get source record count from salesforce"); String output; Iterator<String> itr = (Iterator<String>) response.getResults().values().iterator(); if (itr.hasNext()) { output = itr.next(); } else { throw new RecordCountException("Failed to get count from salesforce; REST response has no output"); } JsonElement element = GSON.fromJson(output, JsonObject.class); long count; try { JsonObject jsonObject = element.getAsJsonObject(); count = jsonObject.get("totalSize").getAsLong(); } catch (Exception e) { throw new RecordCountException("Failed to get record count from salesforce; error - " + e.getMessage(), e); } return count; } @Override public List<Command> getDataMetadata(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws DataRecordException { log.debug("Build url to retrieve data records"); String query = this.updatedQuery; String url = null; try { if (this.getNextUrl() != null && this.pullStatus == true) { url = this.getNextUrl(); } else { if (isNullPredicate(predicateList)) { log.info("QUERY:" + query); return constructGetCommand(this.sfConnector.getFullUri(getSoqlUrl(query))); } String limitString = getLimitFromInputQuery(query); query = query.replace(limitString, ""); Iterator<Predicate> i = predicateList.listIterator(); while (i.hasNext()) { Predicate predicate = i.next(); query = SqlQueryUtils.addPredicate(query, predicate.getCondition()); } if (Boolean.valueOf( this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_SPECIFIC_API_ACTIVE))) { query = SqlQueryUtils.addPredicate(query, "IsDeleted = true"); } query = query + limitString; log.info("QUERY: " + query); url = this.sfConnector.getFullUri(getSoqlUrl(query)); } return constructGetCommand(url); } catch (Exception e) { throw new DataRecordException( "Failed to get salesforce url for data records; error - " + e.getMessage(), e); } } private static String getLimitFromInputQuery(String query) { String inputQuery = query.toLowerCase(); int limitIndex = inputQuery.indexOf(" limit"); if (limitIndex > 0) { return query.substring(limitIndex); } return ""; } @Override public Iterator<JsonElement> getData(CommandOutput<?, ?> response) throws DataRecordException { log.debug("Get data records from response"); String output; Iterator<String> itr = (Iterator<String>) response.getResults().values().iterator(); if (itr.hasNext()) { output = itr.next(); } else { throw new DataRecordException("Failed to get data from salesforce; REST response has no output"); } List<JsonElement> rs = Lists.newArrayList(); JsonElement element = GSON.fromJson(output, JsonObject.class); JsonArray partRecords; try { JsonObject jsonObject = element.getAsJsonObject(); partRecords = jsonObject.getAsJsonArray("records"); if (jsonObject.get("done").getAsBoolean()) { setPullStatus(false); } else { setNextUrl(this.sfConnector.getFullUri(jsonObject.get("nextRecordsUrl").getAsString() .replaceAll(this.sfConnector.getServicesDataEnvPath(), ""))); } JsonArray array = Utils.removeElementFromJsonArray(partRecords, "attributes"); Iterator<JsonElement> li = array.iterator(); while (li.hasNext()) { JsonElement recordElement = li.next(); rs.add(recordElement); } return rs.iterator(); } catch (Exception e) { throw new DataRecordException("Failed to get records from salesforce; error - " + e.getMessage(), e); } } @Override public boolean getPullStatus() { return this.pullStatus; } @Override public String getNextUrl() { return this.nextUrl; } public static String getSoqlUrl(String soqlQuery) throws RestApiClientException { String path = SOQL_RESOURCE + "/"; NameValuePair pair = new BasicNameValuePair("q", soqlQuery); List<NameValuePair> qparams = new ArrayList<>(); qparams.add(pair); return buildUrl(path, qparams); } private static String buildUrl(String path, List<NameValuePair> qparams) throws RestApiClientException { URIBuilder builder = new URIBuilder(); builder.setPath(path); ListIterator<NameValuePair> i = qparams.listIterator(); while (i.hasNext()) { NameValuePair keyValue = i.next(); builder.setParameter(keyValue.getName(), keyValue.getValue()); } URI uri; try { uri = builder.build(); } catch (Exception e) { throw new RestApiClientException("Failed to build url; error - " + e.getMessage(), e); } return new HttpGet(uri).getURI().toString(); } private static boolean isNullPredicate(List<Predicate> predicateList) { if (predicateList == null || predicateList.size() == 0) { return true; } return false; } @Override public String getWatermarkSourceFormat(WatermarkType watermarkType) { switch (watermarkType) { case TIMESTAMP: return "yyyy-MM-dd'T'HH:mm:ss"; case DATE: return "yyyy-MM-dd"; default: return null; } } @Override public String getHourPredicateCondition(String column, long value, String valueFormat, String operator) { log.info("Getting hour predicate from salesforce"); String Formattedvalue = Utils.toDateTimeFormat(Long.toString(value), valueFormat, SALESFORCE_HOUR_FORMAT); return column + " " + operator + " " + Formattedvalue; } @Override public String getDatePredicateCondition(String column, long value, String valueFormat, String operator) { log.info("Getting date predicate from salesforce"); String Formattedvalue = Utils.toDateTimeFormat(Long.toString(value), valueFormat, SALESFORCE_DATE_FORMAT); return column + " " + operator + " " + Formattedvalue; } @Override public String getTimestampPredicateCondition(String column, long value, String valueFormat, String operator) { log.info("Getting timestamp predicate from salesforce"); String Formattedvalue = Utils.toDateTimeFormat(Long.toString(value), valueFormat, SALESFORCE_TIMESTAMP_FORMAT); return column + " " + operator + " " + Formattedvalue; } @Override public Map<String, String> getDataTypeMap() { Map<String, String> dataTypeMap = ImmutableMap.<String, String>builder().put("url", "string") .put("textarea", "string").put("reference", "string").put("phone", "string") .put("masterrecord", "string").put("location", "string").put("id", "string") .put("encryptedstring", "string").put("email", "string").put("DataCategoryGroupReference", "string") .put("calculated", "string").put("anyType", "string").put("address", "string").put("blob", "string") .put("date", "date").put("datetime", "timestamp").put("time", "time").put("object", "string") .put("string", "string").put("int", "int").put("long", "long").put("double", "double") .put("percent", "double").put("currency", "double").put("decimal", "double") .put("boolean", "boolean").put("picklist", "string").put("multipicklist", "string") .put("combobox", "string").put("list", "string").put("set", "string").put("map", "string") .put("enum", "string").build(); return dataTypeMap; } @Override public Iterator<JsonElement> getRecordSetFromSourceApi(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws IOException { log.debug("Getting salesforce data using bulk api"); RecordSet<JsonElement> rs = null; try { //Get query result ids in the first run //result id is used to construct url while fetching data if (this.bulkApiInitialRun == true) { // set finish status to false before starting the bulk job this.setBulkJobFinished(false); this.bulkResultIdList = getQueryResultIds(entity, predicateList); log.info("Number of bulk api resultSet Ids:" + this.bulkResultIdList.size()); } // Get data from input stream // If bulk load is not finished, get data from the stream // Skip empty result sets since they will cause the extractor to terminate early while (!this.isBulkJobFinished() && (rs == null || rs.isEmpty())) { rs = getBulkData(); } // Set bulkApiInitialRun to false after the completion of first run this.bulkApiInitialRun = false; // If bulk job is finished, get soft deleted records using Rest API boolean isSoftDeletesPullDisabled = Boolean.valueOf(this.workUnit.getProp( SalesforceConfigurationKeys.SOURCE_QUERYBASED_SALESFORCE_IS_SOFT_DELETES_PULL_DISABLED)); if (rs == null || rs.isEmpty()) { // Get soft delete records only if IsDeleted column exists and soft deletes pull is not disabled if (this.columnList.contains("IsDeleted") && !isSoftDeletesPullDisabled) { return this.getSoftDeletedRecords(schema, entity, workUnit, predicateList); } log.info("Ignoring soft delete records"); } return rs.iterator(); } catch (Exception e) { throw new IOException("Failed to get records using bulk api; error - " + e.getMessage(), e); } } /** * Get soft deleted records using Rest Api * @return iterator with deleted records */ private Iterator<JsonElement> getSoftDeletedRecords(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws DataRecordException { return this.getRecordSet(schema, entity, workUnit, predicateList); } /** * Login to salesforce * @return login status */ public boolean bulkApiLogin() throws Exception { log.info("Authenticating salesforce bulk api"); boolean success = false; String hostName = this.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_HOST_NAME); String apiVersion = this.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_VERSION); if (Strings.isNullOrEmpty(apiVersion)) { apiVersion = "29.0"; } String soapAuthEndPoint = hostName + SALESFORCE_SOAP_AUTH_SERVICE + "/" + apiVersion; try { ConnectorConfig partnerConfig = new ConnectorConfig(); if (super.workUnitState.contains(ConfigurationKeys.SOURCE_CONN_USE_PROXY_URL) && !super.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_USE_PROXY_URL).isEmpty()) { partnerConfig.setProxy(super.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_USE_PROXY_URL), super.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_CONN_USE_PROXY_PORT)); } String securityToken = this.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_SECURITY_TOKEN); String password = PasswordManager.getInstance(this.workUnitState) .readPassword(this.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_PASSWORD)); partnerConfig.setUsername(this.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_USERNAME)); partnerConfig.setPassword(password + securityToken); partnerConfig.setAuthEndpoint(soapAuthEndPoint); new PartnerConnection(partnerConfig); String soapEndpoint = partnerConfig.getServiceEndpoint(); String restEndpoint = soapEndpoint.substring(0, soapEndpoint.indexOf("Soap/")) + "async/" + apiVersion; ConnectorConfig config = new ConnectorConfig(); config.setSessionId(partnerConfig.getSessionId()); config.setRestEndpoint(restEndpoint); config.setCompression(true); config.setTraceFile("traceLogs.txt"); config.setTraceMessage(false); config.setPrettyPrintXml(true); if (super.workUnitState.contains(ConfigurationKeys.SOURCE_CONN_USE_PROXY_URL) && !super.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_USE_PROXY_URL).isEmpty()) { config.setProxy(super.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_USE_PROXY_URL), super.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_CONN_USE_PROXY_PORT)); } this.bulkConnection = new BulkConnection(config); success = true; } catch (RuntimeException e) { throw new RuntimeException("Failed to connect to salesforce bulk api; error - " + e, e); } return success; } /** * Get Record set using salesforce specific API(Bulk API) * @param entity/tablename * @param predicateList of all predicate conditions * @return iterator with batch of records */ private List<BatchIdAndResultId> getQueryResultIds(String entity, List<Predicate> predicateList) throws Exception { if (!bulkApiLogin()) { throw new IllegalArgumentException("Invalid Login"); } try { boolean usingPkChunking = false; // Set bulk job attributes this.bulkJob.setObject(entity); this.bulkJob.setOperation(OperationEnum.query); this.bulkJob.setConcurrencyMode(ConcurrencyMode.Parallel); // use pk chunking if pk chunking is configured and the expected record count is larger than the pk chunking size if (this.pkChunking && getExpectedRecordCount() > this.pkChunkingSize) { log.info("Enabling pk chunking with size {}", this.pkChunkingSize); this.bulkConnection.addHeader("Sforce-Enable-PKChunking", "chunkSize=" + this.pkChunkingSize); usingPkChunking = true; } // Result type as CSV this.bulkJob.setContentType(ContentType.CSV); this.bulkJob = this.bulkConnection.createJob(this.bulkJob); this.bulkJob = this.bulkConnection.getJobStatus(this.bulkJob.getId()); // Construct query with the predicates String query = this.updatedQuery; if (!isNullPredicate(predicateList)) { String limitString = getLimitFromInputQuery(query); query = query.replace(limitString, ""); Iterator<Predicate> i = predicateList.listIterator(); while (i.hasNext()) { Predicate predicate = i.next(); query = SqlQueryUtils.addPredicate(query, predicate.getCondition()); } query = query + limitString; } log.info("QUERY:" + query); ByteArrayInputStream bout = new ByteArrayInputStream( query.getBytes(ConfigurationKeys.DEFAULT_CHARSET_ENCODING)); BatchInfo bulkBatchInfo = this.bulkConnection.createBatchFromStream(this.bulkJob, bout); long expectedSizePerBatch = usingPkChunking ? this.pkChunkingSize : this.getExpectedRecordCount(); int retryInterval = Math.min(MAX_RETRY_INTERVAL_SECS, 30 + (int) Math.ceil((float) expectedSizePerBatch / 10000) * 2); log.info("Salesforce bulk api retry interval in seconds:" + retryInterval); // Get batch info with complete resultset (info id - refers to the resultset id corresponding to entire resultset) bulkBatchInfo = this.bulkConnection.getBatchInfo(this.bulkJob.getId(), bulkBatchInfo.getId()); // wait for completion, failure, or formation of PK chunking batches while ((bulkBatchInfo.getState() != BatchStateEnum.Completed) && (bulkBatchInfo.getState() != BatchStateEnum.Failed) && (!usingPkChunking || bulkBatchInfo.getState() != BatchStateEnum.NotProcessed)) { Thread.sleep(retryInterval * 1000); bulkBatchInfo = this.bulkConnection.getBatchInfo(this.bulkJob.getId(), bulkBatchInfo.getId()); log.debug("Bulk Api Batch Info:" + bulkBatchInfo); log.info("Waiting for bulk resultSetIds"); } // Wait for pk chunking batches BatchInfoList batchInfoList = this.bulkConnection.getBatchInfoList(this.bulkJob.getId()); if (usingPkChunking && bulkBatchInfo.getState() == BatchStateEnum.NotProcessed) { bulkBatchInfo = waitForPkBatches(batchInfoList, retryInterval); } if (bulkBatchInfo.getState() == BatchStateEnum.Failed) { log.error("Bulk batch failed: " + bulkBatchInfo.toString()); throw new RuntimeException("Failed to get bulk batch info for jobId " + bulkBatchInfo.getJobId() + " error - " + bulkBatchInfo.getStateMessage()); } // Get resultset ids of all the batches from the batch info list List<BatchIdAndResultId> batchIdAndResultIdList = Lists.newArrayList(); for (BatchInfo bi : batchInfoList.getBatchInfo()) { QueryResultList list = this.bulkConnection.getQueryResultList(this.bulkJob.getId(), bi.getId()); for (String result : list.getResult()) { batchIdAndResultIdList.add(new BatchIdAndResultId(bi.getId(), result)); } } log.info("QueryResultList: " + batchIdAndResultIdList); return batchIdAndResultIdList; } catch (RuntimeException | AsyncApiException | InterruptedException e) { throw new RuntimeException( "Failed to get query result ids from salesforce using bulk api; error - " + e.getMessage(), e); } } /** * Get a buffered reader wrapping the query result stream for the result with the specified index * @param index index the {@link #bulkResultIdList} * @return a {@link BufferedReader} * @throws AsyncApiException */ private BufferedReader getBulkBufferedReader(int index) throws AsyncApiException { return new BufferedReader(new InputStreamReader( this.bulkConnection.getQueryResultStream(this.bulkJob.getId(), this.bulkResultIdList.get(index).getBatchId(), this.bulkResultIdList.get(index).getResultId()), ConfigurationKeys.DEFAULT_CHARSET_ENCODING)); } /** * Fetch records into a {@link RecordSetList} up to the configured batch size {@link #batchSize}. This batch is not * the entire Salesforce result batch. It is an internal batch in the extractor for buffering a subset of the result * stream that comes from a Salesforce batch for more efficient processing. * @param rs the record set to fetch into * @param initialRecordCount Initial record count to use. This should correspond to the number of records already in rs. * This is used to limit the number of records returned in rs to {@link #batchSize}. * @throws DataRecordException * @throws IOException */ private void fetchResultBatch(RecordSetList<JsonElement> rs, int initialRecordCount) throws DataRecordException, IOException { int recordCount = initialRecordCount; // Stream the resultset through CSV reader to identify columns in each record InputStreamCSVReader reader = new InputStreamCSVReader(this.bulkBufferedReader); // Get header if it is first run of a new resultset if (this.isNewBulkResultSet()) { this.bulkRecordHeader = reader.nextRecord(); this.bulkResultColumCount = this.bulkRecordHeader.size(); this.setNewBulkResultSet(false); } // Get record from CSV reader stream while ((this.csvRecord = reader.nextRecord()) != null) { // Convert CSV record to JsonObject JsonObject jsonObject = Utils.csvToJsonObject(this.bulkRecordHeader, this.csvRecord, this.bulkResultColumCount); rs.add(jsonObject); recordCount++; this.bulkRecordCount++; // Insert records in record set until it reaches the batch size if (recordCount >= batchSize) { log.info("Total number of records processed so far: " + this.bulkRecordCount); break; } } } /** * Reinitialize the state of {@link #bulkBufferedReader} to handle network disconnects * @throws IOException * @throws AsyncApiException */ private void reinitializeBufferedReader() throws IOException, AsyncApiException { // close reader and get a new input stream to reconnect to resolve intermittent network errors this.bulkBufferedReader.close(); this.bulkBufferedReader = getBulkBufferedReader(this.bulkResultIdCount - 1); // if the result set is partially processed then we need to skip over processed records if (!isNewBulkResultSet()) { List<String> lastCsvRecord = null; InputStreamCSVReader reader = new InputStreamCSVReader(this.bulkBufferedReader); // skip header reader.nextRecord(); int recordsToSkip = this.bulkRecordCount - this.prevBulkRecordCount; log.info("Skipping {} records on retry: ", recordsToSkip); for (int i = 0; i < recordsToSkip; i++) { lastCsvRecord = reader.nextRecord(); } // make sure the last record processed before the error was the last record skipped so that the next // unprocessed record is processed in the next call to fetchResultBatch() if (recordsToSkip > 0) { if (!this.csvRecord.equals(lastCsvRecord)) { throw new RuntimeException( "Repositioning after reconnecting did not point to the expected record"); } } } } /** * Fetch a result batch with retry for network errors * @param rs the {@link RecordSetList} to fetch into */ private void fetchResultBatchWithRetry(RecordSetList<JsonElement> rs) throws AsyncApiException, DataRecordException, IOException { boolean success = false; int retryCount = 0; int recordCountBeforeFetch = this.bulkRecordCount; do { try { // reinitialize the reader to establish a new connection to handle transient network errors if (retryCount > 0) { reinitializeBufferedReader(); } // on retries there may already be records in rs, so pass the number of records as the initial count fetchResultBatch(rs, this.bulkRecordCount - recordCountBeforeFetch); success = true; } catch (IOException e) { if (retryCount < this.fetchRetryLimit) { log.info("Exception while fetching data, retrying: " + e.getMessage(), e); retryCount++; } else { log.error("Exception while fetching data: " + e.getMessage(), e); throw e; } } } while (!success); } /** * Get data from the bulk api input stream * @return record set with each record as a JsonObject */ private RecordSet<JsonElement> getBulkData() throws DataRecordException { log.debug("Processing bulk api batch..."); RecordSetList<JsonElement> rs = new RecordSetList<>(); try { // if Buffer is empty then get stream for the new resultset id if (this.bulkBufferedReader == null || !this.bulkBufferedReader.ready()) { // log the number of records from each result set after it is processed (bulkResultIdCount > 0) if (this.bulkResultIdCount > 0) { log.info("Result set {} had {} records", this.bulkResultIdCount, this.bulkRecordCount - this.prevBulkRecordCount); } // if there is unprocessed resultset id then get result stream for that id if (this.bulkResultIdCount < this.bulkResultIdList.size()) { log.info("Stream resultset for resultId:" + this.bulkResultIdList.get(this.bulkResultIdCount)); this.setNewBulkResultSet(true); if (this.bulkBufferedReader != null) { this.bulkBufferedReader.close(); } this.bulkBufferedReader = getBulkBufferedReader(this.bulkResultIdCount); this.bulkResultIdCount++; this.prevBulkRecordCount = bulkRecordCount; } else { // if result stream processed for all resultset ids then finish the bulk job log.info("Bulk job is finished"); this.setBulkJobFinished(true); return rs; } } // fetch a batch of results with retry for network errors fetchResultBatchWithRetry(rs); } catch (Exception e) { throw new DataRecordException("Failed to get records from salesforce; error - " + e.getMessage(), e); } return rs; } @Override public void closeConnection() throws Exception { if (this.bulkConnection != null && !this.bulkConnection.getJobStatus(this.bulkJob.getId()).getState().toString().equals("Closed")) { log.info("Closing salesforce bulk job connection"); this.bulkConnection.closeJob(this.bulkJob.getId()); } } public static List<Command> constructGetCommand(String restQuery) { return Arrays.asList(new RestApiCommand().build(Arrays.asList(restQuery), RestApiCommandType.GET)); } /** * Waits for the PK batches to complete. The wait will stop after all batches are complete or on the first failed batch * @param batchInfoList list of batch info * @param retryInterval the polling interval * @return the last {@link BatchInfo} processed * @throws InterruptedException * @throws AsyncApiException */ private BatchInfo waitForPkBatches(BatchInfoList batchInfoList, int retryInterval) throws InterruptedException, AsyncApiException { BatchInfo batchInfo = null; BatchInfo[] batchInfos = batchInfoList.getBatchInfo(); // Wait for all batches other than the first one. The first one is not processed in PK chunking mode for (int i = 1; i < batchInfos.length; i++) { BatchInfo bi = batchInfos[i]; // get refreshed job status bi = this.bulkConnection.getBatchInfo(this.bulkJob.getId(), bi.getId()); while ((bi.getState() != BatchStateEnum.Completed) && (bi.getState() != BatchStateEnum.Failed)) { Thread.sleep(retryInterval * 1000); bi = this.bulkConnection.getBatchInfo(this.bulkJob.getId(), bi.getId()); log.debug("Bulk Api Batch Info:" + bi); log.info("Waiting for bulk resultSetIds"); } batchInfo = bi; // exit if there was a failure if (batchInfo.getState() == BatchStateEnum.Failed) { break; } } return batchInfo; } @Data private static class BatchIdAndResultId { private final String batchId; private final String resultId; } }