gobblin.source.extractor.extract.QueryBasedExtractor.java Source code

Introduction

Here is the source code for gobblin.source.extractor.extract.QueryBasedExtractor.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package gobblin.source.extractor.extract;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.MDC;

import com.google.common.annotations.VisibleForTesting;
import com.google.gson.Gson;
import com.google.gson.JsonObject;

import lombok.extern.slf4j.Slf4j;

import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.WorkUnitState;
import gobblin.source.extractor.DataRecordException;
import gobblin.source.extractor.Extractor;
import gobblin.source.extractor.exception.ExtractPrepareException;
import gobblin.source.extractor.exception.HighWatermarkException;
import gobblin.source.extractor.exception.RecordCountException;
import gobblin.source.extractor.exception.SchemaException;
import gobblin.source.extractor.partition.Partition;
import gobblin.source.extractor.schema.ArrayDataType;
import gobblin.source.extractor.schema.DataType;
import gobblin.source.extractor.schema.EnumDataType;
import gobblin.source.extractor.schema.MapDataType;
import gobblin.source.extractor.utils.Utils;
import gobblin.source.extractor.watermark.Predicate;
import gobblin.source.extractor.watermark.WatermarkPredicate;
import gobblin.source.extractor.watermark.WatermarkType;
import gobblin.source.workunit.WorkUnit;

/**
 * An implementation of common extractor for query based sources.
 *
 * @param <D> type of data record
 * @param <S> type of schema
 */
@Slf4j
public abstract class QueryBasedExtractor<S, D> implements Extractor<S, D>, ProtocolSpecificLayer<S, D> {
    private static final Gson GSON = new Gson();
    protected final WorkUnitState workUnitState;
    protected final WorkUnit workUnit;
    private final String entity;
    private final String schema;
    private final Partition partition;

    private boolean fetchStatus = true;
    private S outputSchema;
    private long sourceRecordCount = 0;
    private long highWatermark;

    private Iterator<D> iterator;
    protected final List<String> columnList = new ArrayList<>();
    @VisibleForTesting
    protected final List<Predicate> predicateList = new ArrayList<>();

    private S getOutputSchema() {
        return this.outputSchema;
    }

    protected void setOutputSchema(S outputSchema) {
        this.outputSchema = outputSchema;
    }

    private long getSourceRecordCount() {
        return this.sourceRecordCount;
    }

    public boolean getFetchStatus() {
        return this.fetchStatus;
    }

    public void setFetchStatus(boolean fetchStatus) {
        this.fetchStatus = fetchStatus;
    }

    public void setHighWatermark(long highWatermark) {
        this.highWatermark = highWatermark;
    }

    private boolean isPullRequired() {
        return getFetchStatus();
    }

    protected boolean isInitialPull() {
        return this.iterator == null;
    }

    public QueryBasedExtractor(WorkUnitState workUnitState) {
        this.workUnitState = workUnitState;
        this.workUnit = this.workUnitState.getWorkunit();
        this.schema = this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA);
        this.entity = this.workUnitState.getProp(ConfigurationKeys.SOURCE_ENTITY);
        partition = Partition.deserialize(workUnit);
        MDC.put("tableName", getWorkUnitName());
    }

    private String getWorkUnitName() {
        StringBuilder sb = new StringBuilder();
        sb.append("[");
        sb.append(StringUtils.stripToEmpty(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA)));
        sb.append("_");
        sb.append(StringUtils.stripToEmpty(this.workUnitState.getProp(ConfigurationKeys.SOURCE_ENTITY)));
        sb.append("_");
        String id = this.workUnitState.getId();
        int seqIndex = id.lastIndexOf("_", id.length());
        if (seqIndex > 0) {
            String timeSeqStr = id.substring(0, seqIndex);
            int timeIndex = timeSeqStr.lastIndexOf("_", timeSeqStr.length());
            if (timeIndex > 0) {
                sb.append(id.substring(timeIndex + 1));
            }
        }
        sb.append("]");
        return sb.toString();
    }

    @Override
    public D readRecord(@Deprecated D reuse) throws DataRecordException, IOException {
        if (!this.isPullRequired()) {
            log.info("No more records to read");
            return null;
        }

        D nextElement = null;

        try {
            if (isInitialPull()) {
                log.info("Initial pull");

                if (shouldRemoveDataPullUpperBounds()) {
                    this.removeDataPullUpperBounds();
                }
                this.iterator = this.getIterator();
            }

            if (this.iterator.hasNext()) {
                nextElement = this.iterator.next();

                if (!this.iterator.hasNext()) {
                    log.debug("Getting next pull");
                    this.iterator = this.getIterator();
                    if (this.iterator == null) {
                        this.setFetchStatus(false);
                    }
                }
            }
        } catch (Exception e) {
            throw new DataRecordException("Failed to get records using rest api; error - " + e.getMessage(), e);
        }
        return nextElement;
    }

    /**
     * Check if it's appropriate to remove data pull upper bounds in the last work unit, fetching as much data as possible
     * from the source. As between the time when data query was created and that was executed, there might be some
     * new data generated in the source. Removing the upper bounds will help us grab the new data.
     *
     * Note: It's expected that there might be some duplicate data between runs because of removing the upper bounds
     *
     * @return should remove or not
     */
    private boolean shouldRemoveDataPullUpperBounds() {
        if (!this.workUnitState.getPropAsBoolean(ConfigurationKeys.SOURCE_QUERYBASED_ALLOW_REMOVE_UPPER_BOUNDS,
                true)) {
            return false;
        }

        // Only consider the last work unit
        if (!partition.isLastPartition()) {
            return false;
        }

        // Don't remove if user specifies one or is recorded in previous run
        if (partition.getHasUserSpecifiedHighWatermark() || this.workUnitState
                .getProp(ConfigurationKeys.WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY) != null) {
            return false;
        }

        return true;
    }

    /**
     * Remove all upper bounds in the predicateList used for pulling data
     */
    private void removeDataPullUpperBounds() {
        log.info("Removing data pull upper bound for last work unit");
        Iterator<Predicate> it = predicateList.iterator();
        while (it.hasNext()) {
            Predicate predicate = it.next();
            if (predicate.getType() == Predicate.PredicateType.HWM) {
                log.info("Remove predicate: " + predicate.condition);
                it.remove();
            }
        }
    }

    /**
     * Get iterator from protocol specific api if is.specific.api.active is false
     * Get iterator from source specific api if is.specific.api.active is true
     * @return iterator
     */
    private Iterator<D> getIterator() throws DataRecordException, IOException {
        if (Boolean
                .valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_SPECIFIC_API_ACTIVE))) {
            return this.getRecordSetFromSourceApi(this.schema, this.entity, this.workUnit, this.predicateList);
        }
        return this.getRecordSet(this.schema, this.entity, this.workUnit, this.predicateList);
    }

    /**
     * get source record count from source
     * @return record count
     */
    @Override
    public long getExpectedRecordCount() {
        return this.getSourceRecordCount();
    }

    /**
     * get schema(Metadata) corresponding to the data records
     * @return schema
     */
    @Override
    public S getSchema() {
        return this.getOutputSchema();
    }

    /**
     * get high watermark of the current pull
     * @return high watermark
     */
    @Override
    public long getHighWatermark() {
        return this.highWatermark;
    }

    /**
     * close extractor read stream
     * update high watermark
     */
    @Override
    public void close() {
        log.info("Updating the current state high water mark with " + this.highWatermark);
        this.workUnitState.setActualHighWatermark(new LongWatermark(this.highWatermark));
        try {
            this.closeConnection();
        } catch (Exception e) {
            log.error("Failed to close the extractor", e);
        }
    }

    /**
     * @return full dump or not
     */
    public boolean isFullDump() {
        return Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY));
    }

    /**
     * build schema, record count and high water mark
     */
    public Extractor<S, D> build() throws ExtractPrepareException {
        String watermarkColumn = this.workUnitState.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY);
        long lwm = partition.getLowWatermark();
        long hwm = partition.getHighWatermark();
        log.info("Low water mark: " + lwm + "; and High water mark: " + hwm);

        WatermarkType watermarkType;
        if (StringUtils.isBlank(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE))) {
            watermarkType = null;
        } else {
            watermarkType = WatermarkType.valueOf(
                    this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE).toUpperCase());
        }

        log.info("Source Entity is " + this.entity);
        try {
            this.setTimeOut(this.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_CONN_TIMEOUT,
                    ConfigurationKeys.DEFAULT_CONN_TIMEOUT));
            this.extractMetadata(this.schema, this.entity, this.workUnit);

            if (StringUtils.isNotBlank(watermarkColumn)) {
                if (partition.isLastPartition()) {
                    // Get a more accurate high watermark from the source
                    long adjustedHighWatermark = this.getLatestWatermark(watermarkColumn, watermarkType, lwm, hwm);
                    log.info("High water mark from source: " + adjustedHighWatermark);
                    // If the source reports a finer high watermark, then consider the same as runtime high watermark.
                    // Else, consider the low watermark as high water mark(with no delta).i.e, don't move the pointer
                    if (adjustedHighWatermark == ConfigurationKeys.DEFAULT_WATERMARK_VALUE) {
                        adjustedHighWatermark = getLowWatermarkWithNoDelta(lwm);
                    }
                    this.highWatermark = adjustedHighWatermark;
                } else {
                    this.highWatermark = hwm;
                }

                log.info("High water mark for the current run: " + highWatermark);
                this.setRangePredicates(watermarkColumn, watermarkType, lwm, highWatermark);
            }

            // if it is set to true, skip count calculation and set source count to -1
            if (!Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SKIP_COUNT_CALC))) {
                this.sourceRecordCount = this.getSourceCount(this.schema, this.entity, this.workUnit,
                        this.predicateList);
            } else {
                log.info("Skip count calculation");
                this.sourceRecordCount = -1;
            }

            if (this.sourceRecordCount == 0) {
                log.info("Record count is 0; Setting fetch status to false to skip readRecord()");
                this.setFetchStatus(false);
            }
        } catch (SchemaException e) {
            throw new ExtractPrepareException("Failed to get schema for this object; error - " + e.getMessage(), e);
        } catch (HighWatermarkException e) {
            throw new ExtractPrepareException("Failed to get high watermark; error - " + e.getMessage(), e);
        } catch (RecordCountException e) {
            throw new ExtractPrepareException("Failed to get record count; error - " + e.getMessage(), e);
        } catch (Exception e) {
            throw new ExtractPrepareException("Failed to prepare the extract build; error - " + e.getMessage(), e);
        }
        return this;
    }

    private long getLowWatermarkWithNoDelta(long lwm) {
        if (lwm == ConfigurationKeys.DEFAULT_WATERMARK_VALUE) {
            return ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
        }

        String watermarkType = this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE,
                "TIMESTAMP");
        WatermarkType wmType = WatermarkType.valueOf(watermarkType.toUpperCase());
        int deltaNum = new WatermarkPredicate(wmType).getDeltaNumForNextWatermark();

        switch (wmType) {
        case SIMPLE:
            return lwm - deltaNum;
        default:
            Date lowWaterMarkDate = Utils.toDate(lwm, "yyyyMMddHHmmss");
            return Long.parseLong(
                    Utils.dateToString(Utils.addSecondsToDate(lowWaterMarkDate, deltaNum * -1), "yyyyMMddHHmmss"));
        }
    }

    /**
     * if snapshot extract, get latest watermark else return work unit high watermark
     *
     * @param watermark column
     * @param low watermark value
     * @param high watermark value
     * @param column format
     * @return letst watermark
     * @throws IOException
     */
    private long getLatestWatermark(String watermarkColumn, WatermarkType watermarkType, long lwmValue,
            long hwmValue) throws HighWatermarkException, IOException {

        if (!Boolean.valueOf(
                this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SKIP_HIGH_WATERMARK_CALC))) {
            log.info("Getting high watermark");
            List<Predicate> list = new ArrayList<>();
            WatermarkPredicate watermark = new WatermarkPredicate(watermarkColumn, watermarkType);
            String lwmOperator = partition.isLowWatermarkInclusive() ? ">=" : ">";
            String hwmOperator = (partition.isLastPartition() || partition.isHighWatermarkInclusive()) ? "<=" : "<";

            Predicate lwmPredicate = watermark.getPredicate(this, lwmValue, lwmOperator,
                    Predicate.PredicateType.LWM);
            Predicate hwmPredicate = watermark.getPredicate(this, hwmValue, hwmOperator,
                    Predicate.PredicateType.HWM);
            if (lwmPredicate != null) {
                list.add(lwmPredicate);
            }
            if (hwmPredicate != null) {
                list.add(hwmPredicate);
            }

            return this.getMaxWatermark(this.schema, this.entity, watermarkColumn, list,
                    watermark.getWatermarkSourceFormat(this));
        }

        return hwmValue;
    }

    /**
     * range predicates for watermark column and transaction columns.
     *
     * @param watermarkColumn name of the column used as watermark
     * @param watermarkType watermark type
     * @param lwmValue estimated low watermark value
     * @param hwmValue estimated high watermark value
     */
    private void setRangePredicates(String watermarkColumn, WatermarkType watermarkType, long lwmValue,
            long hwmValue) {
        log.debug("Getting range predicates");
        String lwmOperator = partition.isLowWatermarkInclusive() ? ">=" : ">";
        String hwmOperator = (partition.isLastPartition() || partition.isHighWatermarkInclusive()) ? "<=" : "<";

        WatermarkPredicate watermark = new WatermarkPredicate(watermarkColumn, watermarkType);
        this.addPredicates(watermark.getPredicate(this, lwmValue, lwmOperator, Predicate.PredicateType.LWM));
        this.addPredicates(watermark.getPredicate(this, hwmValue, hwmOperator, Predicate.PredicateType.HWM));

        if (Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_HOURLY_EXTRACT))) {
            String hourColumn = this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_HOUR_COLUMN);
            if (StringUtils.isNotBlank(hourColumn)) {
                WatermarkPredicate hourlyWatermark = new WatermarkPredicate(hourColumn, WatermarkType.HOUR);
                this.addPredicates(
                        hourlyWatermark.getPredicate(this, lwmValue, lwmOperator, Predicate.PredicateType.LWM));
                this.addPredicates(
                        hourlyWatermark.getPredicate(this, hwmValue, hwmOperator, Predicate.PredicateType.HWM));
            }
        }
    }

    /**
     * add predicate to the predicate list
     * @param Predicate(watermark column,type,format and condition)
     * @return watermark list
     */
    private void addPredicates(Predicate predicate) {
        if (predicate != null) {
            this.predicateList.add(predicate);
        }
    }

    /**
     * @param given list of watermark columns
     * @param column name to search for
     * @return true, if column name is part of water mark columns. otherwise, return false
     */
    protected boolean isWatermarkColumn(String watermarkColumn, String columnName) {
        if (columnName != null) {
            columnName = columnName.toLowerCase();
        }

        if (StringUtils.isNotBlank(watermarkColumn)) {
            List<String> waterMarkColumnList = Arrays.asList(watermarkColumn.toLowerCase().split(","));
            if (waterMarkColumnList.contains(columnName)) {
                return true;
            }
        }
        return false;
    }

    /**
     * @param given list of watermark columns
     * @return true, if there are multiple water mark columns. otherwise, return false
     */
    protected boolean hasMultipleWatermarkColumns(String watermarkColumn) {
        if (StringUtils.isBlank(watermarkColumn)) {
            return false;
        }

        return Arrays.asList(watermarkColumn.toLowerCase().split(",")).size() > 1;
    }

    /**
     * @param given list of primary key columns
     * @param column name to search for
     * @return index of the column if it exist in given list of primary key columns. otherwise, return 0
     */
    protected int getPrimarykeyIndex(String primarykeyColumn, String columnName) {
        if (columnName != null) {
            columnName = columnName.toLowerCase();
        }

        if (StringUtils.isNotBlank(primarykeyColumn)) {
            List<String> primarykeyColumnList = Arrays.asList(primarykeyColumn.toLowerCase().split(","));
            return primarykeyColumnList.indexOf(columnName) + 1;
        }
        return 0;
    }

    /**
     * @param column name to search for
     * @param list of metadata columns
     * @return true if column is part of metadata columns. otherwise, return false.
     */
    protected boolean isMetadataColumn(String columnName, List<String> columnList) {
        boolean isColumnCheckEnabled = Boolean.valueOf(
                this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_METADATA_COLUMN_CHECK_ENABLED,
                        ConfigurationKeys.DEFAULT_SOURCE_QUERYBASED_IS_METADATA_COLUMN_CHECK_ENABLED));

        if (!isColumnCheckEnabled) {
            return true;
        }

        columnName = columnName.trim().toLowerCase();
        if (columnList.contains(columnName)) {
            return true;
        }
        return false;
    }

    /**
     * @param column name
     * @param data type
     * @param data type of elements
     * @param elements
     * @return converted data type
     */
    protected JsonObject convertDataType(String columnName, String type, String elementType,
            List<String> enumSymbols) {
        String dataType = this.getDataTypeMap().get(type);
        if (dataType == null) {
            dataType = "string";
        }
        DataType convertedDataType;
        if (dataType.equals("map")) {
            convertedDataType = new MapDataType(dataType, elementType);
        } else if (dataType.equals("array")) {
            convertedDataType = new ArrayDataType(dataType, elementType);
        } else if (dataType.equals("enum")) {
            convertedDataType = new EnumDataType(dataType, columnName, enumSymbols);
        } else {
            convertedDataType = new DataType(dataType);
        }

        return GSON.fromJson(GSON.toJson(convertedDataType), JsonObject.class).getAsJsonObject();
    }

    /**
     * @param predicate list
     * @return true, if there are any predicates. otherwise, return false.
     */
    protected boolean isPredicateExists(List<Predicate> predicateList) {
        if (predicateList == null || predicateList.isEmpty()) {
            return false;
        }
        return true;
    }
}