org.apache.carbondata.hadoop.api.CarbonInputFormat.java Source code

Introduction

Here is the source code for org.apache.carbondata.hadoop.api.CarbonInputFormat.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.carbondata.hadoop.api;

import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

import org.apache.carbondata.common.logging.LogServiceFactory;
import org.apache.carbondata.core.constants.CarbonCommonConstants;
import org.apache.carbondata.core.constants.CarbonCommonConstantsInternal;
import org.apache.carbondata.core.datamap.DataMapChooser;
import org.apache.carbondata.core.datamap.DataMapFilter;
import org.apache.carbondata.core.datamap.DataMapJob;
import org.apache.carbondata.core.datamap.DataMapStoreManager;
import org.apache.carbondata.core.datamap.DataMapUtil;
import org.apache.carbondata.core.datamap.Segment;
import org.apache.carbondata.core.datamap.TableDataMap;
import org.apache.carbondata.core.datamap.dev.expr.DataMapExprWrapper;
import org.apache.carbondata.core.datamap.dev.expr.DataMapWrapperSimpleInfo;
import org.apache.carbondata.core.exception.InvalidConfigurationException;
import org.apache.carbondata.core.indexstore.ExtendedBlocklet;
import org.apache.carbondata.core.indexstore.PartitionSpec;
import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
import org.apache.carbondata.core.metadata.schema.PartitionInfo;
import org.apache.carbondata.core.metadata.schema.partition.PartitionType;
import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
import org.apache.carbondata.core.metadata.schema.table.TableInfo;
import org.apache.carbondata.core.metadata.schema.table.column.ColumnSchema;
import org.apache.carbondata.core.profiler.ExplainCollector;
import org.apache.carbondata.core.readcommitter.ReadCommittedScope;
import org.apache.carbondata.core.scan.expression.Expression;
import org.apache.carbondata.core.scan.filter.FilterUtil;
import org.apache.carbondata.core.scan.model.QueryModel;
import org.apache.carbondata.core.scan.model.QueryModelBuilder;
import org.apache.carbondata.core.stats.QueryStatistic;
import org.apache.carbondata.core.stats.QueryStatisticsConstants;
import org.apache.carbondata.core.stats.QueryStatisticsRecorder;
import org.apache.carbondata.core.util.BlockletDataMapUtil;
import org.apache.carbondata.core.util.CarbonProperties;
import org.apache.carbondata.core.util.CarbonTimeStatisticsFactory;
import org.apache.carbondata.core.util.CarbonUtil;
import org.apache.carbondata.core.util.DataTypeConverter;
import org.apache.carbondata.core.util.DataTypeConverterImpl;
import org.apache.carbondata.core.util.ObjectSerializationUtil;
import org.apache.carbondata.core.util.path.CarbonTablePath;
import org.apache.carbondata.hadoop.CarbonInputSplit;
import org.apache.carbondata.hadoop.CarbonMultiBlockSplit;
import org.apache.carbondata.hadoop.CarbonProjection;
import org.apache.carbondata.hadoop.CarbonRecordReader;
import org.apache.carbondata.hadoop.readsupport.CarbonReadSupport;
import org.apache.carbondata.hadoop.readsupport.impl.DictionaryDecodeReadSupport;

import org.apache.commons.collections.CollectionUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.log4j.Logger;

/**
 * Base class for carbondata input format, there are two input format implementations:
 * 1. CarbonFileInputFormat: for reading carbondata files without table level metadata support.
 *
 * 2. CarbonTableInputFormat: for reading carbondata files with table level metadata support,
 * such as segment and explicit schema metadata.
 *
 * @param <T>
 */
public abstract class CarbonInputFormat<T> extends FileInputFormat<Void, T> {
    // comma separated list of input segment numbers
    public static final String INPUT_SEGMENT_NUMBERS = "mapreduce.input.carboninputformat.segmentnumbers";
    private static final String VALIDATE_INPUT_SEGMENT_IDs = "mapreduce.input.carboninputformat.validsegments";
    // comma separated list of input files
    private static final String ALTER_PARTITION_ID = "mapreduce.input.carboninputformat.partitionid";
    private static final Logger LOG = LogServiceFactory.getLogService(CarbonInputFormat.class.getName());
    private static final String FILTER_PREDICATE = "mapreduce.input.carboninputformat.filter.predicate";
    private static final String COLUMN_PROJECTION = "mapreduce.input.carboninputformat.projection";
    private static final String TABLE_INFO = "mapreduce.input.carboninputformat.tableinfo";
    private static final String CARBON_TRANSACTIONAL_TABLE = "mapreduce.input.carboninputformat.transactional";
    private static final String CARBON_READ_SUPPORT = "mapreduce.input.carboninputformat.readsupport";
    private static final String CARBON_CONVERTER = "mapreduce.input.carboninputformat.converter";
    public static final String DATABASE_NAME = "mapreduce.input.carboninputformat.databaseName";
    public static final String TABLE_NAME = "mapreduce.input.carboninputformat.tableName";
    private static final String PARTITIONS_TO_PRUNE = "mapreduce.input.carboninputformat.partitions.to.prune";
    private static final String FGDATAMAP_PRUNING = "mapreduce.input.carboninputformat.fgdatamap";
    private static final String READ_COMMITTED_SCOPE = "mapreduce.input.carboninputformat.read.committed.scope";

    // record segment number and hit blocks
    protected int numSegments = 0;
    protected int numStreamSegments = 0;
    protected int numStreamFiles = 0;
    protected int hitedStreamFiles = 0;
    protected int numBlocks = 0;

    public int getNumSegments() {
        return numSegments;
    }

    public int getNumStreamSegments() {
        return numStreamSegments;
    }

    public int getNumStreamFiles() {
        return numStreamFiles;
    }

    public int getHitedStreamFiles() {
        return hitedStreamFiles;
    }

    public int getNumBlocks() {
        return numBlocks;
    }

    /**
     * Set the `tableInfo` in `configuration`
     */
    public static void setTableInfo(Configuration configuration, TableInfo tableInfo) throws IOException {
        if (null != tableInfo) {
            configuration.set(TABLE_INFO, CarbonUtil.encodeToString(tableInfo.serialize()));
        }
    }

    /**
     * Get TableInfo object from `configuration`
     */
    protected static TableInfo getTableInfo(Configuration configuration) throws IOException {
        String tableInfoStr = configuration.get(TABLE_INFO);
        if (tableInfoStr == null) {
            return null;
        } else {
            TableInfo output = new TableInfo();
            output.readFields(
                    new DataInputStream(new ByteArrayInputStream(CarbonUtil.decodeStringToBytes(tableInfoStr))));
            return output;
        }
    }

    /**
     * Get the cached CarbonTable or create it by TableInfo in `configuration`
     */
    public abstract CarbonTable getOrCreateCarbonTable(Configuration configuration) throws IOException;

    public static void setTablePath(Configuration configuration, String tablePath) {
        configuration.set(FileInputFormat.INPUT_DIR, tablePath);
    }

    public static void setTransactionalTable(Configuration configuration, boolean isTransactionalTable) {
        configuration.set(CARBON_TRANSACTIONAL_TABLE, String.valueOf(isTransactionalTable));
    }

    public static void setPartitionIdList(Configuration configuration, List<String> partitionIds) {
        configuration.set(ALTER_PARTITION_ID, partitionIds.toString());
    }

    /**
     * It sets unresolved filter expression.
     *
     * @param configuration
     * @para    DataMapJob dataMapJob = getDataMapJob(job.getConfiguration());
    m filterExpression
     */
    public static void setFilterPredicates(Configuration configuration, Expression filterExpression) {
        if (filterExpression == null) {
            return;
        }
        try {
            String filterString = ObjectSerializationUtil.convertObjectToString(filterExpression);
            configuration.set(FILTER_PREDICATE, filterString);
        } catch (Exception e) {
            throw new RuntimeException("Error while setting filter expression to Job", e);
        }
    }

    /**
     * Set the column projection column names
     *
     * @param configuration     Configuration info
     * @param projectionColumns projection columns name
     */
    public static void setColumnProjection(Configuration configuration, String[] projectionColumns) {
        Objects.requireNonNull(projectionColumns);
        if (projectionColumns.length < 1) {
            throw new RuntimeException("Projection can't be empty");
        }
        StringBuilder builder = new StringBuilder();
        for (String column : projectionColumns) {
            builder.append(column).append(",");
        }
        String columnString = builder.toString();
        columnString = columnString.substring(0, columnString.length() - 1);
        configuration.set(COLUMN_PROJECTION, columnString);
    }

    /**
     * Set the column projection column names from CarbonProjection
     *
     * @param configuration Configuration info
     * @param projection    CarbonProjection object that includes unique projection column name
     */
    public static void setColumnProjection(Configuration configuration, CarbonProjection projection) {
        if (projection == null || projection.isEmpty()) {
            return;
        }
        String[] allColumns = projection.getAllColumns();
        StringBuilder builder = new StringBuilder();
        for (String column : allColumns) {
            builder.append(column).append(",");
        }
        String columnString = builder.toString();
        columnString = columnString.substring(0, columnString.length() - 1);
        configuration.set(COLUMN_PROJECTION, columnString);
    }

    public static String getColumnProjection(Configuration configuration) {
        return configuration.get(COLUMN_PROJECTION);
    }

    public static void setFgDataMapPruning(Configuration configuration, boolean enable) {
        configuration.set(FGDATAMAP_PRUNING, String.valueOf(enable));
    }

    public static boolean isFgDataMapPruningEnable(Configuration configuration) {
        String enable = configuration.get(FGDATAMAP_PRUNING);

        // if FDDATAMAP_PRUNING is not set, by default we will use FGDataMap
        return (enable == null) || enable.equalsIgnoreCase("true");
    }

    /**
     * Set list of segments to access
     */
    public static void setSegmentsToAccess(Configuration configuration, List<Segment> validSegments) {
        configuration.set(INPUT_SEGMENT_NUMBERS, CarbonUtil.convertToString(validSegments));
    }

    /**
     * Set `CARBON_INPUT_SEGMENTS` from property to configuration
     */
    public static void setQuerySegment(Configuration conf, AbsoluteTableIdentifier identifier) {
        String dbName = identifier.getCarbonTableIdentifier().getDatabaseName().toLowerCase();
        String tbName = identifier.getCarbonTableIdentifier().getTableName().toLowerCase();
        getQuerySegmentToAccess(conf, dbName, tbName);
    }

    /**
     * Set `CARBON_INPUT_SEGMENTS` from property to configuration
     */
    public static void setQuerySegment(Configuration conf, String segmentList) {
        if (!segmentList.trim().equals("*")) {
            CarbonInputFormat.setSegmentsToAccess(conf, Segment.toSegmentList(segmentList.split(","), null));
        }
    }

    /**
     * set list of segment to access
     */
    public static void setValidateSegmentsToAccess(Configuration configuration, Boolean validate) {
        configuration.set(CarbonInputFormat.VALIDATE_INPUT_SEGMENT_IDs, validate.toString());
    }

    /**
     * get list of segment to access
     */
    public static boolean getValidateSegmentsToAccess(Configuration configuration) {
        return configuration.get(CarbonInputFormat.VALIDATE_INPUT_SEGMENT_IDs, "true").equalsIgnoreCase("true");
    }

    /**
     * set list of partitions to prune
     */
    public static void setPartitionsToPrune(Configuration configuration, List<PartitionSpec> partitions) {
        if (partitions == null) {
            return;
        }
        try {
            String partitionString = ObjectSerializationUtil.convertObjectToString(new ArrayList<>(partitions));
            configuration.set(PARTITIONS_TO_PRUNE, partitionString);
        } catch (Exception e) {
            throw new RuntimeException("Error while setting partition information to Job" + partitions, e);
        }
    }

    /**
     * get list of partitions to prune
     */
    public static List<PartitionSpec> getPartitionsToPrune(Configuration configuration) throws IOException {
        String partitionString = configuration.get(PARTITIONS_TO_PRUNE);
        if (partitionString != null) {
            return (List<PartitionSpec>) ObjectSerializationUtil.convertStringToObject(partitionString);
        }
        return null;
    }

    public AbsoluteTableIdentifier getAbsoluteTableIdentifier(Configuration configuration) throws IOException {
        String tablePath = configuration.get(INPUT_DIR, "");
        try {
            return AbsoluteTableIdentifier.from(tablePath, getDatabaseName(configuration),
                    getTableName(configuration));
        } catch (InvalidConfigurationException e) {
            throw new IOException(e);
        }
    }

    public static void setReadCommittedScope(Configuration configuration, ReadCommittedScope committedScope) {
        if (committedScope == null) {
            return;
        }
        try {
            String subFoldersString = ObjectSerializationUtil.convertObjectToString(committedScope);
            configuration.set(READ_COMMITTED_SCOPE, subFoldersString);
        } catch (Exception e) {
            throw new RuntimeException("Error while setting committedScope information to Job" + committedScope, e);
        }
    }

    public static ReadCommittedScope getReadCommittedScope(Configuration configuration) throws IOException {
        String subFoldersString = configuration.get(READ_COMMITTED_SCOPE);
        if (subFoldersString != null) {
            return (ReadCommittedScope) ObjectSerializationUtil.convertStringToObject(subFoldersString);
        }
        return null;
    }

    /**
     * {@inheritDoc}
     * Configurations FileInputFormat.INPUT_DIR
     * are used to get table path to read.
     *
     * @param job
     * @return List<InputSplit> list of CarbonInputSplit
     * @throws IOException
     */
    @Override
    public abstract List<InputSplit> getSplits(JobContext job) throws IOException;

    protected Expression getFilterPredicates(Configuration configuration) {
        try {
            String filterExprString = configuration.get(FILTER_PREDICATE);
            if (filterExprString == null) {
                return null;
            }
            Object filter = ObjectSerializationUtil.convertStringToObject(filterExprString);
            return (Expression) filter;
        } catch (IOException e) {
            throw new RuntimeException("Error while reading filter expression", e);
        }
    }

    /**
     * get data blocks of given segment
     */
    protected List<CarbonInputSplit> getDataBlocksOfSegment(JobContext job, CarbonTable carbonTable,
            Expression expression, BitSet matchedPartitions, List<Segment> segmentIds, PartitionInfo partitionInfo,
            List<Integer> oldPartitionIdList) throws IOException {

        QueryStatisticsRecorder recorder = CarbonTimeStatisticsFactory.createDriverRecorder();
        QueryStatistic statistic = new QueryStatistic();

        // get tokens for all the required FileSystem for table path
        TokenCache.obtainTokensForNamenodes(job.getCredentials(),
                new Path[] { new Path(carbonTable.getTablePath()) }, job.getConfiguration());
        List<ExtendedBlocklet> prunedBlocklets = getPrunedBlocklets(job, carbonTable, expression, segmentIds);
        List<CarbonInputSplit> resultFilteredBlocks = new ArrayList<>();
        int partitionIndex = 0;
        List<Integer> partitionIdList = new ArrayList<>();
        if (partitionInfo != null && partitionInfo.getPartitionType() != PartitionType.NATIVE_HIVE) {
            partitionIdList = partitionInfo.getPartitionIds();
        }
        for (ExtendedBlocklet blocklet : prunedBlocklets) {

            // OldPartitionIdList is only used in alter table partition command because it change
            // partition info first and then read data.
            // For other normal query should use newest partitionIdList
            if (partitionInfo != null && partitionInfo.getPartitionType() != PartitionType.NATIVE_HIVE) {
                long partitionId = CarbonTablePath.DataFileUtil
                        .getTaskIdFromTaskNo(CarbonTablePath.DataFileUtil.getTaskNo(blocklet.getPath()));
                if (oldPartitionIdList != null) {
                    partitionIndex = oldPartitionIdList.indexOf((int) partitionId);
                } else {
                    partitionIndex = partitionIdList.indexOf((int) partitionId);
                }
            }
            if (partitionIndex != -1) {
                // matchedPartitions variable will be null in two cases as follows
                // 1. the table is not a partition table
                // 2. the table is a partition table, and all partitions are matched by query
                // for partition table, the task id of carbaondata file name is the partition id.
                // if this partition is not required, here will skip it.
                if (matchedPartitions == null || matchedPartitions.get(partitionIndex)) {
                    resultFilteredBlocks.add(blocklet.getInputSplit());
                }
            }
        }
        statistic.addStatistics(QueryStatisticsConstants.LOAD_BLOCKS_DRIVER, System.currentTimeMillis());
        recorder.recordStatisticsForDriver(statistic, job.getConfiguration().get("query.id"));
        return resultFilteredBlocks;
    }

    /**
     * for explain command
     * get number of block by counting distinct file path of blocklets
     */
    private int getBlockCount(List<ExtendedBlocklet> blocklets) {
        Set<String> filepaths = new HashSet<>();
        for (ExtendedBlocklet blocklet : blocklets) {
            filepaths.add(blocklet.getPath());
        }
        return filepaths.size();
    }

    /**
     * Prune the blocklets using the filter expression with available datamaps.
     * First pruned with default blocklet datamap, then pruned with CG and FG datamaps
     */
    private List<ExtendedBlocklet> getPrunedBlocklets(JobContext job, CarbonTable carbonTable,
            Expression expression, List<Segment> segmentIds) throws IOException {
        ExplainCollector.addPruningInfo(carbonTable.getTableName());
        final DataMapFilter filter = new DataMapFilter(carbonTable, expression);
        ExplainCollector.setFilterStatement(expression == null ? "none" : expression.getStatement());
        boolean distributedCG = Boolean.parseBoolean(
                CarbonProperties.getInstance().getProperty(CarbonCommonConstants.USE_DISTRIBUTED_DATAMAP,
                        CarbonCommonConstants.USE_DISTRIBUTED_DATAMAP_DEFAULT));
        DataMapJob dataMapJob = DataMapUtil.getDataMapJob(job.getConfiguration());
        List<PartitionSpec> partitionsToPrune = getPartitionsToPrune(job.getConfiguration());
        // First prune using default datamap on driver side.
        TableDataMap defaultDataMap = DataMapStoreManager.getInstance().getDefaultDataMap(carbonTable);
        List<ExtendedBlocklet> prunedBlocklets = null;
        // This is to log the event, so user will know what is happening by seeing logs.
        LOG.info("Started block pruning ...");
        prunedBlocklets = defaultDataMap.prune(segmentIds, filter, partitionsToPrune);

        if (ExplainCollector.enabled()) {
            ExplainCollector.setDefaultDataMapPruningBlockHit(getBlockCount(prunedBlocklets));
        }

        if (prunedBlocklets.size() == 0) {
            return prunedBlocklets;
        }

        DataMapChooser chooser = new DataMapChooser(getOrCreateCarbonTable(job.getConfiguration()));

        // Get the available CG datamaps and prune further.
        DataMapExprWrapper cgDataMapExprWrapper = chooser.chooseCGDataMap(filter.getResolver());
        if (cgDataMapExprWrapper != null) {
            // Prune segments from already pruned blocklets
            pruneSegments(segmentIds, prunedBlocklets);
            List<ExtendedBlocklet> cgPrunedBlocklets;
            // Again prune with CG datamap.
            if (distributedCG && dataMapJob != null) {
                cgPrunedBlocklets = DataMapUtil.executeDataMapJob(carbonTable, filter.getResolver(), segmentIds,
                        cgDataMapExprWrapper, dataMapJob, partitionsToPrune);
            } else {
                cgPrunedBlocklets = cgDataMapExprWrapper.prune(segmentIds, partitionsToPrune);
            }
            // since index datamap prune in segment scope,
            // the result need to intersect with previous pruned result
            prunedBlocklets = intersectFilteredBlocklets(carbonTable, prunedBlocklets, cgPrunedBlocklets);
            ExplainCollector.recordCGDataMapPruning(
                    DataMapWrapperSimpleInfo.fromDataMapWrapper(cgDataMapExprWrapper), prunedBlocklets.size(),
                    getBlockCount(prunedBlocklets));
        }

        if (prunedBlocklets.size() == 0) {
            return prunedBlocklets;
        }
        // Now try to prune with FG DataMap.
        if (isFgDataMapPruningEnable(job.getConfiguration()) && dataMapJob != null) {
            DataMapExprWrapper fgDataMapExprWrapper = chooser.chooseFGDataMap(filter.getResolver());
            if (fgDataMapExprWrapper != null) {
                // Prune segments from already pruned blocklets
                pruneSegments(segmentIds, prunedBlocklets);
                List<ExtendedBlocklet> fgPrunedBlocklets = DataMapUtil.executeDataMapJob(carbonTable,
                        filter.getResolver(), segmentIds, fgDataMapExprWrapper, dataMapJob, partitionsToPrune);
                // note that the 'fgPrunedBlocklets' has extra datamap related info compared with
                // 'prunedBlocklets', so the intersection should keep the elements in 'fgPrunedBlocklets'
                prunedBlocklets = intersectFilteredBlocklets(carbonTable, prunedBlocklets, fgPrunedBlocklets);
                ExplainCollector.recordFGDataMapPruning(
                        DataMapWrapperSimpleInfo.fromDataMapWrapper(fgDataMapExprWrapper), prunedBlocklets.size(),
                        getBlockCount(prunedBlocklets));
            }
        }
        LOG.info("Finished block pruning ...");
        return prunedBlocklets;
    }

    private List<ExtendedBlocklet> intersectFilteredBlocklets(CarbonTable carbonTable,
            List<ExtendedBlocklet> previousDataMapPrunedBlocklets,
            List<ExtendedBlocklet> otherDataMapPrunedBlocklets) {
        List<ExtendedBlocklet> prunedBlocklets = null;
        if (BlockletDataMapUtil.isCacheLevelBlock(carbonTable)) {
            prunedBlocklets = new ArrayList<>();
            for (ExtendedBlocklet otherBlocklet : otherDataMapPrunedBlocklets) {
                if (previousDataMapPrunedBlocklets.contains(otherBlocklet)) {
                    prunedBlocklets.add(otherBlocklet);
                }
            }
        } else {
            prunedBlocklets = (List) CollectionUtils.intersection(otherDataMapPrunedBlocklets,
                    previousDataMapPrunedBlocklets);
        }
        return prunedBlocklets;
    }

    /**
     * Prune the segments from the already pruned blocklets.
     * @param segments
     * @param prunedBlocklets
     */
    private void pruneSegments(List<Segment> segments, List<ExtendedBlocklet> prunedBlocklets) {
        List<Segment> toBeRemovedSegments = new ArrayList<>();
        for (Segment segment : segments) {
            boolean found = false;
            // Clear the old pruned index files if any present
            segment.getFilteredIndexShardNames().clear();
            // Check the segment exist in any of the pruned blocklets.
            for (ExtendedBlocklet blocklet : prunedBlocklets) {
                if (blocklet.getSegment().toString().equals(segment.toString())) {
                    found = true;
                    // Set the pruned index file to the segment for further pruning.
                    String shardName = CarbonTablePath.getShardName(blocklet.getFilePath());
                    segment.setFilteredIndexShardName(shardName);
                }
            }
            // Add to remove segments list if not present in pruned blocklets.
            if (!found) {
                toBeRemovedSegments.add(segment);
            }
        }
        // Remove all segments which are already pruned from pruned blocklets
        segments.removeAll(toBeRemovedSegments);
    }

    @Override
    public RecordReader<Void, T> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
            throws IOException, InterruptedException {
        Configuration configuration = taskAttemptContext.getConfiguration();
        QueryModel queryModel = createQueryModel(inputSplit, taskAttemptContext,
                getFilterPredicates(taskAttemptContext.getConfiguration()));
        CarbonReadSupport<T> readSupport = getReadSupportClass(configuration);
        return new CarbonRecordReader<T>(queryModel, readSupport, taskAttemptContext.getConfiguration());
    }

    public QueryModel createQueryModel(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
            throws IOException {
        return createQueryModel(inputSplit, taskAttemptContext,
                getFilterPredicates(taskAttemptContext.getConfiguration()));
    }

    public QueryModel createQueryModel(InputSplit inputSplit, TaskAttemptContext taskAttemptContext,
            Expression filterExpression) throws IOException {
        Configuration configuration = taskAttemptContext.getConfiguration();
        CarbonTable carbonTable = getOrCreateCarbonTable(configuration);

        // set projection column in the query model
        String projectionString = getColumnProjection(configuration);
        String[] projectColumns;
        if (projectionString != null) {
            projectColumns = projectionString.split(",");
        } else {
            projectColumns = new String[] {};
        }
        checkAndAddImplicitExpression(filterExpression, inputSplit);
        QueryModel queryModel = new QueryModelBuilder(carbonTable).projectColumns(projectColumns)
                .filterExpression(filterExpression).dataConverter(getDataTypeConverter(configuration)).build();
        return queryModel;
    }

    /**
     * This method will create an Implict Expression and set it as right child in the given
     * expression
     *
     * @param expression
     * @param inputSplit
     */
    private void checkAndAddImplicitExpression(Expression expression, InputSplit inputSplit) {
        if (inputSplit instanceof CarbonMultiBlockSplit) {
            CarbonMultiBlockSplit split = (CarbonMultiBlockSplit) inputSplit;
            List<CarbonInputSplit> splits = split.getAllSplits();
            // iterate over all the splits and create block to bblocklet mapping
            Map<String, Set<Integer>> blockIdToBlockletIdMapping = new HashMap<>();
            for (CarbonInputSplit carbonInputSplit : splits) {
                Set<Integer> validBlockletIds = carbonInputSplit.getValidBlockletIds();
                if (null != validBlockletIds && !validBlockletIds.isEmpty()) {
                    String uniqueBlockPath = carbonInputSplit.getFilePath();
                    String shortBlockPath = CarbonTablePath
                            .getShortBlockId(uniqueBlockPath.substring(uniqueBlockPath.lastIndexOf("/Part") + 1));
                    blockIdToBlockletIdMapping.put(shortBlockPath, validBlockletIds);
                }
            }
            if (!blockIdToBlockletIdMapping.isEmpty()) {
                // create implicit expression and set as right child
                FilterUtil.createImplicitExpressionAndSetAsRightChild(expression, blockIdToBlockletIdMapping);
            }
        }
    }

    public CarbonReadSupport<T> getReadSupportClass(Configuration configuration) {
        String readSupportClass = configuration.get(CARBON_READ_SUPPORT);
        //By default it uses dictionary decoder read class
        CarbonReadSupport<T> readSupport = null;
        if (readSupportClass != null) {
            try {
                Class<?> myClass = Class.forName(readSupportClass);
                Constructor<?> constructor = myClass.getConstructors()[0];
                Object object = constructor.newInstance();
                if (object instanceof CarbonReadSupport) {
                    readSupport = (CarbonReadSupport) object;
                }
            } catch (ClassNotFoundException ex) {
                LOG.error("Class " + readSupportClass + "not found", ex);
            } catch (Exception ex) {
                LOG.error("Error while creating " + readSupportClass, ex);
            }
        } else {
            readSupport = new DictionaryDecodeReadSupport<>();
        }
        return readSupport;
    }

    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        try {
            // Don't split the file if it is local file system
            FileSystem fileSystem = filename.getFileSystem(context.getConfiguration());
            if (fileSystem instanceof LocalFileSystem) {
                return false;
            }
        } catch (Exception e) {
            return true;
        }
        return true;
    }

    public static void setCarbonReadSupport(Configuration configuration,
            Class<? extends CarbonReadSupport> readSupportClass) {
        if (readSupportClass != null) {
            configuration.set(CARBON_READ_SUPPORT, readSupportClass.getName());
        }
    }

    /**
     * It is optional, if user does not set then it reads from store
     *
     * @param configuration
     * @param converterClass is the Data type converter for different computing engine
     */
    public static void setDataTypeConverter(Configuration configuration,
            Class<? extends DataTypeConverter> converterClass) {
        if (null != converterClass) {
            configuration.set(CARBON_CONVERTER, converterClass.getCanonicalName());
        }
    }

    public static DataTypeConverter getDataTypeConverter(Configuration configuration) throws IOException {
        String converterClass = configuration.get(CARBON_CONVERTER);
        if (converterClass == null) {
            return new DataTypeConverterImpl();
        }

        try {
            return (DataTypeConverter) Class.forName(converterClass).newInstance();
        } catch (Exception e) {
            throw new IOException(e);
        }
    }

    public static void setDatabaseName(Configuration configuration, String databaseName) {
        if (null != databaseName) {
            configuration.set(DATABASE_NAME, databaseName);
        }
    }

    public static String getDatabaseName(Configuration configuration) throws InvalidConfigurationException {
        String databseName = configuration.get(DATABASE_NAME);
        if (null == databseName) {
            throw new InvalidConfigurationException("Database name is not set.");
        }
        return databseName;
    }

    public static void setTableName(Configuration configuration, String tableName) {
        if (null != tableName) {
            configuration.set(TABLE_NAME, tableName);
        }
    }

    public static String getTableName(Configuration configuration) throws InvalidConfigurationException {
        String tableName = configuration.get(TABLE_NAME);
        if (tableName == null) {
            throw new InvalidConfigurationException("Table name is not set");
        }
        return tableName;
    }

    public static void setAccessStreamingSegments(Configuration configuration, Boolean validate)
            throws InvalidConfigurationException {
        configuration.set(CarbonCommonConstantsInternal.QUERY_ON_PRE_AGG_STREAMING + "."
                + getDatabaseName(configuration) + "." + getTableName(configuration), validate.toString());
    }

    public static boolean getAccessStreamingSegments(Configuration configuration) {
        try {
            return configuration
                    .get(CarbonCommonConstantsInternal.QUERY_ON_PRE_AGG_STREAMING + "."
                            + getDatabaseName(configuration) + "." + getTableName(configuration), "false")
                    .equalsIgnoreCase("true");

        } catch (InvalidConfigurationException e) {
            return false;
        }
    }

    /**
     * Project all Columns for carbon reader
     *
     * @return String araay of columnNames
     * @param carbonTable
     */
    public String[] projectAllColumns(CarbonTable carbonTable) {
        List<ColumnSchema> colList = carbonTable.getTableInfo().getFactTable().getListOfColumns();
        List<String> projectColumns = new ArrayList<>();
        // complex type and add just the parent column name while skipping the child columns.
        for (ColumnSchema col : colList) {
            if (!col.getColumnName().contains(".")) {
                projectColumns.add(col.getColumnName());
            }
        }
        return projectColumns.toArray(new String[projectColumns.size()]);
    }

    private static void getQuerySegmentToAccess(Configuration conf, String dbName, String tableName) {
        String segmentNumbersFromProperty = CarbonProperties.getInstance()
                .getProperty(CarbonCommonConstants.CARBON_INPUT_SEGMENTS + dbName + "." + tableName, "*");
        if (!segmentNumbersFromProperty.trim().equals("*")) {
            CarbonInputFormat.setSegmentsToAccess(conf,
                    Segment.toSegmentList(segmentNumbersFromProperty.split(","), null));
        }
    }

    /**
     * Set `CARBON_INPUT_SEGMENTS` from property to configuration
     */
    public static void setQuerySegment(Configuration conf, CarbonTable carbonTable) {
        String tableName = carbonTable.getTableName();
        getQuerySegmentToAccess(conf, carbonTable.getDatabaseName(), tableName);
    }

}