org.apache.carbondata.presto.impl.CarbonTableReader.java Source code

Introduction

Here is the source code for org.apache.carbondata.presto.impl.CarbonTableReader.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.carbondata.presto.impl;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;

import org.apache.carbondata.common.logging.LogServiceFactory;
import org.apache.carbondata.core.constants.CarbonCommonConstants;
import org.apache.carbondata.core.datamap.DataMapStoreManager;
import org.apache.carbondata.core.datastore.filesystem.CarbonFile;
import org.apache.carbondata.core.datastore.impl.FileFactory;
import org.apache.carbondata.core.indexstore.PartitionSpec;
import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier;
import org.apache.carbondata.core.metadata.CarbonMetadata;
import org.apache.carbondata.core.metadata.SegmentFileStore;
import org.apache.carbondata.core.metadata.converter.SchemaConverter;
import org.apache.carbondata.core.metadata.converter.ThriftWrapperSchemaConverterImpl;
import org.apache.carbondata.core.metadata.schema.PartitionInfo;
import org.apache.carbondata.core.metadata.schema.partition.PartitionType;
import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
import org.apache.carbondata.core.metadata.schema.table.TableInfo;
import org.apache.carbondata.core.reader.ThriftReader;
import org.apache.carbondata.core.scan.expression.Expression;
import org.apache.carbondata.core.statusmanager.FileFormat;
import org.apache.carbondata.core.statusmanager.LoadMetadataDetails;
import org.apache.carbondata.core.statusmanager.SegmentStatusManager;
import org.apache.carbondata.core.util.CarbonProperties;
import org.apache.carbondata.core.util.CarbonUtil;
import org.apache.carbondata.core.util.path.CarbonTablePath;
import org.apache.carbondata.hadoop.CarbonInputSplit;
import org.apache.carbondata.hadoop.api.CarbonInputFormat;
import org.apache.carbondata.hadoop.api.CarbonTableInputFormat;
import org.apache.carbondata.presto.PrestoFilterUtil;

import com.facebook.presto.hadoop.$internal.com.google.gson.Gson;
import com.facebook.presto.hadoop.$internal.org.apache.commons.collections.CollectionUtils;
import com.facebook.presto.hive.HiveColumnHandle;
import com.facebook.presto.spi.SchemaTableName;
import com.facebook.presto.spi.predicate.TupleDomain;
import com.google.inject.Inject;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.log4j.Logger;
import org.apache.thrift.TBase;

import static org.apache.hadoop.fs.s3a.Constants.ACCESS_KEY;
import static org.apache.hadoop.fs.s3a.Constants.ENDPOINT;
import static org.apache.hadoop.fs.s3a.Constants.SECRET_KEY;

/**
 * CarbonTableReader will be a facade of these utils
 * 1:CarbonMetadata,(logic table)
 * 2:FileFactory, (physic table file)
 * 3:CarbonCommonFactory, (offer some )
 * 4:DictionaryFactory, (parse dictionary util)
 * Currently, it is mainly used to parse metadata of tables under
 * the configured carbondata-store path and filter the relevant
 * input splits with given query predicates.
 */
public class CarbonTableReader {

    // default PathFilter, accepts files in carbondata format (with .carbondata extension).
    private static final PathFilter DefaultFilter = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return CarbonTablePath.isCarbonDataFile(path.getName());
        }
    };
    public CarbonTableConfig config;
    /**
     * A cache for Carbon reader, with this cache,
     * metadata of a table is only read from file system once.
     */
    private AtomicReference<Map<SchemaTableName, CarbonTableCacheModel>> carbonCache;

    private String queryId;

    /**
     * Logger instance
     */
    private static final Logger LOGGER = LogServiceFactory.getLogService(CarbonTableReader.class.getName());

    /**
     * List Of Schemas
     */
    private List<String> schemaNames = new ArrayList<>();

    @Inject
    public CarbonTableReader(CarbonTableConfig config) {
        this.config = Objects.requireNonNull(config, "CarbonTableConfig is null");
        this.carbonCache = new AtomicReference(new ConcurrentHashMap<>());
        populateCarbonProperties();
    }

    /**
     * For presto worker node to initialize the metadata cache of a table.
     *
     * @param table the name of the table and schema.
     * @return
     */
    public CarbonTableCacheModel getCarbonCache(SchemaTableName table, String location, Configuration config) {
        updateSchemaTables(table, config);
        CarbonTableCacheModel carbonTableCacheModel = carbonCache.get().get(table);
        if (carbonTableCacheModel == null || !carbonTableCacheModel.isValid()) {
            return parseCarbonMetadata(table, location, config);
        }
        return carbonTableCacheModel;
    }

    /**
     * Find all the tables under the schema store path (this.carbonFileList)
     * and cache all the table names in this.tableList. Notice that whenever this method
     * is called, it clears this.tableList and populate the list by reading the files.
     */
    private void updateSchemaTables(SchemaTableName schemaTableName, Configuration config) {
        CarbonTableCacheModel carbonTableCacheModel = carbonCache.get().get(schemaTableName);
        if (carbonTableCacheModel != null && carbonTableCacheModel.getCarbonTable().isTransactionalTable()) {
            CarbonTable carbonTable = carbonTableCacheModel.getCarbonTable();
            long latestTime = FileFactory
                    .getCarbonFile(CarbonTablePath.getSchemaFilePath(carbonTable.getTablePath()), config)
                    .getLastModifiedTime();
            carbonTableCacheModel.setCurrentSchemaTime(latestTime);
            if (!carbonTableCacheModel.isValid()) {
                // Invalidate datamaps
                DataMapStoreManager.getInstance()
                        .clearDataMaps(carbonTableCacheModel.getCarbonTable().getAbsoluteTableIdentifier());
            }
        }
    }

    /**
     * Read the metadata of the given table
     * and cache it in this.carbonCache (CarbonTableReader cache).
     *
     * @param table name of the given table.
     * @return the CarbonTableCacheModel instance which contains all the needed metadata for a table.
     */
    private CarbonTableCacheModel parseCarbonMetadata(SchemaTableName table, String tablePath,
            Configuration config) {
        try {
            CarbonTableCacheModel cache = getValidCacheBySchemaTableName(table);
            if (cache != null) {
                return cache;
            }
            // multiple tasks can be launched in a worker concurrently. Hence need to synchronize this.
            synchronized (this) {
                // cache might be filled by another thread, so if filled use that cache.
                CarbonTableCacheModel cacheModel = getValidCacheBySchemaTableName(table);
                if (cacheModel != null) {
                    return cacheModel;
                }
                // Step 1: get store path of the table and cache it.
                String schemaFilePath = CarbonTablePath.getSchemaFilePath(tablePath, config);
                // If metadata folder exists, it is a transactional table
                CarbonFile schemaFile = FileFactory.getCarbonFile(schemaFilePath, config);
                boolean isTransactionalTable = schemaFile.exists();
                org.apache.carbondata.format.TableInfo tableInfo;
                long modifiedTime = System.currentTimeMillis();
                if (isTransactionalTable) {
                    //Step 2: read the metadata (tableInfo) of the table.
                    ThriftReader.TBaseCreator createTBase = new ThriftReader.TBaseCreator() {
                        // TBase is used to read and write thrift objects.
                        // TableInfo is a kind of TBase used to read and write table information.
                        // TableInfo is generated by thrift,
                        // see schema.thrift under format/src/main/thrift for details.
                        public TBase create() {
                            return new org.apache.carbondata.format.TableInfo();
                        }
                    };
                    ThriftReader thriftReader = new ThriftReader(schemaFilePath, createTBase, config);
                    thriftReader.open();
                    tableInfo = (org.apache.carbondata.format.TableInfo) thriftReader.read();
                    thriftReader.close();
                    modifiedTime = schemaFile.getLastModifiedTime();
                } else {
                    tableInfo = CarbonUtil.inferSchema(tablePath, table.getTableName(), false, config);
                }
                // Step 3: convert format level TableInfo to code level TableInfo
                SchemaConverter schemaConverter = new ThriftWrapperSchemaConverterImpl();
                // wrapperTableInfo is the code level information of a table in carbondata core,
                // different from the Thrift TableInfo.
                TableInfo wrapperTableInfo = schemaConverter.fromExternalToWrapperTableInfo(tableInfo,
                        table.getSchemaName(), table.getTableName(), tablePath);
                wrapperTableInfo.setTransactionalTable(isTransactionalTable);
                CarbonMetadata.getInstance().removeTable(wrapperTableInfo.getTableUniqueName());
                // Step 4: Load metadata info into CarbonMetadata
                CarbonMetadata.getInstance().loadTableMetadata(wrapperTableInfo);
                CarbonTable carbonTable = Objects.requireNonNull(
                        CarbonMetadata.getInstance().getCarbonTable(table.getSchemaName(), table.getTableName()),
                        "carbontable is null");
                cache = new CarbonTableCacheModel(modifiedTime, carbonTable);
                // cache the table
                carbonCache.get().put(table, cache);
                cache.setCarbonTable(carbonTable);
            }
            return cache;
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }
    }

    private CarbonTableCacheModel getValidCacheBySchemaTableName(SchemaTableName schemaTableName) {
        CarbonTableCacheModel cache = carbonCache.get().get(schemaTableName);
        if (cache != null && cache.isValid()) {
            return cache;
        }
        return null;
    }

    public List<CarbonLocalMultiBlockSplit> getInputSplits2(CarbonTableCacheModel tableCacheModel,
            Expression filters, TupleDomain<HiveColumnHandle> constraints, Configuration config)
            throws IOException {
        List<CarbonLocalInputSplit> result = new ArrayList<>();
        List<CarbonLocalMultiBlockSplit> multiBlockSplitList = new ArrayList<>();
        CarbonTable carbonTable = tableCacheModel.getCarbonTable();
        TableInfo tableInfo = tableCacheModel.getCarbonTable().getTableInfo();
        config.set(CarbonTableInputFormat.INPUT_SEGMENT_NUMBERS, "");
        String carbonTablePath = carbonTable.getAbsoluteTableIdentifier().getTablePath();
        config.set(CarbonTableInputFormat.INPUT_DIR, carbonTablePath);
        config.set(CarbonTableInputFormat.DATABASE_NAME, carbonTable.getDatabaseName());
        config.set(CarbonTableInputFormat.TABLE_NAME, carbonTable.getTableName());
        config.set("query.id", queryId);
        CarbonInputFormat.setTransactionalTable(config, carbonTable.isTransactionalTable());
        CarbonInputFormat.setTableInfo(config, carbonTable.getTableInfo());

        JobConf jobConf = new JobConf(config);
        List<PartitionSpec> filteredPartitions = new ArrayList<>();

        PartitionInfo partitionInfo = carbonTable.getPartitionInfo(carbonTable.getTableName());
        LoadMetadataDetails[] loadMetadataDetails = null;
        if (partitionInfo != null && partitionInfo.getPartitionType() == PartitionType.NATIVE_HIVE) {
            try {
                loadMetadataDetails = SegmentStatusManager
                        .readTableStatusFile(CarbonTablePath.getTableStatusFilePath(carbonTable.getTablePath()));
            } catch (IOException e) {
                LOGGER.error(e.getMessage(), e);
                throw e;
            }
            filteredPartitions = findRequiredPartitions(constraints, carbonTable, loadMetadataDetails);
        }
        try {
            CarbonTableInputFormat.setTableInfo(config, tableInfo);
            CarbonTableInputFormat carbonTableInputFormat = createInputFormat(jobConf,
                    carbonTable.getAbsoluteTableIdentifier(), filters, filteredPartitions);
            Job job = Job.getInstance(jobConf);
            List<InputSplit> splits = carbonTableInputFormat.getSplits(job);
            Gson gson = new Gson();
            if (splits != null && splits.size() > 0) {
                for (InputSplit inputSplit : splits) {
                    CarbonInputSplit carbonInputSplit = (CarbonInputSplit) inputSplit;
                    result.add(new CarbonLocalInputSplit(carbonInputSplit.getSegmentId(),
                            carbonInputSplit.getPath().toString(), carbonInputSplit.getStart(),
                            carbonInputSplit.getLength(), Arrays.asList(carbonInputSplit.getLocations()),
                            carbonInputSplit.getNumberOfBlocklets(), carbonInputSplit.getVersion().number(),
                            carbonInputSplit.getDeleteDeltaFiles(), carbonInputSplit.getBlockletId(),
                            gson.toJson(carbonInputSplit.getDetailInfo()),
                            carbonInputSplit.getFileFormat().ordinal()));
                }

                // Use block distribution
                List<List<CarbonLocalInputSplit>> inputSplits = new ArrayList(result.stream()
                        .map(x -> (CarbonLocalInputSplit) x).collect(Collectors.groupingBy(carbonInput -> {
                            if (FileFormat.ROW_V1.equals(carbonInput.getFileFormat())) {
                                return carbonInput.getSegmentId().concat(carbonInput.getPath())
                                        .concat(carbonInput.getStart() + "");
                            }
                            return carbonInput.getSegmentId().concat(carbonInput.getPath());
                        })).values());
                if (inputSplits != null) {
                    for (int j = 0; j < inputSplits.size(); j++) {
                        multiBlockSplitList.add(new CarbonLocalMultiBlockSplit(inputSplits.get(j),
                                inputSplits.get(j).stream().flatMap(f -> Arrays.stream(getLocations(f))).distinct()
                                        .toArray(String[]::new)));
                    }
                }
                LOGGER.error("Size fo MultiblockList   " + multiBlockSplitList.size());

            }

        } catch (IOException e) {
            throw new RuntimeException(e);
        }

        return multiBlockSplitList;
    }

    /**
     * Returns list of partition specs to query based on the domain constraints
     *
     * @param constraints
     * @param carbonTable
     * @throws IOException
     */
    private List<PartitionSpec> findRequiredPartitions(TupleDomain<HiveColumnHandle> constraints,
            CarbonTable carbonTable, LoadMetadataDetails[] loadMetadataDetails) throws IOException {
        Set<PartitionSpec> partitionSpecs = new HashSet<>();
        List<PartitionSpec> prunePartitions = new ArrayList();

        for (LoadMetadataDetails loadMetadataDetail : loadMetadataDetails) {
            SegmentFileStore segmentFileStore = null;
            try {
                segmentFileStore = new SegmentFileStore(carbonTable.getTablePath(),
                        loadMetadataDetail.getSegmentFile());
                partitionSpecs.addAll(segmentFileStore.getPartitionSpecs());

            } catch (IOException e) {
                LOGGER.error(e.getMessage(), e);
                throw e;
            }
        }
        List<String> partitionValuesFromExpression = PrestoFilterUtil.getPartitionFilters(carbonTable, constraints);

        List<PartitionSpec> partitionSpecList = partitionSpecs.stream().filter(partitionSpec -> CollectionUtils
                .isSubCollection(partitionValuesFromExpression, partitionSpec.getPartitions()))
                .collect(Collectors.toList());

        prunePartitions.addAll(partitionSpecList);

        return prunePartitions;
    }

    private CarbonTableInputFormat<Object> createInputFormat(Configuration conf, AbsoluteTableIdentifier identifier,
            Expression filterExpression, List<PartitionSpec> filteredPartitions) throws IOException {
        CarbonTableInputFormat format = new CarbonTableInputFormat<Object>();
        CarbonTableInputFormat.setTablePath(conf, identifier.appendWithLocalPrefix(identifier.getTablePath()));
        CarbonTableInputFormat.setFilterPredicates(conf, filterExpression);
        if (filteredPartitions.size() != 0) {
            CarbonTableInputFormat.setPartitionsToPrune(conf, filteredPartitions);
        }
        return format;
    }

    private void populateCarbonProperties() {
        addProperty(CarbonCommonConstants.UNSAFE_WORKING_MEMORY_IN_MB, config.getUnsafeMemoryInMb());
        addProperty(CarbonCommonConstants.ENABLE_UNSAFE_IN_QUERY_EXECUTION,
                config.getEnableUnsafeInQueryExecution());
        addProperty(CarbonCommonConstants.ENABLE_UNSAFE_COLUMN_PAGE, config.getEnableUnsafeColumnPage());
        addProperty(CarbonCommonConstants.ENABLE_UNSAFE_SORT, config.getEnableUnsafeSort());
        addProperty(CarbonCommonConstants.ENABLE_QUERY_STATISTICS, config.getEnableQueryStatistics());
        // TODO: Support configurable
        addProperty(CarbonCommonConstants.CARBON_WRITTEN_BY_APPNAME, "Presto_Server");
    }

    public Configuration updateS3Properties(Configuration configuration) {
        configuration.set(ACCESS_KEY, Objects.toString(config.getS3A_AcesssKey(), ""));
        configuration.set(SECRET_KEY, Objects.toString(config.getS3A_SecretKey()));
        configuration.set(CarbonCommonConstants.S3_ACCESS_KEY, Objects.toString(config.getS3_AcesssKey(), ""));
        configuration.set(CarbonCommonConstants.S3_SECRET_KEY, Objects.toString(config.getS3_SecretKey()));
        configuration.set(CarbonCommonConstants.S3N_ACCESS_KEY, Objects.toString(config.getS3N_AcesssKey(), ""));
        configuration.set(CarbonCommonConstants.S3N_SECRET_KEY, Objects.toString(config.getS3N_SecretKey(), ""));
        configuration.set(ENDPOINT, Objects.toString(config.getS3EndPoint(), ""));
        return configuration;
    }

    private void addProperty(String propertyName, String propertyValue) {
        if (propertyValue != null) {
            CarbonProperties.getInstance().addProperty(propertyName, propertyValue);
        }
    }

    /**
     * @param cis
     * @return
     */
    private String[] getLocations(CarbonLocalInputSplit cis) {
        return cis.getLocations().toArray(new String[cis.getLocations().size()]);
    }

    public void setQueryId(String queryId) {
        this.queryId = queryId;
    }
}