org.apache.carbondata.hadoop.api.CarbonTableOutputFormat.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.carbondata.hadoop.api.CarbonTableOutputFormat.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.carbondata.hadoop.api;

import java.io.IOException;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.carbondata.common.logging.LogServiceFactory;
import org.apache.carbondata.core.constants.CarbonCommonConstants;
import org.apache.carbondata.core.constants.CarbonLoadOptionConstants;
import org.apache.carbondata.core.datastore.compression.CompressorFactory;
import org.apache.carbondata.core.metadata.datatype.StructField;
import org.apache.carbondata.core.metadata.datatype.StructType;
import org.apache.carbondata.core.metadata.schema.table.CarbonTable;
import org.apache.carbondata.core.metadata.schema.table.TableInfo;
import org.apache.carbondata.core.util.CarbonProperties;
import org.apache.carbondata.core.util.CarbonThreadFactory;
import org.apache.carbondata.core.util.DataTypeUtil;
import org.apache.carbondata.core.util.ObjectSerializationUtil;
import org.apache.carbondata.core.util.ThreadLocalSessionInfo;
import org.apache.carbondata.hadoop.internal.ObjectArrayWritable;
import org.apache.carbondata.processing.loading.ComplexDelimitersEnum;
import org.apache.carbondata.processing.loading.DataLoadExecutor;
import org.apache.carbondata.processing.loading.TableProcessingOperations;
import org.apache.carbondata.processing.loading.iterator.CarbonOutputIteratorWrapper;
import org.apache.carbondata.processing.loading.model.CarbonDataLoadSchema;
import org.apache.carbondata.processing.loading.model.CarbonLoadModel;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;

/**
 * This is table level output format which writes the data to store in new segment. Each load
 * creates new segment folder and manages the folder through tablestatus file.
 * It also generate and writes dictionary data during load only if dictionary server is configured.
 */
// TODO Move dictionary generater which is coded in spark to MR framework.
public class CarbonTableOutputFormat extends FileOutputFormat<NullWritable, ObjectArrayWritable> {

    private static final String LOAD_MODEL = "mapreduce.carbontable.load.model";
    private static final String DATABASE_NAME = "mapreduce.carbontable.databaseName";
    private static final String TABLE_NAME = "mapreduce.carbontable.tableName";
    private static final String TABLE = "mapreduce.carbontable.table";
    private static final String TABLE_PATH = "mapreduce.carbontable.tablepath";
    private static final String INPUT_SCHEMA = "mapreduce.carbontable.inputschema";
    private static final String TEMP_STORE_LOCATIONS = "mapreduce.carbontable.tempstore.locations";
    private static final String OVERWRITE_SET = "mapreduce.carbontable.set.overwrite";
    public static final String COMPLEX_DELIMITERS = "mapreduce.carbontable.complex_delimiters";
    private static final String CARBON_TRANSACTIONAL_TABLE = "mapreduce.input.carboninputformat.transactional";
    public static final String SERIALIZATION_NULL_FORMAT = "mapreduce.carbontable.serialization.null.format";
    public static final String BAD_RECORDS_LOGGER_ENABLE = "mapreduce.carbontable.bad.records.logger.enable";
    public static final String BAD_RECORDS_LOGGER_ACTION = "mapreduce.carbontable.bad.records.logger.action";
    public static final String IS_EMPTY_DATA_BAD_RECORD = "mapreduce.carbontable.empty.data.bad.record";
    public static final String SKIP_EMPTY_LINE = "mapreduce.carbontable.skip.empty.line";
    public static final String SORT_SCOPE = "mapreduce.carbontable.load.sort.scope";
    public static final String BATCH_SORT_SIZE_INMB = "mapreduce.carbontable.batch.sort.size.inmb";
    public static final String GLOBAL_SORT_PARTITIONS = "mapreduce.carbontable.global.sort.partitions";
    public static final String BAD_RECORD_PATH = "mapreduce.carbontable.bad.record.path";
    public static final String DATE_FORMAT = "mapreduce.carbontable.date.format";
    public static final String TIMESTAMP_FORMAT = "mapreduce.carbontable.timestamp.format";
    public static final String IS_ONE_PASS_LOAD = "mapreduce.carbontable.one.pass.load";
    public static final String DICTIONARY_SERVER_HOST = "mapreduce.carbontable.dict.server.host";
    public static final String DICTIONARY_SERVER_PORT = "mapreduce.carbontable.dict.server.port";
    /**
     * Set the update timestamp if user sets in case of update query. It needs to be updated
     * in load status update time
     */
    public static final String UPADTE_TIMESTAMP = "mapreduce.carbontable.update.timestamp";

    /**
     * During update query we first delete the old data and then add updated data to new segment, so
     * sometimes there is a chance that complete segments needs to removed during deletion. We should
     * do 'Mark for delete' for those segments during table status update.
     */
    public static final String SEGMENTS_TO_BE_DELETED = "mapreduce.carbontable.segments.to.be.removed";

    /**
     * It is used only to fire events in case of any child tables to be loaded.
     */
    public static final String OPERATION_CONTEXT = "mapreduce.carbontable.operation.context";

    private static final Logger LOG = LogServiceFactory.getLogService(CarbonTableOutputFormat.class.getName());

    private CarbonOutputCommitter committer;

    public static void setDatabaseName(Configuration configuration, String databaseName) {
        if (null != databaseName) {
            configuration.set(DATABASE_NAME, databaseName);
        }
    }

    public static String getDatabaseName(Configuration configuration) {
        return configuration.get(DATABASE_NAME);
    }

    public static void setTableName(Configuration configuration, String tableName) {
        if (null != tableName) {
            configuration.set(TABLE_NAME, tableName);
        }
    }

    public static String getTableName(Configuration configuration) {
        return configuration.get(TABLE_NAME);
    }

    public static void setTablePath(Configuration configuration, String tablePath) {
        if (null != tablePath) {
            configuration.set(TABLE_PATH, tablePath);
        }
    }

    public static String getTablePath(Configuration configuration) {
        return configuration.get(TABLE_PATH);
    }

    public static void setCarbonTable(Configuration configuration, CarbonTable carbonTable) throws IOException {
        if (carbonTable != null) {
            configuration.set(TABLE,
                    ObjectSerializationUtil.convertObjectToString(carbonTable.getTableInfo().serialize()));
        }
    }

    public static CarbonTable getCarbonTable(Configuration configuration) throws IOException {
        CarbonTable carbonTable = null;
        String encodedString = configuration.get(TABLE);
        if (encodedString != null) {
            byte[] bytes = (byte[]) ObjectSerializationUtil.convertStringToObject(encodedString);
            TableInfo tableInfo = TableInfo.deserialize(bytes);
            carbonTable = CarbonTable.buildFromTableInfo(tableInfo);
        }
        return carbonTable;
    }

    public static void setLoadModel(Configuration configuration, CarbonLoadModel loadModel) throws IOException {
        if (loadModel != null) {
            configuration.set(LOAD_MODEL, ObjectSerializationUtil.convertObjectToString(loadModel));
        }
    }

    public static void setInputSchema(Configuration configuration, StructType inputSchema) throws IOException {
        if (inputSchema != null && inputSchema.getFields().size() > 0) {
            configuration.set(INPUT_SCHEMA, ObjectSerializationUtil.convertObjectToString(inputSchema));
        } else {
            throw new UnsupportedOperationException("Input schema must be set");
        }
    }

    private static StructType getInputSchema(Configuration configuration) throws IOException {
        String encodedString = configuration.get(INPUT_SCHEMA);
        if (encodedString != null) {
            return (StructType) ObjectSerializationUtil.convertStringToObject(encodedString);
        }
        return null;
    }

    public static boolean isOverwriteSet(Configuration configuration) {
        String overwrite = configuration.get(OVERWRITE_SET);
        if (overwrite != null) {
            return Boolean.parseBoolean(overwrite);
        }
        return false;
    }

    public static void setOverwrite(Configuration configuration, boolean overwrite) {
        configuration.set(OVERWRITE_SET, String.valueOf(overwrite));
    }

    public static void setTempStoreLocations(Configuration configuration, String[] tempLocations)
            throws IOException {
        if (tempLocations != null && tempLocations.length > 0) {
            configuration.set(TEMP_STORE_LOCATIONS, ObjectSerializationUtil.convertObjectToString(tempLocations));
        }
    }

    private static String[] getTempStoreLocations(TaskAttemptContext taskAttemptContext) throws IOException {
        String encodedString = taskAttemptContext.getConfiguration().get(TEMP_STORE_LOCATIONS);
        if (encodedString != null) {
            return (String[]) ObjectSerializationUtil.convertStringToObject(encodedString);
        }
        return new String[] { System.getProperty("java.io.tmpdir") + "/" + System.nanoTime() + "_"
                + taskAttemptContext.getTaskAttemptID().toString() };
    }

    @Override
    public synchronized OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException {
        if (this.committer == null) {
            Path output = getOutputPath(context);
            this.committer = new CarbonOutputCommitter(output, context);
        }
        return this.committer;
    }

    @Override
    public RecordWriter<NullWritable, ObjectArrayWritable> getRecordWriter(
            final TaskAttemptContext taskAttemptContext) throws IOException {
        final CarbonLoadModel loadModel = getLoadModel(taskAttemptContext.getConfiguration());
        String appName = taskAttemptContext.getConfiguration().get(CarbonCommonConstants.CARBON_WRITTEN_BY_APPNAME);
        if (null != appName) {
            CarbonProperties.getInstance().addProperty(CarbonCommonConstants.CARBON_WRITTEN_BY_APPNAME, appName);
        }
        //if loadModel having taskNo already(like in SDK) then no need to overwrite
        short sdkWriterCores = loadModel.getSdkWriterCores();
        int itrSize = (sdkWriterCores > 0) ? sdkWriterCores : 1;
        final CarbonOutputIteratorWrapper[] iterators = new CarbonOutputIteratorWrapper[itrSize];
        for (int i = 0; i < itrSize; i++) {
            iterators[i] = new CarbonOutputIteratorWrapper();
        }
        if (null == loadModel.getTaskNo() || loadModel.getTaskNo().isEmpty()) {
            loadModel.setTaskNo(taskAttemptContext.getConfiguration().get("carbon.outputformat.taskno",
                    String.valueOf(System.nanoTime())));
        }
        loadModel.setDataWritePath(taskAttemptContext.getConfiguration().get("carbon.outputformat.writepath"));
        final String[] tempStoreLocations = getTempStoreLocations(taskAttemptContext);
        DataTypeUtil.clearFormatter();
        final DataLoadExecutor dataLoadExecutor = new DataLoadExecutor();
        final ExecutorService executorService = Executors.newFixedThreadPool(1,
                new CarbonThreadFactory("CarbonRecordWriter:" + loadModel.getTableName(), true));
        // It should be started in new thread as the underlying iterator uses blocking queue.
        Future future = executorService.submit(new Thread() {
            @Override
            public void run() {
                ThreadLocalSessionInfo.setConfigurationToCurrentThread(taskAttemptContext.getConfiguration());
                try {
                    dataLoadExecutor.execute(loadModel, tempStoreLocations, iterators);
                } catch (Exception e) {
                    executorService.shutdownNow();
                    for (CarbonOutputIteratorWrapper iterator : iterators) {
                        iterator.closeWriter(true);
                    }
                    try {
                        dataLoadExecutor.close();
                    } catch (Exception ex) {
                        // As already exception happened before close() send that exception.
                        throw new RuntimeException(e);
                    }
                    throw new RuntimeException(e);
                } finally {
                    ThreadLocalSessionInfo.unsetAll();
                }
            }
        });

        if (sdkWriterCores > 0) {
            // CarbonMultiRecordWriter handles the load balancing of the write rows in round robin.
            return new CarbonMultiRecordWriter(iterators, dataLoadExecutor, loadModel, future, executorService);
        } else {
            return new CarbonRecordWriter(iterators[0], dataLoadExecutor, loadModel, future, executorService);
        }
    }

    public static CarbonLoadModel getLoadModel(Configuration conf) throws IOException {
        CarbonLoadModel model;
        String encodedString = conf.get(LOAD_MODEL);
        if (encodedString != null) {
            model = (CarbonLoadModel) ObjectSerializationUtil.convertStringToObject(encodedString);
            return model;
        }
        model = new CarbonLoadModel();
        CarbonProperties carbonProperty = CarbonProperties.getInstance();
        model.setDatabaseName(CarbonTableOutputFormat.getDatabaseName(conf));
        model.setTableName(CarbonTableOutputFormat.getTableName(conf));
        model.setCarbonTransactionalTable(true);
        CarbonTable carbonTable = getCarbonTable(conf);
        String columnCompressor = carbonTable.getTableInfo().getFactTable().getTableProperties()
                .get(CarbonCommonConstants.COMPRESSOR);
        if (null == columnCompressor) {
            columnCompressor = CompressorFactory.getInstance().getCompressor().getName();
        }
        model.setColumnCompressor(columnCompressor);
        model.setCarbonDataLoadSchema(new CarbonDataLoadSchema(carbonTable));
        model.setTablePath(getTablePath(conf));
        setFileHeader(conf, model);
        model.setSerializationNullFormat(conf.get(SERIALIZATION_NULL_FORMAT, "\\N"));
        model.setBadRecordsLoggerEnable(conf.get(BAD_RECORDS_LOGGER_ENABLE,
                carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE,
                        CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORDS_LOGGER_ENABLE_DEFAULT)));
        model.setBadRecordsAction(conf.get(BAD_RECORDS_LOGGER_ACTION,
                carbonProperty.getProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION,
                        CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION_DEFAULT)));

        model.setIsEmptyDataBadRecord(conf.get(IS_EMPTY_DATA_BAD_RECORD,
                carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD,
                        CarbonLoadOptionConstants.CARBON_OPTIONS_IS_EMPTY_DATA_BAD_RECORD_DEFAULT)));

        model.setSkipEmptyLine(conf.get(SKIP_EMPTY_LINE,
                carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_SKIP_EMPTY_LINE)));

        String complexDelim = conf.get(COMPLEX_DELIMITERS);
        if (null == complexDelim) {
            complexDelim = ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_1.value() + ","
                    + ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_2.value() + ","
                    + ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_3.value();
        }
        String[] split = complexDelim.split(",");
        model.setComplexDelimiter(split[0]);
        if (split.length > 2) {
            model.setComplexDelimiter(split[1]);
            model.setComplexDelimiter(split[2]);
        } else if (split.length > 1) {
            model.setComplexDelimiter(split[1]);
        }
        model.setDateFormat(conf.get(DATE_FORMAT,
                carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_DATEFORMAT,
                        CarbonLoadOptionConstants.CARBON_OPTIONS_DATEFORMAT_DEFAULT)));

        model.setTimestampformat(conf.get(TIMESTAMP_FORMAT,
                carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_TIMESTAMPFORMAT,
                        CarbonLoadOptionConstants.CARBON_OPTIONS_TIMESTAMPFORMAT_DEFAULT)));

        model.setGlobalSortPartitions(conf.get(GLOBAL_SORT_PARTITIONS,
                carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_GLOBAL_SORT_PARTITIONS, null)));

        model.setBatchSortSizeInMb(conf.get(BATCH_SORT_SIZE_INMB,
                carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_BATCH_SORT_SIZE_INMB,
                        carbonProperty.getProperty(CarbonCommonConstants.LOAD_BATCH_SORT_SIZE_INMB,
                                CarbonCommonConstants.LOAD_BATCH_SORT_SIZE_INMB_DEFAULT))));

        String badRecordsPath = conf.get(BAD_RECORD_PATH);
        if (StringUtils.isEmpty(badRecordsPath)) {
            badRecordsPath = carbonTable.getTableInfo().getFactTable().getTableProperties().get("bad_record_path");
            if (StringUtils.isEmpty(badRecordsPath)) {
                badRecordsPath = carbonProperty.getProperty(
                        CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORD_PATH,
                        carbonProperty.getProperty(CarbonCommonConstants.CARBON_BADRECORDS_LOC,
                                CarbonCommonConstants.CARBON_BADRECORDS_LOC_DEFAULT_VAL));
            }
        }
        model.setBadRecordsLocation(badRecordsPath);
        model.setUseOnePass(conf.getBoolean(IS_ONE_PASS_LOAD,
                Boolean.parseBoolean(
                        carbonProperty.getProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_SINGLE_PASS,
                                CarbonLoadOptionConstants.CARBON_OPTIONS_SINGLE_PASS_DEFAULT))));
        return model;
    }

    private static void setFileHeader(Configuration configuration, CarbonLoadModel model) throws IOException {
        StructType inputSchema = getInputSchema(configuration);
        if (inputSchema == null || inputSchema.getFields().size() == 0) {
            throw new UnsupportedOperationException("Input schema must be set");
        }
        List<StructField> fields = inputSchema.getFields();
        StringBuilder builder = new StringBuilder();
        String[] columns = new String[fields.size()];
        int i = 0;
        for (StructField field : fields) {
            builder.append(field.getFieldName());
            builder.append(",");
            columns[i++] = field.getFieldName();
        }
        String header = builder.toString();
        model.setCsvHeader(header.substring(0, header.length() - 1));
        model.setCsvHeaderColumns(columns);
    }

    public static class CarbonRecordWriter extends RecordWriter<NullWritable, ObjectArrayWritable> {

        private CarbonOutputIteratorWrapper iteratorWrapper;

        private DataLoadExecutor dataLoadExecutor;

        private CarbonLoadModel loadModel;

        private ExecutorService executorService;

        private Future future;

        private boolean isClosed;

        public CarbonRecordWriter(CarbonOutputIteratorWrapper iteratorWrapper, DataLoadExecutor dataLoadExecutor,
                CarbonLoadModel loadModel, Future future, ExecutorService executorService) {
            this.iteratorWrapper = iteratorWrapper;
            this.dataLoadExecutor = dataLoadExecutor;
            this.loadModel = loadModel;
            this.executorService = executorService;
            this.future = future;
        }

        @Override
        public void write(NullWritable aVoid, ObjectArrayWritable objects) throws InterruptedException {
            if (iteratorWrapper != null) {
                iteratorWrapper.write(objects.get());
            }
        }

        @Override
        public void close(TaskAttemptContext taskAttemptContext) throws InterruptedException {
            if (!isClosed) {
                isClosed = true;
                if (iteratorWrapper != null) {
                    iteratorWrapper.closeWriter(false);
                }
                try {
                    future.get();
                } catch (ExecutionException e) {
                    LOG.error("Error while loading data", e);
                    throw new InterruptedException(e.getMessage());
                } finally {
                    executorService.shutdownNow();
                    dataLoadExecutor.close();
                    ThreadLocalSessionInfo.unsetAll();
                    // clean up the folders and files created locally for data load operation
                    TableProcessingOperations.deleteLocalDataLoadFolderLocation(loadModel, false, false);
                }
                LOG.info("Closed writer task " + taskAttemptContext.getTaskAttemptID());
            }
        }

        public CarbonLoadModel getLoadModel() {
            return loadModel;
        }
    }

    /* CarbonMultiRecordWriter takes multiple iterators
    and handles the load balancing of the write rows in round robin. */
    public static class CarbonMultiRecordWriter extends CarbonRecordWriter {

        private CarbonOutputIteratorWrapper[] iterators;

        // keep counts of number of writes called
        // and it is used to load balance each write call to one iterator.
        private AtomicLong counter;

        CarbonMultiRecordWriter(CarbonOutputIteratorWrapper[] iterators, DataLoadExecutor dataLoadExecutor,
                CarbonLoadModel loadModel, Future future, ExecutorService executorService) {
            super(null, dataLoadExecutor, loadModel, future, executorService);
            this.iterators = iterators;
            counter = new AtomicLong(0);
        }

        @Override
        public void write(NullWritable aVoid, ObjectArrayWritable objects) throws InterruptedException {
            int iteratorNum = (int) (counter.incrementAndGet() % iterators.length);
            synchronized (iterators[iteratorNum]) {
                iterators[iteratorNum].write(objects.get());
            }
        }

        @Override
        public void close(TaskAttemptContext taskAttemptContext) throws InterruptedException {
            for (int i = 0; i < iterators.length; i++) {
                synchronized (iterators[i]) {
                    iterators[i].closeWriter(false);
                }
            }
            super.close(taskAttemptContext);
        }
    }
}