Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.carbondata.processing.loading; import java.io.File; import java.util.ArrayList; import java.util.List; import org.apache.carbondata.common.CarbonIterator; import org.apache.carbondata.common.logging.LogServiceFactory; import org.apache.carbondata.core.constants.CarbonCommonConstants; import org.apache.carbondata.core.constants.CarbonLoadOptionConstants; import org.apache.carbondata.core.constants.SortScopeOptions; import org.apache.carbondata.core.datastore.TableSpec; import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier; import org.apache.carbondata.core.metadata.datatype.DataTypes; import org.apache.carbondata.core.metadata.schema.SortColumnRangeInfo; import org.apache.carbondata.core.metadata.schema.table.CarbonTable; import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn; import org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension; import org.apache.carbondata.core.metadata.schema.table.column.CarbonMeasure; import org.apache.carbondata.core.util.CarbonProperties; import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants; import org.apache.carbondata.processing.loading.exception.CarbonDataLoadingException; import org.apache.carbondata.processing.loading.model.CarbonLoadModel; import org.apache.carbondata.processing.loading.steps.CarbonRowDataWriterProcessorStepImpl; import org.apache.carbondata.processing.loading.steps.DataConverterProcessorStepImpl; import org.apache.carbondata.processing.loading.steps.DataWriterBatchProcessorStepImpl; import org.apache.carbondata.processing.loading.steps.DataWriterProcessorStepImpl; import org.apache.carbondata.processing.loading.steps.InputProcessorStepImpl; import org.apache.carbondata.processing.loading.steps.InputProcessorStepWithNoConverterImpl; import org.apache.carbondata.processing.loading.steps.JsonInputProcessorStepImpl; import org.apache.carbondata.processing.loading.steps.SortProcessorStepImpl; import org.apache.carbondata.processing.util.CarbonDataProcessorUtil; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; /** * It builds the pipe line of steps for loading data to carbon. */ public final class DataLoadProcessBuilder { private static final Logger LOGGER = LogServiceFactory.getLogService(DataLoadProcessBuilder.class.getName()); public AbstractDataLoadProcessorStep build(CarbonLoadModel loadModel, String[] storeLocation, CarbonIterator[] inputIterators) throws Exception { CarbonDataLoadConfiguration configuration = createConfiguration(loadModel, storeLocation); SortScopeOptions.SortScope sortScope = CarbonDataProcessorUtil.getSortScope(configuration); if (loadModel.isLoadWithoutConverterStep()) { return buildInternalWithNoConverter(inputIterators, configuration, sortScope); } else if (loadModel.isJsonFileLoad()) { return buildInternalWithJsonInputProcessor(inputIterators, configuration, sortScope); } else if (!configuration.isSortTable() || sortScope.equals(SortScopeOptions.SortScope.NO_SORT)) { return buildInternalForNoSort(inputIterators, configuration); } else if (configuration.getBucketingInfo() != null) { return buildInternalForBucketing(inputIterators, configuration); } else if (sortScope.equals(SortScopeOptions.SortScope.BATCH_SORT)) { return buildInternalForBatchSort(inputIterators, configuration); } else { return buildInternal(inputIterators, configuration); } } private AbstractDataLoadProcessorStep buildInternal(CarbonIterator[] inputIterators, CarbonDataLoadConfiguration configuration) { // 1. Reads the data input iterators and parses the data. AbstractDataLoadProcessorStep inputProcessorStep = new InputProcessorStepImpl(configuration, inputIterators); // 2. Converts the data like dictionary or non dictionary or complex objects depends on // data types and configurations. AbstractDataLoadProcessorStep converterProcessorStep = new DataConverterProcessorStepImpl(configuration, inputProcessorStep); // 3. Sorts the data by SortColumn AbstractDataLoadProcessorStep sortProcessorStep = new SortProcessorStepImpl(configuration, converterProcessorStep); // 4. Writes the sorted data in carbondata format. return new DataWriterProcessorStepImpl(configuration, sortProcessorStep); } private AbstractDataLoadProcessorStep buildInternalForNoSort(CarbonIterator[] inputIterators, CarbonDataLoadConfiguration configuration) { // 1. Reads the data input iterators and parses the data. AbstractDataLoadProcessorStep inputProcessorStep = new InputProcessorStepImpl(configuration, inputIterators); // 2. Converts the data like dictionary or non dictionary or complex objects depends on // data types and configurations. AbstractDataLoadProcessorStep converterProcessorStep = new DataConverterProcessorStepImpl(configuration, inputProcessorStep); // 3. Writes the sorted data in carbondata format. return new CarbonRowDataWriterProcessorStepImpl(configuration, converterProcessorStep); } /** * Build pipe line for Load without Conversion Step. */ private AbstractDataLoadProcessorStep buildInternalWithNoConverter(CarbonIterator[] inputIterators, CarbonDataLoadConfiguration configuration, SortScopeOptions.SortScope sortScope) { // Wraps with dummy processor. AbstractDataLoadProcessorStep inputProcessorStep = new InputProcessorStepWithNoConverterImpl(configuration, inputIterators); if (sortScope.equals(SortScopeOptions.SortScope.LOCAL_SORT)) { AbstractDataLoadProcessorStep sortProcessorStep = new SortProcessorStepImpl(configuration, inputProcessorStep); // Writes the sorted data in carbondata format. return new DataWriterProcessorStepImpl(configuration, sortProcessorStep); } else if (sortScope.equals(SortScopeOptions.SortScope.BATCH_SORT)) { // Sorts the data by SortColumn or not AbstractDataLoadProcessorStep sortProcessorStep = new SortProcessorStepImpl(configuration, inputProcessorStep); // Writes the sorted data in carbondata format. return new DataWriterBatchProcessorStepImpl(configuration, sortProcessorStep); } else { // In all other cases like global sort and no sort uses this step return new CarbonRowDataWriterProcessorStepImpl(configuration, inputProcessorStep); } } /** * Build pipe line for Load with json input processor. */ private AbstractDataLoadProcessorStep buildInternalWithJsonInputProcessor(CarbonIterator[] inputIterators, CarbonDataLoadConfiguration configuration, SortScopeOptions.SortScope sortScope) { // currently row by row conversion of string json to carbon row is supported. AbstractDataLoadProcessorStep inputProcessorStep = new JsonInputProcessorStepImpl(configuration, inputIterators); // 2. Converts the data like dictionary or non dictionary or complex objects depends on // data types and configurations. AbstractDataLoadProcessorStep converterProcessorStep = new DataConverterProcessorStepImpl(configuration, inputProcessorStep); if (sortScope.equals(SortScopeOptions.SortScope.LOCAL_SORT)) { AbstractDataLoadProcessorStep sortProcessorStep = new SortProcessorStepImpl(configuration, converterProcessorStep); // Writes the sorted data in carbondata format. return new DataWriterProcessorStepImpl(configuration, sortProcessorStep); } else if (sortScope.equals(SortScopeOptions.SortScope.BATCH_SORT)) { // Sorts the data by SortColumn or not AbstractDataLoadProcessorStep sortProcessorStep = new SortProcessorStepImpl(configuration, converterProcessorStep); // Writes the sorted data in carbondata format. return new DataWriterBatchProcessorStepImpl(configuration, sortProcessorStep); } else { // In all other cases like global sort and no sort uses this step return new CarbonRowDataWriterProcessorStepImpl(configuration, converterProcessorStep); } } private AbstractDataLoadProcessorStep buildInternalForBatchSort(CarbonIterator[] inputIterators, CarbonDataLoadConfiguration configuration) { // 1. Reads the data input iterators and parses the data. AbstractDataLoadProcessorStep inputProcessorStep = new InputProcessorStepImpl(configuration, inputIterators); // 2. Converts the data like dictionary or non dictionary or complex objects depends on // data types and configurations. AbstractDataLoadProcessorStep converterProcessorStep = new DataConverterProcessorStepImpl(configuration, inputProcessorStep); // 3. Sorts the data by SortColumn or not AbstractDataLoadProcessorStep sortProcessorStep = new SortProcessorStepImpl(configuration, converterProcessorStep); // 4. Writes the sorted data in carbondata format. return new DataWriterBatchProcessorStepImpl(configuration, sortProcessorStep); } private AbstractDataLoadProcessorStep buildInternalForBucketing(CarbonIterator[] inputIterators, CarbonDataLoadConfiguration configuration) throws Exception { // 1. Reads the data input iterators and parses the data. AbstractDataLoadProcessorStep inputProcessorStep = new InputProcessorStepImpl(configuration, inputIterators); // 2. Converts the data like dictionary or non dictionary or complex objects depends on // data types and configurations. AbstractDataLoadProcessorStep converterProcessorStep = new DataConverterProcessorStepImpl(configuration, inputProcessorStep); // 3. Sorts the data by SortColumn or not AbstractDataLoadProcessorStep sortProcessorStep = new SortProcessorStepImpl(configuration, converterProcessorStep); // 4. Writes the sorted data in carbondata format. return new DataWriterProcessorStepImpl(configuration, sortProcessorStep); } public static CarbonDataLoadConfiguration createConfiguration(CarbonLoadModel loadModel, String[] storeLocation) { CarbonDataProcessorUtil.createLocations(storeLocation); String databaseName = loadModel.getDatabaseName(); String tableName = loadModel.getTableName(); String tempLocationKey = CarbonDataProcessorUtil.getTempStoreLocationKey(databaseName, tableName, loadModel.getSegmentId(), loadModel.getTaskNo(), false, false); CarbonProperties.getInstance().addProperty(tempLocationKey, StringUtils.join(storeLocation, File.pathSeparator)); return createConfiguration(loadModel); } public static CarbonDataLoadConfiguration createConfiguration(CarbonLoadModel loadModel) { CarbonDataLoadConfiguration configuration = new CarbonDataLoadConfiguration(); CarbonTable carbonTable = loadModel.getCarbonDataLoadSchema().getCarbonTable(); AbsoluteTableIdentifier identifier = carbonTable.getAbsoluteTableIdentifier(); configuration.setParentTablePath(loadModel.getParentTablePath()); configuration.setTableIdentifier(identifier); configuration.setCarbonTransactionalTable(loadModel.isCarbonTransactionalTable()); configuration.setSchemaUpdatedTimeStamp(carbonTable.getTableLastUpdatedTime()); configuration.setHeader(loadModel.getCsvHeaderColumns()); configuration.setSegmentId(loadModel.getSegmentId()); configuration.setTaskNo(loadModel.getTaskNo()); String[] complexDelimiters = new String[loadModel.getComplexDelimiters().size()]; loadModel.getComplexDelimiters().toArray(complexDelimiters); configuration.setDataLoadProperty(DataLoadProcessorConstants.COMPLEX_DELIMITERS, complexDelimiters); configuration.setDataLoadProperty(DataLoadProcessorConstants.SERIALIZATION_NULL_FORMAT, loadModel.getSerializationNullFormat().split(",")[1]); configuration.setDataLoadProperty(DataLoadProcessorConstants.FACT_TIME_STAMP, loadModel.getFactTimeStamp()); configuration.setDataLoadProperty(DataLoadProcessorConstants.BAD_RECORDS_LOGGER_ENABLE, loadModel.getBadRecordsLoggerEnable().split(",")[1]); configuration.setDataLoadProperty(DataLoadProcessorConstants.BAD_RECORDS_LOGGER_ACTION, loadModel.getBadRecordsAction().split(",")[1]); configuration.setDataLoadProperty(DataLoadProcessorConstants.IS_EMPTY_DATA_BAD_RECORD, loadModel.getIsEmptyDataBadRecord().split(",")[1]); configuration.setDataLoadProperty(DataLoadProcessorConstants.SKIP_EMPTY_LINE, loadModel.getSkipEmptyLine()); configuration.setDataLoadProperty(DataLoadProcessorConstants.FACT_FILE_PATH, loadModel.getFactFilePath()); configuration.setParentTablePath(loadModel.getParentTablePath()); configuration.setDataLoadProperty(CarbonCommonConstants.LOAD_SORT_SCOPE, loadModel.getSortScope()); configuration.setDataLoadProperty(CarbonCommonConstants.LOAD_BATCH_SORT_SIZE_INMB, loadModel.getBatchSortSizeInMb()); configuration.setDataLoadProperty(CarbonCommonConstants.LOAD_GLOBAL_SORT_PARTITIONS, loadModel.getGlobalSortPartitions()); configuration.setDataLoadProperty(CarbonLoadOptionConstants.CARBON_OPTIONS_BAD_RECORD_PATH, loadModel.getBadRecordsLocation()); List<CarbonDimension> dimensions = carbonTable.getDimensionByTableName(carbonTable.getTableName()); List<CarbonMeasure> measures = carbonTable.getMeasureByTableName(carbonTable.getTableName()); List<DataField> dataFields = new ArrayList<>(); List<DataField> complexDataFields = new ArrayList<>(); // First add dictionary and non dictionary dimensions because these are part of mdk key. // And then add complex data types and measures. for (CarbonColumn column : dimensions) { DataField dataField = new DataField(column); if (column.getDataType() == DataTypes.DATE) { dataField.setDateFormat(loadModel.getDateFormat()); column.setDateFormat(loadModel.getDateFormat()); } else if (column.getDataType() == DataTypes.TIMESTAMP) { dataField.setTimestampFormat(loadModel.getTimestampformat()); column.setTimestampFormat(loadModel.getTimestampformat()); } if (column.isComplex()) { complexDataFields.add(dataField); List<CarbonDimension> childDimensions = ((CarbonDimension) dataField.getColumn()) .getListOfChildDimensions(); for (CarbonDimension childDimension : childDimensions) { if (childDimension.getDataType() == DataTypes.DATE) { childDimension.setDateFormat(loadModel.getDateFormat()); } else if (childDimension.getDataType() == DataTypes.TIMESTAMP) { childDimension.setTimestampFormat(loadModel.getTimestampformat()); } } } else { dataFields.add(dataField); } } dataFields.addAll(complexDataFields); for (CarbonColumn column : measures) { // This dummy measure is added when no measure was present. We no need to load it. if (!(column.getColName().equals("default_dummy_measure"))) { dataFields.add(new DataField(column)); } } configuration.setDataFields(dataFields.toArray(new DataField[dataFields.size()])); configuration.setBucketingInfo(carbonTable.getBucketingInfo(carbonTable.getTableName())); // configuration for one pass load: dictionary server info configuration.setUseOnePass(loadModel.getUseOnePass()); configuration.setDictionaryServerHost(loadModel.getDictionaryServerHost()); configuration.setDictionaryServerPort(loadModel.getDictionaryServerPort()); configuration.setDictionaryServerSecretKey(loadModel.getDictionaryServerSecretKey()); configuration.setDictionaryEncryptServerSecure(loadModel.getDictionaryEncryptServerSecure()); configuration.setDictionaryServiceProvider(loadModel.getDictionaryServiceProvider()); configuration.setPreFetch(loadModel.isPreFetch()); configuration.setNumberOfSortColumns(carbonTable.getNumberOfSortColumns()); configuration.setNumberOfNoDictSortColumns(carbonTable.getNumberOfNoDictSortColumns()); configuration.setDataWritePath(loadModel.getDataWritePath()); setSortColumnInfo(carbonTable, loadModel, configuration); // For partition loading always use single core as it already runs in multiple // threads per partition if (carbonTable.isHivePartitionTable()) { configuration.setWritingCoresCount((short) 1); } TableSpec tableSpec = new TableSpec(carbonTable); configuration.setTableSpec(tableSpec); if (loadModel.getSdkWriterCores() > 0) { configuration.setWritingCoresCount(loadModel.getSdkWriterCores()); } configuration.setNumberOfLoadingCores(CarbonProperties.getInstance().getNumberOfLoadingCores()); configuration.setColumnCompressor(loadModel.getColumnCompressor()); return configuration; } /** * set sort column info in configuration * @param carbonTable carbon table * @param loadModel load model * @param configuration configuration */ private static void setSortColumnInfo(CarbonTable carbonTable, CarbonLoadModel loadModel, CarbonDataLoadConfiguration configuration) { List<String> sortCols = carbonTable.getSortColumns(carbonTable.getTableName()); SortScopeOptions.SortScope sortScope = SortScopeOptions.getSortScope(loadModel.getSortScope()); if (!SortScopeOptions.SortScope.LOCAL_SORT.equals(sortScope) || sortCols.size() == 0 || StringUtils.isBlank(loadModel.getSortColumnsBoundsStr())) { if (!StringUtils.isBlank(loadModel.getSortColumnsBoundsStr())) { LOGGER.warn("sort column bounds will be ignored"); } configuration.setSortColumnRangeInfo(null); return; } // column index for sort columns int[] sortColIndex = new int[sortCols.size()]; boolean[] isSortColNoDict = new boolean[sortCols.size()]; DataField[] outFields = configuration.getDataFields(); int j = 0; boolean columnExist; for (String sortCol : sortCols) { columnExist = false; for (int i = 0; !columnExist && i < outFields.length; i++) { if (outFields[i].getColumn().getColName().equalsIgnoreCase(sortCol)) { columnExist = true; sortColIndex[j] = i; isSortColNoDict[j] = !outFields[i].hasDictionaryEncoding(); j++; } } if (!columnExist) { throw new CarbonDataLoadingException("Field " + sortCol + " does not exist."); } } String[] sortColumnBounds = StringUtils.splitPreserveAllTokens(loadModel.getSortColumnsBoundsStr(), CarbonLoadOptionConstants.SORT_COLUMN_BOUNDS_ROW_DELIMITER, -1); for (String bound : sortColumnBounds) { String[] fieldInBounds = StringUtils.splitPreserveAllTokens(bound, CarbonLoadOptionConstants.SORT_COLUMN_BOUNDS_FIELD_DELIMITER, -1); if (fieldInBounds.length != sortCols.size()) { String msg = new StringBuilder( "The number of field in bounds should be equal to that in sort columns.") .append(" Expected ").append(sortCols.size()).append(", actual ") .append(String.valueOf(fieldInBounds.length)).append(".") .append(" The illegal bound is '").append(bound).append("'.").toString(); throw new CarbonDataLoadingException(msg); } } SortColumnRangeInfo sortColumnRangeInfo = new SortColumnRangeInfo(sortColIndex, isSortColNoDict, sortColumnBounds, CarbonLoadOptionConstants.SORT_COLUMN_BOUNDS_FIELD_DELIMITER); configuration.setSortColumnRangeInfo(sortColumnRangeInfo); } }