org.apache.hadoop.hive.ql.parse.LoadSemanticAnalyzer.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.hive.ql.parse.LoadSemanticAnalyzer.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.parse;

import org.apache.hadoop.hive.conf.HiveConf.StrictChecks;

import org.apache.hadoop.hive.conf.HiveConf.ConfVars;

import java.io.IOException;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.antlr.runtime.tree.Tree;
import org.apache.commons.httpclient.util.URIUtil;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.TableType;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.QueryState;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.plan.LoadTableDesc;
import org.apache.hadoop.hive.ql.plan.MoveWork;
import org.apache.hadoop.hive.ql.plan.StatsWork;
import org.apache.hadoop.mapred.InputFormat;

import com.google.common.collect.Lists;
import org.apache.orc.impl.OrcAcidUtils;

/**
 * LoadSemanticAnalyzer.
 *
 */
public class LoadSemanticAnalyzer extends BaseSemanticAnalyzer {

    public LoadSemanticAnalyzer(QueryState queryState) throws SemanticException {
        super(queryState);
    }

    public static FileStatus[] matchFilesOrDir(FileSystem fs, Path path) throws IOException {
        FileStatus[] srcs = fs.globStatus(path, new PathFilter() {
            @Override
            public boolean accept(Path p) {
                String name = p.getName();
                return name.equals(EximUtil.METADATA_NAME) ? true : !name.startsWith("_") && !name.startsWith(".");
            }
        });
        if ((srcs != null) && srcs.length == 1) {
            if (srcs[0].isDir()) {
                srcs = fs.listStatus(srcs[0].getPath(), new PathFilter() {
                    @Override
                    public boolean accept(Path p) {
                        String name = p.getName();
                        return !name.startsWith("_") && !name.startsWith(".");
                    }
                });
            }
        }
        return (srcs);
    }

    private URI initializeFromURI(String fromPath, boolean isLocal) throws IOException, URISyntaxException {
        URI fromURI = new Path(fromPath).toUri();

        String fromScheme = fromURI.getScheme();
        String fromAuthority = fromURI.getAuthority();
        String path = fromURI.getPath();

        // generate absolute path relative to current directory or hdfs home
        // directory
        if (!path.startsWith("/")) {
            if (isLocal) {
                path = URIUtil.decode(new Path(System.getProperty("user.dir"), fromPath).toUri().toString());
            } else {
                path = new Path(new Path("/user/" + System.getProperty("user.name")), path).toString();
            }
        }

        // set correct scheme and authority
        if (StringUtils.isEmpty(fromScheme)) {
            if (isLocal) {
                // file for local
                fromScheme = "file";
            } else {
                // use default values from fs.default.name
                URI defaultURI = FileSystem.get(conf).getUri();
                fromScheme = defaultURI.getScheme();
                fromAuthority = defaultURI.getAuthority();
            }
        }

        // if scheme is specified but not authority then use the default authority
        if ((!fromScheme.equals("file")) && StringUtils.isEmpty(fromAuthority)) {
            URI defaultURI = FileSystem.get(conf).getUri();
            fromAuthority = defaultURI.getAuthority();
        }

        LOG.debug(fromScheme + "@" + fromAuthority + "@" + path);
        return new URI(fromScheme, fromAuthority, path, null, null);
    }

    private List<FileStatus> applyConstraintsAndGetFiles(URI fromURI, Tree ast, boolean isLocal)
            throws SemanticException {

        FileStatus[] srcs = null;

        // local mode implies that scheme should be "file"
        // we can change this going forward
        if (isLocal && !fromURI.getScheme().equals("file")) {
            throw new SemanticException(ErrorMsg.ILLEGAL_PATH.getMsg(ast,
                    "Source file system should be \"file\" if \"local\" is specified"));
        }

        try {
            srcs = matchFilesOrDir(FileSystem.get(fromURI, conf), new Path(fromURI));
            if (srcs == null || srcs.length == 0) {
                throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(ast, "No files matching path " + fromURI));
            }

            for (FileStatus oneSrc : srcs) {
                if (oneSrc.isDir()) {
                    throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(ast,
                            "source contains directory: " + oneSrc.getPath().toString()));
                }
            }
        } catch (IOException e) {
            // Has to use full name to make sure it does not conflict with
            // org.apache.commons.lang.StringUtils
            throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(ast), e);
        }

        return Lists.newArrayList(srcs);
    }

    @Override
    public void analyzeInternal(ASTNode ast) throws SemanticException {
        boolean isLocal = false;
        boolean isOverWrite = false;
        Tree fromTree = ast.getChild(0);
        Tree tableTree = ast.getChild(1);

        if (ast.getChildCount() == 4) {
            isLocal = true;
            isOverWrite = true;
        }

        if (ast.getChildCount() == 3) {
            if (ast.getChild(2).getText().toLowerCase().equals("local")) {
                isLocal = true;
            } else {
                isOverWrite = true;
            }
        }

        // initialize load path
        URI fromURI;
        try {
            String fromPath = stripQuotes(fromTree.getText());
            fromURI = initializeFromURI(fromPath, isLocal);
        } catch (IOException e) {
            throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
        } catch (URISyntaxException e) {
            throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(fromTree, e.getMessage()), e);
        }

        // initialize destination table/partition
        TableSpec ts = new TableSpec(db, conf, (ASTNode) tableTree);

        if (ts.tableHandle.isView() || ts.tableHandle.isMaterializedView()) {
            throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg());
        }
        if (ts.tableHandle.isNonNative()) {
            throw new SemanticException(ErrorMsg.LOAD_INTO_NON_NATIVE.getMsg());
        }

        if (ts.tableHandle.isStoredAsSubDirectories()) {
            throw new SemanticException(ErrorMsg.LOAD_INTO_STORED_AS_DIR.getMsg());
        }

        List<FieldSchema> parts = ts.tableHandle.getPartitionKeys();
        if ((parts != null && parts.size() > 0) && (ts.partSpec == null || ts.partSpec.size() == 0)) {
            throw new SemanticException(ErrorMsg.NEED_PARTITION_ERROR.getMsg());
        }
        List<String> bucketCols = ts.tableHandle.getBucketCols();
        if (bucketCols != null && !bucketCols.isEmpty()) {
            String error = StrictChecks.checkBucketing(conf);
            if (error != null)
                throw new SemanticException("Please load into an intermediate table"
                        + " and use 'insert... select' to allow Hive to enforce bucketing. " + error);
        }

        if (AcidUtils.isAcidTable(ts.tableHandle)) {
            throw new SemanticException(ErrorMsg.LOAD_DATA_ON_ACID_TABLE, ts.tableHandle.getCompleteName());
        }
        // make sure the arguments make sense
        List<FileStatus> files = applyConstraintsAndGetFiles(fromURI, fromTree, isLocal);

        // for managed tables, make sure the file formats match
        if (TableType.MANAGED_TABLE.equals(ts.tableHandle.getTableType())
                && conf.getBoolVar(HiveConf.ConfVars.HIVECHECKFILEFORMAT)) {
            ensureFileFormatsMatch(ts, files, fromURI);
        }
        inputs.add(toReadEntity(new Path(fromURI)));
        Task<? extends Serializable> rTask = null;

        // create final load/move work

        boolean preservePartitionSpecs = false;

        Map<String, String> partSpec = ts.getPartSpec();
        if (partSpec == null) {
            partSpec = new LinkedHashMap<String, String>();
            outputs.add(new WriteEntity(ts.tableHandle,
                    (isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE : WriteEntity.WriteType.INSERT)));
        } else {
            try {
                Partition part = Hive.get().getPartition(ts.tableHandle, partSpec, false);
                if (part != null) {
                    if (isOverWrite) {
                        outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT_OVERWRITE));
                    } else {
                        outputs.add(new WriteEntity(part, WriteEntity.WriteType.INSERT));
                        // If partition already exists and we aren't overwriting it, then respect
                        // its current location info rather than picking it from the parent TableDesc
                        preservePartitionSpecs = true;
                    }
                } else {
                    outputs.add(new WriteEntity(ts.tableHandle,
                            (isOverWrite ? WriteEntity.WriteType.INSERT_OVERWRITE : WriteEntity.WriteType.INSERT)));
                }
            } catch (HiveException e) {
                throw new SemanticException(e);
            }
        }

        LoadTableDesc loadTableWork;
        loadTableWork = new LoadTableDesc(new Path(fromURI), Utilities.getTableDesc(ts.tableHandle), partSpec,
                isOverWrite);
        if (preservePartitionSpecs) {
            // Note : preservePartitionSpecs=true implies inheritTableSpecs=false but
            // but preservePartitionSpecs=false(default) here is not sufficient enough
            // info to set inheritTableSpecs=true
            loadTableWork.setInheritTableSpecs(false);
        }

        Task<? extends Serializable> childTask = TaskFactory
                .get(new MoveWork(getInputs(), getOutputs(), loadTableWork, null, true, isLocal), conf);
        if (rTask != null) {
            rTask.addDependentTask(childTask);
        } else {
            rTask = childTask;
        }

        rootTasks.add(rTask);

        // The user asked for stats to be collected.
        // Some stats like number of rows require a scan of the data
        // However, some other stats, like number of files, do not require a complete scan
        // Update the stats which do not require a complete scan.
        Task<? extends Serializable> statTask = null;
        if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
            StatsWork statDesc = new StatsWork(loadTableWork);
            statDesc.setNoStatsAggregator(true);
            statDesc.setClearAggregatorStats(true);
            statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            statTask = TaskFactory.get(statDesc, conf);
        }

        // HIVE-3334 has been filed for load file with index auto update
        if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVEINDEXAUTOUPDATE)) {
            IndexUpdater indexUpdater = new IndexUpdater(loadTableWork, getInputs(), conf);
            try {
                List<Task<? extends Serializable>> indexUpdateTasks = indexUpdater.generateUpdateTasks();

                for (Task<? extends Serializable> updateTask : indexUpdateTasks) {
                    //LOAD DATA will either have a copy & move or just a move,
                    // we always want the update to be dependent on the move
                    childTask.addDependentTask(updateTask);
                    if (statTask != null) {
                        updateTask.addDependentTask(statTask);
                    }
                }
            } catch (HiveException e) {
                console.printInfo("WARNING: could not auto-update stale indexes, indexes are not out of sync");
            }
        } else if (statTask != null) {
            childTask.addDependentTask(statTask);
        }
    }

    private void ensureFileFormatsMatch(TableSpec ts, List<FileStatus> fileStatuses, final URI fromURI)
            throws SemanticException {
        final Class<? extends InputFormat> destInputFormat;
        try {
            if (ts.getPartSpec() == null || ts.getPartSpec().isEmpty()) {
                destInputFormat = ts.tableHandle.getInputFormatClass();
            } else {
                destInputFormat = ts.partHandle.getInputFormatClass();
            }
        } catch (HiveException e) {
            throw new SemanticException(e);
        }

        try {
            FileSystem fs = FileSystem.get(fromURI, conf);
            boolean validFormat = HiveFileFormatUtils.checkInputFormat(fs, conf, destInputFormat, fileStatuses);
            if (!validFormat) {
                throw new SemanticException(ErrorMsg.INVALID_FILE_FORMAT_IN_LOAD.getMsg());
            }
        } catch (Exception e) {
            throw new SemanticException("Unable to load data to destination table." + " Error: " + e.getMessage());
        }
    }
}