org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator.java Source code

Introduction

Here is the source code for org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.parse.spark;

import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.Collection;
import java.util.concurrent.Future;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.hive.serde2.Serializer;

/**
 * This operator gets partition info from the upstream operators, and write them
 * to HDFS. This will later be read at the driver, and used for pruning the partitions
 * for the big table side.
 */
public class SparkPartitionPruningSinkOperator extends Operator<SparkPartitionPruningSinkDesc> {

    @SuppressWarnings("deprecation")
    protected transient Serializer serializer;
    protected transient DataOutputBuffer buffer;
    protected static final Log LOG = LogFactory.getLog(SparkPartitionPruningSinkOperator.class);

    @SuppressWarnings("deprecation")
    public Collection<Future<?>> initializeOp(Configuration hconf) throws HiveException {
        Collection<Future<?>> result = super.initializeOp(hconf);
        serializer = (Serializer) ReflectionUtils.newInstance(conf.getTable().getDeserializerClass(), null);
        buffer = new DataOutputBuffer();
        return result;
    }

    @Override
    public void process(Object row, int tag) throws HiveException {
        ObjectInspector rowInspector = inputObjInspectors[0];
        try {
            Writable writableRow = serializer.serialize(row, rowInspector);
            writableRow.write(buffer);
        } catch (Exception e) {
            throw new HiveException(e);
        }
    }

    @Override
    public void closeOp(boolean abort) throws HiveException {
        if (!abort) {
            try {
                flushToFile();
            } catch (Exception e) {
                throw new HiveException(e);
            }
        }
    }

    private void flushToFile() throws IOException {
        // write an intermediate file to the specified path
        // the format of the path is: tmpPath/targetWorkId/sourceWorkId/randInt
        Path path = conf.getPath();
        FileSystem fs = path.getFileSystem(this.getConfiguration());
        fs.mkdirs(path);

        while (true) {
            path = new Path(path, String.valueOf(Utilities.randGen.nextInt()));
            if (!fs.exists(path)) {
                break;
            }
        }

        short numOfRepl = fs.getDefaultReplication(path);

        ObjectOutputStream out = null;
        FSDataOutputStream fsout = null;

        try {
            fsout = fs.create(path, numOfRepl);
            out = new ObjectOutputStream(new BufferedOutputStream(fsout, 4096));
            out.writeUTF(conf.getTargetColumnName());
            buffer.writeTo(out);
        } catch (Exception e) {
            try {
                fs.delete(path, false);
            } catch (Exception ex) {
                LOG.warn("Exception happened while trying to clean partial file.");
            }
            throw e;
        } finally {
            if (out != null) {
                LOG.info("Flushed to file: " + path);
                out.close();
            } else if (fsout != null) {
                fsout.close();
            }
        }
    }

    @Override
    public OperatorType getType() {
        return OperatorType.SPARKPRUNINGSINK;
    }

    @Override
    public String getName() {
        return getOperatorName();
    }

    public static String getOperatorName() {
        return "SPARKPRUNINGSINK";
    }

}