co.cask.cdap.hive.datasets.DatasetSerDe.java Source code

Introduction

Here is the source code for co.cask.cdap.hive.datasets.DatasetSerDe.java
Source

/*
 * Copyright  2014 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.hive.datasets;

import co.cask.cdap.api.data.batch.RecordScannable;
import co.cask.cdap.api.data.batch.RecordWritable;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.data.schema.UnsupportedTypeException;
import co.cask.cdap.api.dataset.Dataset;
import co.cask.cdap.api.dataset.DatasetManagementException;
import co.cask.cdap.api.dataset.DatasetSpecification;
import co.cask.cdap.common.ServiceUnavailableException;
import co.cask.cdap.common.conf.Constants;
import co.cask.cdap.data.dataset.SystemDatasetInstantiator;
import co.cask.cdap.hive.context.ContextManager;
import co.cask.cdap.hive.serde.ObjectDeserializer;
import co.cask.cdap.hive.serde.ObjectSerializer;
import co.cask.cdap.internal.io.ReflectionSchemaGenerator;
import co.cask.cdap.internal.io.SchemaGenerator;
import co.cask.cdap.proto.Id;
import com.google.common.collect.Lists;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.lang.reflect.Type;
import java.util.ArrayList;
import java.util.Properties;

/**
 * SerDe to serialize Dataset Objects. It MUST implement the deprecated SerDe interface instead of extending the
 * abstract SerDe class, otherwise we get ClassNotFound exceptions on cdh4.x.
 */
public class DatasetSerDe implements SerDe {
    private static final Logger LOG = LoggerFactory.getLogger(DatasetSerDe.class);
    private static final SchemaGenerator schemaGenerator = new ReflectionSchemaGenerator();

    private ObjectInspector objectInspector;
    private ObjectDeserializer deserializer;
    private ObjectSerializer serializer;
    private Schema schema;

    @Override
    public void initialize(Configuration conf, Properties properties) throws SerDeException {
        // The column names are saved as the given inspector to #serialize doesn't preserves them
        // - maybe because it's an external table
        // The columns property comes from the Hive metastore, which has it from the create table statement
        // It is then important that this schema be accurate and in the right order - the same order as
        // object inspectors will reflect them.
        String datasetName = properties.getProperty(Constants.Explore.DATASET_NAME);
        String namespace = properties.getProperty(Constants.Explore.DATASET_NAMESPACE);

        // no namespace SHOULD be an exception but... Hive calls initialize in several places, one of which is
        // when you try and drop a table.
        // When updating from CDAP 2.6, old tables will not have namespace as a serde property. So in order
        // to avoid a null pointer exception that prevents dropping a table, we handle the null namespace case here.
        if (namespace == null) {
            // we also still need an ObjectInspector as Hive uses it to check what columns the table has.
            this.objectInspector = new ObjectDeserializer(properties, null).getInspector();
            return;
        }

        if (datasetName == null || datasetName.isEmpty()) {
            throw new SerDeException("Dataset name not found in serde properties.");
        }

        // Hive may call initialize a bunch of times... so remember the schema so we don't instantiate the dataset
        // a bunch of times.
        if (schema == null) {
            Id.DatasetInstance datasetId = Id.DatasetInstance.from(namespace, datasetName);
            getDatasetSchema(conf, datasetId);
        }

        this.deserializer = new ObjectDeserializer(properties, schema);
        ArrayList<String> columnNames = Lists
                .newArrayList(StringUtils.split(properties.getProperty("columns"), ","));
        this.serializer = new ObjectSerializer(columnNames);
        this.objectInspector = deserializer.getInspector();
    }

    private void getDatasetSchema(Configuration conf, Id.DatasetInstance datasetId) throws SerDeException {

        try (ContextManager.Context hiveContext = ContextManager.getContext(conf)) {
            // apparently the conf can be null in some versions of Hive?
            // Because it calls initialize just to get the object inspector
            if (hiveContext == null) {
                LOG.info("Hive provided a null conf, will not be able to get dataset schema.");
                return;
            }

            // some datasets like Table and ObjectMappedTable have schema in the dataset properties
            try {
                DatasetSpecification datasetSpec = hiveContext.getDatasetSpec(datasetId);
                String schemaStr = datasetSpec.getProperty("schema");
                if (schemaStr != null) {
                    schema = Schema.parseJson(schemaStr);
                    return;
                }
            } catch (DatasetManagementException | ServiceUnavailableException e) {
                throw new SerDeException("Could not instantiate dataset " + datasetId, e);
            } catch (IOException e) {
                throw new SerDeException("Exception getting schema for dataset " + datasetId, e);
            }

            // other datasets must be instantiated to get their schema
            // conf is null if this is a query that writes to a dataset
            ClassLoader parentClassLoader = conf == null ? null : conf.getClassLoader();
            try (SystemDatasetInstantiator datasetInstantiator = hiveContext
                    .createDatasetInstantiator(parentClassLoader)) {
                Dataset dataset = datasetInstantiator.getDataset(datasetId);
                if (dataset == null) {
                    throw new SerDeException("Could not find dataset " + datasetId);
                }
                Type recordType;
                if (dataset instanceof RecordScannable) {
                    recordType = ((RecordScannable) dataset).getRecordType();
                } else if (dataset instanceof RecordWritable) {
                    recordType = ((RecordWritable) dataset).getRecordType();
                } else {
                    throw new SerDeException("Dataset " + datasetId + " is not explorable.");
                }
                schema = schemaGenerator.generate(recordType);
            } catch (UnsupportedTypeException e) {
                throw new SerDeException("Dataset " + datasetId + " has an unsupported schema.", e);
            } catch (IOException e) {
                throw new SerDeException("Exception while trying to instantiate dataset " + datasetId, e);
            }
        } catch (IOException e) {
            throw new SerDeException("Could not get hive context from configuration.", e);
        }
    }

    @Override
    public Class<? extends Writable> getSerializedClass() {
        return Text.class;
    }

    @Override
    public Writable serialize(Object o, ObjectInspector objectInspector) throws SerDeException {
        // NOTE: the object inspector here is not one that we build. It's a default one that Hive built,
        // that contains generic names for columns. The object is a list of objects, each element
        // representing one attribute of the Record type.
        // The object and the objectInspector represent one row of a query result to write into a dataset.
        // Therefore, it is not guaranteed that the object exactly matches the schema of the dataset
        // we want to write into.

        if (!(objectInspector instanceof StructObjectInspector)) {
            throw new SerDeException("Trying to serialize with unknown object inspector type "
                    + objectInspector.getClass().getName() + ". Expected StructObjectInspector.");
        }

        return serializer.serialize(o, objectInspector);
    }

    @Override
    public SerDeStats getSerDeStats() {
        // TODO: add real Dataset stats - CDAP-12
        return new SerDeStats();
    }

    @Override
    public Object deserialize(Writable writable) throws SerDeException {
        ObjectWritable objectWritable = (ObjectWritable) writable;
        Object obj = objectWritable.get();
        try {
            return deserializer.deserialize(obj);
        } catch (Throwable t) {
            LOG.error("Unable to deserialize object {}.", obj, t);
            throw new SerDeException("Unable to deserialize an object.", t);
        }
    }

    @Override
    public ObjectInspector getObjectInspector() throws SerDeException {
        return objectInspector;
    }

}