Source code

Java tutorial


Here is the source code for


 * Copyright  2014 Cask Data, Inc.
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.

package co.cask.cdap.hive.datasets;

import co.cask.cdap.api.dataset.Dataset;
import co.cask.cdap.api.dataset.DatasetManagementException;
import co.cask.cdap.api.dataset.DatasetSpecification;
import co.cask.cdap.common.DatasetNotFoundException;
import co.cask.cdap.common.conf.ConfigurationUtil;
import co.cask.cdap.common.conf.Constants;
import co.cask.cdap.hive.context.ContextManager;
import co.cask.cdap.hive.context.TxnCodec;
import co.cask.cdap.proto.Id;
import co.cask.tephra.Transaction;
import co.cask.tephra.TransactionAware;
import org.apache.hadoop.conf.Configuration;


 * Instantiates the dataset used during runtime of a Hive query. This means it's used in mappers and reducers,
 * and must use the Hadoop configuration to look up what dataset to instantiate. Should not be closed until the
 * dataset has been closed. Assumes the dataset name and namespace are settings in the configuration.
 * It may seem like this would not work if multiple datasets are used in a single query, but that is not the case.
 * It is not obvious, but dataset name and namespace are added as job properties in DatasetStorageHandler. This tells
 * Hive to add those properties to the Configuration object before passing it in to the methods of an InputFormat
 * or OutputFormat. So even if multiple datasets are used in the same query (a join query for example), dataset name
 * will not get clobbered.
public class DatasetAccessor implements Closeable {
    private final Id.DatasetInstance datasetId;
    private final ContextManager.Context context;
    private final Transaction transaction;
    private final SystemDatasetInstantiator datasetInstantiator;
    private Dataset dataset;

    public DatasetAccessor(Configuration conf) throws IOException {
        String datasetName = conf.get(Constants.Explore.DATASET_NAME);
        String namespace = conf.get(Constants.Explore.DATASET_NAMESPACE);
        Preconditions.checkArgument(!Strings.isNullOrEmpty(datasetName), "dataset name not present in config");
        Preconditions.checkArgument(!Strings.isNullOrEmpty(namespace), "namespace not present in config");

        this.datasetId = Id.DatasetInstance.from(namespace, datasetName);
        this.context = ContextManager.getContext(conf);
        this.transaction = ConfigurationUtil.get(conf, Constants.Explore.TX_QUERY_KEY, TxnCodec.INSTANCE);
        this.datasetInstantiator = context.createDatasetInstantiator(conf.getClassLoader());

    public void initialize()
            throws IOException, DatasetManagementException, DatasetNotFoundException, ClassNotFoundException {
        dataset = datasetInstantiator.getDataset(datasetId);
        if (dataset instanceof TransactionAware) {
            ((TransactionAware) dataset).startTx(transaction);

    public Id.DatasetInstance getDatasetId() {
        return datasetId;

    public <T extends Dataset> T getDataset() {
        return (T) dataset;

    public DatasetSpecification getDatasetSpec() throws DatasetManagementException {
        return context.getDatasetSpec(datasetId);

    public void close() throws IOException {