com.redgate.hadoop.hive.azuretables.AzureTablesRecordReader.java Source code

Introduction

Here is the source code for com.redgate.hadoop.hive.azuretables.AzureTablesRecordReader.java
Source

/*******************************************************************************
 * Copyright 2013 Simon Elliston Ball
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.redgate.hadoop.hive.azuretables;

import java.io.IOException;
import java.net.URISyntaxException;
import java.security.InvalidKeyException;
import java.util.Iterator;
import java.util.Map.Entry;
import java.util.UUID;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.RecordReader;

import com.microsoft.windowsazure.services.core.storage.CloudStorageAccount;
import com.microsoft.windowsazure.services.table.client.CloudTableClient;
import com.microsoft.windowsazure.services.table.client.DynamicTableEntity;
import com.microsoft.windowsazure.services.table.client.EntityProperty;
import com.microsoft.windowsazure.services.table.client.TableConstants;
import com.microsoft.windowsazure.services.table.client.TableQuery;
import com.microsoft.windowsazure.services.table.client.TableQuery.QueryComparisons;

/**
 * A Record Reader to read Azure Table entities as Records in a Hive Table
 * 
 * @author Simon Elliston Ball <simon@simonellistonball.com>
 * 
 */
public class AzureTablesRecordReader implements RecordReader<Text, MapWritable> {
    public static final Log LOG = LogFactory.getLog(AzureTablesRecordReader.class);

    private static final String SERIALIZED_NULL = NullWritable.get().toString();
    private long pos = 0;
    private Iterator<DynamicTableEntity> results;

    /**
     * Create a new Azure Table Reader
     * 
     * @param storageConnectionString
     *            An Azure Table connection string, usually built from the
     *            InputFormat
     * @param table
     *            The name of the Azure table, specified in the Hive table
     *            definition
     * @param split
     */
    public AzureTablesRecordReader(String storageConnectionString, String table, InputSplit split) {

        AzureTablesSplit partitionSplit = (AzureTablesSplit) split;

        CloudStorageAccount storageAccount;
        try {
            storageAccount = CloudStorageAccount.parse(storageConnectionString);

            CloudTableClient tableClient = storageAccount.createCloudTableClient();
            LOG.info(String.format("Connecting to Windows Azure Account: %s", storageAccount));
            String partitionFilter = TableQuery.generateFilterCondition(TableConstants.PARTITION_KEY,
                    QueryComparisons.EQUAL, partitionSplit.getPartitionKey());
            TableQuery<DynamicTableEntity> partitionQuery = TableQuery.from(table, DynamicTableEntity.class)
                    .where(partitionFilter);

            results = tableClient.execute(partitionQuery).iterator();
        } catch (InvalidKeyException e) {
            LOG.error(e.getMessage());
        } catch (URISyntaxException e) {
            LOG.error(e.getMessage());
        }
    }

    /**
     * No need to close anything, since the MS SDK handles that, but good idea
     * to remove the result set
     */
    public void close() throws IOException {
        results = null;
    }

    /**
     * Grabs the next result and process the DynamicTableEntity into a Hive
     * friendly MapWriteable
     * 
     * @param key
     *            The RowID for the entity. Not that this is not really an Azure
     *            key, since the partition is implicit in the key
     * @param value
     *            A MapWriteable which will be populated with values from the
     *            DynamicTableEntity returned by the Azure query.
     */
    public boolean next(Text key, MapWritable value) throws IOException {
        if (!results.hasNext())
            return false;
        DynamicTableEntity entity = results.next();
        key.set(entity.getRowKey());
        for (Entry<String, EntityProperty> entry : entity.getProperties().entrySet()) {

            final EntityProperty property = entry.getValue();
            // Note that azure table entity keys are forced to lower case for
            // matching with hive column names
            final String propertyKey = entry.getKey().toLowerCase();
            final String propertyValue = property.getValueAsString();
            final Writable writableValue = SERIALIZED_NULL.equals(propertyValue) ? NullWritable.get()
                    : new Text(propertyValue);
            value.put(new Text(propertyKey), writableValue);
        }
        pos++;
        return true;
    }

    /**
     * This generates a GUID that can be used as a key and converts it to a Text
     * object
     */
    public Text createKey() {
        UUID uid = UUID.randomUUID();
        return new Text(uid.toString());
    }

    /**
     * Creates an empty map to be used for the object
     */
    public MapWritable createValue() {
        return new MapWritable();
    }

    /**
     * Returns the current record being processed (zero indexed)
     */
    public long getPos() throws IOException {
        return this.pos;
    }

    /**
     * Progress is hard to calculate for this since we do not have a record
     * count ahead of time, and don't want to have to run an expensive count
     * query to find it, so this will just return 0
     */
    public float getProgress() throws IOException {
        return 0;
    }

}