org.kitesdk.data.hbase.tool.SchemaTool.java Source code

Introduction

Here is the source code for org.kitesdk.data.hbase.tool.SchemaTool.java
Source

/**
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.kitesdk.data.hbase.tool;

import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import org.apache.commons.collections.MultiHashMap;
import org.apache.hadoop.hbase.util.Bytes;
import org.kitesdk.data.DatasetException;
import org.kitesdk.data.ValidationException;
import org.kitesdk.data.hbase.avro.AvroEntitySchema;
import org.kitesdk.data.hbase.avro.AvroKeyEntitySchemaParser;
import org.kitesdk.data.hbase.avro.AvroKeySchema;
import org.kitesdk.data.hbase.avro.AvroUtils;
import org.kitesdk.data.hbase.impl.Constants;
import org.kitesdk.data.hbase.impl.KeySchema;
import org.kitesdk.data.hbase.impl.SchemaManager;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.jar.JarEntry;
import java.util.jar.JarFile;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Utility class for managing Managed Schemas in HBase Common.
 */
public class SchemaTool {
    // Wait for 600 seconds (10 minutes) for all the tables to be available
    private static final int MAX_SECOND_WAIT_FOR_TABLE_CREATION = 600;

    private static final Logger LOG = LoggerFactory.getLogger(SchemaTool.class);

    private static final String CLASSPATH_PREFIX = "classpath:";

    private static final AvroKeyEntitySchemaParser parser = new AvroKeyEntitySchemaParser();

    private static final ObjectMapper mapper = new ObjectMapper();

    private static final JsonFactory factory = mapper.getJsonFactory();

    private final SchemaManager schemaManager;

    private final HBaseAdmin hbaseAdmin;

    public SchemaTool(HBaseAdmin hbaseAdmin, SchemaManager entityManager) {
        this.hbaseAdmin = hbaseAdmin;
        this.schemaManager = entityManager;
    }

    /**
     * Scans the schemaDirectory for avro schemas, and creates or migrates HBase
     * Common managed schemas managed by this instances entity manager.
     * 
     * @param schemaDirectory
     *          The directory to recursively scan for avro schema files. This
     *          directory can be a directory on the classpath, including a
     *          directory that is embeddded in a jar on the classpath. In both of
     *          those cases, the schemaDirectory should be prefixed with
     *          classpath:
     * @param createTableAndFamilies
     *          If true, will create the table for each schema if it doesn't
     *          exist, and will create families if they don't exist.
     */
    public void createOrMigrateSchemaDirectory(String schemaDirectory, boolean createTableAndFamilies)
            throws InterruptedException {
        List<String> schemaStrings;
        if (schemaDirectory.startsWith(CLASSPATH_PREFIX)) {
            URL dirURL = getClass().getClassLoader()
                    .getResource(schemaDirectory.substring(CLASSPATH_PREFIX.length()));
            if (dirURL != null && dirURL.getProtocol().equals("file")) {
                try {
                    schemaStrings = getSchemaStringsFromDir(new File(dirURL.toURI()));
                } catch (URISyntaxException e) {
                    throw new DatasetException(e);
                }
            } else if (dirURL != null && dirURL.getProtocol().equals("jar")) {
                String jarPath = dirURL.getPath().substring(5, dirURL.getPath().indexOf("!"));
                schemaStrings = getSchemaStringsFromJar(jarPath,
                        schemaDirectory.substring(CLASSPATH_PREFIX.length()));
            } else {
                String msg = "Could not find classpath resource: " + schemaDirectory;
                LOG.error(msg);
                throw new DatasetException(msg);
            }
        } else {
            schemaStrings = getSchemaStringsFromDir(new File(schemaDirectory));
        }

        Map<String, List<String>> tableEntitySchemaMap = new HashMap<String, List<String>>();
        for (String schemaString : schemaStrings) {
            List<String> tables = getTablesFromSchemaString(schemaString);
            for (String table : tables) {
                if (tableEntitySchemaMap.containsKey(table)) {
                    tableEntitySchemaMap.get(table).add(schemaString);
                } else {
                    List<String> entityList = new ArrayList<String>();
                    entityList.add(schemaString);
                    tableEntitySchemaMap.put(table, entityList);
                }
            }

        }

        // Validate if for every key schema there is atleast one entity schemas
        for (Entry<String, List<String>> entry : tableEntitySchemaMap.entrySet()) {
            String table = entry.getKey();
            List<String> entitySchemas = entry.getValue();
            if (entitySchemas.size() == 0) {
                String msg = "Table requested, but no entity schemas for Table: " + table;
                LOG.error(msg);
                throw new ValidationException(msg);
            }
        }

        // Migrate the schemas in a batch, collect all the table descriptors
        // that require a schema migration
        Collection<HTableDescriptor> tableDescriptors = Lists.newArrayList();
        for (Entry<String, List<String>> entry : tableEntitySchemaMap.entrySet()) {
            String table = entry.getKey();
            for (String entitySchemaString : entry.getValue()) {
                boolean migrationRequired = prepareManagedSchema(table, entitySchemaString);
                // Optimization: If no migration is req, then no change in the table
                if (migrationRequired) {
                    tableDescriptors.add(prepareTableDescriptor(table, entitySchemaString));
                }
            }
        }

        if (createTableAndFamilies) {
            createTables(tableDescriptors);
        }
    }

    /**
     * Creates a new managed schema, or migrates an existing one if one exists for
     * the table name, entity name pair.
     * 
     * @param tableName
     *          The name of the table we'll be creating or migrating a schema for.
     * @param entitySchemaFilePath
     *          The absolute file path to the entity schema file.
     * @param createTableAndFamilies
     *          If true, will create the table for this schema if it doesn't
     *          exist, and will create families if they don't exist.
     */
    public void createOrMigrateSchemaFile(String tableName, String entitySchemaFilePath,
            boolean createTableAndFamilies) throws InterruptedException {
        createOrMigrateSchemaFile(tableName, new File(entitySchemaFilePath), createTableAndFamilies);
    }

    /**
     * Creates a new managed schema, or migrates an existing one if one exists for
     * the table name, entity name pair.
     * 
     * @param tableName
     *          The name of the table we'll be creating or migrating a schema for.
     * @param entitySchemaFile
     *          The entity schema file.
     * @param createTableAndFamilies
     *          If true, will create the table for this schema if it doesn't
     *          exist, and will create families if they don't exist.
     */
    public void createOrMigrateSchemaFile(String tableName, File entitySchemaFile, boolean createTableAndFamilies)
            throws InterruptedException {
        createOrMigrateSchema(tableName, getSchemaStringFromFile(entitySchemaFile), createTableAndFamilies);
    }

    /**
     * Creates a new managed schema, or migrates an existing one if one exists for
     * the table name, entity name pair.
     * 
     * @param tableName
     *          The name of the table we'll be creating or migrating a schema for.
     * @param entitySchemaString
     *          The entity schema
     * @param createTableAndFamilies
     *          If true, will create the table for this schema if it doesn't
     *          exist, and will create families if they don't exist.
     */
    public void createOrMigrateSchema(String tableName, String entitySchemaString, boolean createTableAndFamilies)
            throws InterruptedException {
        boolean migrationRequired = prepareManagedSchema(tableName, entitySchemaString);
        if (migrationRequired && createTableAndFamilies) {
            try {
                HTableDescriptor descriptor = prepareTableDescriptor(tableName, entitySchemaString);
                if (hbaseAdmin.isTableAvailable(tableName)) {
                    modifyTable(tableName, descriptor);
                } else {
                    createTable(descriptor);
                }
            } catch (IOException e) {
                throw new DatasetException(e);
            }
        }
    }

    /**
     * Prepare managed schema for this entitySchema
     */
    private boolean prepareManagedSchema(String tableName, String entitySchemaString) {
        String entityName = getEntityNameFromSchemaString(entitySchemaString);
        AvroEntitySchema entitySchema = parser.parseEntitySchema(entitySchemaString);
        AvroKeySchema keySchema = parser.parseKeySchema(entitySchemaString);
        // Verify there are no ambiguities with the managed schemas
        if (schemaManager.hasManagedSchema(tableName, entityName)) {
            KeySchema currentKeySchema = schemaManager.getKeySchema(tableName, entityName);
            if (!keySchema.equals(currentKeySchema)) {
                String msg = "Migrating schema with different keys. Current: " + currentKeySchema.getRawSchema()
                        + " New: " + keySchema.getRawSchema();
                LOG.error(msg);
                throw new ValidationException(msg);
            }
            if (!schemaManager.hasSchemaVersion(tableName, entityName, entitySchema)) {
                LOG.info("Migrating Schema: (" + tableName + ", " + entityName + ")");
                schemaManager.migrateSchema(tableName, entityName, entitySchemaString);
            } else {
                LOG.info("Schema hasn't changed, not migrating: (" + tableName + ", " + entityName + ")");
                return false;
            }
        } else {
            LOG.info("Creating Schema: (" + tableName + ", " + entityName + ")");
            parser.parseEntitySchema(entitySchemaString).getColumnMappingDescriptor().getRequiredColumnFamilies();
            schemaManager.createSchema(tableName, entityName, entitySchemaString,
                    "org.kitesdk.data.hbase.avro.AvroKeyEntitySchemaParser",
                    "org.kitesdk.data.hbase.avro.AvroKeySerDe", "org.kitesdk.data.hbase.avro.AvroEntitySerDe");
        }
        return true;
    }

    /**
     * Prepare the Table descriptor for the given entity Schema
     */
    private HTableDescriptor prepareTableDescriptor(String tableName, String entitySchemaString) {
        HTableDescriptor descriptor = new HTableDescriptor(Bytes.toBytes(tableName));
        AvroEntitySchema entitySchema = parser.parseEntitySchema(entitySchemaString);
        Set<String> familiesToAdd = entitySchema.getColumnMappingDescriptor().getRequiredColumnFamilies();
        familiesToAdd.add(new String(Constants.SYS_COL_FAMILY));
        familiesToAdd.add(new String(Constants.OBSERVABLE_COL_FAMILY));
        for (String familyToAdd : familiesToAdd) {
            if (!descriptor.hasFamily(familyToAdd.getBytes())) {
                descriptor.addFamily(new HColumnDescriptor(familyToAdd));
            }
        }
        return descriptor;
    }

    /**
     * Create the tables asynchronously with the HBase
     */
    private void createTables(Collection<HTableDescriptor> tableDescriptors) throws InterruptedException {
        try {
            Set<String> tablesCreated = Sets.newHashSet();
            Multimap<String, HTableDescriptor> pendingTableUpdates = ArrayListMultimap.create();
            for (HTableDescriptor tableDescriptor : tableDescriptors) {
                String tableName = Bytes.toString(tableDescriptor.getName());
                if (tablesCreated.contains(tableName)) {
                    // We have to wait for the table async creation to modify
                    // Just add the required columns to be added
                    pendingTableUpdates.put(tableName, tableDescriptor);
                } else {
                    LOG.info("Creating table " + tableName);
                    hbaseAdmin.createTableAsync(tableDescriptor, new byte[][] {});
                    tablesCreated.add(tableName);
                }
            }

            // Wait for the tables to be online
            for (int waitCount = 0; waitCount < MAX_SECOND_WAIT_FOR_TABLE_CREATION; waitCount++) {
                Iterator<String> iterator = tablesCreated.iterator();
                while (iterator.hasNext()) {
                    String table = iterator.next();
                    if (hbaseAdmin.isTableAvailable(table)) {
                        // Perform any updates scheduled on the table
                        if (pendingTableUpdates.containsKey(table)) {
                            for (HTableDescriptor tableDescriptor : pendingTableUpdates.get(table)) {
                                // Add the new columns - synchronous calls
                                modifyTable(table, tableDescriptor);
                            }
                        }
                        iterator.remove();
                    }
                }
                // If all tables are available, then break
                if (tablesCreated.isEmpty()) {
                    break;
                }
                // Sleep for a second before checking again
                Thread.sleep(1000);
            }
        } catch (IOException e) {
            throw new DatasetException(e);
        }
    }

    /**
     * add the column families which are not already present to the given table
     */
    private void modifyTable(String tableName, HTableDescriptor newDescriptor) {
        LOG.info("Modifying table " + tableName);
        HColumnDescriptor[] newFamilies = newDescriptor.getColumnFamilies();
        try {
            List<HColumnDescriptor> columnsToAdd = Lists.newArrayList();
            HTableDescriptor currentFamilies = hbaseAdmin.getTableDescriptor(Bytes.toBytes(tableName));
            for (HColumnDescriptor newFamily : newFamilies) {
                if (!currentFamilies.hasFamily(newFamily.getName())) {
                    columnsToAdd.add(new HColumnDescriptor(newFamily.getName()));
                }
            }
            // Add all the necessary column families
            if (!columnsToAdd.isEmpty()) {
                hbaseAdmin.disableTable(tableName);
                try {
                    for (HColumnDescriptor columnToAdd : columnsToAdd) {
                        hbaseAdmin.addColumn(tableName, columnToAdd);
                    }
                } finally {
                    hbaseAdmin.enableTable(tableName);
                }
            }
        } catch (IOException e) {
            throw new DatasetException(e);
        }
    }

    /**
     * Create a single column asynchronously
     */
    private void createTable(HTableDescriptor tableDescriptor) throws InterruptedException {
        createTables(ImmutableList.of(tableDescriptor));
    }

    /**
     * Will return the contents of schemaFile as a string
     * 
     * @param schemaFile
     *          The file who's contents should be returned.
     * @return The contents of schemaFile
     */
    private String getSchemaStringFromFile(File schemaFile) {
        String schemaString;
        FileInputStream fis = null;
        try {
            fis = new FileInputStream(schemaFile);
            schemaString = AvroUtils.inputStreamToString(fis);
        } catch (IOException e) {
            throw new DatasetException(e);
        } finally {
            if (fis != null) {
                try {
                    fis.close();
                } catch (IOException e) {
                }
            }
        }
        return schemaString;
    }

    private List<String> getTablesFromSchemaString(String schema) {
        JsonNode node;
        try {
            JsonParser jp = factory.createJsonParser(schema);
            node = mapper.readTree(jp);
            if (node.get("tables") == null) {
                return new ArrayList<String>();
            }
            List<String> result = new ArrayList<String>(node.get("tables").size());
            for (Iterator<JsonNode> it = node.get("tables").elements(); it.hasNext();) {
                result.add(it.next().textValue());
            }
            return result;
        } catch (JsonParseException e) {
            throw new ValidationException(e);
        } catch (IOException e) {
            throw new ValidationException(e);
        }
    }

    private String getEntityNameFromSchemaString(String schema) {
        JsonNode node;
        try {
            JsonParser jp = factory.createJsonParser(schema);
            node = mapper.readTree(jp);
            if (node.get("name") == null) {
                return null;
            }
            return node.get("name").textValue();
        } catch (JsonParseException e) {
            throw new ValidationException(e);
        } catch (IOException e) {
            throw new ValidationException(e);
        }
    }

    /**
     * Gets the list of HBase Common Avro schema strings from dir. It recursively
     * searches dir to find files that end in .avsc to locate those strings.
     * 
     * @param dir
     *          The dir to recursively search for schema strings
     * @return The list of schema strings
     */
    private List<String> getSchemaStringsFromDir(File dir) {
        List<String> schemaStrings = new ArrayList<String>();
        Collection<File> schemaFiles = FileUtils.listFiles(dir, new SuffixFileFilter(".avsc"),
                TrueFileFilter.INSTANCE);
        for (File schemaFile : schemaFiles) {
            schemaStrings.add(getSchemaStringFromFile(schemaFile));
        }
        return schemaStrings;
    }

    /**
     * Gets the list of HBase Common Avro schema strings from a directory in the
     * Jar. It recursively searches the directory in the jar to find files that
     * end in .avsc to locate thos strings.
     * 
     * @param jarPath
     *          The path to the jar to search
     * @param directoryPath
     *          The directory in the jar to find avro schema strings
     * @return The list of schema strings.
     */
    private List<String> getSchemaStringsFromJar(String jarPath, String directoryPath) {
        LOG.info("Getting schema strings in: " + directoryPath + ", from jar: " + jarPath);
        JarFile jar;
        try {
            jar = new JarFile(URLDecoder.decode(jarPath, "UTF-8"));
        } catch (UnsupportedEncodingException e) {
            throw new DatasetException(e);
        } catch (IOException e) {
            throw new DatasetException(e);
        }
        Enumeration<JarEntry> entries = jar.entries();
        List<String> schemaStrings = new ArrayList<String>();
        while (entries.hasMoreElements()) {
            JarEntry jarEntry = entries.nextElement();
            if (jarEntry.getName().startsWith(directoryPath) && jarEntry.getName().endsWith(".avsc")) {
                LOG.info("Found schema: " + jarEntry.getName());
                InputStream inputStream;
                try {
                    inputStream = jar.getInputStream(jarEntry);
                } catch (IOException e) {
                    throw new DatasetException(e);
                }
                String schemaString = AvroUtils.inputStreamToString(inputStream);
                schemaStrings.add(schemaString);
            }
        }
        return schemaStrings;
    }
}