com.cloudera.cdk.data.hbase.tool.SchemaTool.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.cdk.data.hbase.tool.SchemaTool.java

Source

/**
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.cdk.data.hbase.tool;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.jar.JarEntry;
import java.util.jar.JarFile;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.SuffixFileFilter;
import org.apache.commons.io.filefilter.TrueFileFilter;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.JsonParseException;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.cloudera.cdk.data.dao.Constants;
import com.cloudera.cdk.data.dao.HBaseCommonException;
import com.cloudera.cdk.data.dao.KeySchema;
import com.cloudera.cdk.data.dao.SchemaManager;
import com.cloudera.cdk.data.dao.SchemaValidationException;
import com.cloudera.cdk.data.hbase.avro.impl.AvroEntitySchema;
import com.cloudera.cdk.data.hbase.avro.impl.AvroKeyEntitySchemaParser;
import com.cloudera.cdk.data.hbase.avro.impl.AvroKeySchema;
import com.cloudera.cdk.data.hbase.avro.impl.AvroUtils;

/**
 * Utility class for managing Managed Schemas in HBase Common.
 */
public class SchemaTool {

    private static final Logger LOG = LoggerFactory.getLogger(SchemaTool.class);

    private static final String CLASSPATH_PREFIX = "classpath:";

    private static final AvroKeyEntitySchemaParser parser = new AvroKeyEntitySchemaParser();

    private static final ObjectMapper mapper = new ObjectMapper();

    private static final JsonFactory factory = mapper.getJsonFactory();

    private final SchemaManager schemaManager;

    private final HBaseAdmin hbaseAdmin;

    public SchemaTool(HBaseAdmin hbaseAdmin, SchemaManager entityManager) {
        this.hbaseAdmin = hbaseAdmin;
        this.schemaManager = entityManager;
    }

    /**
     * Scans the schemaDirectory for avro schemas, and creates or migrates HBase
     * Common managed schemas managed by this instances entity manager.
     * 
     * @param schemaDirectory
     *          The directory to recursively scan for avro schema files. This
     *          directory can be a directory on the classpath, including a
     *          directory that is embeddded in a jar on the classpath. In both of
     *          those cases, the schemaDirectory should be prefixed with
     *          classpath:
     * @param createTableAndFamilies
     *          If true, will create the table for each schema if it doesn't
     *          exist, and will create families if they don't exist.
     */
    public void createOrMigrateSchemaDirectory(String schemaDirectory, boolean createTableAndFamilies) {
        List<String> schemaStrings;
        if (schemaDirectory.startsWith(CLASSPATH_PREFIX)) {
            URL dirURL = getClass().getClassLoader()
                    .getResource(schemaDirectory.substring(CLASSPATH_PREFIX.length()));
            if (dirURL != null && dirURL.getProtocol().equals("file")) {
                try {
                    schemaStrings = getSchemaStringsFromDir(new File(dirURL.toURI()));
                } catch (URISyntaxException e) {
                    throw new HBaseCommonException(e);
                }
            } else if (dirURL != null && dirURL.getProtocol().equals("jar")) {
                String jarPath = dirURL.getPath().substring(5, dirURL.getPath().indexOf("!"));
                schemaStrings = getSchemaStringsFromJar(jarPath,
                        schemaDirectory.substring(CLASSPATH_PREFIX.length()));
            } else {
                String msg = "Could not find classpath resource: " + schemaDirectory;
                LOG.error(msg);
                throw new HBaseCommonException(msg);
            }
        } else {
            schemaStrings = getSchemaStringsFromDir(new File(schemaDirectory));
        }

        Map<String, String> tableKeySchemaMap = new HashMap<String, String>();
        Map<String, List<String>> tableEntitySchemaMap = new HashMap<String, List<String>>();
        for (String schemaString : schemaStrings) {
            String name = getEntityNameFromSchemaString(schemaString);
            List<String> tables = getTablesFromSchemaString(schemaString);
            if (name.endsWith("Key")) {
                for (String table : tables) {
                    if (tableKeySchemaMap.containsKey(table)) {
                        String msg = "Multiple keys for table: " + table;
                        LOG.error(msg);
                        throw new SchemaValidationException(msg);
                    }
                    LOG.debug("Adding key to tableKeySchemaMap for table: " + table + ". " + schemaString);
                    tableKeySchemaMap.put(table, schemaString);
                }
            } else {
                for (String table : tables) {
                    if (tableEntitySchemaMap.containsKey(table)) {
                        tableEntitySchemaMap.get(table).add(schemaString);
                    } else {
                        List<String> entityList = new ArrayList<String>();
                        entityList.add(schemaString);
                        tableEntitySchemaMap.put(table, entityList);
                    }
                }
            }
        }

        for (Entry<String, List<String>> entry : tableEntitySchemaMap.entrySet()) {
            String table = entry.getKey();
            List<String> entitySchemas = entry.getValue();
            if (!tableKeySchemaMap.containsKey(table)) {
                String msg = "No Key Schema For Table: " + table;
                LOG.error(msg);
                throw new HBaseCommonException(msg);
            }
            if (entitySchemas.size() == 0) {
                String msg = "Key, but no entity schemas for Table: " + table;
                LOG.error(msg);
                throw new SchemaValidationException(msg);
            }
            for (String entitySchema : entry.getValue()) {
                createOrMigrateSchema(table, entitySchema, createTableAndFamilies);
            }
        }
    }

    /**
     * Creates a new managed schema, or migrates an existing one if one exists for
     * the table name, entity name pair.
     * 
     * @param tableName
     *          The name of the table we'll be creating or migrating a schema for.
     * @param keySchemaFilePath
     *          The absolute file path to the key schema file.
     * @param entitySchemaFilePath
     *          The absolute file path to the entity schema file.
     * @param createTableAndFamilies
     *          If true, will create the table for this schema if it doesn't
     *          exist, and will create families if they don't exist.
     */
    public void createOrMigrateSchemaFile(String tableName, String entitySchemaFilePath,
            boolean createTableAndFamilies) {
        createOrMigrateSchemaFile(tableName, new File(entitySchemaFilePath), createTableAndFamilies);
    }

    /**
     * Creates a new managed schema, or migrates an existing one if one exists for
     * the table name, entity name pair.
     * 
     * @param tableName
     *          The name of the table we'll be creating or migrating a schema for.
     * @param keySchemaFile
     *          The key schema file.
     * @param entitySchemaFile
     *          The entity schema file.
     * @param createTableAndFamilies
     *          If true, will create the table for this schema if it doesn't
     *          exist, and will create families if they don't exist.
     */
    public void createOrMigrateSchemaFile(String tableName, File entitySchemaFile, boolean createTableAndFamilies) {
        createOrMigrateSchema(tableName, getSchemaStringFromFile(entitySchemaFile), createTableAndFamilies);
    }

    /**
     * Creates a new managed schema, or migrates an existing one if one exists for
     * the table name, entity name pair.
     * 
     * @param tableName
     *          The name of the table we'll be creating or migrating a schema for.
     * @param keySchemaString
     *          The key schema
     * @param entitySchemaString
     *          The entity schema
     * @param createTableAndFamilies
     *          If true, will create the table for this schema if it doesn't
     *          exist, and will create families if they don't exist.
     */
    public void createOrMigrateSchema(String tableName, String entitySchemaString, boolean createTableAndFamilies) {
        String entityName = getEntityNameFromSchemaString(entitySchemaString);
        AvroEntitySchema entitySchema = parser.parseEntitySchema(entitySchemaString);
        AvroKeySchema keySchema = parser.parseKeySchema(entitySchemaString);
        if (schemaManager.hasManagedSchema(tableName, entityName)) {
            KeySchema currentKeySchema = schemaManager.getKeySchema(tableName, entityName);
            if (!keySchema.equals(currentKeySchema)) {
                String msg = "Migrating schema with different keys. Current: " + currentKeySchema.getRawSchema()
                        + " New: " + keySchema.getRawSchema();
                LOG.error(msg);
                throw new SchemaValidationException(msg);
            }
            if (schemaManager.getEntityVersion(tableName, entityName, entitySchema) == -1) {
                LOG.info("Migrating Schema: (" + tableName + ", " + entityName + ")");
                schemaManager.migrateSchema(tableName, entityName, entitySchemaString);
            } else {
                // don't set createTableAndFamilies to false, becasue we may still need
                // to update the table to support what exists in the meta store.
                LOG.info("Schema hasn't changed, not migrating: (" + tableName + ", " + entityName + ")");
            }
        } else {
            LOG.info("Creating Schema: (" + tableName + ", " + entityName + ")");
            parser.parseEntitySchema(entitySchemaString).getRequiredColumnFamilies();
            schemaManager.createSchema(tableName, entityName, entitySchemaString,
                    "com.cloudera.cdk.data.hbase.avro.impl.AvroKeyEntitySchemaParser",
                    "com.cloudera.cdk.data.hbase.avro.impl.AvroKeySerDe",
                    "com.cloudera.cdk.data.hbase.avro.impl.AvroEntitySerDe");
        }

        if (createTableAndFamilies) {
            try {
                if (!hbaseAdmin.tableExists(tableName)) {
                    HTableDescriptor desc = new HTableDescriptor(tableName);
                    desc.addFamily(new HColumnDescriptor(Constants.SYS_COL_FAMILY));
                    desc.addFamily(new HColumnDescriptor(Constants.OBSERVABLE_COL_FAMILY));
                    for (String columnFamily : entitySchema.getRequiredColumnFamilies()) {
                        desc.addFamily(new HColumnDescriptor(columnFamily));
                    }
                    hbaseAdmin.createTable(desc);
                } else {
                    Set<String> familiesToAdd = entitySchema.getRequiredColumnFamilies();
                    familiesToAdd.add(new String(Constants.SYS_COL_FAMILY));
                    familiesToAdd.add(new String(Constants.OBSERVABLE_COL_FAMILY));
                    HTableDescriptor desc = hbaseAdmin.getTableDescriptor(tableName.getBytes());
                    for (HColumnDescriptor columnDesc : desc.getColumnFamilies()) {
                        String familyName = columnDesc.getNameAsString();
                        if (familiesToAdd.contains(familyName)) {
                            familiesToAdd.remove(familyName);
                        }
                    }
                    if (familiesToAdd.size() > 0) {
                        hbaseAdmin.disableTable(tableName);
                        try {
                            for (String family : familiesToAdd) {
                                hbaseAdmin.addColumn(tableName, new HColumnDescriptor(family));
                            }
                        } finally {
                            hbaseAdmin.enableTable(tableName);
                        }
                    }
                }
            } catch (IOException e) {
                throw new HBaseCommonException(e);
            }
        }
    }

    /**
     * Will return the contents of schemaFile as a string
     * 
     * @param schemaFile
     *          The file who's contents should be returned.
     * @return The contents of schemaFile
     */
    private String getSchemaStringFromFile(File schemaFile) {
        String schemaString;
        FileInputStream fis = null;
        try {
            fis = new FileInputStream(schemaFile);
            schemaString = AvroUtils.inputStreamToString(fis);
        } catch (IOException e) {
            throw new HBaseCommonException(e);
        } finally {
            if (fis != null) {
                try {
                    fis.close();
                } catch (IOException e) {
                }
            }
        }
        return schemaString;
    }

    private List<String> getTablesFromSchemaString(String schema) {
        JsonNode node;
        try {
            JsonParser jp = factory.createJsonParser(schema);
            node = mapper.readTree(jp);
            if (node.get("tables") == null) {
                return new ArrayList<String>();
            }
            List<String> result = new ArrayList<String>(node.get("tables").size());
            for (Iterator<JsonNode> it = node.get("tables").getElements(); it.hasNext();) {
                result.add(it.next().getTextValue());
            }
            return result;
        } catch (JsonParseException e) {
            throw new SchemaValidationException(e);
        } catch (IOException e) {
            throw new SchemaValidationException(e);
        }
    }

    private String getEntityNameFromSchemaString(String schema) {
        JsonNode node;
        try {
            JsonParser jp = factory.createJsonParser(schema);
            node = mapper.readTree(jp);
            if (node.get("name") == null) {
                return null;
            }
            return node.get("name").getTextValue();
        } catch (JsonParseException e) {
            throw new SchemaValidationException(e);
        } catch (IOException e) {
            throw new SchemaValidationException(e);
        }
    }

    /**
     * Gets the list of HBase Common Avro schema strings from dir. It recursively
     * searches dir to find files that end in .avsc to locate those strings.
     * 
     * @param dir
     *          The dir to recursively search for schema strings
     * @return The list of schema strings
     */
    private List<String> getSchemaStringsFromDir(File dir) {
        List<String> schemaStrings = new ArrayList<String>();
        Collection<File> schemaFiles = FileUtils.listFiles(dir, new SuffixFileFilter(".avsc"),
                TrueFileFilter.INSTANCE);
        for (File schemaFile : schemaFiles) {
            schemaStrings.add(getSchemaStringFromFile(schemaFile));
        }
        return schemaStrings;
    }

    /**
     * Gets the list of HBase Common Avro schema strings from a directory in the
     * Jar. It recursively searches the directory in the jar to find files that
     * end in .avsc to locate thos strings.
     * 
     * @param jarPath
     *          The path to the jar to search
     * @param directoryPath
     *          The directory in the jar to find avro schema strings
     * @return The list of schema strings.
     */
    private List<String> getSchemaStringsFromJar(String jarPath, String directoryPath) {
        LOG.info("Getting schema strings in: " + directoryPath + ", from jar: " + jarPath);
        JarFile jar;
        try {
            jar = new JarFile(URLDecoder.decode(jarPath, "UTF-8"));
        } catch (UnsupportedEncodingException e) {
            throw new HBaseCommonException(e);
        } catch (IOException e) {
            throw new HBaseCommonException(e);
        }
        Enumeration<JarEntry> entries = jar.entries();
        List<String> schemaStrings = new ArrayList<String>();
        while (entries.hasMoreElements()) {
            JarEntry jarEntry = entries.nextElement();
            if (jarEntry.getName().startsWith(directoryPath) && jarEntry.getName().endsWith(".avsc")) {
                LOG.info("Found schema: " + jarEntry.getName());
                InputStream inputStream;
                try {
                    inputStream = jar.getInputStream(jarEntry);
                } catch (IOException e) {
                    throw new HBaseCommonException(e);
                }
                String schemaString = AvroUtils.inputStreamToString(inputStream);
                schemaStrings.add(schemaString);
            }
        }
        return schemaStrings;
    }
}