Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.hadoop.hive.metastore.hbase; import com.google.common.annotations.VisibleForTesting; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.hive.metastore.api.SQLForeignKey; import org.apache.hadoop.hive.metastore.api.SQLPrimaryKey; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.Deadline; import org.apache.hadoop.hive.metastore.ObjectStore; import org.apache.hadoop.hive.metastore.RawStore; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.Function; import org.apache.hadoop.hive.metastore.api.Index; import org.apache.hadoop.hive.metastore.api.InvalidObjectException; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.NoSuchObjectException; import org.apache.hadoop.hive.metastore.api.Partition; import org.apache.hadoop.hive.metastore.api.Role; import org.apache.hadoop.hive.metastore.api.Table; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; /** * A tool to take the contents of an RDBMS based Hive metastore and import it into an HBase based * one. To use this the config files for Hive configured to work with the RDBMS (that is, * including the JDBC string, etc.) as well as HBase configuration files must be in the path. * There should not be a hive-site.xml that specifies HBaseStore in the path. This tool will then * handle connecting to the RDBMS via the {@link org.apache.hadoop.hive.metastore.ObjectStore} * and HBase via {@link org.apache.hadoop.hive.metastore.hbase.HBaseStore} and transferring the * data. * * This tool can import an entire metastore or only selected objects. When selecting objects it * is necessary to fully specify the object's name. For example, if you want to import the table * T in the default database it needs to be identified as default.T. The same is true for * functions. When an object is specified, everything under that object will be imported (e.g. * if you select database D, then all tables and functions in that database will be * imported as well). * * At this point only tables and partitions are handled in parallel as it is assumed there are * relatively few of everything else. * * Note that HBaseSchemaTool must have already been used to create the appropriate tables in HBase. */ public class HBaseImport { static final private Logger LOG = LoggerFactory.getLogger(HBaseImport.class.getName()); public static int main(String[] args) { try { HBaseImport tool = new HBaseImport(); int rv = tool.init(args); if (rv != 0) return rv; tool.run(); } catch (Exception e) { System.err.println( "Caught exception " + e.getClass().getName() + " with message <" + e.getMessage() + ">"); return 1; } return 0; } private ThreadLocal<RawStore> rdbmsStore = new ThreadLocal<RawStore>() { @Override protected RawStore initialValue() { if (rdbmsConf == null) { throw new RuntimeException("order violation, need to set rdbms conf first"); } RawStore os = new ObjectStore(); os.setConf(rdbmsConf); return os; } }; private ThreadLocal<RawStore> hbaseStore = new ThreadLocal<RawStore>() { @Override protected RawStore initialValue() { if (hbaseConf == null) { throw new RuntimeException("order violation, need to set hbase conf first"); } RawStore hs = new HBaseStore(); hs.setConf(hbaseConf); return hs; } }; private Configuration rdbmsConf; private Configuration hbaseConf; private List<Database> dbs; private BlockingQueue<Table> partitionedTables; private BlockingQueue<String[]> tableNameQueue; private BlockingQueue<String[]> indexNameQueue; private BlockingQueue<PartQueueEntry> partQueue; private boolean writingToQueue, readersFinished; private boolean doKerberos, doAll; private List<String> rolesToImport, dbsToImport, tablesToImport, functionsToImport; private int parallel; private int batchSize; private HBaseImport() { } @VisibleForTesting public HBaseImport(String... args) throws ParseException { init(args); } private int init(String... args) throws ParseException { Options options = new Options(); doAll = doKerberos = false; parallel = 1; batchSize = 1000; options.addOption( OptionBuilder.withLongOpt("all").withDescription("Import the full metastore").create('a')); options.addOption(OptionBuilder.withLongOpt("batchsize") .withDescription("Number of partitions to read and write in a batch, defaults to 1000").hasArg() .create('b')); options.addOption(OptionBuilder.withLongOpt("database").withDescription("Import a single database") .hasArgs().create('d')); options.addOption(OptionBuilder.withLongOpt("help").withDescription("You're looking at it").create('h')); options.addOption(OptionBuilder.withLongOpt("function").withDescription("Import a single function") .hasArgs().create('f')); options.addOption(OptionBuilder.withLongOpt("kerberos") .withDescription("Import all kerberos related objects (master key, tokens)").create('k')); options.addOption(OptionBuilder.withLongOpt("parallel") .withDescription( "Parallel factor for loading (only applied to tables and partitions), " + "defaults to 1") .hasArg().create('p')); options.addOption( OptionBuilder.withLongOpt("role").withDescription("Import a single role").hasArgs().create('r')); options.addOption(OptionBuilder.withLongOpt("tables").withDescription("Import a single tables").hasArgs() .create('t')); CommandLine cli = new GnuParser().parse(options, args); // Process help, if it was asked for, this must be done first if (cli.hasOption('h')) { printHelp(options); return 1; } boolean hasCmd = false; // Now process the other command line args if (cli.hasOption('a')) { hasCmd = true; doAll = true; } if (cli.hasOption('b')) { batchSize = Integer.parseInt(cli.getOptionValue('b')); } if (cli.hasOption('d')) { hasCmd = true; dbsToImport = Arrays.asList(cli.getOptionValues('d')); } if (cli.hasOption('f')) { hasCmd = true; functionsToImport = Arrays.asList(cli.getOptionValues('f')); } if (cli.hasOption('p')) { parallel = Integer.parseInt(cli.getOptionValue('p')); } if (cli.hasOption('r')) { hasCmd = true; rolesToImport = Arrays.asList(cli.getOptionValues('r')); } if (cli.hasOption('k')) { doKerberos = true; } if (cli.hasOption('t')) { hasCmd = true; tablesToImport = Arrays.asList(cli.getOptionValues('t')); } if (!hasCmd) { printHelp(options); return 1; } dbs = new ArrayList<>(); // We don't want to bound the size of the table queue because we keep it all in memory partitionedTables = new LinkedBlockingQueue<>(); tableNameQueue = new LinkedBlockingQueue<>(); indexNameQueue = new LinkedBlockingQueue<>(); // Bound the size of this queue so we don't get too much in memory. partQueue = new ArrayBlockingQueue<>(parallel * 2); return 0; } private void printHelp(Options options) { (new HelpFormatter()).printHelp("hbaseschematool", options); } @VisibleForTesting void run() throws MetaException, InstantiationException, IllegalAccessException, NoSuchObjectException, InvalidObjectException, InterruptedException { // Order here is crucial, as you can't add tables until you've added databases, etc. init(); if (doAll || rolesToImport != null) { copyRoles(); } if (doAll || dbsToImport != null) { copyDbs(); } if (doAll || dbsToImport != null || tablesToImport != null) { copyTables(); copyPartitions(); copyIndexes(); } if (doAll || dbsToImport != null || functionsToImport != null) { copyFunctions(); } if (doAll || doKerberos) { copyKerberos(); } } private void init() throws MetaException, IllegalAccessException, InstantiationException { if (rdbmsConf != null) { // We've been configured for testing, so don't do anything here. return; } // We're depending on having everything properly in the path rdbmsConf = new HiveConf(); hbaseConf = new HiveConf();// HiveConf.setVar(hbaseConf, HiveConf.ConfVars.METASTORE_RAW_STORE_IMPL, HBaseStore.class.getName()); HiveConf.setBoolVar(hbaseConf, HiveConf.ConfVars.METASTORE_FASTPATH, true); // First get a connection to the RDBMS based store rdbmsStore.get().setConf(rdbmsConf); // Get a connection to the HBase based store hbaseStore.get().setConf(hbaseConf); } private void copyRoles() throws NoSuchObjectException, InvalidObjectException, MetaException { screen("Copying roles"); List<String> toCopy = doAll ? rdbmsStore.get().listRoleNames() : rolesToImport; for (String roleName : toCopy) { Role role = rdbmsStore.get().getRole(roleName); screen("Copying role " + roleName); hbaseStore.get().addRole(roleName, role.getOwnerName()); } } private void copyDbs() throws MetaException, NoSuchObjectException, InvalidObjectException { screen("Copying databases"); List<String> toCopy = doAll ? rdbmsStore.get().getAllDatabases() : dbsToImport; for (String dbName : toCopy) { Database db = rdbmsStore.get().getDatabase(dbName); dbs.add(db); screen("Copying database " + dbName); hbaseStore.get().createDatabase(db); } } private void copyTables() throws MetaException, InvalidObjectException, InterruptedException { screen("Copying tables"); // Start the parallel threads that will copy the tables Thread[] copiers = new Thread[parallel]; writingToQueue = true; for (int i = 0; i < parallel; i++) { copiers[i] = new TableCopier(); copiers[i].start(); } // Put tables from the databases we copied into the queue for (Database db : dbs) { screen("Coyping tables in database " + db.getName()); for (String tableName : rdbmsStore.get().getAllTables(db.getName())) { tableNameQueue.put(new String[] { db.getName(), tableName }); } } // Now put any specifically requested tables into the queue if (tablesToImport != null) { for (String compoundTableName : tablesToImport) { String[] tn = compoundTableName.split("\\."); if (tn.length != 2) { error(compoundTableName + " not in proper form. Must be in form dbname.tablename. " + "Ignoring this table and continuing."); } else { tableNameQueue.put(new String[] { tn[0], tn[1] }); } } } writingToQueue = false; // Wait until we've finished adding all the tables for (Thread copier : copiers) copier.join(); } private class TableCopier extends Thread { @Override public void run() { while (writingToQueue || tableNameQueue.size() > 0) { try { String[] name = tableNameQueue.poll(1, TimeUnit.SECONDS); if (name != null) { Table table = rdbmsStore.get().getTable(name[0], name[1]); // If this has partitions, put it in the list to fetch partions for if (table.getPartitionKeys() != null && table.getPartitionKeys().size() > 0) { partitionedTables.put(table); } screen("Copying table " + name[0] + "." + name[1]); hbaseStore.get().createTable(table); // See if the table has any constraints, and if so copy those as well List<SQLPrimaryKey> pk = rdbmsStore.get().getPrimaryKeys(table.getDbName(), table.getTableName()); if (pk != null && pk.size() > 0) { LOG.debug("Found primary keys, adding them"); hbaseStore.get().addPrimaryKeys(pk); } // Passing null as the target table name results in all of the foreign keys being // retrieved. List<SQLForeignKey> fks = rdbmsStore.get().getForeignKeys(null, null, table.getDbName(), table.getTableName()); if (fks != null && fks.size() > 0) { LOG.debug("Found foreign keys, adding them"); hbaseStore.get().addForeignKeys(fks); } } } catch (InterruptedException | MetaException | InvalidObjectException e) { throw new RuntimeException(e); } } } } private void copyIndexes() throws MetaException, InvalidObjectException, InterruptedException { screen("Copying indexes"); // Start the parallel threads that will copy the indexes Thread[] copiers = new Thread[parallel]; writingToQueue = true; for (int i = 0; i < parallel; i++) { copiers[i] = new IndexCopier(); copiers[i].start(); } // Put indexes from the databases we copied into the queue for (Database db : dbs) { screen("Coyping indexes in database " + db.getName()); for (String tableName : rdbmsStore.get().getAllTables(db.getName())) { for (Index index : rdbmsStore.get().getIndexes(db.getName(), tableName, -1)) { indexNameQueue.put(new String[] { db.getName(), tableName, index.getIndexName() }); } } } // Now put any specifically requested tables into the queue if (tablesToImport != null) { for (String compoundTableName : tablesToImport) { String[] tn = compoundTableName.split("\\."); if (tn.length != 2) { error(compoundTableName + " not in proper form. Must be in form dbname.tablename. " + "Ignoring this table and continuing."); } else { for (Index index : rdbmsStore.get().getIndexes(tn[0], tn[1], -1)) { indexNameQueue.put(new String[] { tn[0], tn[1], index.getIndexName() }); } } } } writingToQueue = false; // Wait until we've finished adding all the tables for (Thread copier : copiers) copier.join(); } private class IndexCopier extends Thread { @Override public void run() { while (writingToQueue || indexNameQueue.size() > 0) { try { String[] name = indexNameQueue.poll(1, TimeUnit.SECONDS); if (name != null) { Index index = rdbmsStore.get().getIndex(name[0], name[1], name[2]); screen("Copying index " + name[0] + "." + name[1] + "." + name[2]); hbaseStore.get().addIndex(index); } } catch (InterruptedException | MetaException | InvalidObjectException e) { throw new RuntimeException(e); } } } } /* Partition copying is a little complex. As we went through and copied the tables we put each * partitioned table into a queue. We will now go through that queue and add partitions for the * tables. We do the finding of partitions and writing of them separately and in parallel. * This way if there is one table with >> partitions then all of the others that skew won't * hurt us. To avoid pulling all of the partitions for a table into memory, we batch up * partitions (by default in batches of 1000) and copy them over in batches. */ private void copyPartitions() throws MetaException, NoSuchObjectException, InvalidObjectException, InterruptedException { screen("Copying partitions"); readersFinished = false; Thread[] readers = new Thread[parallel]; Thread[] writers = new Thread[parallel]; for (int i = 0; i < parallel; i++) { readers[i] = new PartitionReader(); readers[i].start(); writers[i] = new PartitionWriter(); writers[i].start(); } for (Thread reader : readers) reader.join(); readersFinished = true; // Wait until we've finished adding all the partitions for (Thread writer : writers) writer.join(); } private class PartitionReader extends Thread { @Override public void run() { while (partitionedTables.size() > 0) { try { Table table = partitionedTables.poll(1, TimeUnit.SECONDS); if (table != null) { screen("Fetching partitions for table " + table.getDbName() + "." + table.getTableName()); List<String> partNames = rdbmsStore.get().listPartitionNames(table.getDbName(), table.getTableName(), (short) -1); if (partNames.size() <= batchSize) { LOG.debug("Adding all partition names to queue for " + table.getDbName() + "." + table.getTableName()); partQueue.put(new PartQueueEntry(table.getDbName(), table.getTableName(), partNames)); } else { int goUntil = partNames.size() % batchSize == 0 ? partNames.size() / batchSize : partNames.size() / batchSize + 1; for (int i = 0; i < goUntil; i++) { int start = i * batchSize; int end = Math.min((i + 1) * batchSize, partNames.size()); LOG.debug("Adding partitions " + start + " to " + end + " for " + table.getDbName() + "." + table.getTableName()); partQueue.put(new PartQueueEntry(table.getDbName(), table.getTableName(), partNames.subList(start, end))); } } } } catch (InterruptedException | MetaException e) { throw new RuntimeException(e); } } } } private class PartitionWriter extends Thread { @Override public void run() { // This keeps us from throwing exceptions in our raw store calls Deadline.registerIfNot(1000000); while (!readersFinished || partQueue.size() > 0) { try { PartQueueEntry entry = partQueue.poll(1, TimeUnit.SECONDS); if (entry != null) { LOG.info("Writing partitions " + entry.dbName + "." + entry.tableName + "." + StringUtils.join(entry.partNames, ':')); // Fetch these partitions and write them to HBase Deadline.startTimer("hbaseimport"); List<Partition> parts = rdbmsStore.get().getPartitionsByNames(entry.dbName, entry.tableName, entry.partNames); hbaseStore.get().addPartitions(entry.dbName, entry.tableName, parts); Deadline.stopTimer(); } } catch (InterruptedException | MetaException | InvalidObjectException | NoSuchObjectException e) { throw new RuntimeException(e); } } } } private void copyFunctions() throws MetaException, NoSuchObjectException, InvalidObjectException { screen("Copying functions"); // Copy any functions from databases we copied. for (Database db : dbs) { screen("Copying functions in database " + db.getName()); for (String funcName : rdbmsStore.get().getFunctions(db.getName(), "*")) { copyOneFunction(db.getName(), funcName); } } // Now do any specifically requested functions if (functionsToImport != null) { for (String compoundFuncName : functionsToImport) { String[] fn = compoundFuncName.split("\\."); if (fn.length != 2) { error(compoundFuncName + " not in proper form. Must be in form dbname.funcname. " + "Ignoring this function and continuing."); } else { copyOneFunction(fn[0], fn[1]); } } } } private void copyOneFunction(String dbName, String funcName) throws MetaException, InvalidObjectException { Function func = rdbmsStore.get().getFunction(dbName, funcName); screen("Copying function " + dbName + "." + funcName); hbaseStore.get().createFunction(func); } private void copyKerberos() throws MetaException { screen("Copying kerberos related items"); for (String tokenId : rdbmsStore.get().getAllTokenIdentifiers()) { String token = rdbmsStore.get().getToken(tokenId); hbaseStore.get().addToken(tokenId, token); } for (String masterKey : rdbmsStore.get().getMasterKeys()) { hbaseStore.get().addMasterKey(masterKey); } } private void screen(String msg) { LOG.info(msg); System.out.println(msg); } private void error(String msg) { LOG.error(msg); System.err.println("ERROR: " + msg); } @VisibleForTesting void setConnections(RawStore rdbms, RawStore hbase) { rdbmsStore.set(rdbms); hbaseStore.set(hbase); rdbmsConf = rdbms.getConf(); hbaseConf = hbase.getConf(); } private static class PartQueueEntry { final String dbName; final String tableName; final List<String> partNames; PartQueueEntry(String d, String t, List<String> p) { dbName = d; tableName = t; partNames = p; } } }