Java tutorial
/* * Copyright (c) 2012 OCLC, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.oclc.firefly.hadoop.backup; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.TreeMap; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; import org.apache.commons.lang.ArrayUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FsShell; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.TableNotFoundException; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.FSTableDescriptors; import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * HBase backup tool */ public class Backup { /** Class Logger */ private static final Logger LOG = LoggerFactory.getLogger(Backup.class); /** name of hbase user config setting */ public static final String JOBCONF_USER = "backup.hbase.user"; /** name of hbase dest hdfs setting */ public static final String JOBCONF_DEST_HDFS = "backup.copy.destination.hdfs"; /** name of hbase dest path setting */ public static final String JOBCONF_DEST_PATH = "backup.copy.destination.path"; /** the replication factor for the copied files */ public static final String JOBCONF_REPLICATION = "backup.copy.replication"; /** The attempt id setting */ public static final String JOBCONF_ATTEMPT_ID = "backup.copy.attempt.id"; /** The backup directory */ public static final String BACKUP_STORE_DIR = "/backup"; /** mapper output directory */ public static final String BACKUP_MAP_OUT_DIR = "/tmp/backup/output"; /** mapper input directory */ public static final String BACKUP_MAP_IN_DIR = "/tmp/backup/input"; /** default buffer size 8KB */ private static final int BUFFER_SIZE = 8192; /** the internal buffer */ private byte[] buffer = new byte[BUFFER_SIZE]; /** the source configuration */ private Configuration srcConf = null; /** the destination configuration */ private Configuration dstConf = null; /** the source file system */ private FileSystem srcFs = null; /** the destination file system */ private FileSystem dstFs = null; /** the user name to run tasks as */ private String username = System.getProperty("user.name"); /** the initial replication value */ private int initialReplication = 1; /** the final replication factor */ private int finalReplication = 0; /** the number of map tasks to run */ private int numMapTasks = 1; /** A list of all the regions that have been copied */ private List<HRegionInfo> copiedRegions = new ArrayList<HRegionInfo>(); /** The backup directory Path */ private Path backupDirectoryPath = null; /** the directory to store backups */ private String storeDirectory = null; /** * Constructor * @param srcConf The source configuration * @param dstConf The destination configuration * @throws IOException If failed to get the file systems */ public Backup(Configuration srcConf, Configuration dstConf) throws IOException { this.srcConf = srcConf; this.dstConf = dstConf; this.srcFs = FileSystem.get(srcConf); this.dstFs = FileSystem.get(dstConf); finalReplication = dstFs.getDefaultReplication(); } /** * Entry point * @param args Command line arguments * @throws Exception exception */ public static void main(String[] args) throws Exception { int initialReplication = 1; int finalReplication = 0; int numMaps = 2; int tries = 0; String tbl = null; String dest = null; String user = System.getProperty("user.name"); Path destPath = null; CommandLineParser parser = new PosixParser(); CommandLine cmdline = null; // Parse command line options try { cmdline = parser.parse(getOptions(), args); } catch (org.apache.commons.cli.ParseException e) { System.out.println(e.getMessage()); printOptions(); System.exit(-1); } // Get command line options for (Option option : cmdline.getOptions()) { switch (option.getId()) { case 'd': dest = option.getValue(); destPath = new Path(dest); if (!destPath.isAbsolute()) { throw new IllegalArgumentException("Destination path must be an absolute path"); } break; case 'm': numMaps = Integer.parseInt(option.getValue()); if (numMaps <= 0) { throw new IllegalArgumentException("Number of map tasks must be greater than zero."); } break; case 'n': tries = Integer.parseInt(option.getValue()); if (tries < 0) { throw new IllegalArgumentException( "Maximum number of tries must be greater than or equal to zero."); } break; case 'f': finalReplication = Integer.parseInt(option.getValue()); if (finalReplication <= 0) { throw new IllegalArgumentException("Initial replication must be greater than zero."); } break; case 'r': initialReplication = Integer.parseInt(option.getValue()); if (initialReplication <= 0) { throw new IllegalArgumentException("Initial replication must be greater than zero."); } break; case 't': tbl = option.getValue(); break; case 'u': user = option.getValue(); break; default: throw new IllegalArgumentException("unexpected option " + option); } } String[] tables = null; if (tbl != null) { tables = tbl.split(","); } Configuration srcConf = HBaseConfiguration.create(); Configuration dstConf = HBaseConfiguration.create(); // This allows us to copy to a separate HDFS instance String destDir = null; if (dest != null) { destDir = destPath.toUri().getPath(); String fsName = null; if (destDir != null && destDir.length() > 0) { LOG.debug("destination dfs: " + dest.substring(0, dest.length() - destDir.length())); fsName = dest.substring(0, dest.length() - destDir.length()); } else { fsName = dest; destDir = null; } if (fsName != null && fsName.length() > 0) { dstConf.set("fs.default.name", fsName); } } Backup backup = new Backup(srcConf, dstConf); backup.setInitialReplication(initialReplication); backup.setFinalReplication(finalReplication); backup.setUsername(user); backup.setNumMapTasks(numMaps); if (destDir != null) { backup.setBackupStoreDirectory(destDir); } LOG.info("HBase backup tool"); LOG.info("--------------------------------------------------"); //LOG.info("Destination fs : " + dstConf.get("fs.default.name")); LOG.info("Initial replication: " + backup.getInitialReplication()); LOG.info("Final replication : " + backup.getFinalReplication()); LOG.info("Number of attempts : " + ((tries == 0) ? "Until nothing left to copy" : tries)); LOG.info("Username : " + backup.getUsername()); LOG.info("Number map tasks : " + backup.getNumMapTasks()); LOG.info("Backup store path : " + backup.getBackupStoreDirectory()); LOG.info("--------------------------------------------------"); boolean success = backup.doMajorCopy(tables, tries); LOG.info("--------------------------------------------------"); if (success) { LOG.info("Backup located at: " + backup.getBackupDirectoryPath()); LOG.info("Backup complete"); } else { LOG.info("Files located at: " + backup.getBackupDirectoryPath()); LOG.info("Backup failed"); } System.exit(success ? 0 : -1); } /** * Performs a complete copy of the source hbase to the given destination * @param tables The names of the tables to backup * @param maxTries The maximum number of times to try to copy regions. * @return True if successful, false otherwise * @throws IOException If failed to interact with Hadoop * @throws ClassNotFoundException * @throws InterruptedException */ public boolean doMajorCopy(String[] tables, int maxTries) throws IOException, InterruptedException, ClassNotFoundException { boolean ret = false; String username = getUsername(); short replication = (short) getInitialReplication(); // Get a list of regions from HBase // Then filter out the regions we are not extracting, and group them by table List<CatalogRow> regions = getHBaseRegions(srcConf); Map<String, List<CatalogRow>> filtered = groupAndFilter(regions, tables); List<Pair<String, HRegionInfo>> mapperInput = new ArrayList<Pair<String, HRegionInfo>>(); // Prepare the input for the mappers to use // This creates a list of region server and region pairs LOG.info("Exporting the following tables:"); for (Entry<String, List<CatalogRow>> entry : filtered.entrySet()) { String tablename = entry.getKey(); List<CatalogRow> rows = entry.getValue(); LOG.info(". " + tablename); for (CatalogRow r : rows) { String regionServer = r.getHost() + ":" + r.getPort(); HRegionInfo region = r.getHRegionInfo(); mapperInput.add(Pair.newPair(regionServer, region)); } } // Make sure we write to a directory that does not exist backupDirectoryPath = createBackupDirectory(getCurrentDateString()); LOG.info("Starting backup path: " + backupDirectoryPath); // Copy the .tableinfo files for the tables we are extracting // These files are not copied by the MR job as it only focuses on regions List<FileStatus> tableInfoFiles = getTableInfoFiles(srcFs, filtered); for (FileStatus file : tableInfoFiles) { Path srcFilePath = file.getPath(); Path relPath = new Path(BackupUtils.getFsRelativePath(srcFs, srcFilePath)); Path dstFilePath = new Path(backupDirectoryPath.toString() + relPath.toString()); BackupUtils.copy(srcFs, srcFilePath, dstFs, dstFilePath, buffer, username, replication); } // Dispatch MR job and monitor // Retry regions if necessary if (mapperInput.size() > 0) { int tries = 0; while (!ret && (maxTries == 0 || tries < maxTries)) { if (getNumMapTasks() > mapperInput.size()) { setNumMapTasks(mapperInput.size()); LOG.info("Not enough regions. Reducing number of map tasks"); } // Generate a list of mapper input files and create job List<Path> sourceFiles = createMapperInputSequenceFiles(mapperInput, getNumMapTasks(), srcFs, tries); Job job = createMRJob(srcConf, dstConf, sourceFiles, backupDirectoryPath, tries); LOG.info(job.getJobName()); LOG.info("--------------------------------------------------"); LOG.info("Number of regions : " + mapperInput.size()); LOG.info("Number of map tasks: " + getNumMapTasks()); LOG.info("Mapper input path : " + getMapInputDirectory(tries)); LOG.info("Mapper output path : " + FileOutputFormat.getOutputPath(job)); LOG.info("--------------------------------------------------"); job.waitForCompletion(true); if (job.isSuccessful()) { // Check if any regions failed Counters counters = job.getCounters(); Counter failedCounter = counters.findCounter("Backup", "FailedRegions"); long failed = failedCounter.getValue(); if (failed > 0) { LOG.info("Number of failed regions: " + failed + "."); // get a fresh list of regions to copy List<Pair<String, HRegionInfo>> failedRegions = getFailedRegions(srcFs, srcConf, tries); addCopiedRegions(mapperInput, failedRegions); mapperInput = getRemainingRegions(mapperInput, tables); for (Pair<String, HRegionInfo> pair : mapperInput) { LOG.info("Retry: " + pair.getSecond()); } if (mapperInput.size() == 0) { ret = true; backupDirectoryPath = appendEndTime(backupDirectoryPath); LOG.warn("No regions left to copy, but expected to copy more. " + "Please inspect logs/files manually for errors"); } } else { ret = true; addCopiedRegions(mapperInput, null); backupDirectoryPath = appendEndTime(backupDirectoryPath); LOG.info("MR job finished successfully"); } } else { LOG.error("An unexpected error occurred during the MR job. Please see MR logs."); break; } tries++; } if (ret) { if (verifyCopiedRegions()) { LOG.info("Verification passed succesfully"); } else { ret = false; LOG.info("Verification failed. Please inspect errors manually"); } } else { LOG.info("No attempts left. Try setting -n to a higher value, or setting it to 0"); } } if (ret) { // Set replication factor of backup directory to default. // This may not be the best solution, but let built-in shell take care of it // because it can do it recursively with out us having to rediscover all the files short finalReplication = (short) getFinalReplication(); if (replication != finalReplication) { FsShell shell = new FsShell(dstConf); String[] repArgs = { "-setrep", "-R", "-w", "" + finalReplication, backupDirectoryPath.toString() }; try { LOG.info("Setting final replication factor of backup files to " + finalReplication); shell.run(repArgs); } catch (Exception e) { LOG.warn("Could not set replication factor of backup files to " + finalReplication); } } } return ret; } /** * Verify the copied regions * @return True if verification succeeds. False otherwise */ private boolean verifyCopiedRegions() { boolean ret = true; Map<String, List<HRegionInfo>> tableRegions = groupRegionsByTableName(copiedRegions); for (Map.Entry<String, List<HRegionInfo>> entry : tableRegions.entrySet()) { HRegionInfo prevRegion = null; String tableName = entry.getKey(); List<HRegionInfo> regions = entry.getValue(); // sort regions from start to end region Collections.sort(regions); LOG.info("Checking table: " + tableName); for (int i = 0; i < regions.size(); i++) { HRegionInfo currRegion = regions.get(i); LOG.info(" " + i + ": " + currRegion); if (regions.size() == 1) { // Single region. Start and end key should be empty if (currRegion.getStartKey().length != 0 || currRegion.getEndKey().length != 0) { ret = false; LOG.error(tableName + ": Single region, expecting start and keys to be empty"); LOG.error(" " + currRegion); } } else { if (i == 0) { // First region. Current start key should be empty if (currRegion.getStartKey().length != 0) { ret = false; LOG.error(tableName + ": First region. Expecting start key to be empty"); LOG.error(" " + currRegion); } } else { // Last region or middle region. Current start key should equals previous end key if (Bytes.compareTo(currRegion.getStartKey(), prevRegion.getEndKey()) != 0) { ret = false; LOG.error(tableName + ": Missing region. " + "End key and start key of adjacent regions don't match"); LOG.error(" left: " + prevRegion); LOG.error(" right: " + currRegion); } if (i == regions.size() - 1) { // Last region. The current end key should be empty if (currRegion.getEndKey().length != 0) { ret = false; LOG.error(tableName + ": Last region. Expecting end key to be empty"); LOG.error(" " + currRegion); } } } } prevRegion = currRegion; } } return ret; } /** * Group the given region list by table names * @param regions The list of regions to group * @return A map where the key is the table name and value is a list of all regions for that table */ private Map<String, List<HRegionInfo>> groupRegionsByTableName(List<HRegionInfo> regions) { Map<String, List<HRegionInfo>> ret = new TreeMap<String, List<HRegionInfo>>(); for (HRegionInfo region : regions) { String tableName = region.getTableNameAsString(); List<HRegionInfo> value = ret.get(tableName); if (value == null) { value = new ArrayList<HRegionInfo>(); } value.add(region); ret.put(tableName, value); } return ret; } /** * Rename backup directory to contain start and end date of backup * @param dstPath The current path to backup directory * @return The final name of the backup directory * @throws IOException Thrown if failed to rename backup directory */ private Path appendEndTime(Path dstPath) throws IOException { Path finalBackupPath = new Path(dstPath.toString() + "-" + getCurrentDateString()); dstFs.rename(dstPath, finalBackupPath); return finalBackupPath; } /** * Add copied regions to global list of copied regions making sure not to add failed regions * @param inputRegions The list of regions that were sent to mappers to copy * @param failedRegions The list of regions that failed to be copied. Can be null to indicate no failed regions */ private void addCopiedRegions(List<Pair<String, HRegionInfo>> inputRegions, List<Pair<String, HRegionInfo>> failedRegions) { for (Pair<String, HRegionInfo> pair : inputRegions) { boolean hasFailed = false; HRegionInfo region = pair.getSecond(); if (failedRegions != null) { // search for this region among failed regions for (int i = 0; i < failedRegions.size() && !hasFailed; i++) { HRegionInfo failedRegion = failedRegions.get(i).getSecond(); if (region.equals(failedRegion)) { hasFailed = true; } } } // Add to list of copied regions only if copy didn't fail if (!hasFailed) { copiedRegions.add(region); } } } /** * Get a list of regions to copy for next attempt * @param oldRegions The list of all regions that were tried in the last MR run * @param tables The table we are backing up * @return The list of remaining regions * @throws IOException Thrown if failed to read from HBase */ private List<Pair<String, HRegionInfo>> getRemainingRegions(List<Pair<String, HRegionInfo>> oldRegions, String[] tables) throws IOException { List<Pair<String, HRegionInfo>> ret = new ArrayList<Pair<String, HRegionInfo>>(); // Get the most current list of regions in .META. List<CatalogRow> regionsInMeta = getHBaseRegions(srcConf); LOG.info("Calculating remaining regions"); // Remove those regions from regionsInMeta which are already in copiedRegions (should be most of them) // This steps retains only the regions which we have yet to copy for (int i = 0; i < regionsInMeta.size(); i++) { HRegionInfo region = regionsInMeta.get(i).getHRegionInfo(); if (copiedRegions.contains(region)) { regionsInMeta.remove(i); --i; } } // Remove regions from regionsInMeta that are daughters of any region in copiedRegions for (int i = 0; i < regionsInMeta.size(); i++) { CatalogRow r = regionsInMeta.get(i); for (HRegionInfo copiedRegion : copiedRegions) { if (BackupUtils.regionContains(copiedRegion, r.getHRegionInfo())) { LOG.info("Daughter region : " + r.getHRegionInfo()); LOG.info(" Copied parent : " + copiedRegion); regionsInMeta.remove(i); --i; break; } } } // Get only the regions for the tables we are extracting Map<String, List<CatalogRow>> filtered = groupAndFilter(regionsInMeta, tables); for (Entry<String, List<CatalogRow>> entry : filtered.entrySet()) { List<CatalogRow> rows = entry.getValue(); for (CatalogRow r : rows) { String regionServer = r.getHost() + ":" + r.getPort(); HRegionInfo region = r.getHRegionInfo(); ret.add(Pair.newPair(regionServer, region)); } } return ret; } /** * Reads files from retry directory and collects input for the * next set of mappers to run * @param fs The filesystem to read from * @param conf The configuration object * @param id The mapper id * @return The list of mapper inputs * @throws IOException thrown if failed to read from filesystem */ private List<Pair<String, HRegionInfo>> getFailedRegions(FileSystem fs, Configuration conf, int id) throws IOException { List<Pair<String, HRegionInfo>> ret = new ArrayList<Pair<String, HRegionInfo>>(); Text rserver = new Text(); Path retryPath = new Path(getMapOutputDirectory(id)); FileStatus[] list = fs.listStatus(retryPath); if (list != null) { LOG.info("Getting failed regions"); for (FileStatus file : list) { Path filePath = file.getPath(); if (filePath.getName().startsWith("part-")) { LOG.debug("Retry file: " + filePath); SequenceFile.Reader reader = new SequenceFile.Reader(fs, filePath, conf); while (reader.next(rserver)) { HRegionInfo rinfo = new HRegionInfo(); reader.getCurrentValue(rinfo); LOG.info(rinfo.toString()); ret.add(Pair.newPair(rserver.toString(), rinfo)); } try { reader.close(); } catch (Exception e) { // Ignore error } } } } return ret; } /** * Get the MR job configuration object already configured * @param srcConf The hdfs source configuration * @param dstConf The hdfs destination configuration * @param sourceFiles The mapper input files * @param dstPath The backup path * @param id The internal id of this job * @return The configured object * @throws IOException If fails to read from file system */ private Job createMRJob(Configuration srcConf, Configuration dstConf, List<Path> sourceFiles, Path dstPath, int id) throws IOException { srcConf.set(JOBCONF_DEST_HDFS, dstConf.get("fs.default.name")); srcConf.set(JOBCONF_DEST_PATH, dstPath.toString()); srcConf.set(JOBCONF_USER, getUsername()); srcConf.setInt(JOBCONF_ATTEMPT_ID, id); srcConf.setInt(JOBCONF_REPLICATION, getInitialReplication()); srcConf.setInt("mapred.map.tasks", getNumMapTasks()); // Don't want multiple mappers copying the same file to the same location srcConf.setBoolean("mapred.map.tasks.speculative.execution", false); Job job = new Job(srcConf); job.setJobName("Backup " + dstPath.getName() + " (Attempt " + (id + 1) + ")"); job.setJarByClass(Backup.class); job.setMapperClass(CopyRegionMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(HRegionInfo.class); job.setNumReduceTasks(0); job.setInputFormatClass(BackupInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); for (Path file : sourceFiles) { LOG.debug("Map input: " + file.toString()); FileInputFormat.addInputPath(job, file); } // Set output path, delete first if it exists Path outputPath = new Path(getMapOutputDirectory(id)); srcFs.delete(outputPath, true); FileOutputFormat.setOutputPath(job, outputPath); return job; } /** * Create mapper input files containing their paths to copy * @param mapperInput The list of files that the copy mappers should copy * @param numMapTasks The number of map tasks * @param fs The file system to write to * @param id The mapper id * @return The list of input files for a a mapper * @throws IOException If we fail to create input files */ private List<Path> createMapperInputSequenceFiles(List<Pair<String, HRegionInfo>> mapperInput, int numMapTasks, FileSystem fs, int id) throws IOException { int idx = 0; List<Path> paths = new ArrayList<Path>(); List<SequenceFile.Writer> writers = new ArrayList<SequenceFile.Writer>(); String inputDir = getMapInputDirectory(id); // delete this directory if already exists fs.delete(new Path(inputDir), true); // each mapper gets an input file for (int i = 0; i < numMapTasks; i++) { // open the input file for writing Path mapInputFile = new Path(inputDir + "/mapper-input-" + i + ".txt"); fs.delete(mapInputFile, false); SequenceFile.Writer writer = SequenceFile.createWriter(fs, fs.getConf(), mapInputFile, Text.class, HRegionInfo.class, SequenceFile.CompressionType.NONE); LOG.debug("Mapper input: " + mapInputFile); paths.add(mapInputFile); writers.add(writer); } // Assign copy paths to mappers for (Pair<String, HRegionInfo> pair : mapperInput) { Text key = new Text(pair.getFirst()); HRegionInfo value = new HRegionInfo(pair.getSecond()); LOG.debug("Appending " + key + ", " + value.getEncodedName()); writers.get(idx).append(key, value); idx++; if (idx >= writers.size()) { idx = 0; } } // close writers for (SequenceFile.Writer writer : writers) { try { writer.sync(); writer.close(); } catch (Exception e) { // nothing to do here } } return paths; } /** * Create a new temporary directory and return its path. * Guarantees that it does not exist at the moment that this function is called. * @param startTime The date time string * @return The Path to the temporary directory * @throws IOException When failed to read file system */ private Path createBackupDirectory(String startTime) throws IOException { Path ret; String parent = getBackupStoreDirectory(); ret = new Path(parent, "bak-" + startTime); dstFs.mkdirs(ret); return ret; } /** * Returns a string of the current date time in the format yyyyMMdd.kkmmss.SSS * @return The date time string */ private String getCurrentDateString() { return BackupUtils.BACKUP_DATE_FORMAT.format(new Date()); } /** * Group a list of CatalogRow by table name * @param regions The list CatalogRow * @param tables The names of the tables to keep. Anything not listed is filtered out * @return A map where the key is the table name and value is a list of CatalogRow objects */ public Map<String, List<CatalogRow>> groupAndFilter(List<CatalogRow> regions, String[] tables) { Map<String, List<CatalogRow>> ret = new HashMap<String, List<CatalogRow>>(); List<HRegionInfo> daughterRegions = new ArrayList<HRegionInfo>(); // When regions split, the parent region still contains the actual files with the data // The daughter regions only contain references, which we will not copy // We have to be careful with this case as the parent region could be deleted at any point during the backup for (CatalogRow r : regions) { HRegionInfo region = r.getHRegionInfo(); if (region.isSplit()) { LOG.warn("Region is split: " + region); if (!region.isOffline()) { LOG.warn(" But region is not offline"); } HRegionInfo splitA = r.getSplitA(); HRegionInfo splitB = r.getSplitB(); if (splitA != null) { LOG.warn(" Split A: " + splitA); daughterRegions.add(splitA); } if (splitB != null) { LOG.warn(" Split B: " + splitB); daughterRegions.add(splitB); } } } for (CatalogRow r : regions) { HRegionInfo region = r.getHRegionInfo(); // Leave out daughter regions if (!daughterRegions.contains(region)) { String tName = region.getTableNameAsString(); if (tables == null || ArrayUtils.contains(tables, tName)) { List<CatalogRow> value = ret.get(tName); if (value == null) { value = new ArrayList<CatalogRow>(); } value.add(r); ret.put(tName, value); } } else { LOG.warn("Filtered daughter region: " + region); } } return ret; } /** * Get the list of files to copy. * @param fs The file system to get file from * @param tableRegions the table regions for which to look files * @return A list of file names * @throws IOException When failed to communicate with filesystem */ public List<FileStatus> getTableInfoFiles(FileSystem fs, Map<String, List<CatalogRow>> tableRegions) throws IOException { List<FileStatus> ret = new ArrayList<FileStatus>(); String rootDir = fs.getConf().get(HConstants.HBASE_DIR); // Get list of files to copy one table at a time for (Map.Entry<String, List<CatalogRow>> entry : tableRegions.entrySet()) { String tableName = entry.getKey(); Path tableDirPath = new Path(rootDir, tableName); // Add .tableinfo to list of files to cppy try { FileStatus tableInfoFile = BackupUtils.getTableInfoPath(fs, tableDirPath); ret.add(tableInfoFile); } catch (FileNotFoundException e) { // Not sure what to do if we can't find this file LOG.warn("No .tableinfo file found for table " + tableName); } } return ret; } /** * Get the list of files to copy. * @param fs The file system to get file from * @param tableRegions the table regions for which to look files * @return A list of file names * @throws IOException When failed to communicate with filesystem */ public List<FileStatus> getListOfFiles(FileSystem fs, Map<String, List<CatalogRow>> tableRegions) throws IOException { List<FileStatus> ret = new ArrayList<FileStatus>(); String rootDir = fs.getConf().get(HConstants.HBASE_DIR); // Get list of files to copy one table at a time for (Map.Entry<String, List<CatalogRow>> entry : tableRegions.entrySet()) { String tableName = entry.getKey(); Path tableDirPath = new Path(rootDir, tableName); // Add .tableinfo to list of files to cppy try { FileStatus tableInfoFile = BackupUtils.getTableInfoPath(fs, tableDirPath); ret.add(tableInfoFile); } catch (FileNotFoundException e) { // Not sure what to do if we can't find this file LOG.warn("No .tableinfo file found for table " + tableName); } // Get table descriptor so we may get information about the table we are extracting HTableDescriptor tDesc = FSTableDescriptors.getTableDescriptor(fs, tableDirPath); if (tDesc == null) { throw new TableNotFoundException("Could not get HTableDescriptor for table " + tableName); } // Need to find out what column families this table has // so that we may generate paths to the files we are copying HColumnDescriptor[] columnFamilies = tDesc.getColumnFamilies(); List<CatalogRow> regions = entry.getValue(); for (CatalogRow r : regions) { HRegionInfo info = r.getHRegionInfo(); String regionName = info.getEncodedName(); // Add .regioninfo to list of files to cppy Path regionDirPath = new Path(tableDirPath, regionName); Path regionInfoFilePath = new Path(regionDirPath, HRegion.REGIONINFO_FILE); try { FileStatus regionInfoFile = fs.getFileStatus(regionInfoFilePath); ret.add(regionInfoFile); } catch (FileNotFoundException e) { // Not sure what to do if we can't find this file LOG.warn("No .regioninfo file found for region " + tableName + "/" + regionName); } for (HColumnDescriptor col : columnFamilies) { String family = col.getNameAsString(); Path regionFamilyDirPath = new Path(regionDirPath, family); try { // Add column family directories to make sure // they get copied should they be empty FileStatus dirStatus = fs.getFileStatus(regionFamilyDirPath); ret.add(dirStatus); // Finally, get all the files under this column family FileStatus[] statusList = fs.listStatus(regionFamilyDirPath); if (statusList != null) { for (FileStatus status : statusList) { ret.add(status); } } } catch (FileNotFoundException e) { LOG.warn("Expecting region family directory '" + regionFamilyDirPath + "' but not found"); } } } } return ret; } /** * Get all regions listed in .META. * @param conf the hbase configuration * @return List of regions * @throws IOException */ public static List<CatalogRow> getHBaseRegions(Configuration conf) throws IOException { List<CatalogRow> ret = null; HTable meta = getMetaTable(conf); Scan metaScanner = new Scan(); metaScanner.addFamily(Bytes.toBytes("info")); ResultScanner metaResults = meta.getScanner(metaScanner); try { ret = new ArrayList<CatalogRow>(); for (Result r : metaResults) { CatalogRow row = new CatalogRow(r); ret.add(row); } } finally { metaResults.close(); } return ret; } /** * Get root table HTable object * @param config the hbase config * @return The root HTable object */ public static HTable getRootTable(Configuration config) { HTable table = null; try { table = new HTable(config, HConstants.ROOT_TABLE_NAME); } catch (IOException e) { LOG.error("Could not instantiate -ROOT- HTable"); } return table; } /** * Get root table HTable object * @param config the hbase config * @return The root HTable object */ public static HTable getMetaTable(Configuration config) { HTable table = null; try { table = new HTable(config, HConstants.META_TABLE_NAME); } catch (IOException e) { LOG.error("Could not instantiate .META. HTable"); } return table; } /** * Get the backup directory path * @return the backup directory path */ public Path getBackupDirectoryPath() { return this.backupDirectoryPath; } /** * Set the name of user to run as * @param username the username to set * @throws IllegalArgumentException If username is null or empty */ public void setUsername(String username) throws IllegalArgumentException { if (username == null || username.length() == 0) { throw new IllegalArgumentException("Username cannot be null or empty"); } this.username = username; } /** * Set the initial replication factor * @param replication the replication to set * @throws IllegalArgumentException Thrown if initial replication is less than or equal to zero */ public void setInitialReplication(int replication) throws IllegalArgumentException { if (finalReplication <= 0) { throw new IllegalArgumentException("Initial replication must be greater than zero"); } this.initialReplication = replication; } /** * Set the final replication factor. A value of 0 sets it to configuration default. * @param finalReplication The final replication value. * @throws IllegalArgumentException Thrown if value is less than 0 */ public void setFinalReplication(int finalReplication) throws IllegalArgumentException { if (finalReplication < 0) { throw new IllegalArgumentException("Final replication must be non-negative"); } else if (finalReplication > 0) { this.finalReplication = finalReplication; } else { this.finalReplication = dstFs.getDefaultReplication(); } } /** * Get the final replication factor * @return The final replication factor */ public int getFinalReplication() { return this.finalReplication; } /** * Set the number of map tasks to run * @param numMapTasks the numMapTasks to set * @throws IllegalArgumentException If value is not greater than zero */ public void setNumMapTasks(int numMapTasks) throws IllegalArgumentException { if (numMapTasks <= 0) { throw new IllegalArgumentException("Number of map tasks must be greater than zero"); } this.numMapTasks = numMapTasks; } /** * Get the name of user to run as * @return the username */ public String getUsername() { return username; } /** * Get the backup storage directory * @return the backup storage directory */ public String getBackupStoreDirectory() { String ret = this.storeDirectory; if (ret == null) { ret = BACKUP_STORE_DIR; String user = getUsername(); if (user != null) { ret = "/user/" + user + ret; } } return dstFs.getUri().toString() + ret; } /** * Set the backup storage directory for backups * @param storeDir Path to store directory */ public void setBackupStoreDirectory(String storeDir) { if (storeDir == null) { throw new NullPointerException("Backup store directory is null"); } else if (storeDir.trim().length() == 0) { throw new IllegalArgumentException("Invalid DFS path"); } this.storeDirectory = storeDir; } /** * Get the map output directory * @param id The mapper id * @return The map output directory */ public String getMapOutputDirectory(int id) { String ret = BACKUP_MAP_OUT_DIR + "/output-" + id; String user = getUsername(); if (user != null) { ret = "/user/" + user + ret; } return ret; } /** * Get the map input directory * @param id The mapper id * @return The map output directory */ public String getMapInputDirectory(int id) { String ret = BACKUP_MAP_IN_DIR + "/input-" + id; String user = getUsername(); if (user != null) { ret = "/user/" + user + ret; } return ret; } /** * Get the initial replication factor * @return the initialReplication */ public int getInitialReplication() { return initialReplication; } /** * Get the number of map tasks to run * @return the numMapTasks */ public int getNumMapTasks() { return numMapTasks; } /** * Returns the command-line options supported. * * @return the command-line options */ private static Options getOptions() { Options options = new Options(); Option initialreplication = new Option("r", "initialReplication", true, "The initial replication factor of copied files. " + "Default is 1 because it cuts down the copy time window which helps reduce errors. " + "Note: this can cause problems if a block is lost while backup is running."); Option finalReplication = new Option("f", "finalReplication", true, "The final replication factor of copied files. Default is destination config default. " + "This is desirable if initial replication is set low."); Option tries = new Option("n", "tries", true, "The maximum number of times to attempt to copy regions. Default: 0 (Try until nothing left to copy)"); Option numMaps = new Option("m", "mappers", true, "The number of mappers to run (The number of paraller copiers)"); Option dst = new Option("d", "destUri", true, "Destination URI. Must be an absolute path. Default is /user/<username>/backup. Example " + "hdfs://example.com:2020/foo/bar or /foo/bar"); Option usr = new Option("u", "user", true, "The hbase user to use. Default is current user logged in"); Option tbl = new Option("t", "tables", true, "Comma delimited list of tables to backup. Default is to create a backup of all tables"); initialreplication.setRequired(false); finalReplication.setRequired(false); tries.setRequired(false); numMaps.setRequired(false); dst.setRequired(false); usr.setRequired(false); tbl.setRequired(false); options.addOption(initialreplication); options.addOption(finalReplication); options.addOption(tries); options.addOption(numMaps); options.addOption(dst); options.addOption(usr); options.addOption(tbl); return options; } /** * Print the available options to the display. */ private static void printOptions() { HelpFormatter formatter = new HelpFormatter(); String header = "Tool to backup an HBase database"; formatter.printHelp("Backup", header, getOptions(), "", true); } }