Java tutorial
/* This file is part of VoltDB. * Copyright (C) 2008-2013 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <>. */ package org.voltdb.sysprocs; import; import; import; import; import; import; import; import; import; import; import java.nio.ByteBuffer; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import org.apache.zookeeper_voltpatches.CreateMode; import org.apache.zookeeper_voltpatches.KeeperException; import org.apache.zookeeper_voltpatches.ZooDefs.Ids; import org.apache.zookeeper_voltpatches.ZooKeeper; import org.json_voltpatches.JSONArray; import org.json_voltpatches.JSONException; import org.json_voltpatches.JSONObject; import org.voltcore.logging.VoltLogger; import org.voltcore.messaging.BinaryPayloadMessage; import org.voltcore.messaging.Mailbox; import org.voltcore.messaging.VoltMessage; import org.voltcore.utils.CoreUtils; import org.voltcore.utils.DBBPool.BBContainer; import org.voltcore.zk.ZKUtil.StringCallback; import org.voltdb.ClientResponseImpl; import org.voltdb.DependencyPair; import org.voltdb.ParameterSet; import org.voltdb.PrivateVoltTableFactory; import org.voltdb.ProcInfo; import org.voltdb.StartAction; import org.voltdb.StoredProcedureInvocation; import org.voltdb.SystemProcedureExecutionContext; import org.voltdb.TheHashinator; import org.voltdb.VoltDB; import org.voltdb.VoltSystemProcedure; import org.voltdb.VoltTable; import org.voltdb.VoltTable.ColumnInfo; import org.voltdb.VoltType; import org.voltdb.VoltTypeException; import org.voltdb.VoltZK; import org.voltdb.catalog.Database; import org.voltdb.catalog.Table; import org.voltdb.dtxn.DtxnConstants; import org.voltdb.dtxn.SiteTracker; import org.voltdb.export.ExportManager; import org.voltdb.messaging.FragmentResponseMessage; import org.voltdb.messaging.FragmentTaskMessage; import org.voltdb.sysprocs.saverestore.ClusterSaveFileState; import org.voltdb.sysprocs.saverestore.DuplicateRowHandler; import org.voltdb.sysprocs.saverestore.SavedTableConverter; import org.voltdb.sysprocs.saverestore.SnapshotUtil; import org.voltdb.sysprocs.saverestore.TableSaveFile; import org.voltdb.sysprocs.saverestore.TableSaveFileState; import org.voltdb.utils.CatalogUtil; import org.voltdb.utils.CompressionService; import org.voltdb.utils.VoltFile; import org.voltdb.utils.VoltTableUtil; import; import; @ProcInfo(singlePartition = false) public class SnapshotRestore extends VoltSystemProcedure { private static final VoltLogger TRACE_LOG = new VoltLogger(SnapshotRestore.class.getName()); private static final VoltLogger SNAP_LOG = new VoltLogger("SNAPSHOT"); private static final VoltLogger CONSOLE_LOG = new VoltLogger("CONSOLE"); private static final int DEP_restoreScan = (int) SysProcFragmentId.PF_restoreScan | DtxnConstants.MULTIPARTITION_DEPENDENCY; private static final int DEP_restoreScanResults = (int) SysProcFragmentId.PF_restoreScanResults; /* * Plan fragments for retrieving the digests * for the snapshot visible at every node. Can't be combined * with the other scan because only one result table can be returned * by a plan fragment. */ private static final int DEP_restoreDigestScan = (int) SysProcFragmentId.PF_restoreDigestScan | DtxnConstants.MULTIPARTITION_DEPENDENCY; private static final int DEP_restoreDigestScanResults = (int) SysProcFragmentId.PF_restoreDigestScanResults; /* * Plan fragments for distributing the full set of export sequence numbers * to every partition where the relevant ones can be selected * and forwarded to the EE. Also distributes the txnId of the snapshot * which is used to truncate export data on disk from after the snapshot */ private static final int DEP_restoreDistributeExportAndPartitionSequenceNumbers = (int) SysProcFragmentId.PF_restoreDistributeExportAndPartitionSequenceNumbers | DtxnConstants.MULTIPARTITION_DEPENDENCY; private static final int DEP_restoreDistributeExportAndPartitionSequenceNumbersResults = (int) SysProcFragmentId.PF_restoreDistributeExportAndPartitionSequenceNumbersResults; /* * Plan fragment for entering an asynchronous run loop that generates a mailbox * and sends the generated mailbox id to the MP coordinator which then propagates the info. * The MP coordinator then sends plan fragments through this async mailbox, * bypassing the master/slave replication system that doesn't understand plan fragments * directed at individual executions sites. */ private static final int DEP_restoreAsyncRunLoop = (int) SysProcFragmentId.PF_restoreAsyncRunLoop | DtxnConstants.MULTIPARTITION_DEPENDENCY; private static final int DEP_restoreAsyncRunLoopResults = (int) SysProcFragmentId.PF_restoreAsyncRunLoopResults; private static HashSet<String> m_initializedTableSaveFileNames = new HashSet<String>(); private static ArrayDeque<TableSaveFile> m_saveFiles = new ArrayDeque<TableSaveFile>(); private static volatile DuplicateRowHandler m_duplicateRowHandler = null; private static synchronized void initializeTableSaveFiles(String filePath, String fileNonce, String tableName, int originalHostIds[], int relevantPartitionIds[], SiteTracker st) throws IOException { // This check ensures that only one site per host attempts to // distribute this table. @SnapshotRestore sends plan fragments // to every site on this host with the tables and partition ID that // this host is going to distribute to the cluster. The first // execution site to get into this synchronized method is going to // 'win', add the table it's doing to this set, and+ then do the rest // of the work. Subsequent sites will just return here. if (!m_initializedTableSaveFileNames.add(tableName)) { return; } // To avoid pulling duplicate rows when we have multiple files // that contain the data for a partition, we're going to assign // all of the partition IDs that were passed in to one and only one // TableSaveFile. We'll pull them out of this set as we find // files for them, and then once the set is empty we can bail out of // this loop. The restore planner called in @SnapshotRestore should // ensure that we can, in fact, find files for all these partitions. HashSet<Integer> relevantPartitionSet = new HashSet<Integer>(); for (int part_id : relevantPartitionIds) { relevantPartitionSet.add(part_id); } for (int originalHostId : originalHostIds) { final File f = getSaveFileForPartitionedTable(filePath, fileNonce, tableName, originalHostId); TableSaveFile savefile = getTableSaveFile(f, st.getLocalSites().length * 4, relevantPartitionSet.toArray(new Integer[relevantPartitionSet.size()])); m_saveFiles.offer(savefile); for (int part_id : savefile.getPartitionIds()) { relevantPartitionSet.remove(part_id); } if (relevantPartitionSet.isEmpty()) { break; } assert (m_saveFiles.peekLast().getCompleted()); } } private static synchronized boolean hasMoreChunks() throws IOException { boolean hasMoreChunks = false; while (!hasMoreChunks && m_saveFiles.peek() != null) { TableSaveFile f = m_saveFiles.peek(); hasMoreChunks = f.hasMoreChunks(); if (!hasMoreChunks) { try { f.close(); } catch (IOException e) { } m_saveFiles.poll(); } } return hasMoreChunks; } private static synchronized BBContainer getNextChunk() throws IOException { BBContainer c = null; while (c == null && m_saveFiles.peek() != null) { TableSaveFile f = m_saveFiles.peek(); c = f.getNextChunk(); if (c == null) { f.close(); m_saveFiles.poll(); } } return c; } @Override public void init() { registerPlanFragment(SysProcFragmentId.PF_restoreScan); registerPlanFragment(SysProcFragmentId.PF_restoreScanResults); registerPlanFragment(SysProcFragmentId.PF_restoreDigestScan); registerPlanFragment(SysProcFragmentId.PF_restoreDigestScanResults); registerPlanFragment(SysProcFragmentId.PF_restoreDistributeExportAndPartitionSequenceNumbers); registerPlanFragment(SysProcFragmentId.PF_restoreDistributeExportAndPartitionSequenceNumbersResults); registerPlanFragment(SysProcFragmentId.PF_restoreAsyncRunLoop); registerPlanFragment(SysProcFragmentId.PF_restoreAsyncRunLoopResults); registerPlanFragment(SysProcFragmentId.PF_restoreLoadTable); registerPlanFragment(SysProcFragmentId.PF_restoreReceiveResultTables); registerPlanFragment(SysProcFragmentId.PF_restoreLoadReplicatedTable); registerPlanFragment(SysProcFragmentId.PF_restoreDistributeReplicatedTableAsReplicated); registerPlanFragment(SysProcFragmentId.PF_restoreDistributePartitionedTableAsPartitioned); registerPlanFragment(SysProcFragmentId.PF_restoreDistributePartitionedTableAsReplicated); registerPlanFragment(SysProcFragmentId.PF_restoreDistributeReplicatedTableAsPartitioned); m_siteId = CoreUtils.getSiteIdFromHSId(m_site.getCorrespondingSiteId()); m_hostId = m_site.getCorrespondingHostId(); // XXX HACK GIANT HACK given the current assumption that there is // only one database per cluster, I'm asserting this and then // skirting around the need to have the database name in order to get // to the set of tables. --izzy assert (m_cluster.getDatabases().size() == 1); m_database = m_cluster.getDatabases().get("database"); } @Override public DependencyPair executePlanFragment(Map<Integer, List<VoltTable>> dependencies, long fragmentId, ParameterSet params, SystemProcedureExecutionContext context) { if (fragmentId == SysProcFragmentId.PF_restoreDistributeExportAndPartitionSequenceNumbers) { assert (params.toArray()[0] != null); assert (params.toArray().length == 3); assert (params.toArray()[0] instanceof byte[]); assert (params.toArray()[2] instanceof long[]); VoltTable result = new VoltTable(new VoltTable.ColumnInfo("RESULT", VoltType.STRING)); long snapshotTxnId = ((Long) params.toArray()[1]).longValue(); long perPartitionTxnIds[] = (long[]) params.toArray()[2]; /* * Use the per partition txn ids to set the initial txnid value from the snapshot * All the values are sent in, but only the one for the appropriate partition * will be used */ context.getSiteProcedureConnection().setPerPartitionTxnIds(perPartitionTxnIds); // Choose the lowest site ID on this host to truncate export data if (context.isLowestSiteId()) { ExportManager.instance().truncateExportToTxnId(snapshotTxnId, perPartitionTxnIds); } try { ByteArrayInputStream bais = new ByteArrayInputStream((byte[]) params.toArray()[0]); ObjectInputStream ois = new ObjectInputStream(bais); //Sequence numbers for every table and partition @SuppressWarnings("unchecked") Map<String, Map<Integer, Long>> exportSequenceNumbers = (Map<String, Map<Integer, Long>>) ois .readObject(); Database db = context.getDatabase(); Integer myPartitionId = context.getPartitionId(); //Iterate the export tables for (Table t : db.getTables()) { if (!CatalogUtil.isTableExportOnly(db, t)) continue; String signature = t.getSignature(); String name = t.getTypeName(); //Sequence numbers for this table for every partition Map<Integer, Long> sequenceNumberPerPartition = exportSequenceNumbers.get(name); if (sequenceNumberPerPartition == null) { SNAP_LOG.warn("Could not find export sequence number for table " + name + ". This warning is safe to ignore if you are loading a pre 1.3 snapshot" + " which would not contain these sequence numbers (added in 1.3)." + " If this is a post 1.3 snapshot then the restore has failed and export sequence " + " are reset to 0"); continue; } Long sequenceNumber = sequenceNumberPerPartition.get(myPartitionId); if (sequenceNumber == null) { SNAP_LOG.warn("Could not find an export sequence number for table " + name + " partition " + myPartitionId + ". This warning is safe to ignore if you are loading a pre 1.3 snapshot " + " which would not contain these sequence numbers (added in 1.3)." + " If this is a post 1.3 snapshot then the restore has failed and export sequence " + " are reset to 0"); continue; } //Forward the sequence number to the EE context.getSiteProcedureConnection().exportAction(true, 0, sequenceNumber, myPartitionId, signature); } } catch (Exception e) { e.printStackTrace();//l4j doesn't print the stack trace SNAP_LOG.error(e); result.addRow("FAILURE"); } return new DependencyPair(DEP_restoreDistributeExportAndPartitionSequenceNumbers, result); } else if (fragmentId == SysProcFragmentId.PF_restoreDistributeExportAndPartitionSequenceNumbersResults) { TRACE_LOG.trace("Aggregating digest scan state"); assert (dependencies.size() > 0); VoltTable result = VoltTableUtil .unionTables(dependencies.get(DEP_restoreDistributeExportAndPartitionSequenceNumbers)); return new DependencyPair(DEP_restoreDistributeExportAndPartitionSequenceNumbersResults, result); } else if (fragmentId == SysProcFragmentId.PF_restoreDigestScan) { VoltTable result = new VoltTable(new VoltTable.ColumnInfo("DIGEST", VoltType.STRING), new VoltTable.ColumnInfo("RESULT", VoltType.STRING), new VoltTable.ColumnInfo("ERR_MSG", VoltType.STRING)); // Choose the lowest site ID on this host to do the file scan // All other sites should just return empty results tables. if (context.isLowestSiteId()) { try { // implicitly synchronized by the way restore operates. // this scan must complete on every site and return results // to the coordinator for aggregation before it will send out // distribution fragments, so two sites on the same node // can't be attempting to set and clear this HashSet simultaneously TRACE_LOG.trace( "Checking saved table digest state for restore of: " + m_filePath + ", " + m_fileNonce); List<JSONObject> digests = SnapshotUtil.retrieveDigests(m_filePath, m_fileNonce, SNAP_LOG); for (JSONObject obj : digests) { result.addRow(obj.toString(), "SUCCESS", null); } } catch (Exception e) { StringWriter sw = new StringWriter(); PrintWriter pw = new PrintWriter(sw); e.printStackTrace(pw); pw.flush(); e.printStackTrace();//l4j doesn't print stack traces SNAP_LOG.error(e); result.addRow(null, "FAILURE", sw.toString()); return new DependencyPair(DEP_restoreDigestScan, result); } } return new DependencyPair(DEP_restoreDigestScan, result); } else if (fragmentId == SysProcFragmentId.PF_restoreDigestScanResults) { TRACE_LOG.trace("Aggregating digest scan state"); assert (dependencies.size() > 0); VoltTable result = VoltTableUtil.unionTables(dependencies.get(DEP_restoreDigestScan)); return new DependencyPair(DEP_restoreDigestScanResults, result); } else if (fragmentId == SysProcFragmentId.PF_restoreScan) { assert (params.toArray()[0] != null); assert (params.toArray()[1] != null); String hostname = CoreUtils.getHostnameOrAddress(); VoltTable result = ClusterSaveFileState.constructEmptySaveFileStateVoltTable(); // Choose the lowest site ID on this host to do the file scan // All other sites should just return empty results tables. if (context.isLowestSiteId()) { /* * Initialize a duplicate row handling policy for this restore */ m_duplicateRowHandler = null; if (params.toArray()[2] != null) { m_duplicateRowHandler = new DuplicateRowHandler((String) params.toArray()[2], getTransactionTime()); } // implicitly synchronized by the way restore operates. // this scan must complete on every site and return results // to the coordinator for aggregation before it will send out // distribution fragments, so two sites on the same node // can't be attempting to set and clear this HashSet simultaneously m_initializedTableSaveFileNames.clear(); m_saveFiles.clear();//Tests will reused a VoltDB process that fails a restore m_filePath = (String) params.toArray()[0]; m_fileNonce = (String) params.toArray()[1]; TRACE_LOG.trace("Checking saved table state for restore of: " + m_filePath + ", " + m_fileNonce); File[] savefiles = SnapshotUtil.retrieveRelevantFiles(m_filePath, m_fileNonce); if (savefiles == null) { return new DependencyPair(DEP_restoreScan, result); } for (File file : savefiles) { TableSaveFile savefile = null; try { savefile = getTableSaveFile(file, 1, null); try { if (!savefile.getCompleted()) { continue; } String is_replicated = "FALSE"; if (savefile.isReplicated()) { is_replicated = "TRUE"; } int partitionIds[] = savefile.getPartitionIds(); for (int pid : partitionIds) { result.addRow(m_hostId, hostname, savefile.getHostId(), savefile.getHostname(), savefile.getClusterName(), savefile.getDatabaseName(), savefile.getTableName(), savefile.getTxnId(), is_replicated, pid, savefile.getTotalPartitions()); } } finally { savefile.close(); } } catch (FileNotFoundException e) { // retrieveRelevantFiles should always generate a list // of valid present files in m_filePath, so if we end up // getting here, something has gone very weird. e.printStackTrace(); } catch (IOException e) { // For the time being I'm content to treat this as a // missing file and let the coordinator complain if // it discovers that it can't build a consistent // database out of the files it sees available. // // Maybe just a log message? Later. e.printStackTrace(); } } } return new DependencyPair(DEP_restoreScan, result); } else if (fragmentId == SysProcFragmentId.PF_restoreScanResults) { TRACE_LOG.trace("Aggregating saved table state"); assert (dependencies.size() > 0); VoltTable result = VoltTableUtil.unionTables(dependencies.get(DEP_restoreScan)); return new DependencyPair(DEP_restoreScanResults, result); } else if (fragmentId == SysProcFragmentId.PF_restoreAsyncRunLoop) { Object paramsArray[] = params.toArray(); assert (paramsArray.length == 1); assert (paramsArray[0] instanceof Long); long coordinatorHSId = (Long) paramsArray[0]; Mailbox m = VoltDB.instance().getHostMessenger().createMailbox(); m_mbox = m; TRACE_LOG.trace("Entering async run loop at " + CoreUtils.hsIdToString(context.getSiteId()) + " listening on mbox " + CoreUtils.hsIdToString(m.getHSId())); /* * Send the generated mailbox id to the coordinator mapping * from the actual execution site id to the mailbox that will * be used for restore */ ByteBuffer responseBuffer = ByteBuffer.allocate(16); responseBuffer.putLong(m_site.getCorrespondingSiteId()); responseBuffer.putLong(m.getHSId()); BinaryPayloadMessage bpm = new BinaryPayloadMessage(new byte[0], responseBuffer.array()); m.send(coordinatorHSId, bpm); bpm = null; /* * Retrieve the mapping from actual site ids * to the site ids generated for mailboxes used for restore * The coordinator will generate this once it has heard from all sites */ while (true) { bpm = (BinaryPayloadMessage) m.recvBlocking(); if (bpm == null) continue; ByteBuffer wrappedMap = ByteBuffer.wrap(bpm.m_payload); while (wrappedMap.hasRemaining()) { long actualHSId = wrappedMap.getLong(); long generatedHSId = wrappedMap.getLong(); m_actualToGenerated.put(actualHSId, generatedHSId); } break; } /* * Loop until the termination signal is received. Execute any plan fragments that * are received */ while (true) { VoltMessage vm = m.recvBlocking(1000); if (vm == null) continue; if (vm instanceof FragmentTaskMessage) { FragmentTaskMessage ftm = (FragmentTaskMessage) vm; TRACE_LOG.trace(CoreUtils.hsIdToString(context.getSiteId()) + " received fragment id " + VoltSystemProcedure.hashToFragId(ftm.getPlanHash(0))); DependencyPair dp = m_runner.executeSysProcPlanFragment(m_runner.getTxnState(), null, VoltSystemProcedure.hashToFragId(ftm.getPlanHash(0)), ftm.getParameterSetForFragment(0)); FragmentResponseMessage frm = new FragmentResponseMessage(ftm, m.getHSId()); frm.addDependency(dp.depId, dp.dependency); m.send(ftm.getCoordinatorHSId(), frm); } else if (vm instanceof BinaryPayloadMessage) { if (context.isLowestSiteId() && m_duplicateRowHandler != null) { try { m_duplicateRowHandler.close(); } catch (Exception e) { VoltDB.crashLocalVoltDB("Error closing duplicate row handler during snapshot restore", true, e); } } //Null result table is intentional //The results of the process are propagated through a future in performTableRestoreWork return new DependencyPair(DEP_restoreAsyncRunLoop, constructResultsTable()); } } } else if (fragmentId == SysProcFragmentId.PF_restoreAsyncRunLoopResults) { return new DependencyPair(DEP_restoreAsyncRunLoopResults, constructResultsTable()); } // called by: performDistributeReplicatedTable() and performDistributePartitionedTable // handle all 4 LOADING tasks: // 1. load a replicated table as replicated table // 2. load a partitioned table as replicated table // 3. load a partitioned table as partitioned table (need to check unique violation) // 4. load a partitioned table as replicated table (need to check unique violation) else if (fragmentId == SysProcFragmentId.PF_restoreLoadTable) { // the last parameter could be null for the replicatedToReplicated case // and this parameter is used for log only for both load as replicated cases assert (params.toArray()[0] != null); assert (params.toArray()[1] != null); assert (params.toArray()[2] != null); assert (params.toArray()[3] != null); String table_name = (String) params.toArray()[0]; int dependency_id = (Integer) params.toArray()[1]; byte compressedTable[] = (byte[]) params.toArray()[2]; int checkUniqueViolations = (Integer) params.toArray()[3]; int[] partition_ids = (int[]) params.toArray()[4]; if (checkUniqueViolations > 0) { assert (partition_ids != null && partition_ids.length == 1); } TRACE_LOG.trace("Received table: " + table_name + (partition_ids == null ? "[REPLICATED]" : "of partition [" + partition_ids.toString()) + "]"); String result_str = "SUCCESS"; String error_msg = ""; try { VoltTable table = PrivateVoltTableFactory.createVoltTableFromBuffer( ByteBuffer.wrap(CompressionService.decompressBytes(compressedTable)), true); if (checkUniqueViolations > 0) { byte uniqueViolations[] = voltLoadTable(context.getCluster().getTypeName(), context.getDatabase().getTypeName(), table_name, table, m_duplicateRowHandler != null); if (uniqueViolations != null && m_duplicateRowHandler != null) { m_duplicateRowHandler.handleDuplicates(table_name, uniqueViolations); } } else { voltLoadTable(context.getCluster().getTypeName(), context.getDatabase().getTypeName(), table_name, table, false); } } catch (Exception e) { result_str = "FAILURE"; error_msg = e.getMessage(); } VoltTable result = constructResultsTable(); result.addRow(m_hostId, CoreUtils.getHostnameOrAddress(), CoreUtils.getSiteIdFromHSId(m_siteId), table_name, ((checkUniqueViolations > 0) ? partition_ids[0] : -1), result_str, error_msg); return new DependencyPair(dependency_id, result); } else if (fragmentId == SysProcFragmentId.PF_restoreReceiveResultTables) { assert (params.toArray()[0] != null); assert (params.toArray()[1] != null); int dependency_id = (Integer) params.toArray()[0]; String tracingLogMsg = (String) params.toArray()[1]; TRACE_LOG.trace(tracingLogMsg); List<VoltTable> table_list = new ArrayList<VoltTable>(); for (int dep_id : dependencies.keySet()) { table_list.addAll(dependencies.get(dep_id)); } assert (table_list.size() == dependencies.size()); VoltTable result = VoltTableUtil.unionTables(table_list); return new DependencyPair(dependency_id, result); } else if (fragmentId == SysProcFragmentId.PF_restoreLoadReplicatedTable) { assert (params.toArray()[0] != null); assert (params.toArray()[1] != null); String table_name = (String) params.toArray()[0]; int dependency_id = (Integer) params.toArray()[1]; TRACE_LOG.trace("Loading replicated table: " + table_name); String result_str = "SUCCESS"; String error_msg = ""; TableSaveFile savefile = null; /** * For replicated tables this will do the slow thing and read the file * once for each ExecutionSite. This could use optimization like * is done with the partitioned tables. */ try { savefile = getTableSaveFile(getSaveFileForReplicatedTable(table_name), 3, null); assert (savefile.getCompleted()); } catch (IOException e) { String hostname = CoreUtils.getHostnameOrAddress(); VoltTable result = constructResultsTable(); result.addRow(m_hostId, hostname, CoreUtils.getSiteIdFromHSId(m_siteId), table_name, -1, "FAILURE", "Unable to load table: " + table_name + " error: " + e.getMessage()); return new DependencyPair(dependency_id, result); } try { final Table new_catalog_table = getCatalogTable(table_name); Boolean needsConversion = null; while (savefile.hasMoreChunks()) { VoltTable table = null; final org.voltcore.utils.DBBPool.BBContainer c = savefile.getNextChunk(); if (c == null) { continue;//Should be equivalent to break } if (needsConversion == null) { VoltTable old_table = PrivateVoltTableFactory.createVoltTableFromBuffer(c.b.duplicate(), true); needsConversion = SavedTableConverter.needsConversion(old_table, new_catalog_table); } if (needsConversion.booleanValue()) { VoltTable old_table = PrivateVoltTableFactory.createVoltTableFromBuffer(c.b, true); table = SavedTableConverter.convertTable(old_table, new_catalog_table); } else { ByteBuffer copy = ByteBuffer.allocate(c.b.remaining()); copy.put(c.b); copy.flip(); table = PrivateVoltTableFactory.createVoltTableFromBuffer(copy, true); } c.discard(); try { voltLoadTable(context.getCluster().getTypeName(), context.getDatabase().getTypeName(), table_name, table, false); } catch (VoltAbortException e) { result_str = "FAILURE"; error_msg = e.getMessage(); break; } } } catch (IOException e) { String hostname = CoreUtils.getHostnameOrAddress(); VoltTable result = constructResultsTable(); result.addRow(m_hostId, hostname, CoreUtils.getSiteIdFromHSId(m_siteId), table_name, -1, "FAILURE", "Unable to load table: " + table_name + " error: " + e.getMessage()); return new DependencyPair(dependency_id, result); } catch (VoltTypeException e) { String hostname = CoreUtils.getHostnameOrAddress(); VoltTable result = constructResultsTable(); result.addRow(m_hostId, hostname, CoreUtils.getSiteIdFromHSId(m_siteId), table_name, -1, "FAILURE", "Unable to load table: " + table_name + " error: " + e.getMessage()); return new DependencyPair(dependency_id, result); } String hostname = CoreUtils.getHostnameOrAddress(); VoltTable result = constructResultsTable(); result.addRow(m_hostId, hostname, CoreUtils.getSiteIdFromHSId(m_siteId), table_name, -1, result_str, error_msg); try { savefile.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return new DependencyPair(dependency_id, result); } else if (fragmentId == SysProcFragmentId.PF_restoreDistributeReplicatedTableAsReplicated) { // XXX I tested this with a hack that cannot be replicated // in a unit test since it requires hacks to this sysproc that // effectively break it assert (params.toArray()[0] != null); assert (params.toArray()[1] != null); assert (params.toArray()[2] != null); String table_name = (String) params.toArray()[0]; long site_id = (Long) params.toArray()[1]; int dependency_id = (Integer) params.toArray()[2]; TRACE_LOG.trace(CoreUtils.hsIdToString(context.getSiteId()) + " distributing replicated table: " + table_name + " to: " + CoreUtils.hsIdToString(site_id)); VoltTable result = performDistributeReplicatedTable(table_name, context, site_id, false); return new DependencyPair(dependency_id, result); } else if (fragmentId == SysProcFragmentId.PF_restoreDistributePartitionedTableAsPartitioned) { Object paramsA[] = params.toArray(); assert (paramsA[0] != null); assert (paramsA[1] != null); assert (paramsA[2] != null); assert (paramsA[3] != null); String table_name = (String) paramsA[0]; int originalHosts[] = (int[]) paramsA[1]; int relevantPartitions[] = (int[]) paramsA[2]; int dependency_id = (Integer) paramsA[3]; for (int partition_id : relevantPartitions) { TRACE_LOG.trace("Distributing partitioned table: " + table_name + " partition id: " + partition_id); } VoltTable result = performDistributePartitionedTable(table_name, originalHosts, relevantPartitions, context, false); return new DependencyPair(dependency_id, result); } else if (fragmentId == SysProcFragmentId.PF_restoreDistributePartitionedTableAsReplicated) { Object paramsA[] = params.toArray(); assert (paramsA[0] != null); assert (paramsA[1] != null); assert (paramsA[2] != null); assert (paramsA[3] != null); String table_name = (String) paramsA[0]; int originalHosts[] = (int[]) paramsA[1]; int relevantPartitions[] = (int[]) paramsA[2]; int dependency_id = (Integer) paramsA[3]; for (int partition_id : relevantPartitions) { TRACE_LOG.trace("Loading partitioned-to-replicated table: " + table_name + " partition id: " + partition_id); } VoltTable result = performDistributePartitionedTable(table_name, originalHosts, relevantPartitions, context, true); return new DependencyPair(dependency_id, result); } else if (fragmentId == SysProcFragmentId.PF_restoreDistributeReplicatedTableAsPartitioned) { assert (params.toArray()[0] != null); assert (params.toArray()[1] != null); String table_name = (String) params.toArray()[0]; int dependency_id = (Integer) params.toArray()[1]; TRACE_LOG.trace("Loading replicated-to-partitioned table: " + table_name); VoltTable result = performDistributeReplicatedTable(table_name, context, -1, true); return new DependencyPair(dependency_id, result); } assert (false); return null; } public static final String JSON_PATH = "path"; public static final String JSON_NONCE = "nonce"; public static final String JSON_DUPLICATES_PATH = "duplicatesPath"; public VoltTable[] run(SystemProcedureExecutionContext ctx, String json) throws Exception { JSONObject jsObj = new JSONObject(json); final String path = jsObj.getString(JSON_PATH); final String nonce = jsObj.getString(JSON_NONCE); final String dupsPath = jsObj.optString(JSON_DUPLICATES_PATH, null); final long startTime = System.currentTimeMillis(); if (dupsPath != null) {"Restoring from path: " + path + " with nonce: " + nonce + " and duplicate rows will be output to " + dupsPath); } else {"Restoring from path: " + path + " with nonce: " + nonce); } // Fetch all the savefile metadata from the cluster VoltTable[] savefile_data; savefile_data = performRestoreScanWork(path, nonce, dupsPath); List<JSONObject> digests; Map<String, Map<Integer, Long>> exportSequenceNumbers; long perPartitionTxnIds[]; try { DigestScanResult digestScanResult = performRestoreDigestScanWork(); digests = digestScanResult.digests; exportSequenceNumbers = digestScanResult.exportSequenceNumbers; perPartitionTxnIds = digestScanResult.perPartitionTxnIds; if (perPartitionTxnIds.length == 0) { perPartitionTxnIds = new long[] { ctx.getCurrentTxnId() }; } } catch (VoltAbortException e) { ColumnInfo[] result_columns = new ColumnInfo[2]; int ii = 0; result_columns[ii++] = new ColumnInfo("RESULT", VoltType.STRING); result_columns[ii++] = new ColumnInfo("ERR_MSG", VoltType.STRING); VoltTable results[] = new VoltTable[] { new VoltTable(result_columns) }; results[0].addRow("FAILURE", e.toString()); noteOperationalFailure("Restore failed to complete. See response table for additional info."); return results; } ClusterSaveFileState savefile_state = null; try { savefile_state = new ClusterSaveFileState(savefile_data[0]); } catch (IOException e) { throw new VoltAbortException(e.getMessage()); } HashSet<String> relevantTableNames = new HashSet<String>(); try { if (digests.isEmpty()) { throw new Exception("No digests found"); } for (JSONObject obj : digests) { JSONArray tables = obj.getJSONArray("tables"); for (int ii = 0; ii < tables.length(); ii++) { relevantTableNames.add(tables.getString(ii)); } } } catch (Exception e) { ColumnInfo[] result_columns = new ColumnInfo[2]; int ii = 0; result_columns[ii++] = new ColumnInfo("RESULT", VoltType.STRING); result_columns[ii++] = new ColumnInfo("ERR_MSG", VoltType.STRING); VoltTable results[] = new VoltTable[] { new VoltTable(result_columns) }; results[0].addRow("FAILURE", e.toString()); noteOperationalFailure("Restore failed to complete. See response table for additional info."); return results; } assert (relevantTableNames != null); assert (relevantTableNames.size() > 0); // ENG-1078: I think this giant for/if block is only good for // checking if there are no files for a table listed in the digest. // There appear to be redundant checks for that, and then the per-table // consistency check is preempted by the ClusterSaveFileState constructor // called above. VoltTable[] results = null; for (String tableName : relevantTableNames) { if (!savefile_state.getSavedTableNames().contains(tableName)) { if (results == null) { ColumnInfo[] result_columns = new ColumnInfo[2]; int ii = 0; result_columns[ii++] = new ColumnInfo("RESULT", VoltType.STRING); result_columns[ii++] = new ColumnInfo("ERR_MSG", VoltType.STRING); results = new VoltTable[] { new VoltTable(result_columns) }; } results[0].addRow("FAILURE", "Save data contains no information for table " + tableName); break; } final TableSaveFileState saveFileState = savefile_state.getTableState(tableName); if (saveFileState == null) { // Pretty sure this is unreachable // See ENG-1078 if (results == null) { ColumnInfo[] result_columns = new ColumnInfo[2]; int ii = 0; result_columns[ii++] = new ColumnInfo("RESULT", VoltType.STRING); result_columns[ii++] = new ColumnInfo("ERR_MSG", VoltType.STRING); results = new VoltTable[] { new VoltTable(result_columns) }; } results[0].addRow("FAILURE", "Save data contains no information for table " + tableName); } else if (!saveFileState.isConsistent()) { // Also pretty sure this is unreachable // See ENG-1078 if (results == null) { ColumnInfo[] result_columns = new ColumnInfo[2]; int ii = 0; result_columns[ii++] = new ColumnInfo("RESULT", VoltType.STRING); result_columns[ii++] = new ColumnInfo("ERR_MSG", VoltType.STRING); results = new VoltTable[] { new VoltTable(result_columns) }; } results[0].addRow("FAILURE", saveFileState.getConsistencyResult()); } } if (results != null) { noteOperationalFailure("Restore failed to complete. See response table for additional info."); return results; } // Post a notice that a restore has started OR notice a prior post that one has started // and exit rather than attempt to start another. // Ideally this happens only just before any serious restore work begins. // If it comes too soon, a first attempt at restore that fails early sanity checks // wastes the one shot for a successful restore until the next cluster restart. // If it comes too late, a second attempt to restore could needlessly spin its wheels or even // start significant work before being called off by the detection of the prior notice. // A possible alternative would be to do this earlier, but then "undo" the post in cases where the // restore failed but left the database in a state that could reasonable be restored by a later attempt. try { VoltDB.instance().getHostMessenger().getZK().create(VoltZK.restoreMarker, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } catch (KeeperException.NodeExistsException e) { throw new VoltAbortException("Cluster has already been restored or has failed a restore." + " Restart the cluster before doing another restore."); } /* * This list stores all the partition transaction ids ever seen even if the partition * is no longer present. The values from here are added to snapshot digests to propagate * partitions that were remove/add several time by SnapshotSave. * * Only the partitions that are no longer part of the cluster will have their ids retrieved, * those that are active will populate their current values manually because they change after startup * * This is necessary to make sure that sequence numbers never go backwards as a result of a partition * being removed and then added back by save restore sequences. * * They will be retrieved from ZK by the snapshot daemon * and passed to @SnapshotSave which will use it to fill in transaction ids for * partitions that are no longer present */ ByteBuffer buf = ByteBuffer.allocate(perPartitionTxnIds.length * 8 + 4); buf.putInt(perPartitionTxnIds.length); for (long txnid : perPartitionTxnIds) { buf.putLong(txnid); } VoltDB.instance().getHostMessenger().getZK().create(VoltZK.perPartitionTxnIds, buf.array(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); /* * Serialize all the export sequence numbers and then distribute them in a * plan fragment and each receiver will pull the relevant information for * itself */ try { ByteArrayOutputStream baos = new ByteArrayOutputStream(); ObjectOutputStream oos = new ObjectOutputStream(baos); oos.writeObject(exportSequenceNumbers); oos.flush(); byte exportSequenceNumberBytes[] = baos.toByteArray(); oos.close(); /* * Also set the perPartitionTxnIds locally at the multi-part coordinator. * The coord will have to forward this value to all the idle coordinators. */ ctx.getSiteProcedureConnection().setPerPartitionTxnIds(perPartitionTxnIds); results = performDistributeExportSequenceNumbers(exportSequenceNumberBytes, digests.get(0).getLong("txnId"), perPartitionTxnIds); } catch (IOException e) { throw new VoltAbortException(e); } catch (JSONException e) { throw new VoltAbortException(e); } while (results[0].advanceRow()) { if (results[0].getString("RESULT").equals("FAILURE")) { throw new VoltAbortException("Error distributing export sequence numbers"); } } results = performTableRestoreWork(savefile_state, ctx.getSiteTrackerForSnapshot()); final long endTime = System.currentTimeMillis(); final double duration = (endTime - startTime) / 1000.0; final StringWriter sw = new StringWriter(); final PrintWriter pw = new PrintWriter(sw); pw.toString(); pw.printf("%.2f", duration); "Finished restore of " + path + " with nonce: " + nonce + " in " + sw.toString() + " seconds"); // m_sampler.setShouldStop(); // try { // m_sampler.join(); // } catch (InterruptedException e) { // e.printStackTrace(); // } /* * ENG-1858, make data loaded by snapshot restore durable * immediately by starting a truncation snapshot if * the command logging is enabled and the database start action * was create */ final StartAction startAction = VoltDB.instance().getConfig().m_startAction; final org.voltdb.OperationMode mode = VoltDB.instance().getMode(); /* * Is this the start action and no recovery is being performed. The mode * will not be INITIALIZING, it will PAUSED or RUNNING. If that is the case, * we do want a truncation snapshot if CL is enabled. */ final boolean isStartWithNoAutomatedRestore = startAction == StartAction.CREATE && mode != org.voltdb.OperationMode.INITIALIZING; final boolean isCLEnabled = VoltDB.instance().getCommandLog().getClass().getSimpleName() .equals("CommandLogImpl"); final boolean isStartedWithCreateAction = startAction == StartAction.CREATE; if (isCLEnabled && (isStartedWithCreateAction || isStartWithNoAutomatedRestore)) { final ZooKeeper zk = VoltDB.instance().getHostMessenger().getZK();"Requesting truncation snapshot to make data loaded by snapshot restore durable."); zk.create(VoltZK.request_truncation_snapshot, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, new StringCallback() { @Override public void processResult(int rc, String path, Object ctx, String name) { if (rc != 0) { KeeperException.Code code = KeeperException.Code.get(rc); if (code != KeeperException.Code.NODEEXISTS) { SNAP_LOG.warn( "Don't expect this ZK response when requesting a truncation snapshot " + code); } } } }, null); } return results; } private VoltTable[] performDistributeExportSequenceNumbers(byte[] exportSequenceNumberBytes, long txnId, long perPartitionTxnIds[]) { SynthesizedPlanFragment[] pfs = new SynthesizedPlanFragment[2]; // This fragment causes each execution site to confirm the likely // success of writing tables to disk pfs[0] = new SynthesizedPlanFragment(); pfs[0].fragmentId = SysProcFragmentId.PF_restoreDistributeExportAndPartitionSequenceNumbers; pfs[0].outputDepId = DEP_restoreDistributeExportAndPartitionSequenceNumbers; pfs[0].inputDepIds = new int[] {}; pfs[0].multipartition = true; pfs[0].parameters = ParameterSet.fromArrayNoCopy(exportSequenceNumberBytes, txnId, perPartitionTxnIds); // This fragment aggregates the save-to-disk sanity check results pfs[1] = new SynthesizedPlanFragment(); pfs[1].fragmentId = SysProcFragmentId.PF_restoreDistributeExportAndPartitionSequenceNumbersResults; pfs[1].outputDepId = DEP_restoreDistributeExportAndPartitionSequenceNumbersResults; pfs[1].inputDepIds = new int[] { DEP_restoreDistributeExportAndPartitionSequenceNumbers }; pfs[1].multipartition = false; pfs[1].parameters = ParameterSet.emptyParameterSet(); VoltTable[] results; results = executeSysProcPlanFragments(pfs, DEP_restoreDistributeExportAndPartitionSequenceNumbersResults); return results; } private VoltTable constructResultsTable() { ColumnInfo[] result_columns = new ColumnInfo[7]; int ii = 0; result_columns[ii++] = new ColumnInfo(CNAME_HOST_ID, CTYPE_ID); result_columns[ii++] = new ColumnInfo("HOSTNAME", VoltType.STRING); result_columns[ii++] = new ColumnInfo(CNAME_SITE_ID, CTYPE_ID); result_columns[ii++] = new ColumnInfo("TABLE", VoltType.STRING); result_columns[ii++] = new ColumnInfo(CNAME_PARTITION_ID, CTYPE_ID); result_columns[ii++] = new ColumnInfo("RESULT", VoltType.STRING); result_columns[ii++] = new ColumnInfo("ERR_MSG", VoltType.STRING); return new VoltTable(result_columns); } private File getSaveFileForReplicatedTable(String tableName) { StringBuilder filename_builder = new StringBuilder(m_fileNonce); filename_builder.append("-"); filename_builder.append(tableName); filename_builder.append(".vpt"); return new VoltFile(m_filePath, new String(filename_builder)); } private static File getSaveFileForPartitionedTable(String filePath, String fileNonce, String tableName, int originalHostId) { StringBuilder filename_builder = new StringBuilder(fileNonce); filename_builder.append("-"); filename_builder.append(tableName); filename_builder.append("-host_"); filename_builder.append(originalHostId); filename_builder.append(".vpt"); return new VoltFile(filePath, new String(filename_builder)); } private static TableSaveFile getTableSaveFile(File saveFile, int readAheadChunks, Integer relevantPartitionIds[]) throws IOException { @SuppressWarnings("resource") FileInputStream savefile_input = new FileInputStream(saveFile); TableSaveFile savefile = new TableSaveFile(savefile_input.getChannel(), readAheadChunks, relevantPartitionIds); return savefile; } /* * Block the execution site thread distributing the async mailbox fragment. * Has to be done from this thread because it uses the existing plumbing * that pops into the EE to do stats periodically and that relies on thread locals */ private final VoltTable[] distributeAsyncMailboxFragment(final long coordinatorHSId) { SynthesizedPlanFragment[] pfs = new SynthesizedPlanFragment[2]; //This fragment causes every ES to generate a mailbox and //enter an async run loop to do restore work out of that mailbox pfs[0] = new SynthesizedPlanFragment(); pfs[0].fragmentId = SysProcFragmentId.PF_restoreAsyncRunLoop; pfs[0].outputDepId = DEP_restoreAsyncRunLoop; pfs[0].inputDepIds = new int[] {}; pfs[0].multipartition = true; pfs[0].parameters = ParameterSet.fromArrayNoCopy(coordinatorHSId); // This fragment aggregates the save-to-disk sanity check results pfs[1] = new SynthesizedPlanFragment(); pfs[1].fragmentId = SysProcFragmentId.PF_restoreAsyncRunLoopResults; pfs[1].outputDepId = DEP_restoreAsyncRunLoopResults; pfs[1].inputDepIds = new int[] { DEP_restoreAsyncRunLoop }; pfs[1].multipartition = false; pfs[1].parameters = ParameterSet.emptyParameterSet(); return executeSysProcPlanFragments(pfs, DEP_restoreAsyncRunLoopResults); } private final VoltTable[] performRestoreScanWork(String filePath, String fileNonce, String dupsPath) { SynthesizedPlanFragment[] pfs = new SynthesizedPlanFragment[2]; // This fragment causes each execution site to confirm the likely // success of writing tables to disk pfs[0] = new SynthesizedPlanFragment(); pfs[0].fragmentId = SysProcFragmentId.PF_restoreScan; pfs[0].outputDepId = DEP_restoreScan; pfs[0].inputDepIds = new int[] {}; pfs[0].multipartition = true; pfs[0].parameters = ParameterSet.fromArrayNoCopy(filePath, fileNonce, dupsPath); // This fragment aggregates the save-to-disk sanity check results pfs[1] = new SynthesizedPlanFragment(); pfs[1].fragmentId = SysProcFragmentId.PF_restoreScanResults; pfs[1].outputDepId = DEP_restoreScanResults; pfs[1].inputDepIds = new int[] { DEP_restoreScan }; pfs[1].multipartition = false; pfs[1].parameters = ParameterSet.emptyParameterSet(); VoltTable[] results; results = executeSysProcPlanFragments(pfs, DEP_restoreScanResults); return results; } private static class DigestScanResult { List<JSONObject> digests; Map<String, Map<Integer, Long>> exportSequenceNumbers; long perPartitionTxnIds[]; } private final DigestScanResult performRestoreDigestScanWork() { SynthesizedPlanFragment[] pfs = new SynthesizedPlanFragment[2]; // This fragment causes each execution site to confirm the likely // success of writing tables to disk pfs[0] = new SynthesizedPlanFragment(); pfs[0].fragmentId = SysProcFragmentId.PF_restoreDigestScan; pfs[0].outputDepId = DEP_restoreDigestScan; pfs[0].inputDepIds = new int[] {}; pfs[0].multipartition = true; pfs[0].parameters = ParameterSet.emptyParameterSet(); // This fragment aggregates the save-to-disk sanity check results pfs[1] = new SynthesizedPlanFragment(); pfs[1].fragmentId = SysProcFragmentId.PF_restoreDigestScanResults; pfs[1].outputDepId = DEP_restoreDigestScanResults; pfs[1].inputDepIds = new int[] { DEP_restoreDigestScan }; pfs[1].multipartition = false; pfs[1].parameters = ParameterSet.emptyParameterSet(); VoltTable[] results; results = executeSysProcPlanFragments(pfs, DEP_restoreDigestScanResults); HashMap<String, Map<Integer, Long>> exportSequenceNumbers = new HashMap<String, Map<Integer, Long>>(); Long digestTxnId = null; ArrayList<JSONObject> digests = new ArrayList<JSONObject>(); Set<Long> perPartitionTxnIds = new HashSet<Long>(); /* * Retrieve and aggregate the per table per partition sequence numbers from * all the digest files retrieved across the cluster */ try { while (results[0].advanceRow()) { if (results[0].getString("RESULT").equals("FAILURE")) { throw new VoltAbortException(results[0].getString("ERR_MSG")); } JSONObject digest = new JSONObject(results[0].getString(0)); digests.add(digest); /* * Validate that the digests are all from the same snapshot */ if (digestTxnId == null) { digestTxnId = digest.getLong("txnId"); } else { if (digest.getLong("txnId") != digestTxnId) { throw new VoltAbortException("Retrieved a digest with txnId " + digest.getLong("txnId") + " that doesn't match the txnId seen previously " + digestTxnId + " inspect the digests" + " with the provided nonce and ensure that they are all really from the same snapshot"); } } /* * Snapshots from pre 1.3 VoltDB won't have sequence numbers * Doing nothing will default it to zero. */ if (digest.has("exportSequenceNumbers")) { /* * An array of entries for each table */ JSONArray sequenceNumbers = digest.getJSONArray("exportSequenceNumbers"); for (int ii = 0; ii < sequenceNumbers.length(); ii++) { /* * An object containing all the sequence numbers for its partitions * in this table. This will be a subset since it is from a single digest */ JSONObject tableSequenceNumbers = sequenceNumbers.getJSONObject(ii); String tableName = tableSequenceNumbers.getString("exportTableName"); Map<Integer, Long> partitionSequenceNumbers = exportSequenceNumbers.get(tableName); if (partitionSequenceNumbers == null) { partitionSequenceNumbers = new HashMap<Integer, Long>(); exportSequenceNumbers.put(tableName, partitionSequenceNumbers); } /* * Array of objects containing partition and sequence number pairs */ JSONArray sourcePartitionSequenceNumbers = tableSequenceNumbers .getJSONArray("sequenceNumberPerPartition"); for (int zz = 0; zz < sourcePartitionSequenceNumbers.length(); zz++) { int partition = sourcePartitionSequenceNumbers.getJSONObject(zz).getInt("partition"); long sequenceNumber = sourcePartitionSequenceNumbers.getJSONObject(zz) .getInt("exportSequenceNumber"); partitionSequenceNumbers.put(partition, sequenceNumber); } } } if (digest.has("partitionTransactionIds")) { JSONObject partitionTxnIds = digest.getJSONObject("partitionTransactionIds"); @SuppressWarnings("unchecked") Iterator<String> keys = partitionTxnIds.keys(); while (keys.hasNext()) { perPartitionTxnIds.add(partitionTxnIds.getLong(; } } } } catch (JSONException e) { throw new VoltAbortException(e); } DigestScanResult result = new DigestScanResult(); result.digests = digests; result.exportSequenceNumbers = exportSequenceNumbers; result.perPartitionTxnIds = Longs.toArray(perPartitionTxnIds); return result; } private Set<Table> getTablesToRestore(Set<String> savedTableNames) { Set<Table> tables_to_restore = new HashSet<Table>(); for (Table table : m_database.getTables()) { if (savedTableNames.contains(table.getTypeName())) { if (table.getMaterializer() == null) { tables_to_restore.add(table); } else { // LOG_TRIAGE reconsider info level here?"Table: " + table.getTypeName() + " was saved " + "but is now a materialized table and will " + "not be loaded from disk"); } } else { if (table.getMaterializer() == null && !CatalogUtil.isTableExportOnly(m_database, table)) {"Table: " + table.getTypeName() + " does not have " + "any savefile data and so will not be loaded " + "from disk"); } } } // XXX consider logging the list of tables that were saved but not // in the current catalog return tables_to_restore; } private VoltTable[] performTableRestoreWork(final ClusterSaveFileState savefileState, final SiteTracker st) throws Exception { /* * Create a mailbox to use to send fragment work to execution sites */ final Mailbox m = VoltDB.instance().getHostMessenger().createMailbox(); /* * Create a separate thread to do the work of coordinating the restore * while this execution sites's thread (or the MP coordinator in IV2) * is blocked in distributing the async mailbox plan fragment. It * has to be threaded this way because invoking the async mailbox plan fragment * enters the EE to service stats stuff which relies on thread locals. */ ExecutorService es = Executors.newSingleThreadExecutor(CoreUtils.getThreadFactory("Snapshot Restore")); Future<VoltTable[]> ft = es.submit(new Callable<VoltTable[]>() { @Override public VoltTable[] call() throws Exception { int discoveredMailboxes = 0; int totalMailboxes = st.m_numberOfExecutionSites; /* * First two loops handle picking up the generated mailbox ids * and then distributing the entire map to all sites * so they can convert between actual site ids to mailbox ids * used for restore */ Map<Long, Long> actualToGenerated = new HashMap<Long, Long>(); while (discoveredMailboxes < totalMailboxes) { BinaryPayloadMessage bpm = (BinaryPayloadMessage) m.recvBlocking(); if (bpm == null) continue; discoveredMailboxes++; ByteBuffer payload = ByteBuffer.wrap(bpm.m_payload); long actualHSId = payload.getLong(); long asyncMailboxHSId = payload.getLong(); actualToGenerated.put(actualHSId, asyncMailboxHSId); } ByteBuffer generatedToActualBuf = ByteBuffer.allocate(actualToGenerated.size() * 16); for (Map.Entry<Long, Long> e : actualToGenerated.entrySet()) { generatedToActualBuf.putLong(e.getKey()); generatedToActualBuf.putLong(e.getValue()); } for (Long generatedHSId : actualToGenerated.values()) { BinaryPayloadMessage bpm = new BinaryPayloadMessage(new byte[0], Arrays.copyOf(generatedToActualBuf.array(), generatedToActualBuf.capacity())); m.send(generatedHSId, bpm); } /* * Do the usual restore planning to generate the plan fragments for execution at each * site */ Set<Table> tables_to_restore = getTablesToRestore(savefileState.getSavedTableNames()); VoltTable[] restore_results = new VoltTable[1]; restore_results[0] = constructResultsTable(); ArrayList<SynthesizedPlanFragment[]> restorePlans = new ArrayList<SynthesizedPlanFragment[]>(); for (Table t : tables_to_restore) { TableSaveFileState table_state = savefileState.getTableState(t.getTypeName()); SynthesizedPlanFragment[] restore_plan = table_state.generateRestorePlan(t, st); if (restore_plan == null) { SNAP_LOG.error( "Unable to generate restore plan for " + t.getTypeName() + " table not restored"); throw new VoltAbortException( "Unable to generate restore plan for " + t.getTypeName() + " table not restored"); } restorePlans.add(restore_plan); } /* * Now distribute the plan fragments for restoring each table. */ Iterator<Table> tableIterator = tables_to_restore.iterator(); for (SynthesizedPlanFragment[] restore_plan : restorePlans) { Table table =; TRACE_LOG.trace("Performing restore for table: " + table.getTypeName()); TRACE_LOG.trace("Plan has fragments: " + restore_plan.length); for (int ii = 0; ii < restore_plan.length - 1; ii++) { restore_plan[ii].siteId = actualToGenerated.get(restore_plan[ii].siteId); } /* * This isn't ye olden executeSysProcPlanFragments. It uses the provided mailbox * and has it's own tiny run loop to process incoming fragments. */ VoltTable[] results = executeSysProcPlanFragments(restore_plan, m); while (results[0].advanceRow()) { // this will actually add the active row of results[0] restore_results[0].add(results[0]); // if any table at any site fails... then the whole proc fails if (results[0].getString("RESULT").equalsIgnoreCase("FAILURE")) { noteOperationalFailure( "Restore failed to complete. See response table for additional info."); } } } /* * Send a termination message. This will cause the async mailbox plan fragment to stop * executing allowing the coordinator thread to get back to work. */ for (long hsid : actualToGenerated.values()) { BinaryPayloadMessage bpm = new BinaryPayloadMessage(new byte[0], new byte[0]); m.send(hsid, bpm); } return restore_results; } }); /* * Distribute the task of doing the async run loop * for restore. It will block on generating the response from the end of the run loop * the response doesn't contain any information */ distributeAsyncMailboxFragment(m.getHSId()); //Wait for the thread that was created to terminate to prevent concurrent access. //It should already have finished if distributeAsyncMailboxFragment returned //because that means that the term message was sent VoltTable restore_results[] = ft.get(); es.shutdown(); es.awaitTermination(365, TimeUnit.DAYS); return restore_results; } // XXX I hacked up a horrible one-off in my world to test this code. // I believe that it will work for at least one new node, but // there's not a good way to add a unit test for this at the moment, // so the emma coverage is weak. private VoltTable performDistributeReplicatedTable(String tableName, SystemProcedureExecutionContext ctx, // only used in replicated-to-partitioned case long siteId, // only used in replicated-to-replicated case boolean asPartitioned) { String hostname = CoreUtils.getHostnameOrAddress(); TableSaveFile savefile = null; try { savefile = getTableSaveFile(getSaveFileForReplicatedTable(tableName), 3, null); assert (savefile.getCompleted()); } catch (IOException e) { VoltTable result = constructResultsTable(); result.addRow(m_hostId, hostname, CoreUtils.getSiteIdFromHSId(m_siteId), tableName, -1, "FAILURE", "Unable to load table: " + tableName + " error: " + e.getMessage()); return result; } VoltTable[] results = new VoltTable[] { constructResultsTable() }; results[0].addRow(m_hostId, hostname, CoreUtils.getSiteIdFromHSId(m_siteId), tableName, -1, "SUCCESS", "NO DATA TO DISTRIBUTE"); final Table new_catalog_table = getCatalogTable(tableName); Boolean needsConversion = null; try { while (savefile.hasMoreChunks()) { VoltTable table = null; final org.voltcore.utils.DBBPool.BBContainer c = savefile.getNextChunk(); if (c == null) { continue; // Should be equivalent to break } if (needsConversion == null) { VoltTable old_table = PrivateVoltTableFactory.createVoltTableFromBuffer(c.b.duplicate(), true); needsConversion = SavedTableConverter.needsConversion(old_table, new_catalog_table); } final VoltTable old_table = PrivateVoltTableFactory.createVoltTableFromBuffer(c.b, true); if (needsConversion) { table = SavedTableConverter.convertTable(old_table, new_catalog_table); } else { table = old_table; } SynthesizedPlanFragment[] pfs = null; if (asPartitioned) { byte[][] partitioned_tables = createPartitionedTables(tableName, table, ctx.getNumberOfPartitions()); Map<Long, Integer> sites_to_partitions = new HashMap<Long, Integer>(); SiteTracker tracker = ctx.getSiteTrackerForSnapshot(); sites_to_partitions.putAll(tracker.getSitesToPartitions()); int[] dependencyIds = new int[sites_to_partitions.size()]; pfs = new SynthesizedPlanFragment[sites_to_partitions.size() + 1]; int pfs_index = 0; for (long site_id : sites_to_partitions.keySet()) { int partition_id = sites_to_partitions.get(site_id); dependencyIds[pfs_index] = TableSaveFileState.getNextDependencyId(); SynthesizedPlanFragment loadFragment = new SynthesizedPlanFragment(); loadFragment.fragmentId = SysProcFragmentId.PF_restoreLoadTable; loadFragment.siteId = m_actualToGenerated.get(site_id); loadFragment.multipartition = false; loadFragment.outputDepId = dependencyIds[pfs_index]; loadFragment.inputDepIds = new int[] {}; loadFragment.parameters = ParameterSet.fromArrayNoCopy(tableName, dependencyIds[pfs_index], partitioned_tables[partition_id], 1, new int[] { partition_id }); pfs[pfs_index++] = loadFragment; } int result_dependency_id = TableSaveFileState.getNextDependencyId(); SynthesizedPlanFragment aggregatorFragment = new SynthesizedPlanFragment(); aggregatorFragment.fragmentId = SysProcFragmentId.PF_restoreReceiveResultTables; aggregatorFragment.multipartition = false; aggregatorFragment.outputDepId = result_dependency_id; aggregatorFragment.inputDepIds = dependencyIds; aggregatorFragment.parameters = ParameterSet.fromArrayNoCopy(result_dependency_id, "Received confirmation of successful partitioned-to-replicated table load"); pfs[sites_to_partitions.size()] = aggregatorFragment; } else { byte compressedTable[] = table.getCompressedBytes(); pfs = new SynthesizedPlanFragment[2]; int result_dependency_id = TableSaveFileState.getNextDependencyId(); pfs[0] = new SynthesizedPlanFragment(); pfs[0].fragmentId = SysProcFragmentId.PF_restoreLoadTable; pfs[0].siteId = m_actualToGenerated.get(siteId); pfs[0].outputDepId = result_dependency_id; pfs[0].inputDepIds = new int[] {}; pfs[0].multipartition = false; pfs[0].parameters = ParameterSet.fromArrayNoCopy(tableName, result_dependency_id, compressedTable, 0, null); int final_dependency_id = TableSaveFileState.getNextDependencyId(); pfs[1] = new SynthesizedPlanFragment(); pfs[1].fragmentId = SysProcFragmentId.PF_restoreReceiveResultTables; pfs[1].outputDepId = final_dependency_id; pfs[1].inputDepIds = new int[] { result_dependency_id }; pfs[1].multipartition = false; pfs[1].parameters = ParameterSet.fromArrayNoCopy(final_dependency_id, "Received confirmation of successful replicated table load at " + siteId); TRACE_LOG.trace("Sending replicated table: " + tableName + " to site id:" + siteId); } c.discard(); results = executeSysProcPlanFragments(pfs, m_mbox); } } catch (Exception e) { VoltTable result = PrivateVoltTableFactory.createUninitializedVoltTable(); result = constructResultsTable(); result.addRow(m_hostId, hostname, CoreUtils.getSiteIdFromHSId(m_siteId), tableName, -1, "FAILURE", "Unable to load table: " + tableName + " error: " + e.getMessage()); return result; } return results[0]; } private VoltTable performDistributePartitionedTable(String tableName, int originalHostIds[], int relevantPartitionIds[], SystemProcedureExecutionContext ctx, boolean asReplicated) { String hostname = CoreUtils.getHostnameOrAddress(); // XXX This is all very similar to the splitting code in // LoadMultipartitionTable. Consider ways to consolidate later Map<Long, Integer> sites_to_partitions = new HashMap<Long, Integer>(); SiteTracker tracker = ctx.getSiteTrackerForSnapshot(); sites_to_partitions.putAll(tracker.getSitesToPartitions()); try { initializeTableSaveFiles(m_filePath, m_fileNonce, tableName, originalHostIds, relevantPartitionIds, tracker); } catch (IOException e) { VoltTable result = constructResultsTable(); result.addRow(m_hostId, hostname, CoreUtils.getSiteIdFromHSId(m_siteId), tableName, relevantPartitionIds[0], "FAILURE", "Unable to load table: " + tableName + " error: " + e.getMessage()); return result; } VoltTable[] results = new VoltTable[] { constructResultsTable() }; results[0].addRow(m_hostId, hostname, CoreUtils.getSiteIdFromHSId(m_siteId), tableName, 0, "SUCCESS", "NO DATA TO DISTRIBUTE"); final Table new_catalog_table = getCatalogTable(tableName); Boolean needsConversion = null; org.voltcore.utils.DBBPool.BBContainer c = null; try { while (hasMoreChunks()) { VoltTable table = null; c = null; c = getNextChunk(); if (c == null) { continue;//Should be equivalent to break } if (needsConversion == null) { VoltTable old_table = PrivateVoltTableFactory.createVoltTableFromBuffer(c.b.duplicate(), true); needsConversion = SavedTableConverter.needsConversion(old_table, new_catalog_table); } final VoltTable old_table = PrivateVoltTableFactory.createVoltTableFromBuffer(c.b, true); if (needsConversion) { table = SavedTableConverter.convertTable(old_table, new_catalog_table); } else { table = old_table; } // use if will load as partitioned table byte[][] partitioned_tables = null; // use if will load as replicated table byte compressedTable[] = null; if (asReplicated) { compressedTable = table.getCompressedBytes(); } else { partitioned_tables = createPartitionedTables(tableName, table, ctx.getNumberOfPartitions()); } if (c != null) { c.discard(); } int[] dependencyIds = new int[sites_to_partitions.size()]; SynthesizedPlanFragment[] pfs = new SynthesizedPlanFragment[sites_to_partitions.size() + 1]; int pfs_index = 0; for (long site_id : sites_to_partitions.keySet()) { dependencyIds[pfs_index] = TableSaveFileState.getNextDependencyId(); SynthesizedPlanFragment loadFragment = new SynthesizedPlanFragment(); loadFragment.fragmentId = SysProcFragmentId.PF_restoreLoadTable; loadFragment.siteId = m_actualToGenerated.get(site_id); loadFragment.multipartition = false; loadFragment.outputDepId = dependencyIds[pfs_index]; loadFragment.inputDepIds = new int[] {}; if (asReplicated) { loadFragment.parameters = ParameterSet.fromArrayNoCopy(tableName, dependencyIds[pfs_index], compressedTable, 0, relevantPartitionIds); } else { int partition_id = sites_to_partitions.get(site_id); loadFragment.parameters = ParameterSet.fromArrayNoCopy(tableName, dependencyIds[pfs_index], partitioned_tables[partition_id], 1, new int[] { partition_id }); } pfs[pfs_index++] = loadFragment; } int result_dependency_id = TableSaveFileState.getNextDependencyId(); SynthesizedPlanFragment aggregatorFragment = new SynthesizedPlanFragment(); aggregatorFragment.fragmentId = SysProcFragmentId.PF_restoreReceiveResultTables; aggregatorFragment.multipartition = false; aggregatorFragment.outputDepId = result_dependency_id; aggregatorFragment.inputDepIds = dependencyIds; if (asReplicated) { aggregatorFragment.parameters = ParameterSet.fromArrayNoCopy(result_dependency_id, "Received confirmation of successful partitioned-to-replicated table load"); } else { aggregatorFragment.parameters = ParameterSet.fromArrayNoCopy(result_dependency_id, "Received confirmation of successful partitioned-to-partitioned table load"); } pfs[sites_to_partitions.size()] = aggregatorFragment; results = executeSysProcPlanFragments(pfs, m_mbox); } } catch (Exception e) { VoltTable result = PrivateVoltTableFactory.createUninitializedVoltTable(); result = constructResultsTable(); result.addRow(m_hostId, hostname, CoreUtils.getSiteIdFromHSId(m_siteId), tableName, relevantPartitionIds[0], "FAILURE", "Unable to load table: " + tableName + " error: " + e.getMessage()); return result; } return results[0]; } private byte[][] createPartitionedTables(String tableName, VoltTable loadedTable, int number_of_partitions) throws Exception { Table catalog_table = m_database.getTables().getIgnoreCase(tableName); assert (!catalog_table.getIsreplicated()); // XXX blatantly stolen from LoadMultipartitionTable // find the index and type of the partitioning attribute int partition_col = catalog_table.getPartitioncolumn().getIndex(); VoltType partition_type = VoltType.get((byte) catalog_table.getPartitioncolumn().getType()); // create a table for each partition VoltTable[] partitioned_tables = new VoltTable[number_of_partitions]; for (int i = 0; i < partitioned_tables.length; i++) { partitioned_tables[i] = loadedTable.clone(loadedTable.getUnderlyingBufferSize() / number_of_partitions); } // split the input table into per-partition units while (loadedTable.advanceRow()) { int partition = 0; try { partition = TheHashinator.hashToPartition(loadedTable.get(partition_col, partition_type)); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException(e.getMessage()); } // this adds the active row of loadedTable partitioned_tables[partition].add(loadedTable); } byte compressedTables[][] = new byte[number_of_partitions][]; for (int ii = 0; ii < compressedTables.length; ii++) { compressedTables[ii] = partitioned_tables[ii].getCompressedBytes(); } return compressedTables; } private Table getCatalogTable(String tableName) { return m_database.getTables().get(tableName); } /* * Do parameter checking for the pre-JSON version of @SnapshotRestore old version */ public static ClientResponseImpl transformRestoreParamsToJSON(StoredProcedureInvocation task) { Object params[] = task.getParams().toArray(); if (params.length == 1) { return null; } else if (params.length == 2) { if (params[0] == null) { return new ClientResponseImpl(ClientResponseImpl.GRACEFUL_FAILURE, new VoltTable[0], "@SnapshotRestore parameter 0 was null", task.getClientHandle()); } if (params[1] == null) { return new ClientResponseImpl(ClientResponseImpl.GRACEFUL_FAILURE, new VoltTable[0], "@SnapshotRestore parameter 1 was null", task.getClientHandle()); } if (!(params[0] instanceof String)) { return new ClientResponseImpl(ClientResponseImpl.GRACEFUL_FAILURE, new VoltTable[0], "@SnapshotRestore param 0 (path) needs to be a string, but was type " + params[0].getClass().getSimpleName(), task.getClientHandle()); } if (!(params[1] instanceof String)) { return new ClientResponseImpl(ClientResponseImpl.GRACEFUL_FAILURE, new VoltTable[0], "@SnapshotRestore param 1 (nonce) needs to be a string, but was type " + params[1].getClass().getSimpleName(), task.getClientHandle()); } JSONObject jsObj = new JSONObject(); try { jsObj.put(SnapshotRestore.JSON_PATH, (String) params[0]); jsObj.put(SnapshotRestore.JSON_NONCE, (String) params[1]); } catch (JSONException e) { Throwables.propagate(e); } task.setParams(jsObj.toString()); return null; } else { return new ClientResponseImpl(ClientResponseImpl.GRACEFUL_FAILURE, new VoltTable[0], "@SnapshotRestore supports a single json document parameter or two parameters (path, nonce), " + params.length + " parameters provided", task.getClientHandle()); } } private Mailbox m_mbox; private final Map<Long, Long> m_actualToGenerated = new HashMap<Long, Long>(); private Database m_database; private long m_siteId; private int m_hostId; private static volatile String m_filePath; private static volatile String m_fileNonce; }