Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.accumulo.server.tabletserver; import static org.apache.accumulo.server.problems.ProblemType.TABLET_LOAD; import java.io.FileNotFoundException; import java.io.IOException; import java.lang.management.GarbageCollectorMXBean; import java.lang.management.ManagementFactory; import java.net.Socket; import java.net.UnknownHostException; import java.nio.ByteBuffer; import java.security.SecureRandom; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.EnumMap; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; import java.util.TimerTask; import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingDeque; import java.util.concurrent.CancellationException; import java.util.concurrent.ExecutionException; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.RunnableFuture; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import javax.management.ObjectName; import javax.management.StandardMBean; import org.apache.accumulo.core.Constants; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; import org.apache.accumulo.core.client.Instance; import org.apache.accumulo.core.client.impl.CompressedIterators; import org.apache.accumulo.core.client.impl.CompressedIterators.IterConfig; import org.apache.accumulo.core.client.impl.ScannerImpl; import org.apache.accumulo.core.client.impl.TabletType; import org.apache.accumulo.core.client.impl.Translator; import org.apache.accumulo.core.client.impl.Translator.TKeyExtentTranslator; import org.apache.accumulo.core.client.impl.Translator.TRangeTranslator; import org.apache.accumulo.core.client.impl.thrift.SecurityErrorCode; import org.apache.accumulo.core.client.impl.thrift.ThriftSecurityException; import org.apache.accumulo.core.conf.AccumuloConfiguration; import org.apache.accumulo.core.conf.Property; import org.apache.accumulo.core.constraints.Constraint.Environment; import org.apache.accumulo.core.constraints.Violations; import org.apache.accumulo.core.data.Column; import org.apache.accumulo.core.data.ConstraintViolationSummary; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.KeyExtent; import org.apache.accumulo.core.data.Mutation; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.data.thrift.InitialMultiScan; import org.apache.accumulo.core.data.thrift.InitialScan; import org.apache.accumulo.core.data.thrift.IterInfo; import org.apache.accumulo.core.data.thrift.MapFileInfo; import org.apache.accumulo.core.data.thrift.MultiScanResult; import org.apache.accumulo.core.data.thrift.ScanResult; import org.apache.accumulo.core.data.thrift.TCMResult; import org.apache.accumulo.core.data.thrift.TCMStatus; import org.apache.accumulo.core.data.thrift.TColumn; import org.apache.accumulo.core.data.thrift.TCondition; import org.apache.accumulo.core.data.thrift.TConditionalMutation; import org.apache.accumulo.core.data.thrift.TConditionalSession; import org.apache.accumulo.core.data.thrift.TKey; import org.apache.accumulo.core.data.thrift.TKeyExtent; import org.apache.accumulo.core.data.thrift.TKeyValue; import org.apache.accumulo.core.data.thrift.TMutation; import org.apache.accumulo.core.data.thrift.TRange; import org.apache.accumulo.core.data.thrift.UpdateErrors; import org.apache.accumulo.core.iterators.IterationInterruptedException; import org.apache.accumulo.core.master.thrift.Compacting; import org.apache.accumulo.core.master.thrift.MasterClientService; import org.apache.accumulo.core.master.thrift.TableInfo; import org.apache.accumulo.core.master.thrift.TabletLoadState; import org.apache.accumulo.core.master.thrift.TabletServerStatus; import org.apache.accumulo.core.metadata.MetadataTable; import org.apache.accumulo.core.metadata.RootTable; import org.apache.accumulo.core.metadata.schema.MetadataSchema.TabletsSection; import org.apache.accumulo.core.security.Authorizations; import org.apache.accumulo.core.security.SecurityUtil; import org.apache.accumulo.core.security.thrift.TCredentials; import org.apache.accumulo.core.tabletserver.thrift.ActiveCompaction; import org.apache.accumulo.core.tabletserver.thrift.ActiveScan; import org.apache.accumulo.core.tabletserver.thrift.ConstraintViolationException; import org.apache.accumulo.core.tabletserver.thrift.NoSuchScanIDException; import org.apache.accumulo.core.tabletserver.thrift.NotServingTabletException; import org.apache.accumulo.core.tabletserver.thrift.ScanState; import org.apache.accumulo.core.tabletserver.thrift.ScanType; import org.apache.accumulo.core.tabletserver.thrift.TabletClientService; import org.apache.accumulo.core.tabletserver.thrift.TabletClientService.Iface; import org.apache.accumulo.core.tabletserver.thrift.TabletClientService.Processor; import org.apache.accumulo.core.tabletserver.thrift.TabletStats; import org.apache.accumulo.core.util.ByteBufferUtil; import org.apache.accumulo.core.util.ColumnFQ; import org.apache.accumulo.core.util.Daemon; import org.apache.accumulo.core.util.LoggingRunnable; import org.apache.accumulo.core.util.MapCounter; import org.apache.accumulo.core.util.Pair; import org.apache.accumulo.core.util.ServerServices; import org.apache.accumulo.core.util.ServerServices.Service; import org.apache.accumulo.core.util.SimpleThreadPool; import org.apache.accumulo.core.util.Stat; import org.apache.accumulo.core.util.ThriftUtil; import org.apache.accumulo.core.util.UtilWaitThread; import org.apache.accumulo.core.zookeeper.ZooUtil; import org.apache.accumulo.fate.zookeeper.IZooReaderWriter; import org.apache.accumulo.fate.zookeeper.ZooLock.LockLossReason; import org.apache.accumulo.fate.zookeeper.ZooLock.LockWatcher; import org.apache.accumulo.fate.zookeeper.ZooUtil.NodeExistsPolicy; import org.apache.accumulo.server.Accumulo; import org.apache.accumulo.server.ServerConstants; import org.apache.accumulo.server.ServerOpts; import org.apache.accumulo.server.client.ClientServiceHandler; import org.apache.accumulo.server.client.HdfsZooInstance; import org.apache.accumulo.server.conf.ServerConfiguration; import org.apache.accumulo.server.conf.TableConfiguration; import org.apache.accumulo.server.data.ServerConditionalMutation; import org.apache.accumulo.server.data.ServerMutation; import org.apache.accumulo.server.fs.FileRef; import org.apache.accumulo.server.fs.VolumeManager; import org.apache.accumulo.server.fs.VolumeManager.FileType; import org.apache.accumulo.server.fs.VolumeManagerImpl; import org.apache.accumulo.server.master.state.Assignment; import org.apache.accumulo.server.master.state.DistributedStoreException; import org.apache.accumulo.server.master.state.TServerInstance; import org.apache.accumulo.server.master.state.TabletLocationState; import org.apache.accumulo.server.master.state.TabletLocationState.BadLocationStateException; import org.apache.accumulo.server.master.state.TabletStateStore; import org.apache.accumulo.server.master.state.ZooTabletStateStore; import org.apache.accumulo.server.metrics.AbstractMetricsImpl; import org.apache.accumulo.server.problems.ProblemReport; import org.apache.accumulo.server.problems.ProblemReports; import org.apache.accumulo.server.security.AuditedSecurityOperation; import org.apache.accumulo.server.security.SecurityOperation; import org.apache.accumulo.server.security.SystemCredentials; import org.apache.accumulo.server.tabletserver.Compactor.CompactionInfo; import org.apache.accumulo.server.tabletserver.RowLocks.RowLock; import org.apache.accumulo.server.tabletserver.Tablet.CommitSession; import org.apache.accumulo.server.tabletserver.Tablet.KVEntry; import org.apache.accumulo.server.tabletserver.Tablet.LookupResult; import org.apache.accumulo.server.tabletserver.Tablet.MajorCompactionReason; import org.apache.accumulo.server.tabletserver.Tablet.MinorCompactionReason; import org.apache.accumulo.server.tabletserver.Tablet.ScanBatch; import org.apache.accumulo.server.tabletserver.Tablet.Scanner; import org.apache.accumulo.server.tabletserver.Tablet.SplitInfo; import org.apache.accumulo.server.tabletserver.Tablet.TConstraintViolationException; import org.apache.accumulo.server.tabletserver.Tablet.TabletClosedException; import org.apache.accumulo.server.tabletserver.TabletServerResourceManager.TabletResourceManager; import org.apache.accumulo.server.tabletserver.TabletStatsKeeper.Operation; import org.apache.accumulo.server.tabletserver.log.DfsLogger; import org.apache.accumulo.server.tabletserver.log.LogSorter; import org.apache.accumulo.server.tabletserver.log.MutationReceiver; import org.apache.accumulo.server.tabletserver.log.TabletServerLogger; import org.apache.accumulo.server.tabletserver.mastermessage.MasterMessage; import org.apache.accumulo.server.tabletserver.mastermessage.SplitReportMessage; import org.apache.accumulo.server.tabletserver.mastermessage.TabletStatusMessage; import org.apache.accumulo.server.tabletserver.metrics.TabletServerMBean; import org.apache.accumulo.server.tabletserver.metrics.TabletServerMinCMetrics; import org.apache.accumulo.server.tabletserver.metrics.TabletServerScanMetrics; import org.apache.accumulo.server.tabletserver.metrics.TabletServerUpdateMetrics; import org.apache.accumulo.server.util.FileSystemMonitor; import org.apache.accumulo.server.util.Halt; import org.apache.accumulo.server.util.MetadataTableUtil; import org.apache.accumulo.server.util.MetadataTableUtil.LogEntry; import org.apache.accumulo.server.util.TServerUtils; import org.apache.accumulo.server.util.TServerUtils.ServerAddress; import org.apache.accumulo.server.util.time.RelativeTime; import org.apache.accumulo.server.util.time.SimpleTimer; import org.apache.accumulo.server.zookeeper.DistributedWorkQueue; import org.apache.accumulo.server.zookeeper.TransactionWatcher; import org.apache.accumulo.server.zookeeper.ZooCache; import org.apache.accumulo.server.zookeeper.ZooLock; import org.apache.accumulo.server.zookeeper.ZooReaderWriter; import org.apache.accumulo.start.classloader.vfs.AccumuloVFSClassLoader; import org.apache.accumulo.start.classloader.vfs.ContextManager; import org.apache.accumulo.trace.instrument.Span; import org.apache.accumulo.trace.instrument.Trace; import org.apache.accumulo.trace.instrument.thrift.TraceWrap; import org.apache.accumulo.trace.thrift.TInfo; import org.apache.commons.collections.map.LRUMap; import org.apache.hadoop.fs.FSError; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.log4j.Logger; import org.apache.thrift.TException; import org.apache.thrift.TProcessor; import org.apache.thrift.TServiceClient; import org.apache.thrift.server.TServer; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException.NoNodeException; import com.google.common.net.HostAndPort; enum ScanRunState { QUEUED, RUNNING, FINISHED } public class TabletServer extends AbstractMetricsImpl implements org.apache.accumulo.server.tabletserver.metrics.TabletServerMBean { private static final Logger log = Logger.getLogger(TabletServer.class); private static HashMap<String, Long> prevGcTime = new HashMap<String, Long>(); private static long lastMemorySize = 0; private static long gcTimeIncreasedCount; private static final long MAX_TIME_TO_WAIT_FOR_SCAN_RESULT_MILLIS = 1000; private static final long RECENTLY_SPLIT_MILLIES = 60 * 1000; private TabletServerLogger logger; protected TabletServerMinCMetrics mincMetrics = new TabletServerMinCMetrics(); private ServerConfiguration serverConfig; private LogSorter logSorter = null; public TabletServer(ServerConfiguration conf, VolumeManager fs) { super(); this.serverConfig = conf; this.instance = conf.getInstance(); this.fs = fs; this.logSorter = new LogSorter(instance, fs, getSystemConfiguration()); SimpleTimer.getInstance().schedule(new Runnable() { @Override public void run() { synchronized (onlineTablets) { long now = System.currentTimeMillis(); for (Tablet tablet : onlineTablets.values()) try { tablet.updateRates(now); } catch (Exception ex) { log.error(ex, ex); } } } }, 5000, 5000); } private synchronized static void logGCInfo(AccumuloConfiguration conf) { List<GarbageCollectorMXBean> gcmBeans = ManagementFactory.getGarbageCollectorMXBeans(); Runtime rt = Runtime.getRuntime(); StringBuilder sb = new StringBuilder("gc"); boolean sawChange = false; long maxIncreaseInCollectionTime = 0; for (GarbageCollectorMXBean gcBean : gcmBeans) { Long prevTime = prevGcTime.get(gcBean.getName()); long pt = 0; if (prevTime != null) { pt = prevTime; } long time = gcBean.getCollectionTime(); if (time - pt != 0) { sawChange = true; } long increaseInCollectionTime = time - pt; sb.append(String.format(" %s=%,.2f(+%,.2f) secs", gcBean.getName(), time / 1000.0, increaseInCollectionTime / 1000.0)); maxIncreaseInCollectionTime = Math.max(increaseInCollectionTime, maxIncreaseInCollectionTime); prevGcTime.put(gcBean.getName(), time); } long mem = rt.freeMemory(); if (maxIncreaseInCollectionTime == 0) { gcTimeIncreasedCount = 0; } else { gcTimeIncreasedCount++; if (gcTimeIncreasedCount > 3 && mem < rt.maxMemory() * 0.05) { log.warn("Running low on memory"); gcTimeIncreasedCount = 0; } } if (mem > lastMemorySize) { sawChange = true; } String sign = "+"; if (mem - lastMemorySize <= 0) { sign = ""; } sb.append(String.format(" freemem=%,d(%s%,d) totalmem=%,d", mem, sign, (mem - lastMemorySize), rt.totalMemory())); if (sawChange) { log.debug(sb.toString()); } final long keepAliveTimeout = conf.getTimeInMillis(Property.INSTANCE_ZK_TIMEOUT); if (maxIncreaseInCollectionTime > keepAliveTimeout) { Halt.halt("Garbage collection may be interfering with lock keep-alive. Halting.", -1); } lastMemorySize = mem; } private TabletStatsKeeper statsKeeper; private static class Session { long lastAccessTime; long startTime; String user; String client = TServerUtils.clientAddress.get(); public boolean reserved; public void cleanup() { } } private static class SessionManager { SecureRandom random; Map<Long, Session> sessions; long maxIdle; SessionManager(AccumuloConfiguration conf) { random = new SecureRandom(); sessions = new HashMap<Long, Session>(); maxIdle = conf.getTimeInMillis(Property.TSERV_SESSION_MAXIDLE); Runnable r = new Runnable() { @Override public void run() { sweep(maxIdle); } }; SimpleTimer.getInstance().schedule(r, 0, Math.max(maxIdle / 2, 1000)); } synchronized long createSession(Session session, boolean reserve) { long sid = random.nextLong(); while (sessions.containsKey(sid)) { sid = random.nextLong(); } sessions.put(sid, session); session.reserved = reserve; session.startTime = session.lastAccessTime = System.currentTimeMillis(); return sid; } long getMaxIdleTime() { return maxIdle; } /** * while a session is reserved, it cannot be canceled or removed * * @param sessionId */ synchronized Session reserveSession(long sessionId) { Session session = sessions.get(sessionId); if (session != null) { if (session.reserved) throw new IllegalStateException(); session.reserved = true; } return session; } synchronized Session reserveSession(long sessionId, boolean wait) { Session session = sessions.get(sessionId); if (session != null) { while (wait && session.reserved) { try { wait(1000); } catch (InterruptedException e) { throw new RuntimeException(); } } if (session.reserved) throw new IllegalStateException(); session.reserved = true; } return session; } synchronized void unreserveSession(Session session) { if (!session.reserved) throw new IllegalStateException(); notifyAll(); session.reserved = false; session.lastAccessTime = System.currentTimeMillis(); } synchronized void unreserveSession(long sessionId) { Session session = getSession(sessionId); if (session != null) unreserveSession(session); } synchronized Session getSession(long sessionId) { Session session = sessions.get(sessionId); if (session != null) session.lastAccessTime = System.currentTimeMillis(); return session; } Session removeSession(long sessionId) { return removeSession(sessionId, false); } Session removeSession(long sessionId, boolean unreserve) { Session session = null; synchronized (this) { session = sessions.remove(sessionId); if (unreserve && session != null) unreserveSession(session); } // do clean up out side of lock.. if (session != null) session.cleanup(); return session; } private void sweep(long maxIdle) { ArrayList<Session> sessionsToCleanup = new ArrayList<Session>(); synchronized (this) { Iterator<Session> iter = sessions.values().iterator(); while (iter.hasNext()) { Session session = iter.next(); long idleTime = System.currentTimeMillis() - session.lastAccessTime; if (idleTime > maxIdle && !session.reserved) { iter.remove(); sessionsToCleanup.add(session); } } } // do clean up outside of lock for (Session session : sessionsToCleanup) { session.cleanup(); } } synchronized void removeIfNotAccessed(final long sessionId, long delay) { Session session = sessions.get(sessionId); if (session != null) { final long removeTime = session.lastAccessTime; TimerTask r = new TimerTask() { @Override public void run() { Session sessionToCleanup = null; synchronized (SessionManager.this) { Session session2 = sessions.get(sessionId); if (session2 != null && session2.lastAccessTime == removeTime && !session2.reserved) { sessions.remove(sessionId); sessionToCleanup = session2; } } // call clean up outside of lock if (sessionToCleanup != null) sessionToCleanup.cleanup(); } }; SimpleTimer.getInstance().schedule(r, delay); } } public synchronized Map<String, MapCounter<ScanRunState>> getActiveScansPerTable() { Map<String, MapCounter<ScanRunState>> counts = new HashMap<String, MapCounter<ScanRunState>>(); for (Entry<Long, Session> entry : sessions.entrySet()) { Session session = entry.getValue(); @SuppressWarnings("rawtypes") ScanTask nbt = null; String tableID = null; if (session instanceof ScanSession) { ScanSession ss = (ScanSession) session; nbt = ss.nextBatchTask; tableID = ss.extent.getTableId().toString(); } else if (session instanceof MultiScanSession) { MultiScanSession mss = (MultiScanSession) session; nbt = mss.lookupTask; tableID = mss.threadPoolExtent.getTableId().toString(); } if (nbt == null) continue; ScanRunState srs = nbt.getScanRunState(); if (nbt == null || srs == ScanRunState.FINISHED) continue; MapCounter<ScanRunState> stateCounts = counts.get(tableID); if (stateCounts == null) { stateCounts = new MapCounter<ScanRunState>(); counts.put(tableID, stateCounts); } stateCounts.increment(srs, 1); } return counts; } public synchronized List<ActiveScan> getActiveScans() { ArrayList<ActiveScan> activeScans = new ArrayList<ActiveScan>(); long ct = System.currentTimeMillis(); for (Entry<Long, Session> entry : sessions.entrySet()) { Session session = entry.getValue(); if (session instanceof ScanSession) { ScanSession ss = (ScanSession) session; ScanState state = ScanState.RUNNING; ScanTask<ScanBatch> nbt = ss.nextBatchTask; if (nbt == null) { state = ScanState.IDLE; } else { switch (nbt.getScanRunState()) { case QUEUED: state = ScanState.QUEUED; break; case FINISHED: state = ScanState.IDLE; break; case RUNNING: default: /* do nothing */ break; } } activeScans.add(new ActiveScan(ss.client, ss.user, ss.extent.getTableId().toString(), ct - ss.startTime, ct - ss.lastAccessTime, ScanType.SINGLE, state, ss.extent.toThrift(), Translator.translate(ss.columnSet, Translator.CT), ss.ssiList, ss.ssio, ss.auths.getAuthorizationsBB())); } else if (session instanceof MultiScanSession) { MultiScanSession mss = (MultiScanSession) session; ScanState state = ScanState.RUNNING; ScanTask<MultiScanResult> nbt = mss.lookupTask; if (nbt == null) { state = ScanState.IDLE; } else { switch (nbt.getScanRunState()) { case QUEUED: state = ScanState.QUEUED; break; case FINISHED: state = ScanState.IDLE; break; case RUNNING: default: /* do nothing */ break; } } activeScans.add(new ActiveScan(mss.client, mss.user, mss.threadPoolExtent.getTableId().toString(), ct - mss.startTime, ct - mss.lastAccessTime, ScanType.BATCH, state, mss.threadPoolExtent.toThrift(), Translator.translate(mss.columnSet, Translator.CT), mss.ssiList, mss.ssio, mss.auths.getAuthorizationsBB())); } } return activeScans; } } static class TservConstraintEnv implements Environment { private TCredentials credentials; private SecurityOperation security; private Authorizations auths; private KeyExtent ke; TservConstraintEnv(SecurityOperation secOp, TCredentials credentials) { this.security = secOp; this.credentials = credentials; } void setExtent(KeyExtent ke) { this.ke = ke; } @Override public KeyExtent getExtent() { return ke; } @Override public String getUser() { return credentials.getPrincipal(); } @Override public Authorizations getAuthorizations() { if (auths == null) try { this.auths = security.getUserAuthorizations(credentials); } catch (ThriftSecurityException e) { throw new RuntimeException(e); } return auths; } } private abstract class ScanTask<T> implements RunnableFuture<T> { protected AtomicBoolean interruptFlag; protected ArrayBlockingQueue<Object> resultQueue; protected AtomicInteger state; protected AtomicReference<ScanRunState> runState; private static final int INITIAL = 1; private static final int ADDED = 2; private static final int CANCELED = 3; ScanTask() { interruptFlag = new AtomicBoolean(false); runState = new AtomicReference<ScanRunState>(ScanRunState.QUEUED); state = new AtomicInteger(INITIAL); resultQueue = new ArrayBlockingQueue<Object>(1); } protected void addResult(Object o) { if (state.compareAndSet(INITIAL, ADDED)) resultQueue.add(o); else if (state.get() == ADDED) throw new IllegalStateException("Tried to add more than one result"); } @Override public boolean cancel(boolean mayInterruptIfRunning) { if (!mayInterruptIfRunning) throw new IllegalArgumentException( "Cancel will always attempt to interupt running next batch task"); if (state.get() == CANCELED) return true; if (state.compareAndSet(INITIAL, CANCELED)) { interruptFlag.set(true); resultQueue = null; return true; } return false; } @Override public T get() throws InterruptedException, ExecutionException { throw new UnsupportedOperationException(); } @SuppressWarnings("unchecked") @Override public T get(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException { ArrayBlockingQueue<Object> localRQ = resultQueue; if (state.get() == CANCELED) throw new CancellationException(); if (localRQ == null && state.get() == ADDED) throw new IllegalStateException("Tried to get result twice"); Object r = localRQ.poll(timeout, unit); // could have been canceled while waiting if (state.get() == CANCELED) { if (r != null) throw new IllegalStateException("Nothing should have been added when in canceled state"); throw new CancellationException(); } if (r == null) throw new TimeoutException(); // make this method stop working now that something is being // returned resultQueue = null; if (r instanceof Throwable) throw new ExecutionException((Throwable) r); return (T) r; } @Override public boolean isCancelled() { return state.get() == CANCELED; } @Override public boolean isDone() { return runState.get().equals(ScanRunState.FINISHED); } public ScanRunState getScanRunState() { return runState.get(); } } private static class ConditionalSession extends Session { public TCredentials credentials; public Authorizations auths; public String tableId; public AtomicBoolean interruptFlag; @Override public void cleanup() { interruptFlag.set(true); } } private static class UpdateSession extends Session { public Tablet currentTablet; public MapCounter<Tablet> successfulCommits = new MapCounter<Tablet>(); Map<KeyExtent, Long> failures = new HashMap<KeyExtent, Long>(); HashMap<KeyExtent, SecurityErrorCode> authFailures = new HashMap<KeyExtent, SecurityErrorCode>(); public Violations violations; public TCredentials credentials; public long totalUpdates = 0; public long flushTime = 0; Stat prepareTimes = new Stat(); Stat walogTimes = new Stat(); Stat commitTimes = new Stat(); Stat authTimes = new Stat(); public Map<Tablet, List<Mutation>> queuedMutations = new HashMap<Tablet, List<Mutation>>(); public long queuedMutationSize = 0; TservConstraintEnv cenv = null; } private static class ScanSession extends Session { public KeyExtent extent; public HashSet<Column> columnSet; public List<IterInfo> ssiList; public Map<String, Map<String, String>> ssio; public Authorizations auths; public long entriesReturned = 0; public Stat nbTimes = new Stat(); public long batchCount = 0; public volatile ScanTask<ScanBatch> nextBatchTask; public AtomicBoolean interruptFlag; public Scanner scanner; public long readaheadThreshold = Constants.SCANNER_DEFAULT_READAHEAD_THRESHOLD; @Override public void cleanup() { try { if (nextBatchTask != null) nextBatchTask.cancel(true); } finally { if (scanner != null) scanner.close(); } } } private static class MultiScanSession extends Session { HashSet<Column> columnSet; Map<KeyExtent, List<Range>> queries; public List<IterInfo> ssiList; public Map<String, Map<String, String>> ssio; public Authorizations auths; // stats int numRanges; int numTablets; int numEntries; long totalLookupTime; public volatile ScanTask<MultiScanResult> lookupTask; public KeyExtent threadPoolExtent; @Override public void cleanup() { if (lookupTask != null) lookupTask.cancel(true); } } /** * This little class keeps track of writes in progress and allows readers to wait for writes that started before the read. It assumes that the operation ids * are monotonically increasing. * */ static class WriteTracker { private static AtomicLong operationCounter = new AtomicLong(1); private Map<TabletType, TreeSet<Long>> inProgressWrites = new EnumMap<TabletType, TreeSet<Long>>( TabletType.class); WriteTracker() { for (TabletType ttype : TabletType.values()) { inProgressWrites.put(ttype, new TreeSet<Long>()); } } synchronized long startWrite(TabletType ttype) { long operationId = operationCounter.getAndIncrement(); inProgressWrites.get(ttype).add(operationId); return operationId; } synchronized void finishWrite(long operationId) { if (operationId == -1) return; boolean removed = false; for (TabletType ttype : TabletType.values()) { removed = inProgressWrites.get(ttype).remove(operationId); if (removed) break; } if (!removed) { throw new IllegalArgumentException( "Attempted to finish write not in progress, operationId " + operationId); } this.notifyAll(); } synchronized void waitForWrites(TabletType ttype) { long operationId = operationCounter.getAndIncrement(); while (inProgressWrites.get(ttype).floor(operationId) != null) { try { this.wait(); } catch (InterruptedException e) { log.error(e, e); } } } public long startWrite(Set<Tablet> keySet) { if (keySet.size() == 0) return -1; ArrayList<KeyExtent> extents = new ArrayList<KeyExtent>(keySet.size()); for (Tablet tablet : keySet) extents.add(tablet.getExtent()); return startWrite(TabletType.type(extents)); } } public AccumuloConfiguration getSystemConfiguration() { return serverConfig.getConfiguration(); } TransactionWatcher watcher = new TransactionWatcher(); private class ThriftClientHandler extends ClientServiceHandler implements TabletClientService.Iface { SessionManager sessionManager; AccumuloConfiguration acuConf = getSystemConfiguration(); TabletServerUpdateMetrics updateMetrics = new TabletServerUpdateMetrics(); TabletServerScanMetrics scanMetrics = new TabletServerScanMetrics(); WriteTracker writeTracker = new WriteTracker(); private RowLocks rowLocks = new RowLocks(); ThriftClientHandler() { super(instance, watcher); log.debug(ThriftClientHandler.class.getName() + " created"); sessionManager = new SessionManager(getSystemConfiguration()); // Register the metrics MBean try { updateMetrics.register(); scanMetrics.register(); } catch (Exception e) { log.error("Exception registering MBean with MBean Server", e); } } @Override public List<TKeyExtent> bulkImport(TInfo tinfo, TCredentials credentials, long tid, Map<TKeyExtent, Map<String, MapFileInfo>> files, boolean setTime) throws ThriftSecurityException { if (!security.canPerformSystemActions(credentials)) throw new ThriftSecurityException(credentials.getPrincipal(), SecurityErrorCode.PERMISSION_DENIED); List<TKeyExtent> failures = new ArrayList<TKeyExtent>(); for (Entry<TKeyExtent, Map<String, MapFileInfo>> entry : files.entrySet()) { TKeyExtent tke = entry.getKey(); Map<String, MapFileInfo> fileMap = entry.getValue(); Map<FileRef, MapFileInfo> fileRefMap = new HashMap<FileRef, MapFileInfo>(); for (Entry<String, MapFileInfo> mapping : fileMap.entrySet()) { Path path = new Path(mapping.getKey()); FileSystem ns = fs.getFileSystemByPath(path); path = ns.makeQualified(path); fileRefMap.put(new FileRef(path.toString(), path), mapping.getValue()); } Tablet importTablet = onlineTablets.get(new KeyExtent(tke)); if (importTablet == null) { failures.add(tke); } else { try { importTablet.importMapFiles(tid, fileRefMap, setTime); } catch (IOException ioe) { log.info("files " + fileMap.keySet() + " not imported to " + new KeyExtent(tke) + ": " + ioe.getMessage()); failures.add(tke); } } } return failures; } private class NextBatchTask extends ScanTask<ScanBatch> { private long scanID; NextBatchTask(long scanID, AtomicBoolean interruptFlag) { this.scanID = scanID; this.interruptFlag = interruptFlag; if (interruptFlag.get()) cancel(true); } @Override public void run() { final ScanSession scanSession = (ScanSession) sessionManager.getSession(scanID); String oldThreadName = Thread.currentThread().getName(); try { if (isCancelled() || scanSession == null) return; runState.set(ScanRunState.RUNNING); Thread.currentThread().setName("User: " + scanSession.user + " Start: " + scanSession.startTime + " Client: " + scanSession.client + " Tablet: " + scanSession.extent); Tablet tablet = onlineTablets.get(scanSession.extent); if (tablet == null) { addResult(new org.apache.accumulo.core.tabletserver.thrift.NotServingTabletException( scanSession.extent.toThrift())); return; } long t1 = System.currentTimeMillis(); ScanBatch batch = scanSession.scanner.read(); long t2 = System.currentTimeMillis(); scanSession.nbTimes.addStat(t2 - t1); // there should only be one thing on the queue at a time, so // it should be ok to call add() // instead of put()... if add() fails because queue is at // capacity it means there is code // problem somewhere addResult(batch); } catch (TabletClosedException e) { addResult(new org.apache.accumulo.core.tabletserver.thrift.NotServingTabletException( scanSession.extent.toThrift())); } catch (IterationInterruptedException iie) { if (!isCancelled()) { log.warn("Iteration interrupted, when scan not cancelled", iie); addResult(iie); } } catch (TooManyFilesException tmfe) { addResult(tmfe); } catch (Throwable e) { log.warn("exception while scanning tablet " + (scanSession == null ? "(unknown)" : scanSession.extent), e); addResult(e); } finally { runState.set(ScanRunState.FINISHED); Thread.currentThread().setName(oldThreadName); } } } private class LookupTask extends ScanTask<MultiScanResult> { private long scanID; LookupTask(long scanID) { this.scanID = scanID; } @Override public void run() { MultiScanSession session = (MultiScanSession) sessionManager.getSession(scanID); String oldThreadName = Thread.currentThread().getName(); try { if (isCancelled() || session == null) return; TableConfiguration acuTableConf = ServerConfiguration.getTableConfiguration(instance, session.threadPoolExtent.getTableId().toString()); long maxResultsSize = acuTableConf.getMemoryInBytes(Property.TABLE_SCAN_MAXMEM); runState.set(ScanRunState.RUNNING); Thread.currentThread().setName("Client: " + session.client + " User: " + session.user + " Start: " + session.startTime + " Table: "); long bytesAdded = 0; long maxScanTime = 4000; long startTime = System.currentTimeMillis(); ArrayList<KVEntry> results = new ArrayList<KVEntry>(); Map<KeyExtent, List<Range>> failures = new HashMap<KeyExtent, List<Range>>(); ArrayList<KeyExtent> fullScans = new ArrayList<KeyExtent>(); KeyExtent partScan = null; Key partNextKey = null; boolean partNextKeyInclusive = false; Iterator<Entry<KeyExtent, List<Range>>> iter = session.queries.entrySet().iterator(); // check the time so that the read ahead thread is not monopolized while (iter.hasNext() && bytesAdded < maxResultsSize && (System.currentTimeMillis() - startTime) < maxScanTime) { Entry<KeyExtent, List<Range>> entry = iter.next(); iter.remove(); // check that tablet server is serving requested tablet Tablet tablet = onlineTablets.get(entry.getKey()); if (tablet == null) { failures.put(entry.getKey(), entry.getValue()); continue; } Thread.currentThread().setName("Client: " + session.client + " User: " + session.user + " Start: " + session.startTime + " Tablet: " + entry.getKey().toString()); LookupResult lookupResult; try { // do the following check to avoid a race condition // between setting false below and the task being // canceled if (isCancelled()) interruptFlag.set(true); lookupResult = tablet.lookup(entry.getValue(), session.columnSet, session.auths, results, maxResultsSize - bytesAdded, session.ssiList, session.ssio, interruptFlag); // if the tablet was closed it it possible that the // interrupt flag was set.... do not want it set for // the next // lookup interruptFlag.set(false); } catch (IOException e) { log.warn("lookup failed for tablet " + entry.getKey(), e); throw new RuntimeException(e); } bytesAdded += lookupResult.bytesAdded; if (lookupResult.unfinishedRanges.size() > 0) { if (lookupResult.closed) { failures.put(entry.getKey(), lookupResult.unfinishedRanges); } else { session.queries.put(entry.getKey(), lookupResult.unfinishedRanges); partScan = entry.getKey(); partNextKey = lookupResult.unfinishedRanges.get(0).getStartKey(); partNextKeyInclusive = lookupResult.unfinishedRanges.get(0).isStartKeyInclusive(); } } else { fullScans.add(entry.getKey()); } } long finishTime = System.currentTimeMillis(); session.totalLookupTime += (finishTime - startTime); session.numEntries += results.size(); // convert everything to thrift before adding result List<TKeyValue> retResults = new ArrayList<TKeyValue>(); for (KVEntry entry : results) retResults.add(new TKeyValue(entry.key.toThrift(), ByteBuffer.wrap(entry.value))); Map<TKeyExtent, List<TRange>> retFailures = Translator.translate(failures, Translator.KET, new Translator.ListTranslator<Range, TRange>(Translator.RT)); List<TKeyExtent> retFullScans = Translator.translate(fullScans, Translator.KET); TKeyExtent retPartScan = null; TKey retPartNextKey = null; if (partScan != null) { retPartScan = partScan.toThrift(); retPartNextKey = partNextKey.toThrift(); } // add results to queue addResult(new MultiScanResult(retResults, retFailures, retFullScans, retPartScan, retPartNextKey, partNextKeyInclusive, session.queries.size() != 0)); } catch (IterationInterruptedException iie) { if (!isCancelled()) { log.warn("Iteration interrupted, when scan not cancelled", iie); addResult(iie); } } catch (Throwable e) { log.warn("exception while doing multi-scan ", e); addResult(e); } finally { Thread.currentThread().setName(oldThreadName); runState.set(ScanRunState.FINISHED); } } } @Override public InitialScan startScan(TInfo tinfo, TCredentials credentials, TKeyExtent textent, TRange range, List<TColumn> columns, int batchSize, List<IterInfo> ssiList, Map<String, Map<String, String>> ssio, List<ByteBuffer> authorizations, boolean waitForWrites, boolean isolated, long readaheadThreshold) throws NotServingTabletException, ThriftSecurityException, org.apache.accumulo.core.tabletserver.thrift.TooManyFilesException { Authorizations userauths = null; if (!security.canScan(credentials, new String(textent.getTable()), range, columns, ssiList, ssio, authorizations)) throw new ThriftSecurityException(credentials.getPrincipal(), SecurityErrorCode.PERMISSION_DENIED); userauths = security.getUserAuthorizations(credentials); for (ByteBuffer auth : authorizations) if (!userauths.contains(ByteBufferUtil.toBytes(auth))) throw new ThriftSecurityException(credentials.getPrincipal(), SecurityErrorCode.BAD_AUTHORIZATIONS); KeyExtent extent = new KeyExtent(textent); // wait for any writes that are in flight.. this done to ensure // consistency across client restarts... assume a client writes // to accumulo and dies while waiting for a confirmation from // accumulo... the client process restarts and tries to read // data from accumulo making the assumption that it will get // any writes previously made... however if the server side thread // processing the write from the dead client is still in progress, // the restarted client may not see the write unless we wait here. // this behavior is very important when the client is reading the // !METADATA table if (waitForWrites) writeTracker.waitForWrites(TabletType.type(extent)); Tablet tablet = onlineTablets.get(extent); if (tablet == null) throw new NotServingTabletException(textent); ScanSession scanSession = new ScanSession(); scanSession.user = credentials.getPrincipal(); scanSession.extent = new KeyExtent(extent); scanSession.columnSet = new HashSet<Column>(); scanSession.ssiList = ssiList; scanSession.ssio = ssio; scanSession.auths = new Authorizations(authorizations); scanSession.interruptFlag = new AtomicBoolean(); scanSession.readaheadThreshold = readaheadThreshold; for (TColumn tcolumn : columns) { scanSession.columnSet.add(new Column(tcolumn)); } scanSession.scanner = tablet.createScanner(new Range(range), batchSize, scanSession.columnSet, scanSession.auths, ssiList, ssio, isolated, scanSession.interruptFlag); long sid = sessionManager.createSession(scanSession, true); ScanResult scanResult; try { scanResult = continueScan(tinfo, sid, scanSession); } catch (NoSuchScanIDException e) { log.error("The impossible happened", e); throw new RuntimeException(); } finally { sessionManager.unreserveSession(sid); } return new InitialScan(sid, scanResult); } @Override public ScanResult continueScan(TInfo tinfo, long scanID) throws NoSuchScanIDException, NotServingTabletException, org.apache.accumulo.core.tabletserver.thrift.TooManyFilesException { ScanSession scanSession = (ScanSession) sessionManager.reserveSession(scanID); if (scanSession == null) { throw new NoSuchScanIDException(); } try { return continueScan(tinfo, scanID, scanSession); } finally { sessionManager.unreserveSession(scanSession); } } private ScanResult continueScan(TInfo tinfo, long scanID, ScanSession scanSession) throws NoSuchScanIDException, NotServingTabletException, org.apache.accumulo.core.tabletserver.thrift.TooManyFilesException { if (scanSession.nextBatchTask == null) { scanSession.nextBatchTask = new NextBatchTask(scanID, scanSession.interruptFlag); resourceManager.executeReadAhead(scanSession.extent, scanSession.nextBatchTask); } ScanBatch bresult; try { bresult = scanSession.nextBatchTask.get(MAX_TIME_TO_WAIT_FOR_SCAN_RESULT_MILLIS, TimeUnit.MILLISECONDS); scanSession.nextBatchTask = null; } catch (ExecutionException e) { sessionManager.removeSession(scanID); if (e.getCause() instanceof NotServingTabletException) throw (NotServingTabletException) e.getCause(); else if (e.getCause() instanceof TooManyFilesException) throw new org.apache.accumulo.core.tabletserver.thrift.TooManyFilesException( scanSession.extent.toThrift()); else throw new RuntimeException(e); } catch (CancellationException ce) { sessionManager.removeSession(scanID); Tablet tablet = onlineTablets.get(scanSession.extent); if (tablet == null || tablet.isClosed()) throw new NotServingTabletException(scanSession.extent.toThrift()); else throw new NoSuchScanIDException(); } catch (TimeoutException e) { List<TKeyValue> param = Collections.emptyList(); long timeout = acuConf.getTimeInMillis(Property.TSERV_CLIENT_TIMEOUT); sessionManager.removeIfNotAccessed(scanID, timeout); return new ScanResult(param, true); } catch (Throwable t) { sessionManager.removeSession(scanID); log.warn("Failed to get next batch", t); throw new RuntimeException(t); } ScanResult scanResult = new ScanResult(Key.compress(bresult.results), bresult.more); scanSession.entriesReturned += scanResult.results.size(); scanSession.batchCount++; if (scanResult.more && scanSession.batchCount > scanSession.readaheadThreshold) { // start reading next batch while current batch is transmitted // to client scanSession.nextBatchTask = new NextBatchTask(scanID, scanSession.interruptFlag); resourceManager.executeReadAhead(scanSession.extent, scanSession.nextBatchTask); } if (!scanResult.more) closeScan(tinfo, scanID); return scanResult; } @Override public void closeScan(TInfo tinfo, long scanID) { ScanSession ss = (ScanSession) sessionManager.removeSession(scanID); if (ss != null) { long t2 = System.currentTimeMillis(); log.debug(String.format("ScanSess tid %s %s %,d entries in %.2f secs, nbTimes = [%s] ", TServerUtils.clientAddress.get(), ss.extent.getTableId().toString(), ss.entriesReturned, (t2 - ss.startTime) / 1000.0, ss.nbTimes.toString())); if (scanMetrics.isEnabled()) { scanMetrics.add(TabletServerScanMetrics.scan, t2 - ss.startTime); scanMetrics.add(TabletServerScanMetrics.resultSize, ss.entriesReturned); } } } @Override public InitialMultiScan startMultiScan(TInfo tinfo, TCredentials credentials, Map<TKeyExtent, List<TRange>> tbatch, List<TColumn> tcolumns, List<IterInfo> ssiList, Map<String, Map<String, String>> ssio, List<ByteBuffer> authorizations, boolean waitForWrites) throws ThriftSecurityException { // find all of the tables that need to be scanned HashSet<String> tables = new HashSet<String>(); for (TKeyExtent keyExtent : tbatch.keySet()) { tables.add(new String(keyExtent.getTable())); } if (tables.size() != 1) throw new IllegalArgumentException("Cannot batch scan over multiple tables"); // check if user has permission to the tables Authorizations userauths = null; for (String table : tables) if (!security.canScan(credentials, table, tbatch, tcolumns, ssiList, ssio, authorizations)) throw new ThriftSecurityException(credentials.getPrincipal(), SecurityErrorCode.PERMISSION_DENIED); userauths = security.getUserAuthorizations(credentials); for (ByteBuffer auth : authorizations) if (!userauths.contains(ByteBufferUtil.toBytes(auth))) throw new ThriftSecurityException(credentials.getPrincipal(), SecurityErrorCode.BAD_AUTHORIZATIONS); Map<KeyExtent, List<Range>> batch = Translator.translate(tbatch, new TKeyExtentTranslator(), new Translator.ListTranslator<TRange, Range>(new TRangeTranslator())); // This is used to determine which thread pool to use KeyExtent threadPoolExtent = batch.keySet().iterator().next(); if (waitForWrites) writeTracker.waitForWrites(TabletType.type(batch.keySet())); MultiScanSession mss = new MultiScanSession(); mss.user = credentials.getPrincipal(); mss.queries = batch; mss.columnSet = new HashSet<Column>(tcolumns.size()); mss.ssiList = ssiList; mss.ssio = ssio; mss.auths = new Authorizations(authorizations); mss.numTablets = batch.size(); for (List<Range> ranges : batch.values()) { mss.numRanges += ranges.size(); } for (TColumn tcolumn : tcolumns) mss.columnSet.add(new Column(tcolumn)); mss.threadPoolExtent = threadPoolExtent; long sid = sessionManager.createSession(mss, true); MultiScanResult result; try { result = continueMultiScan(tinfo, sid, mss); } catch (NoSuchScanIDException e) { log.error("the impossible happened", e); throw new RuntimeException("the impossible happened", e); } finally { sessionManager.unreserveSession(sid); } return new InitialMultiScan(sid, result); } @Override public MultiScanResult continueMultiScan(TInfo tinfo, long scanID) throws NoSuchScanIDException { MultiScanSession session = (MultiScanSession) sessionManager.reserveSession(scanID); if (session == null) { throw new NoSuchScanIDException(); } try { return continueMultiScan(tinfo, scanID, session); } finally { sessionManager.unreserveSession(session); } } private MultiScanResult continueMultiScan(TInfo tinfo, long scanID, MultiScanSession session) throws NoSuchScanIDException { if (session.lookupTask == null) { session.lookupTask = new LookupTask(scanID); resourceManager.executeReadAhead(session.threadPoolExtent, session.lookupTask); } try { MultiScanResult scanResult = session.lookupTask.get(MAX_TIME_TO_WAIT_FOR_SCAN_RESULT_MILLIS, TimeUnit.MILLISECONDS); session.lookupTask = null; return scanResult; } catch (TimeoutException e1) { long timeout = acuConf.getTimeInMillis(Property.TSERV_CLIENT_TIMEOUT); sessionManager.removeIfNotAccessed(scanID, timeout); List<TKeyValue> results = Collections.emptyList(); Map<TKeyExtent, List<TRange>> failures = Collections.emptyMap(); List<TKeyExtent> fullScans = Collections.emptyList(); return new MultiScanResult(results, failures, fullScans, null, null, false, true); } catch (Throwable t) { sessionManager.removeSession(scanID); log.warn("Failed to get multiscan result", t); throw new RuntimeException(t); } } @Override public void closeMultiScan(TInfo tinfo, long scanID) throws NoSuchScanIDException { MultiScanSession session = (MultiScanSession) sessionManager.removeSession(scanID); if (session == null) { throw new NoSuchScanIDException(); } long t2 = System.currentTimeMillis(); log.debug(String.format( "MultiScanSess %s %,d entries in %.2f secs (lookup_time:%.2f secs tablets:%,d ranges:%,d) ", TServerUtils.clientAddress.get(), session.numEntries, (t2 - session.startTime) / 1000.0, session.totalLookupTime / 1000.0, session.numTablets, session.numRanges)); } @Override public long startUpdate(TInfo tinfo, TCredentials credentials) throws ThriftSecurityException { // Make sure user is real security.authenticateUser(credentials, credentials); if (updateMetrics.isEnabled()) updateMetrics.add(TabletServerUpdateMetrics.permissionErrors, 0); UpdateSession us = new UpdateSession(); us.violations = new Violations(); us.credentials = credentials; us.cenv = new TservConstraintEnv(security, us.credentials); long sid = sessionManager.createSession(us, false); return sid; } private void setUpdateTablet(UpdateSession us, KeyExtent keyExtent) { long t1 = System.currentTimeMillis(); if (us.currentTablet != null && us.currentTablet.getExtent().equals(keyExtent)) return; if (us.currentTablet == null && (us.failures.containsKey(keyExtent) || us.authFailures.containsKey(keyExtent))) { // if there were previous failures, then do not accept additional writes return; } try { // if user has no permission to write to this table, add it to // the failures list boolean sameTable = us.currentTablet != null && (us.currentTablet.getExtent().getTableId().equals(keyExtent.getTableId())); if (sameTable || security.canWrite(us.credentials, keyExtent.getTableId().toString())) { long t2 = System.currentTimeMillis(); us.authTimes.addStat(t2 - t1); us.currentTablet = onlineTablets.get(keyExtent); if (us.currentTablet != null) { us.queuedMutations.put(us.currentTablet, new ArrayList<Mutation>()); } else { // not serving tablet, so report all mutations as // failures us.failures.put(keyExtent, 0l); if (updateMetrics.isEnabled()) updateMetrics.add(TabletServerUpdateMetrics.unknownTabletErrors, 0); } } else { log.warn("Denying access to table " + keyExtent.getTableId() + " for user " + us.credentials.getPrincipal()); long t2 = System.currentTimeMillis(); us.authTimes.addStat(t2 - t1); us.currentTablet = null; us.authFailures.put(keyExtent, SecurityErrorCode.PERMISSION_DENIED); if (updateMetrics.isEnabled()) updateMetrics.add(TabletServerUpdateMetrics.permissionErrors, 0); return; } } catch (ThriftSecurityException e) { log.error("Denying permission to check user " + us.credentials.getPrincipal() + " with user " + e.getUser(), e); long t2 = System.currentTimeMillis(); us.authTimes.addStat(t2 - t1); us.currentTablet = null; us.authFailures.put(keyExtent, e.getCode()); if (updateMetrics.isEnabled()) updateMetrics.add(TabletServerUpdateMetrics.permissionErrors, 0); return; } } @Override public void applyUpdates(TInfo tinfo, long updateID, TKeyExtent tkeyExtent, List<TMutation> tmutations) { UpdateSession us = (UpdateSession) sessionManager.reserveSession(updateID); if (us == null) { throw new RuntimeException("No Such SessionID"); } try { KeyExtent keyExtent = new KeyExtent(tkeyExtent); setUpdateTablet(us, keyExtent); if (us.currentTablet != null) { List<Mutation> mutations = us.queuedMutations.get(us.currentTablet); for (TMutation tmutation : tmutations) { Mutation mutation = new ServerMutation(tmutation); mutations.add(mutation); us.queuedMutationSize += mutation.numBytes(); } if (us.queuedMutationSize > getSystemConfiguration() .getMemoryInBytes(Property.TSERV_MUTATION_QUEUE_MAX)) flush(us); } } finally { sessionManager.unreserveSession(us); } } private void flush(UpdateSession us) { int mutationCount = 0; Map<CommitSession, List<Mutation>> sendables = new HashMap<CommitSession, List<Mutation>>(); Throwable error = null; long pt1 = System.currentTimeMillis(); boolean containsMetadataTablet = false; for (Tablet tablet : us.queuedMutations.keySet()) if (tablet.getExtent().isMeta()) containsMetadataTablet = true; if (!containsMetadataTablet && us.queuedMutations.size() > 0) TabletServer.this.resourceManager.waitUntilCommitsAreEnabled(); Span prep = Trace.start("prep"); try { for (Entry<Tablet, ? extends List<Mutation>> entry : us.queuedMutations.entrySet()) { Tablet tablet = entry.getKey(); List<Mutation> mutations = entry.getValue(); if (mutations.size() > 0) { try { if (updateMetrics.isEnabled()) updateMetrics.add(TabletServerUpdateMetrics.mutationArraySize, mutations.size()); CommitSession commitSession = tablet.prepareMutationsForCommit(us.cenv, mutations); if (commitSession == null) { if (us.currentTablet == tablet) { us.currentTablet = null; } us.failures.put(tablet.getExtent(), us.successfulCommits.get(tablet)); } else { sendables.put(commitSession, mutations); mutationCount += mutations.size(); } } catch (TConstraintViolationException e) { us.violations.add(e.getViolations()); if (updateMetrics.isEnabled()) updateMetrics.add(TabletServerUpdateMetrics.constraintViolations, 0); if (e.getNonViolators().size() > 0) { // only log and commit mutations if there were some // that did not // violate constraints... this is what // prepareMutationsForCommit() // expects sendables.put(e.getCommitSession(), e.getNonViolators()); } mutationCount += mutations.size(); } catch (HoldTimeoutException t) { error = t; log.debug("Giving up on mutations due to a long memory hold time"); break; } catch (Throwable t) { error = t; log.error("Unexpected error preparing for commit", error); break; } } } } finally { prep.stop(); } long pt2 = System.currentTimeMillis(); us.prepareTimes.addStat(pt2 - pt1); updateAvgPrepTime(pt2 - pt1, us.queuedMutations.size()); if (error != null) { for (Entry<CommitSession, List<Mutation>> e : sendables.entrySet()) { e.getKey().abortCommit(e.getValue()); } throw new RuntimeException(error); } try { Span wal = Trace.start("wal"); try { while (true) { try { long t1 = System.currentTimeMillis(); logger.logManyTablets(sendables); long t2 = System.currentTimeMillis(); us.walogTimes.addStat(t2 - t1); updateWalogWriteTime((t2 - t1)); break; } catch (IOException ex) { log.warn("logging mutations failed, retrying"); } catch (FSError ex) { // happens when DFS is localFS log.warn("logging mutations failed, retrying"); } catch (Throwable t) { log.error( "Unknown exception logging mutations, counts for mutations in flight not decremented!", t); throw new RuntimeException(t); } } } finally { wal.stop(); } Span commit = Trace.start("commit"); try { long t1 = System.currentTimeMillis(); for (Entry<CommitSession, ? extends List<Mutation>> entry : sendables.entrySet()) { CommitSession commitSession = entry.getKey(); List<Mutation> mutations = entry.getValue(); commitSession.commit(mutations); Tablet tablet = commitSession.getTablet(); if (tablet == us.currentTablet) { // because constraint violations may filter out some // mutations, for proper // accounting with the client code, need to increment // the count based // on the original number of mutations from the client // NOT the filtered number us.successfulCommits.increment(tablet, us.queuedMutations.get(tablet).size()); } } long t2 = System.currentTimeMillis(); us.flushTime += (t2 - pt1); us.commitTimes.addStat(t2 - t1); updateAvgCommitTime(t2 - t1, sendables.size()); } finally { commit.stop(); } } finally { us.queuedMutations.clear(); if (us.currentTablet != null) { us.queuedMutations.put(us.currentTablet, new ArrayList<Mutation>()); } us.queuedMutationSize = 0; } us.totalUpdates += mutationCount; } private void updateWalogWriteTime(long time) { if (updateMetrics.isEnabled()) updateMetrics.add(TabletServerUpdateMetrics.waLogWriteTime, time); } private void updateAvgCommitTime(long time, int size) { if (updateMetrics.isEnabled()) updateMetrics.add(TabletServerUpdateMetrics.commitTime, (long) ((time) / (double) size)); } private void updateAvgPrepTime(long time, int size) { if (updateMetrics.isEnabled()) updateMetrics.add(TabletServerUpdateMetrics.commitPrep, (long) ((time) / (double) size)); } @Override public UpdateErrors closeUpdate(TInfo tinfo, long updateID) throws NoSuchScanIDException { UpdateSession us = (UpdateSession) sessionManager.removeSession(updateID); if (us == null) { throw new NoSuchScanIDException(); } // clients may or may not see data from an update session while // it is in progress, however when the update session is closed // want to ensure that reads wait for the write to finish long opid = writeTracker.startWrite(us.queuedMutations.keySet()); try { flush(us); } finally { writeTracker.finishWrite(opid); } log.debug(String.format("UpSess %s %,d in %.3fs, at=[%s] ft=%.3fs(pt=%.3fs lt=%.3fs ct=%.3fs)", TServerUtils.clientAddress.get(), us.totalUpdates, (System.currentTimeMillis() - us.startTime) / 1000.0, us.authTimes.toString(), us.flushTime / 1000.0, us.prepareTimes.getSum() / 1000.0, us.walogTimes.getSum() / 1000.0, us.commitTimes.getSum() / 1000.0)); if (us.failures.size() > 0) { Entry<KeyExtent, Long> first = us.failures.entrySet().iterator().next(); log.debug(String.format("Failures: %d, first extent %s successful commits: %d", us.failures.size(), first.getKey().toString(), first.getValue())); } List<ConstraintViolationSummary> violations = us.violations.asList(); if (violations.size() > 0) { ConstraintViolationSummary first = us.violations.asList().iterator().next(); log.debug(String.format("Violations: %d, first %s occurs %d", violations.size(), first.violationDescription, first.numberOfViolatingMutations)); } if (us.authFailures.size() > 0) { KeyExtent first = us.authFailures.keySet().iterator().next(); log.debug(String.format("Authentication Failures: %d, first %s", us.authFailures.size(), first.toString())); } return new UpdateErrors(Translator.translate(us.failures, Translator.KET), Translator.translate(violations, Translator.CVST), Translator.translate(us.authFailures, Translator.KET)); } @Override public void update(TInfo tinfo, TCredentials credentials, TKeyExtent tkeyExtent, TMutation tmutation) throws NotServingTabletException, ConstraintViolationException, ThriftSecurityException { if (!security.canWrite(credentials, new String(tkeyExtent.getTable()))) throw new ThriftSecurityException(credentials.getPrincipal(), SecurityErrorCode.PERMISSION_DENIED); KeyExtent keyExtent = new KeyExtent(tkeyExtent); Tablet tablet = onlineTablets.get(new KeyExtent(keyExtent)); if (tablet == null) { throw new NotServingTabletException(tkeyExtent); } if (!keyExtent.isMeta()) TabletServer.this.resourceManager.waitUntilCommitsAreEnabled(); long opid = writeTracker.startWrite(TabletType.type(keyExtent)); try { Mutation mutation = new ServerMutation(tmutation); List<Mutation> mutations = Collections.singletonList(mutation); Span prep = Trace.start("prep"); CommitSession cs; try { cs = tablet.prepareMutationsForCommit(new TservConstraintEnv(security, credentials), mutations); } finally { prep.stop(); } if (cs == null) { throw new NotServingTabletException(tkeyExtent); } while (true) { try { Span wal = Trace.start("wal"); try { logger.log(cs, cs.getWALogSeq(), mutation); } finally { wal.stop(); } break; } catch (IOException ex) { log.warn(ex, ex); } } Span commit = Trace.start("commit"); try { cs.commit(mutations); } finally { commit.stop(); } } catch (TConstraintViolationException e) { throw new ConstraintViolationException( Translator.translate(e.getViolations().asList(), Translator.CVST)); } finally { writeTracker.finishWrite(opid); } } private void checkConditions(Map<KeyExtent, List<ServerConditionalMutation>> updates, ArrayList<TCMResult> results, ConditionalSession cs, List<String> symbols) throws IOException { Iterator<Entry<KeyExtent, List<ServerConditionalMutation>>> iter = updates.entrySet().iterator(); CompressedIterators compressedIters = new CompressedIterators(symbols); while (iter.hasNext()) { Entry<KeyExtent, List<ServerConditionalMutation>> entry = iter.next(); Tablet tablet = onlineTablets.get(entry.getKey()); if (tablet == null || tablet.isClosed()) { for (ServerConditionalMutation scm : entry.getValue()) results.add(new TCMResult(scm.getID(), TCMStatus.IGNORED)); iter.remove(); } else { List<ServerConditionalMutation> okMutations = new ArrayList<ServerConditionalMutation>( entry.getValue().size()); for (ServerConditionalMutation scm : entry.getValue()) { if (checkCondition(results, cs, compressedIters, tablet, scm)) okMutations.add(scm); } entry.setValue(okMutations); } } } boolean checkCondition(ArrayList<TCMResult> results, ConditionalSession cs, CompressedIterators compressedIters, Tablet tablet, ServerConditionalMutation scm) throws IOException { boolean add = true; Set<Column> emptyCols = Collections.emptySet(); for (TCondition tc : scm.getConditions()) { Range range; if (tc.hasTimestamp) range = Range.exact(new Text(scm.getRow()), new Text(tc.getCf()), new Text(tc.getCq()), new Text(tc.getCv()), tc.getTs()); else range = Range.exact(new Text(scm.getRow()), new Text(tc.getCf()), new Text(tc.getCq()), new Text(tc.getCv())); IterConfig ic = compressedIters.decompress(tc.iterators); Scanner scanner = tablet.createScanner(range, 1, emptyCols, cs.auths, ic.ssiList, ic.ssio, false, cs.interruptFlag); try { ScanBatch batch = scanner.read(); Value val = null; for (KVEntry entry2 : batch.results) { val = entry2.getValue(); break; } if ((val == null ^ tc.getVal() == null) || (val != null && !Arrays.equals(tc.getVal(), val.get()))) { results.add(new TCMResult(scm.getID(), TCMStatus.REJECTED)); add = false; break; } } catch (TabletClosedException e) { results.add(new TCMResult(scm.getID(), TCMStatus.IGNORED)); add = false; break; } catch (IterationInterruptedException iie) { results.add(new TCMResult(scm.getID(), TCMStatus.IGNORED)); add = false; break; } catch (TooManyFilesException tmfe) { results.add(new TCMResult(scm.getID(), TCMStatus.IGNORED)); add = false; break; } } return add; } private void writeConditionalMutations(Map<KeyExtent, List<ServerConditionalMutation>> updates, ArrayList<TCMResult> results, ConditionalSession sess) { Set<Entry<KeyExtent, List<ServerConditionalMutation>>> es = updates.entrySet(); Map<CommitSession, List<Mutation>> sendables = new HashMap<CommitSession, List<Mutation>>(); boolean sessionCanceled = sess.interruptFlag.get(); Span prepSpan = Trace.start("prep"); try { long t1 = System.currentTimeMillis(); for (Entry<KeyExtent, List<ServerConditionalMutation>> entry : es) { Tablet tablet = onlineTablets.get(entry.getKey()); if (tablet == null || tablet.isClosed() || sessionCanceled) { for (ServerConditionalMutation scm : entry.getValue()) results.add(new TCMResult(scm.getID(), TCMStatus.IGNORED)); } else { try { @SuppressWarnings("unchecked") List<Mutation> mutations = (List<Mutation>) (List<? extends Mutation>) entry.getValue(); if (mutations.size() > 0) { CommitSession cs = tablet.prepareMutationsForCommit( new TservConstraintEnv(security, sess.credentials), mutations); if (cs == null) { for (ServerConditionalMutation scm : entry.getValue()) results.add(new TCMResult(scm.getID(), TCMStatus.IGNORED)); } else { for (ServerConditionalMutation scm : entry.getValue()) results.add(new TCMResult(scm.getID(), TCMStatus.ACCEPTED)); sendables.put(cs, mutations); } } } catch (TConstraintViolationException e) { if (e.getNonViolators().size() > 0) { sendables.put(e.getCommitSession(), e.getNonViolators()); for (Mutation m : e.getNonViolators()) results.add(new TCMResult(((ServerConditionalMutation) m).getID(), TCMStatus.ACCEPTED)); } for (Mutation m : e.getViolators()) results.add( new TCMResult(((ServerConditionalMutation) m).getID(), TCMStatus.VIOLATED)); } } } long t2 = System.currentTimeMillis(); updateAvgPrepTime(t2 - t1, es.size()); } finally { prepSpan.stop(); } Span walSpan = Trace.start("wal"); try { while (true && sendables.size() > 0) { try { long t1 = System.currentTimeMillis(); logger.logManyTablets(sendables); long t2 = System.currentTimeMillis(); updateWalogWriteTime(t2 - t1); break; } catch (IOException ex) { log.warn("logging mutations failed, retrying"); } catch (FSError ex) { // happens when DFS is localFS log.warn("logging mutations failed, retrying"); } catch (Throwable t) { log.error( "Unknown exception logging mutations, counts for mutations in flight not decremented!", t); throw new RuntimeException(t); } } } finally { walSpan.stop(); } Span commitSpan = Trace.start("commit"); try { long t1 = System.currentTimeMillis(); for (Entry<CommitSession, ? extends List<Mutation>> entry : sendables.entrySet()) { CommitSession commitSession = entry.getKey(); List<Mutation> mutations = entry.getValue(); commitSession.commit(mutations); } long t2 = System.currentTimeMillis(); updateAvgCommitTime(t2 - t1, sendables.size()); } finally { commitSpan.stop(); } } private Map<KeyExtent, List<ServerConditionalMutation>> conditionalUpdate(ConditionalSession cs, Map<KeyExtent, List<ServerConditionalMutation>> updates, ArrayList<TCMResult> results, List<String> symbols) throws IOException { // sort each list of mutations, this is done to avoid deadlock and doing seeks in order is more efficient and detect duplicate rows. ConditionalMutationSet.sortConditionalMutations(updates); Map<KeyExtent, List<ServerConditionalMutation>> deferred = new HashMap<KeyExtent, List<ServerConditionalMutation>>(); // can not process two mutations for the same row, because one will not see what the other writes ConditionalMutationSet.deferDuplicatesRows(updates, deferred); // get as many locks as possible w/o blocking... defer any rows that are locked List<RowLock> locks = rowLocks.acquireRowlocks(updates, deferred); try { Span checkSpan = Trace.start("Check conditions"); try { checkConditions(updates, results, cs, symbols); } finally { checkSpan.stop(); } Span updateSpan = Trace.start("apply conditional mutations"); try { writeConditionalMutations(updates, results, cs); } finally { updateSpan.stop(); } } finally { rowLocks.releaseRowLocks(locks); } return deferred; } @Override public TConditionalSession startConditionalUpdate(TInfo tinfo, TCredentials credentials, List<ByteBuffer> authorizations, String tableID) throws ThriftSecurityException, TException { Authorizations userauths = null; if (!security.canConditionallyUpdate(credentials, tableID, authorizations)) throw new ThriftSecurityException(credentials.getPrincipal(), SecurityErrorCode.PERMISSION_DENIED); userauths = security.getUserAuthorizations(credentials); for (ByteBuffer auth : authorizations) if (!userauths.contains(ByteBufferUtil.toBytes(auth))) throw new ThriftSecurityException(credentials.getPrincipal(), SecurityErrorCode.BAD_AUTHORIZATIONS); ConditionalSession cs = new ConditionalSession(); cs.auths = new Authorizations(authorizations); cs.credentials = credentials; cs.tableId = tableID; cs.interruptFlag = new AtomicBoolean(); long sid = sessionManager.createSession(cs, false); return new TConditionalSession(sid, lockID, sessionManager.getMaxIdleTime()); } @Override public List<TCMResult> conditionalUpdate(TInfo tinfo, long sessID, Map<TKeyExtent, List<TConditionalMutation>> mutations, List<String> symbols) throws NoSuchScanIDException, TException { ConditionalSession cs = (ConditionalSession) sessionManager.reserveSession(sessID); if (cs == null || cs.interruptFlag.get()) throw new NoSuchScanIDException(); if (!cs.tableId.equals(MetadataTable.ID) && !cs.tableId.equals(RootTable.ID)) TabletServer.this.resourceManager.waitUntilCommitsAreEnabled(); Text tid = new Text(cs.tableId); long opid = writeTracker.startWrite(TabletType.type(new KeyExtent(tid, null, null))); try { Map<KeyExtent, List<ServerConditionalMutation>> updates = Translator.translate(mutations, Translator.TKET, new Translator.ListTranslator<TConditionalMutation, ServerConditionalMutation>( ServerConditionalMutation.TCMT)); for (KeyExtent ke : updates.keySet()) if (!ke.getTableId().equals(tid)) throw new IllegalArgumentException("Unexpected table id " + tid + " != " + ke.getTableId()); ArrayList<TCMResult> results = new ArrayList<TCMResult>(); Map<KeyExtent, List<ServerConditionalMutation>> deferred = conditionalUpdate(cs, updates, results, symbols); while (deferred.size() > 0) { deferred = conditionalUpdate(cs, deferred, results, symbols); } return results; } catch (IOException ioe) { throw new TException(ioe); } finally { writeTracker.finishWrite(opid); sessionManager.unreserveSession(sessID); } } @Override public void invalidateConditionalUpdate(TInfo tinfo, long sessID) throws TException { // this method should wait for any running conditional update to complete // after this method returns a conditional update should not be able to start ConditionalSession cs = (ConditionalSession) sessionManager.getSession(sessID); if (cs != null) cs.interruptFlag.set(true); cs = (ConditionalSession) sessionManager.reserveSession(sessID, true); if (cs != null) sessionManager.removeSession(sessID, true); } @Override public void closeConditionalUpdate(TInfo tinfo, long sessID) throws TException { sessionManager.removeSession(sessID, false); } @Override public void splitTablet(TInfo tinfo, TCredentials credentials, TKeyExtent tkeyExtent, ByteBuffer splitPoint) throws NotServingTabletException, ThriftSecurityException { String tableId = new String(ByteBufferUtil.toBytes(tkeyExtent.table)); if (!security.canSplitTablet(credentials, tableId)) throw new ThriftSecurityException(credentials.getPrincipal(), SecurityErrorCode.PERMISSION_DENIED); KeyExtent keyExtent = new KeyExtent(tkeyExtent); Tablet tablet = onlineTablets.get(keyExtent); if (tablet == null) { throw new NotServingTabletException(tkeyExtent); } if (keyExtent.getEndRow() == null || !keyExtent.getEndRow().equals(ByteBufferUtil.toText(splitPoint))) { try { if (TabletServer.this.splitTablet(tablet, ByteBufferUtil.toBytes(splitPoint)) == null) { throw new NotServingTabletException(tkeyExtent); } } catch (IOException e) { log.warn("Failed to split " + keyExtent, e); throw new RuntimeException(e); } } } @Override public TabletServerStatus getTabletServerStatus(TInfo tinfo, TCredentials credentials) throws ThriftSecurityException, TException { return getStats(sessionManager.getActiveScansPerTable()); } @Override public List<TabletStats> getTabletStats(TInfo tinfo, TCredentials credentials, String tableId) throws ThriftSecurityException, TException { TreeMap<KeyExtent, Tablet> onlineTabletsCopy; synchronized (onlineTablets) { onlineTabletsCopy = new TreeMap<KeyExtent, Tablet>(onlineTablets); } List<TabletStats> result = new ArrayList<TabletStats>(); Text text = new Text(tableId); KeyExtent start = new KeyExtent(text, new Text(), null); for (Entry<KeyExtent, Tablet> entry : onlineTabletsCopy.tailMap(start).entrySet()) { KeyExtent ke = entry.getKey(); if (ke.getTableId().compareTo(text) == 0) { Tablet tablet = entry.getValue(); TabletStats stats = tablet.timer.getTabletStats(); stats.extent = ke.toThrift(); stats.ingestRate = tablet.ingestRate(); stats.queryRate = tablet.queryRate(); stats.splitCreationTime = tablet.getSplitCreationTime(); stats.numEntries = tablet.getNumEntries(); result.add(stats); } } return result; } private ZooCache masterLockCache = new ZooCache(); private void checkPermission(TCredentials credentials, String lock, final String request) throws ThriftSecurityException { boolean fatal = false; try { log.debug("Got " + request + " message from user: " + credentials.getPrincipal()); if (!security.canPerformSystemActions(credentials)) { log.warn("Got " + request + " message from user: " + credentials.getPrincipal()); throw new ThriftSecurityException(credentials.getPrincipal(), SecurityErrorCode.PERMISSION_DENIED); } } catch (ThriftSecurityException e) { log.warn("Got " + request + " message from unauthenticatable user: " + e.getUser()); if (SystemCredentials.get().getToken().getClass().getName() .equals(credentials.getTokenClassName())) { log.fatal( "Got message from a service with a mismatched configuration. Please ensure a compatible configuration.", e); fatal = true; } throw e; } finally { if (fatal) { Halt.halt(1, new Runnable() { @Override public void run() { logGCInfo(getSystemConfiguration()); } }); } } if (tabletServerLock == null || !tabletServerLock.wasLockAcquired()) { log.warn("Got " + request + " message from master before lock acquired, ignoring..."); throw new RuntimeException("Lock not acquired"); } if (tabletServerLock != null && tabletServerLock.wasLockAcquired() && !tabletServerLock.isLocked()) { Halt.halt(1, new Runnable() { @Override public void run() { log.info("Tablet server no longer holds lock during checkPermission() : " + request + ", exiting"); logGCInfo(getSystemConfiguration()); } }); } if (lock != null) { ZooUtil.LockID lid = new ZooUtil.LockID(ZooUtil.getRoot(instance) + Constants.ZMASTER_LOCK, lock); try { if (!ZooLock.isLockHeld(masterLockCache, lid)) { // maybe the cache is out of date and a new master holds the // lock? masterLockCache.clear(); if (!ZooLock.isLockHeld(masterLockCache, lid)) { log.warn("Got " + request + " message from a master that does not hold the current lock " + lock); throw new RuntimeException("bad master lock"); } } } catch (Exception e) { throw new RuntimeException("bad master lock", e); } } } @Override public void loadTablet(TInfo tinfo, TCredentials credentials, String lock, final TKeyExtent textent) { try { checkPermission(credentials, lock, "loadTablet"); } catch (ThriftSecurityException e) { log.error(e, e); throw new RuntimeException(e); } final KeyExtent extent = new KeyExtent(textent); synchronized (unopenedTablets) { synchronized (openingTablets) { synchronized (onlineTablets) { // checking if this exact tablet is in any of the sets // below is not a strong enough check // when splits and fix splits occurring Set<KeyExtent> unopenedOverlapping = KeyExtent.findOverlapping(extent, unopenedTablets); Set<KeyExtent> openingOverlapping = KeyExtent.findOverlapping(extent, openingTablets); Set<KeyExtent> onlineOverlapping = KeyExtent.findOverlapping(extent, onlineTablets); Set<KeyExtent> all = new HashSet<KeyExtent>(); all.addAll(unopenedOverlapping); all.addAll(openingOverlapping); all.addAll(onlineOverlapping); if (!all.isEmpty()) { // ignore any tablets that have recently split, for error logging for (KeyExtent e2 : onlineOverlapping) { Tablet tablet = onlineTablets.get(e2); if (System.currentTimeMillis() - tablet.getSplitCreationTime() < RECENTLY_SPLIT_MILLIES) { all.remove(e2); } } // ignore self, for error logging all.remove(extent); if (all.size() > 0) { log.error( "Tablet " + extent + " overlaps previously assigned " + unopenedOverlapping + " " + openingOverlapping + " " + onlineOverlapping + " " + all); } return; } unopenedTablets.add(extent); } } } // add the assignment job to the appropriate queue log.info("Loading tablet " + extent); final Runnable ah = new LoggingRunnable(log, new AssignmentHandler(extent)); // Root tablet assignment must take place immediately if (extent.isRootTablet()) { new Daemon("Root Tablet Assignment") { @Override public void run() { ah.run(); if (onlineTablets.containsKey(extent)) { log.info("Root tablet loaded: " + extent); } else { log.info("Root tablet failed to load"); } } }.start(); } else { if (extent.isMeta()) { resourceManager.addMetaDataAssignment(ah); } else { resourceManager.addAssignment(ah); } } } @Override public void unloadTablet(TInfo tinfo, TCredentials credentials, String lock, TKeyExtent textent, boolean save) { try { checkPermission(credentials, lock, "unloadTablet"); } catch (ThriftSecurityException e) { log.error(e, e); throw new RuntimeException(e); } KeyExtent extent = new KeyExtent(textent); resourceManager.addMigration(extent, new LoggingRunnable(log, new UnloadTabletHandler(extent, save))); } @Override public void flush(TInfo tinfo, TCredentials credentials, String lock, String tableId, ByteBuffer startRow, ByteBuffer endRow) { try { checkPermission(credentials, lock, "flush"); } catch (ThriftSecurityException e) { log.error(e, e); throw new RuntimeException(e); } ArrayList<Tablet> tabletsToFlush = new ArrayList<Tablet>(); KeyExtent ke = new KeyExtent(new Text(tableId), ByteBufferUtil.toText(endRow), ByteBufferUtil.toText(startRow)); synchronized (onlineTablets) { for (Tablet tablet : onlineTablets.values()) if (ke.overlaps(tablet.getExtent())) tabletsToFlush.add(tablet); } Long flushID = null; for (Tablet tablet : tabletsToFlush) { if (flushID == null) { // read the flush id once from zookeeper instead of reading // it for each tablet try { flushID = tablet.getFlushID(); } catch (NoNodeException e) { // table was probably deleted log.info("Asked to flush table that has no flush id " + ke + " " + e.getMessage()); return; } } tablet.flush(flushID); } } @Override public void flushTablet(TInfo tinfo, TCredentials credentials, String lock, TKeyExtent textent) throws TException { try { checkPermission(credentials, lock, "flushTablet"); } catch (ThriftSecurityException e) { log.error(e, e); throw new RuntimeException(e); } Tablet tablet = onlineTablets.get(new KeyExtent(textent)); if (tablet != null) { log.info("Flushing " + tablet.getExtent()); try { tablet.flush(tablet.getFlushID()); } catch (NoNodeException nne) { log.info("Asked to flush tablet that has no flush id " + new KeyExtent(textent) + " " + nne.getMessage()); } } } @Override public void halt(TInfo tinfo, TCredentials credentials, String lock) throws ThriftSecurityException { checkPermission(credentials, lock, "halt"); Halt.halt(0, new Runnable() { @Override public void run() { log.info("Master requested tablet server halt"); logGCInfo(getSystemConfiguration()); serverStopRequested = true; try { tabletServerLock.unlock(); } catch (Exception e) { log.error(e, e); } } }); } @Override public void fastHalt(TInfo info, TCredentials credentials, String lock) { try { halt(info, credentials, lock); } catch (Exception e) { log.warn("Error halting", e); } } @Override public TabletStats getHistoricalStats(TInfo tinfo, TCredentials credentials) throws ThriftSecurityException, TException { return statsKeeper.getTabletStats(); } @Override public List<ActiveScan> getActiveScans(TInfo tinfo, TCredentials credentials) throws ThriftSecurityException, TException { try { checkPermission(credentials, null, "getScans"); } catch (ThriftSecurityException e) { log.error(e, e); throw new RuntimeException(e); } return sessionManager.getActiveScans(); } @Override public void chop(TInfo tinfo, TCredentials credentials, String lock, TKeyExtent textent) throws TException { try { checkPermission(credentials, lock, "chop"); } catch (ThriftSecurityException e) { log.error(e, e); throw new RuntimeException(e); } KeyExtent ke = new KeyExtent(textent); Tablet tablet = onlineTablets.get(ke); if (tablet != null) { tablet.chopFiles(); } } @Override public void compact(TInfo tinfo, TCredentials credentials, String lock, String tableId, ByteBuffer startRow, ByteBuffer endRow) throws TException { try { checkPermission(credentials, lock, "compact"); } catch (ThriftSecurityException e) { log.error(e, e); throw new RuntimeException(e); } KeyExtent ke = new KeyExtent(new Text(tableId), ByteBufferUtil.toText(endRow), ByteBufferUtil.toText(startRow)); ArrayList<Tablet> tabletsToCompact = new ArrayList<Tablet>(); synchronized (onlineTablets) { for (Tablet tablet : onlineTablets.values()) if (ke.overlaps(tablet.getExtent())) tabletsToCompact.add(tablet); } Long compactionId = null; for (Tablet tablet : tabletsToCompact) { // all for the same table id, so only need to read // compaction id once if (compactionId == null) try { compactionId = tablet.getCompactionID().getFirst(); } catch (NoNodeException e) { log.info("Asked to compact table with no compaction id " + ke + " " + e.getMessage()); return; } tablet.compactAll(compactionId); } } @Override public void removeLogs(TInfo tinfo, TCredentials credentials, List<String> filenames) throws TException { String myname = getClientAddressString(); myname = myname.replace(':', '+'); Set<String> loggers = new HashSet<String>(); logger.getLoggers(loggers); nextFile: for (String filename : filenames) { // skip any log we're currently using for (String logger : loggers) { if (logger.contains(filename)) continue nextFile; } List<Tablet> onlineTabletsCopy = new ArrayList<Tablet>(); synchronized (onlineTablets) { onlineTabletsCopy.addAll(onlineTablets.values()); } for (Tablet tablet : onlineTabletsCopy) { for (String current : tablet.getCurrentLogs()) { if (current.contains(filename)) { log.info("Attempted to delete " + filename + " from tablet " + tablet.getExtent()); continue nextFile; } } } try { Path source = new Path(filename); if (acuConf.getBoolean(Property.TSERV_ARCHIVE_WALOGS)) { Path walogArchive = fs.matchingFileSystem(source, ServerConstants.getWalogArchives()); fs.mkdirs(walogArchive); Path dest = new Path(walogArchive, source.getName()); log.info("Archiving walog " + source + " to " + dest); if (!fs.rename(source, dest)) log.error("rename is unsuccessful"); } else { log.info("Deleting walog " + filename); Path sourcePath = new Path(filename); if (!(!acuConf.getBoolean(Property.GC_TRASH_IGNORE) && fs.moveToTrash(sourcePath)) && !fs.deleteRecursively(sourcePath)) log.warn("Failed to delete walog " + source); for (String recovery : ServerConstants.getRecoveryDirs()) { Path recoveryPath = new Path(recovery, source.getName()); try { if (fs.moveToTrash(recoveryPath) || fs.deleteRecursively(recoveryPath)) log.info("Deleted any recovery log " + filename); } catch (FileNotFoundException ex) { // ignore } } } } catch (IOException e) { log.warn("Error attempting to delete write-ahead log " + filename + ": " + e); } } } @Override public List<ActiveCompaction> getActiveCompactions(TInfo tinfo, TCredentials credentials) throws ThriftSecurityException, TException { try { checkPermission(credentials, null, "getActiveCompactions"); } catch (ThriftSecurityException e) { log.error(e, e); throw new RuntimeException(e); } List<CompactionInfo> compactions = Compactor.getRunningCompactions(); List<ActiveCompaction> ret = new ArrayList<ActiveCompaction>(compactions.size()); for (CompactionInfo compactionInfo : compactions) { ret.add(compactionInfo.toThrift()); } return ret; } } private class SplitRunner implements Runnable { private Tablet tablet; public SplitRunner(Tablet tablet) { this.tablet = tablet; } @Override public void run() { if (majorCompactorDisabled) { // this will make split task that were queued when shutdown was // initiated exit return; } splitTablet(tablet); } } boolean isMajorCompactionDisabled() { return majorCompactorDisabled; } void executeSplit(Tablet tablet) { resourceManager.executeSplit(tablet.getExtent(), new LoggingRunnable(log, new SplitRunner(tablet))); } private class MajorCompactor implements Runnable { public MajorCompactor(AccumuloConfiguration config) { CompactionWatcher.startWatching(config); } @Override public void run() { while (!majorCompactorDisabled) { try { UtilWaitThread.sleep(getSystemConfiguration().getTimeInMillis(Property.TSERV_MAJC_DELAY)); TreeMap<KeyExtent, Tablet> copyOnlineTablets = new TreeMap<KeyExtent, Tablet>(); synchronized (onlineTablets) { copyOnlineTablets.putAll(onlineTablets); // avoid // concurrent // modification } int numMajorCompactionsInProgress = 0; Iterator<Entry<KeyExtent, Tablet>> iter = copyOnlineTablets.entrySet().iterator(); // bail early now if we're shutting down while (iter.hasNext() && !majorCompactorDisabled) { Entry<KeyExtent, Tablet> entry = iter.next(); Tablet tablet = entry.getValue(); // if we need to split AND compact, we need a good way // to decide what to do if (tablet.needsSplit()) { executeSplit(tablet); continue; } int maxLogEntriesPerTablet = getTableConfiguration(tablet.getExtent()) .getCount(Property.TABLE_MINC_LOGS_MAX); if (tablet.getLogCount() >= maxLogEntriesPerTablet) { log.debug("Initiating minor compaction for " + tablet.getExtent() + " because it has " + tablet.getLogCount() + " write ahead logs"); tablet.initiateMinorCompaction(MinorCompactionReason.SYSTEM); } synchronized (tablet) { if (tablet.initiateMajorCompaction(MajorCompactionReason.NORMAL) || tablet.majorCompactionQueued() || tablet.majorCompactionRunning()) { numMajorCompactionsInProgress++; continue; } } } int idleCompactionsToStart = Math.max(1, getSystemConfiguration().getCount(Property.TSERV_MAJC_MAXCONCURRENT) / 2); if (numMajorCompactionsInProgress < idleCompactionsToStart) { // system is not major compacting, can schedule some // idle compactions iter = copyOnlineTablets.entrySet().iterator(); while (iter.hasNext() && !majorCompactorDisabled && numMajorCompactionsInProgress < idleCompactionsToStart) { Entry<KeyExtent, Tablet> entry = iter.next(); Tablet tablet = entry.getValue(); if (tablet.initiateMajorCompaction(MajorCompactionReason.IDLE)) { numMajorCompactionsInProgress++; } } } } catch (Throwable t) { log.error("Unexpected exception in " + Thread.currentThread().getName(), t); UtilWaitThread.sleep(1000); } } } } private void splitTablet(Tablet tablet) { try { TreeMap<KeyExtent, SplitInfo> tabletInfo = splitTablet(tablet, null); if (tabletInfo == null) { // either split or compact not both // were not able to split... so see if a major compaction is // needed tablet.initiateMajorCompaction(MajorCompactionReason.NORMAL); } } catch (IOException e) { statsKeeper.updateTime(Operation.SPLIT, 0, 0, true); log.error("split failed: " + e.getMessage() + " for tablet " + tablet.getExtent(), e); } catch (Exception e) { statsKeeper.updateTime(Operation.SPLIT, 0, 0, true); log.error("Unknown error on split: " + e, e); } } private TreeMap<KeyExtent, SplitInfo> splitTablet(Tablet tablet, byte[] splitPoint) throws IOException { long t1 = System.currentTimeMillis(); TreeMap<KeyExtent, SplitInfo> tabletInfo = tablet.split(splitPoint); if (tabletInfo == null) { return null; } log.info("Starting split: " + tablet.getExtent()); statsKeeper.incrementStatusSplit(); long start = System.currentTimeMillis(); Tablet[] newTablets = new Tablet[2]; Entry<KeyExtent, SplitInfo> first = tabletInfo.firstEntry(); newTablets[0] = new Tablet(TabletServer.this, new Text(first.getValue().dir), first.getKey(), resourceManager.createTabletResourceManager(), first.getValue().datafiles, first.getValue().time, first.getValue().initFlushID, first.getValue().initCompactID); Entry<KeyExtent, SplitInfo> last = tabletInfo.lastEntry(); newTablets[1] = new Tablet(TabletServer.this, new Text(last.getValue().dir), last.getKey(), resourceManager.createTabletResourceManager(), last.getValue().datafiles, last.getValue().time, last.getValue().initFlushID, last.getValue().initCompactID); // roll tablet stats over into tablet server's statsKeeper object as // historical data statsKeeper.saveMinorTimes(tablet.timer); statsKeeper.saveMajorTimes(tablet.timer); // lose the reference to the old tablet and open two new ones synchronized (onlineTablets) { onlineTablets.remove(tablet.getExtent()); onlineTablets.put(newTablets[0].getExtent(), newTablets[0]); onlineTablets.put(newTablets[1].getExtent(), newTablets[1]); } // tell the master enqueueMasterMessage(new SplitReportMessage(tablet.getExtent(), newTablets[0].getExtent(), new Text("/" + newTablets[0].getLocation().getName()), newTablets[1].getExtent(), new Text("/" + newTablets[1].getLocation().getName()))); statsKeeper.updateTime(Operation.SPLIT, start, 0, false); long t2 = System.currentTimeMillis(); log.info("Tablet split: " + tablet.getExtent() + " size0 " + newTablets[0].estimateTabletSize() + " size1 " + newTablets[1].estimateTabletSize() + " time " + (t2 - t1) + "ms"); return tabletInfo; } public long lastPingTime = System.currentTimeMillis(); public Socket currentMaster; // a queue to hold messages that are to be sent back to the master private BlockingDeque<MasterMessage> masterMessages = new LinkedBlockingDeque<MasterMessage>(); // add a message for the main thread to send back to the master void enqueueMasterMessage(MasterMessage m) { masterMessages.addLast(m); } private class UnloadTabletHandler implements Runnable { private KeyExtent extent; private boolean saveState; public UnloadTabletHandler(KeyExtent extent, boolean saveState) { this.extent = extent; this.saveState = saveState; } @Override public void run() { Tablet t = null; synchronized (unopenedTablets) { if (unopenedTablets.contains(extent)) { unopenedTablets.remove(extent); // enqueueMasterMessage(new TabletUnloadedMessage(extent)); return; } } synchronized (openingTablets) { while (openingTablets.contains(extent)) { try { openingTablets.wait(); } catch (InterruptedException e) { } } } synchronized (onlineTablets) { if (onlineTablets.containsKey(extent)) { t = onlineTablets.get(extent); } } if (t == null) { // Tablet has probably been recently unloaded: repeated master // unload request is crossing the successful unloaded message if (!recentlyUnloadedCache.containsKey(extent)) { log.info("told to unload tablet that was not being served " + extent); enqueueMasterMessage( new TabletStatusMessage(TabletLoadState.UNLOAD_FAILURE_NOT_SERVING, extent)); } return; } try { t.close(saveState); } catch (Throwable e) { if ((t.isClosing() || t.isClosed()) && e instanceof IllegalStateException) { log.debug("Failed to unload tablet " + extent + "... it was alread closing or closed : " + e.getMessage()); } else { log.error("Failed to close tablet " + extent + "... Aborting migration", e); enqueueMasterMessage(new TabletStatusMessage(TabletLoadState.UNLOAD_ERROR, extent)); } return; } // stop serving tablet - client will get not serving tablet // exceptions recentlyUnloadedCache.put(extent, System.currentTimeMillis()); onlineTablets.remove(extent); try { TServerInstance instance = new TServerInstance(clientAddress, getLock().getSessionId()); TabletLocationState tls = null; try { tls = new TabletLocationState(extent, null, instance, null, null, false); } catch (BadLocationStateException e) { log.error("Unexpected error ", e); } log.debug("Unassigning " + tls); TabletStateStore.unassign(tls); } catch (DistributedStoreException ex) { log.warn("Unable to update storage", ex); } catch (KeeperException e) { log.warn("Unable determine our zookeeper session information", e); } catch (InterruptedException e) { log.warn("Interrupted while getting our zookeeper session information", e); } // tell the master how it went enqueueMasterMessage(new TabletStatusMessage(TabletLoadState.UNLOADED, extent)); // roll tablet stats over into tablet server's statsKeeper object as // historical data statsKeeper.saveMinorTimes(t.timer); statsKeeper.saveMajorTimes(t.timer); log.info("unloaded " + extent); } } private class AssignmentHandler implements Runnable { private KeyExtent extent; private int retryAttempt = 0; public AssignmentHandler(KeyExtent extent) { this.extent = extent; } public AssignmentHandler(KeyExtent extent, int retryAttempt) { this(extent); this.retryAttempt = retryAttempt; } @Override public void run() { log.info(clientAddress + ": got assignment from master: " + extent); synchronized (unopenedTablets) { synchronized (openingTablets) { synchronized (onlineTablets) { // nothing should be moving between sets, do a sanity // check Set<KeyExtent> unopenedOverlapping = KeyExtent.findOverlapping(extent, unopenedTablets); Set<KeyExtent> openingOverlapping = KeyExtent.findOverlapping(extent, openingTablets); Set<KeyExtent> onlineOverlapping = KeyExtent.findOverlapping(extent, onlineTablets); if (openingOverlapping.contains(extent) || onlineOverlapping.contains(extent)) return; if (!unopenedTablets.contains(extent) || unopenedOverlapping.size() != 1 || openingOverlapping.size() > 0 || onlineOverlapping.size() > 0) { throw new IllegalStateException("overlaps assigned " + extent + " " + !unopenedTablets.contains(extent) + " " + unopenedOverlapping + " " + openingOverlapping + " " + onlineOverlapping); } } unopenedTablets.remove(extent); openingTablets.add(extent); } } log.debug("Loading extent: " + extent); // check Metadata table before accepting assignment Text locationToOpen = null; SortedMap<Key, Value> tabletsKeyValues = new TreeMap<Key, Value>(); try { Pair<Text, KeyExtent> pair = verifyTabletInformation(extent, TabletServer.this.getTabletSession(), tabletsKeyValues, getClientAddressString(), getLock()); locationToOpen = pair.getFirst(); if (pair.getSecond() != null) { synchronized (openingTablets) { openingTablets.remove(extent); openingTablets.notifyAll(); // it expected that the new extent will overlap the old one... if it does not, it should not be added to unopenedTablets if (!KeyExtent .findOverlapping(extent, new TreeSet<KeyExtent>(Arrays.asList(pair.getSecond()))) .contains(pair.getSecond())) { throw new IllegalStateException( "Fixed split does not overlap " + extent + " " + pair.getSecond()); } unopenedTablets.add(pair.getSecond()); } // split was rolled back... try again new AssignmentHandler(pair.getSecond()).run(); return; } } catch (Exception e) { synchronized (openingTablets) { openingTablets.remove(extent); openingTablets.notifyAll(); } log.warn("Failed to verify tablet " + extent, e); enqueueMasterMessage(new TabletStatusMessage(TabletLoadState.LOAD_FAILURE, extent)); throw new RuntimeException(e); } if (locationToOpen == null) { log.debug( "Reporting tablet " + extent + " assignment failure: unable to verify Tablet Information"); synchronized (openingTablets) { openingTablets.remove(extent); openingTablets.notifyAll(); } enqueueMasterMessage(new TabletStatusMessage(TabletLoadState.LOAD_FAILURE, extent)); return; } Tablet tablet = null; boolean successful = false; try { TabletResourceManager trm = resourceManager.createTabletResourceManager(); // this opens the tablet file and fills in the endKey in the // extent tablet = new Tablet(TabletServer.this, locationToOpen, extent, trm, tabletsKeyValues); /* * If a minor compaction starts after a tablet opens, this indicates a log recovery occurred. This recovered data must be minor compacted. * * There are three reasons to wait for this minor compaction to finish before placing the tablet in online tablets. * * 1) The log recovery code does not handle data written to the tablet on multiple tablet servers. 2) The log recovery code does not block if memory is * full. Therefore recovering lots of tablets that use a lot of memory could run out of memory. 3) The minor compaction finish event did not make it to * the logs (the file will be in !METADATA, preventing replay of compacted data)... but do not want a majc to wipe the file out from !METADATA and then * have another process failure... this could cause duplicate data to replay */ if (tablet.getNumEntriesInMemory() > 0 && !tablet.minorCompactNow(MinorCompactionReason.SYSTEM)) { throw new RuntimeException("Minor compaction after recovery fails for " + extent); } Assignment assignment = new Assignment(extent, getTabletSession()); TabletStateStore.setLocation(assignment); synchronized (openingTablets) { synchronized (onlineTablets) { openingTablets.remove(extent); onlineTablets.put(extent, tablet); openingTablets.notifyAll(); recentlyUnloadedCache.remove(tablet); } } tablet = null; // release this reference successful = true; } catch (Throwable e) { log.warn("exception trying to assign tablet " + extent + " " + locationToOpen, e); if (e.getMessage() != null) log.warn(e.getMessage()); String table = extent.getTableId().toString(); ProblemReports.getInstance().report(new ProblemReport(table, TABLET_LOAD, extent.getUUID().toString(), getClientAddressString(), e)); } if (!successful) { synchronized (unopenedTablets) { synchronized (openingTablets) { openingTablets.remove(extent); unopenedTablets.add(extent); openingTablets.notifyAll(); } } log.warn("failed to open tablet " + extent + " reporting failure to master"); enqueueMasterMessage(new TabletStatusMessage(TabletLoadState.LOAD_FAILURE, extent)); long reschedule = Math.min((1l << Math.min(32, retryAttempt)) * 1000, 10 * 60 * 1000l); log.warn(String.format("rescheduling tablet load in %.2f seconds", reschedule / 1000.)); SimpleTimer.getInstance().schedule(new TimerTask() { @Override public void run() { log.info("adding tablet " + extent + " back to the assignment pool (retry " + retryAttempt + ")"); AssignmentHandler handler = new AssignmentHandler(extent, retryAttempt + 1); if (extent.isMeta()) { if (extent.isRootTablet()) { new Daemon(new LoggingRunnable(log, handler), "Root tablet assignment retry") .start(); } else { resourceManager.addMetaDataAssignment(handler); } } else { resourceManager.addAssignment(handler); } } }, reschedule); } else { enqueueMasterMessage(new TabletStatusMessage(TabletLoadState.LOADED, extent)); } } } private VolumeManager fs; private Instance instance; private final SortedMap<KeyExtent, Tablet> onlineTablets = Collections .synchronizedSortedMap(new TreeMap<KeyExtent, Tablet>()); private final SortedSet<KeyExtent> unopenedTablets = Collections .synchronizedSortedSet(new TreeSet<KeyExtent>()); private final SortedSet<KeyExtent> openingTablets = Collections.synchronizedSortedSet(new TreeSet<KeyExtent>()); @SuppressWarnings("unchecked") private final Map<KeyExtent, Long> recentlyUnloadedCache = Collections.synchronizedMap(new LRUMap(1000)); private Thread majorCompactorThread; // used for stopping the server and MasterListener thread private volatile boolean serverStopRequested = false; private HostAndPort clientAddress; private TabletServerResourceManager resourceManager; private SecurityOperation security; private volatile boolean majorCompactorDisabled = false; private volatile boolean shutdownComplete = false; private ZooLock tabletServerLock; private TServer server; private DistributedWorkQueue bulkFailedCopyQ; private String lockID; private static final String METRICS_PREFIX = "tserver"; private static ObjectName OBJECT_NAME = null; static AtomicLong seekCount = new AtomicLong(0); public TabletStatsKeeper getStatsKeeper() { return statsKeeper; } public void addLoggersToMetadata(List<DfsLogger> logs, KeyExtent extent, int id) { log.info("Adding " + logs.size() + " logs for extent " + extent + " as alias " + id); long now = RelativeTime.currentTimeMillis(); List<String> logSet = new ArrayList<String>(); for (DfsLogger log : logs) logSet.add(log.toString()); MetadataTableUtil.LogEntry entry = new MetadataTableUtil.LogEntry(); entry.extent = extent; entry.tabletId = id; entry.timestamp = now; entry.server = logs.get(0).getLogger(); entry.filename = logs.get(0).getFileName(); entry.logSet = logSet; MetadataTableUtil.addLogEntry(SystemCredentials.get(), entry, getLock()); } private HostAndPort startServer(AccumuloConfiguration conf, String address, Property portHint, TProcessor processor, String threadName) throws UnknownHostException { Property maxMessageSizeProperty = (conf.get(Property.TSERV_MAX_MESSAGE_SIZE) != null ? Property.TSERV_MAX_MESSAGE_SIZE : Property.GENERAL_MAX_MESSAGE_SIZE); ServerAddress sp = TServerUtils.startServer(conf, address, portHint, processor, this.getClass().getSimpleName(), threadName, Property.TSERV_PORTSEARCH, Property.TSERV_MINTHREADS, Property.TSERV_THREADCHECK, maxMessageSizeProperty); this.server = sp.server; return sp.address; } private String getMasterAddress() { try { List<String> locations = instance.getMasterLocations(); if (locations.size() == 0) return null; return locations.get(0); } catch (Exception e) { log.warn("Failed to obtain master host " + e); } return null; } // Connect to the master for posting asynchronous results private MasterClientService.Client masterConnection(String address) { try { if (address == null) { return null; } MasterClientService.Client client = ThriftUtil.getClient(new MasterClientService.Client.Factory(), address, Property.GENERAL_RPC_TIMEOUT, getSystemConfiguration()); // log.info("Listener API to master has been opened"); return client; } catch (Exception e) { log.warn("Issue with masterConnection (" + address + ") " + e, e); } return null; } private void returnMasterConnection(MasterClientService.Client client) { ThriftUtil.returnClient(client); } private HostAndPort startTabletClientService() throws UnknownHostException { // start listening for client connection last Iface tch = TraceWrap.service(new ThriftClientHandler()); Processor<Iface> processor = new Processor<Iface>(tch); HostAndPort address = startServer(getSystemConfiguration(), clientAddress.getHostText(), Property.TSERV_CLIENTPORT, processor, "Thrift Client Server"); log.info("address = " + address); return address; } ZooLock getLock() { return tabletServerLock; } private void announceExistence() { IZooReaderWriter zoo = ZooReaderWriter.getInstance(); try { String zPath = ZooUtil.getRoot(instance) + Constants.ZTSERVERS + "/" + getClientAddressString(); zoo.putPersistentData(zPath, new byte[] {}, NodeExistsPolicy.SKIP); tabletServerLock = new ZooLock(zPath); LockWatcher lw = new LockWatcher() { @Override public void lostLock(final LockLossReason reason) { Halt.halt(0, new Runnable() { @Override public void run() { if (!serverStopRequested) log.fatal("Lost tablet server lock (reason = " + reason + "), exiting."); logGCInfo(getSystemConfiguration()); } }); } @Override public void unableToMonitorLockNode(final Throwable e) { Halt.halt(0, new Runnable() { @Override public void run() { log.fatal("Lost ability to monitor tablet server lock, exiting.", e); } }); } }; byte[] lockContent = new ServerServices(getClientAddressString(), Service.TSERV_CLIENT).toString() .getBytes(); for (int i = 0; i < 120 / 5; i++) { zoo.putPersistentData(zPath, new byte[0], NodeExistsPolicy.SKIP); if (tabletServerLock.tryLock(lw, lockContent)) { log.debug("Obtained tablet server lock " + tabletServerLock.getLockPath()); lockID = tabletServerLock.getLockID() .serialize(ZooUtil.getRoot(instance) + Constants.ZTSERVERS + "/"); return; } log.info("Waiting for tablet server lock"); UtilWaitThread.sleep(5000); } String msg = "Too many retries, exiting."; log.info(msg); throw new RuntimeException(msg); } catch (Exception e) { log.info("Could not obtain tablet server lock, exiting.", e); throw new RuntimeException(e); } } // main loop listens for client requests public void run() { SecurityUtil.serverLogin(); try { clientAddress = startTabletClientService(); } catch (UnknownHostException e1) { throw new RuntimeException("Failed to start the tablet client service", e1); } announceExistence(); ThreadPoolExecutor distWorkQThreadPool = new SimpleThreadPool( getSystemConfiguration().getCount(Property.TSERV_WORKQ_THREADS), "distributed work queue"); bulkFailedCopyQ = new DistributedWorkQueue(ZooUtil.getRoot(instance) + Constants.ZBULK_FAILED_COPYQ); try { bulkFailedCopyQ.startProcessing(new BulkFailedCopyProcessor(), distWorkQThreadPool); } catch (Exception e1) { throw new RuntimeException("Failed to start distributed work queue for copying ", e1); } try { logSorter.startWatchingForRecoveryLogs(distWorkQThreadPool); } catch (Exception ex) { log.error("Error setting watches for recoveries"); throw new RuntimeException(ex); } try { OBJECT_NAME = new ObjectName( "accumulo.server.metrics:service=TServerInfo,name=TabletServerMBean,instance=" + Thread.currentThread().getName()); // Do this because interface not in same package. StandardMBean mbean = new StandardMBean(this, TabletServerMBean.class, false); this.register(mbean); mincMetrics.register(); } catch (Exception e) { log.error("Error registering with JMX", e); } String masterHost; while (!serverStopRequested) { // send all of the pending messages try { MasterMessage mm = null; MasterClientService.Client iface = null; try { // wait until a message is ready to send, or a sever stop // was requested while (mm == null && !serverStopRequested) { mm = masterMessages.poll(1000, TimeUnit.MILLISECONDS); } // have a message to send to the master, so grab a // connection masterHost = getMasterAddress(); iface = masterConnection(masterHost); TServiceClient client = iface; // if while loop does not execute at all and mm != null, // then // finally block should place mm back on queue while (!serverStopRequested && mm != null && client != null && client.getOutputProtocol() != null && client.getOutputProtocol().getTransport() != null && client.getOutputProtocol().getTransport().isOpen()) { try { mm.send(SystemCredentials.get().toThrift(instance), getClientAddressString(), iface); mm = null; } catch (TException ex) { log.warn("Error sending message: queuing message again"); masterMessages.putFirst(mm); mm = null; throw ex; } // if any messages are immediately available grab em and // send them mm = masterMessages.poll(); } } finally { if (mm != null) { masterMessages.putFirst(mm); } returnMasterConnection(iface); UtilWaitThread.sleep(1000); } } catch (InterruptedException e) { log.info("Interrupt Exception received, shutting down"); serverStopRequested = true; } catch (Exception e) { // may have lost connection with master // loop back to the beginning and wait for a new one // this way we survive master failures log.error(getClientAddressString() + ": TServerInfo: Exception. Master down?", e); } } // wait for shutdown // if the main thread exits oldServer the master listener, the JVM will // kill the // other threads and finalize objects. We want the shutdown that is // running // in the master listener thread to complete oldServer this happens. // consider making other threads daemon threads so that objects don't // get prematurely finalized synchronized (this) { while (shutdownComplete == false) { try { this.wait(1000); } catch (InterruptedException e) { log.error(e.toString()); } } } log.debug("Stopping Thrift Servers"); TServerUtils.stopTServer(server); try { log.debug("Closing filesystem"); fs.close(); } catch (IOException e) { log.warn("Failed to close filesystem : " + e.getMessage(), e); } logGCInfo(getSystemConfiguration()); log.info("TServerInfo: stop requested. exiting ... "); try { tabletServerLock.unlock(); } catch (Exception e) { log.warn("Failed to release tablet server lock", e); } } private long totalMinorCompactions; private static Pair<Text, KeyExtent> verifyRootTablet(KeyExtent extent, TServerInstance instance) throws DistributedStoreException, AccumuloException { ZooTabletStateStore store = new ZooTabletStateStore(); if (!store.iterator().hasNext()) { throw new AccumuloException("Illegal state: location is not set in zookeeper"); } TabletLocationState next = store.iterator().next(); if (!instance.equals(next.future)) { throw new AccumuloException("Future location is not to this server for the root tablet"); } if (next.current != null) { throw new AccumuloException("Root tablet already has a location set"); } return new Pair<Text, KeyExtent>(new Text(RootTable.ROOT_TABLET_LOCATION), null); } public static Pair<Text, KeyExtent> verifyTabletInformation(KeyExtent extent, TServerInstance instance, SortedMap<Key, Value> tabletsKeyValues, String clientAddress, ZooLock lock) throws AccumuloSecurityException, DistributedStoreException, AccumuloException { log.debug("verifying extent " + extent); if (extent.isRootTablet()) { return verifyRootTablet(extent, instance); } String tableToVerify = MetadataTable.ID; if (extent.isMeta()) tableToVerify = RootTable.ID; List<ColumnFQ> columnsToFetch = Arrays .asList(new ColumnFQ[] { TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN, TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN, TabletsSection.TabletColumnFamily.SPLIT_RATIO_COLUMN, TabletsSection.TabletColumnFamily.OLD_PREV_ROW_COLUMN, TabletsSection.ServerColumnFamily.TIME_COLUMN }); ScannerImpl scanner = new ScannerImpl(HdfsZooInstance.getInstance(), SystemCredentials.get(), tableToVerify, Authorizations.EMPTY); scanner.setRange(extent.toMetadataRange()); TreeMap<Key, Value> tkv = new TreeMap<Key, Value>(); for (Entry<Key, Value> entry : scanner) tkv.put(entry.getKey(), entry.getValue()); // only populate map after success if (tabletsKeyValues == null) { tabletsKeyValues = tkv; } else { tabletsKeyValues.clear(); tabletsKeyValues.putAll(tkv); } Text metadataEntry = extent.getMetadataEntry(); Value dir = checkTabletMetadata(extent, instance, tabletsKeyValues, metadataEntry); if (dir == null) return null; Value oldPrevEndRow = null; for (Entry<Key, Value> entry : tabletsKeyValues.entrySet()) { if (TabletsSection.TabletColumnFamily.OLD_PREV_ROW_COLUMN.hasColumns(entry.getKey())) { oldPrevEndRow = entry.getValue(); } } if (oldPrevEndRow != null) { SortedMap<Text, SortedMap<ColumnFQ, Value>> tabletEntries; tabletEntries = MetadataTableUtil.getTabletEntries(tabletsKeyValues, columnsToFetch); KeyExtent fke; try { fke = MetadataTableUtil.fixSplit(metadataEntry, tabletEntries.get(metadataEntry), instance, SystemCredentials.get(), lock); } catch (IOException e) { log.error("Error fixing split " + metadataEntry); throw new AccumuloException(e.toString()); } if (!fke.equals(extent)) { return new Pair<Text, KeyExtent>(null, fke); } // reread and reverify metadata entries now that metadata entries were fixed tabletsKeyValues.clear(); return verifyTabletInformation(fke, instance, tabletsKeyValues, clientAddress, lock); } return new Pair<Text, KeyExtent>(new Text(dir.get()), null); } static Value checkTabletMetadata(KeyExtent extent, TServerInstance instance, SortedMap<Key, Value> tabletsKeyValues, Text metadataEntry) throws AccumuloException { TServerInstance future = null; Value prevEndRow = null; Value dir = null; Value time = null; for (Entry<Key, Value> entry : tabletsKeyValues.entrySet()) { Key key = entry.getKey(); if (!metadataEntry.equals(key.getRow())) { log.info("Unexpected row in tablet metadata " + metadataEntry + " " + key.getRow()); return null; } Text cf = key.getColumnFamily(); if (cf.equals(TabletsSection.FutureLocationColumnFamily.NAME)) { if (future != null) { throw new AccumuloException("Tablet has multiple future locations " + extent); } future = new TServerInstance(entry.getValue(), key.getColumnQualifier()); } else if (cf.equals(TabletsSection.CurrentLocationColumnFamily.NAME)) { log.info("Tablet seems to be already assigned to " + new TServerInstance(entry.getValue(), key.getColumnQualifier())); return null; } else if (TabletsSection.TabletColumnFamily.PREV_ROW_COLUMN.hasColumns(key)) { prevEndRow = entry.getValue(); } else if (TabletsSection.ServerColumnFamily.DIRECTORY_COLUMN.hasColumns(key)) { dir = entry.getValue(); } else if (TabletsSection.ServerColumnFamily.TIME_COLUMN.hasColumns(key)) { time = entry.getValue(); } } if (prevEndRow == null) { throw new AccumuloException("Metadata entry does not have prev row (" + metadataEntry + ")"); } else { KeyExtent ke2 = new KeyExtent(metadataEntry, prevEndRow); if (!extent.equals(ke2)) { log.info("Tablet prev end row mismatch " + extent + " " + ke2.getPrevEndRow()); return null; } } if (dir == null) { throw new AccumuloException("Metadata entry does not have directory (" + metadataEntry + ")"); } if (time == null) { throw new AccumuloException("Metadata entry does not have time (" + metadataEntry + ")"); } if (future == null) { log.info("The master has not assigned " + extent + " to " + instance); return null; } if (!instance.equals(future)) { log.info("Table " + extent + " has been assigned to " + future + " which is not " + instance); return null; } return dir; } public String getClientAddressString() { if (clientAddress == null) return null; return clientAddress.getHostText() + ":" + clientAddress.getPort(); } TServerInstance getTabletSession() { String address = getClientAddressString(); if (address == null) return null; try { return new TServerInstance(address, tabletServerLock.getSessionId()); } catch (Exception ex) { log.warn("Unable to read session from tablet server lock" + ex); return null; } } public void config(String hostname) { log.info("Tablet server starting on " + hostname); security = AuditedSecurityOperation.getInstance(); clientAddress = HostAndPort.fromParts(hostname, 0); logger = new TabletServerLogger(this, getSystemConfiguration().getMemoryInBytes(Property.TSERV_WALOG_MAX_SIZE)); try { AccumuloVFSClassLoader.getContextManager().setContextConfig( new ContextManager.DefaultContextsConfig(new Iterable<Entry<String, String>>() { @Override public Iterator<Entry<String, String>> iterator() { return getSystemConfiguration().iterator(); } })); } catch (IOException e) { throw new RuntimeException(e); } // A task that cleans up unused classloader contexts Runnable contextCleaner = new Runnable() { @Override public void run() { ArrayList<KeyExtent> extents; synchronized (onlineTablets) { extents = new ArrayList<KeyExtent>(onlineTablets.keySet()); } Set<Text> tables = new HashSet<Text>(); for (KeyExtent keyExtent : extents) { tables.add(keyExtent.getTableId()); } HashSet<String> contexts = new HashSet<String>(); for (Text tableid : tables) { String context = getTableConfiguration(new KeyExtent(tableid, null, null)) .get(Property.TABLE_CLASSPATH); if (!context.equals("")) { contexts.add(context); } } try { AccumuloVFSClassLoader.getContextManager().removeUnusedContexts(contexts); } catch (IOException e) { log.warn(e.getMessage(), e); } } }; SimpleTimer.getInstance().schedule(contextCleaner, 60000, 60000); FileSystemMonitor.start(getSystemConfiguration(), Property.TSERV_MONITOR_FS); Runnable gcDebugTask = new Runnable() { @Override public void run() { logGCInfo(getSystemConfiguration()); } }; SimpleTimer.getInstance().schedule(gcDebugTask, 0, 1000); Runnable constraintTask = new Runnable() { @Override public void run() { ArrayList<Tablet> tablets; synchronized (onlineTablets) { tablets = new ArrayList<Tablet>(onlineTablets.values()); } for (Tablet tablet : tablets) { tablet.checkConstraints(); } } }; SimpleTimer.getInstance().schedule(constraintTask, 0, 1000); this.resourceManager = new TabletServerResourceManager(instance, fs); lastPingTime = System.currentTimeMillis(); currentMaster = null; statsKeeper = new TabletStatsKeeper(); // start major compactor majorCompactorThread = new Daemon(new LoggingRunnable(log, new MajorCompactor(getSystemConfiguration()))); majorCompactorThread.setName("Split/MajC initiator"); majorCompactorThread.start(); } public TabletServerStatus getStats(Map<String, MapCounter<ScanRunState>> scanCounts) { TabletServerStatus result = new TabletServerStatus(); Map<KeyExtent, Tablet> onlineTabletsCopy; synchronized (this.onlineTablets) { onlineTabletsCopy = new HashMap<KeyExtent, Tablet>(this.onlineTablets); } Map<String, TableInfo> tables = new HashMap<String, TableInfo>(); for (Entry<KeyExtent, Tablet> entry : onlineTabletsCopy.entrySet()) { String tableId = entry.getKey().getTableId().toString(); TableInfo table = tables.get(tableId); if (table == null) { table = new TableInfo(); table.minors = new Compacting(); table.majors = new Compacting(); tables.put(tableId, table); } Tablet tablet = entry.getValue(); long recs = tablet.getNumEntries(); table.tablets++; table.onlineTablets++; table.recs += recs; table.queryRate += tablet.queryRate(); table.queryByteRate += tablet.queryByteRate(); table.ingestRate += tablet.ingestRate(); table.ingestByteRate += tablet.ingestByteRate(); table.scanRate += tablet.scanRate(); long recsInMemory = tablet.getNumEntriesInMemory(); table.recsInMemory += recsInMemory; if (tablet.minorCompactionRunning()) table.minors.running++; if (tablet.minorCompactionQueued()) table.minors.queued++; if (tablet.majorCompactionRunning()) table.majors.running++; if (tablet.majorCompactionQueued()) table.majors.queued++; } for (Entry<String, MapCounter<ScanRunState>> entry : scanCounts.entrySet()) { TableInfo table = tables.get(entry.getKey()); if (table == null) { table = new TableInfo(); tables.put(entry.getKey(), table); } if (table.scans == null) table.scans = new Compacting(); table.scans.queued += entry.getValue().get(ScanRunState.QUEUED); table.scans.running += entry.getValue().get(ScanRunState.RUNNING); } ArrayList<KeyExtent> offlineTabletsCopy = new ArrayList<KeyExtent>(); synchronized (this.unopenedTablets) { synchronized (this.openingTablets) { offlineTabletsCopy.addAll(this.unopenedTablets); offlineTabletsCopy.addAll(this.openingTablets); } } for (KeyExtent extent : offlineTabletsCopy) { String tableId = extent.getTableId().toString(); TableInfo table = tables.get(tableId); if (table == null) { table = new TableInfo(); tables.put(tableId, table); } table.tablets++; } result.lastContact = RelativeTime.currentTimeMillis(); result.tableMap = tables; result.osLoad = ManagementFactory.getOperatingSystemMXBean().getSystemLoadAverage(); result.name = getClientAddressString(); result.holdTime = resourceManager.holdTime(); result.lookups = seekCount.get(); result.indexCacheHits = resourceManager.getIndexCache().getStats().getHitCount(); result.indexCacheRequest = resourceManager.getIndexCache().getStats().getRequestCount(); result.dataCacheHits = resourceManager.getDataCache().getStats().getHitCount(); result.dataCacheRequest = resourceManager.getDataCache().getStats().getRequestCount(); result.logSorts = logSorter.getLogSorts(); return result; } public static void main(String[] args) throws IOException { try { SecurityUtil.serverLogin(); VolumeManager fs = VolumeManagerImpl.get(); ServerOpts opts = new ServerOpts(); opts.parseArgs("tserver", args); String hostname = opts.getAddress(); Instance instance = HdfsZooInstance.getInstance(); ServerConfiguration conf = new ServerConfiguration(instance); Accumulo.init(fs, conf, "tserver"); TabletServer server = new TabletServer(conf, fs); server.config(hostname); Accumulo.enableTracing(hostname, "tserver"); server.run(); } catch (Exception ex) { log.error("Uncaught exception in TabletServer.main, exiting", ex); System.exit(1); } } public void minorCompactionFinished(CommitSession tablet, String newDatafile, int walogSeq) throws IOException { totalMinorCompactions++; logger.minorCompactionFinished(tablet, newDatafile, walogSeq); } public void minorCompactionStarted(CommitSession tablet, int lastUpdateSequence, String newMapfileLocation) throws IOException { logger.minorCompactionStarted(tablet, lastUpdateSequence, newMapfileLocation); } public void recover(VolumeManager fs, Tablet tablet, List<LogEntry> logEntries, Set<String> tabletFiles, MutationReceiver mutationReceiver) throws IOException { List<Path> recoveryLogs = new ArrayList<Path>(); List<LogEntry> sorted = new ArrayList<LogEntry>(logEntries); Collections.sort(sorted, new Comparator<LogEntry>() { @Override public int compare(LogEntry e1, LogEntry e2) { return (int) (e1.timestamp - e2.timestamp); } }); for (LogEntry entry : sorted) { Path recovery = null; for (String log : entry.logSet) { String[] parts = log.split("/", 2); // "host:port/filename" Path finished = new Path(fs.getFullPath(FileType.RECOVERY, parts[parts.length - 1]), "finished"); TabletServer.log.info("Looking for " + finished); if (fs.exists(finished)) { recovery = finished.getParent(); break; } } if (recovery == null) throw new IOException( "Unable to find recovery files for extent " + tablet.getExtent() + " logEntry: " + entry); recoveryLogs.add(recovery); } logger.recover(fs, tablet, recoveryLogs, tabletFiles, mutationReceiver); } private final AtomicInteger logIdGenerator = new AtomicInteger(); public int createLogId(KeyExtent tablet) { AccumuloConfiguration acuTableConf = getTableConfiguration(tablet); if (acuTableConf.getBoolean(Property.TABLE_WALOG_ENABLED)) { return logIdGenerator.incrementAndGet(); } return -1; } // / JMX methods @Override public long getEntries() { if (this.isEnabled()) { long result = 0; for (Tablet tablet : Collections.unmodifiableCollection(onlineTablets.values())) { result += tablet.getNumEntries(); } return result; } return 0; } @Override public long getEntriesInMemory() { if (this.isEnabled()) { long result = 0; for (Tablet tablet : Collections.unmodifiableCollection(onlineTablets.values())) { result += tablet.getNumEntriesInMemory(); } return result; } return 0; } @Override public long getIngest() { if (this.isEnabled()) { long result = 0; for (Tablet tablet : Collections.unmodifiableCollection(onlineTablets.values())) { result += tablet.getNumEntriesInMemory(); } return result; } return 0; } @Override public int getMajorCompactions() { if (this.isEnabled()) { int result = 0; for (Tablet tablet : Collections.unmodifiableCollection(onlineTablets.values())) { if (tablet.majorCompactionRunning()) result++; } return result; } return 0; } @Override public int getMajorCompactionsQueued() { if (this.isEnabled()) { int result = 0; for (Tablet tablet : Collections.unmodifiableCollection(onlineTablets.values())) { if (tablet.majorCompactionQueued()) result++; } return result; } return 0; } @Override public int getMinorCompactions() { if (this.isEnabled()) { int result = 0; for (Tablet tablet : Collections.unmodifiableCollection(onlineTablets.values())) { if (tablet.minorCompactionRunning()) result++; } return result; } return 0; } @Override public int getMinorCompactionsQueued() { if (this.isEnabled()) { int result = 0; for (Tablet tablet : Collections.unmodifiableCollection(onlineTablets.values())) { if (tablet.minorCompactionQueued()) result++; } return result; } return 0; } @Override public int getOnlineCount() { if (this.isEnabled()) return onlineTablets.size(); return 0; } @Override public int getOpeningCount() { if (this.isEnabled()) return openingTablets.size(); return 0; } @Override public long getQueries() { if (this.isEnabled()) { long result = 0; for (Tablet tablet : Collections.unmodifiableCollection(onlineTablets.values())) { result += tablet.totalQueries(); } return result; } return 0; } @Override public int getUnopenedCount() { if (this.isEnabled()) return unopenedTablets.size(); return 0; } @Override public String getName() { if (this.isEnabled()) return getClientAddressString(); return ""; } @Override public long getTotalMinorCompactions() { if (this.isEnabled()) return totalMinorCompactions; return 0; } @Override public double getHoldTime() { if (this.isEnabled()) return this.resourceManager.holdTime() / 1000.; return 0; } @Override public double getAverageFilesPerTablet() { if (this.isEnabled()) { int count = 0; long result = 0; for (Tablet tablet : Collections.unmodifiableCollection(onlineTablets.values())) { result += tablet.getDatafiles().size(); count++; } if (count == 0) return 0; return result / (double) count; } return 0; } @Override protected ObjectName getObjectName() { return OBJECT_NAME; } @Override protected String getMetricsPrefix() { return METRICS_PREFIX; } public TableConfiguration getTableConfiguration(KeyExtent extent) { return ServerConfiguration.getTableConfiguration(instance, extent.getTableId().toString()); } public DfsLogger.ServerResources getServerConfig() { return new DfsLogger.ServerResources() { @Override public VolumeManager getFileSystem() { return fs; } @Override public Set<TServerInstance> getCurrentTServers() { return null; } @Override public AccumuloConfiguration getConfiguration() { return getSystemConfiguration(); } }; } public VolumeManager getFileSystem() { return fs; } }