org.apache.accumulo.gc.replication.CloseWriteAheadLogReferences.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.accumulo.gc.replication.CloseWriteAheadLogReferences.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.accumulo.gc.replication;

import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.accumulo.core.client.BatchScanner;
import org.apache.accumulo.core.client.BatchWriter;
import org.apache.accumulo.core.client.BatchWriterConfig;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.MutationsRejectedException;
import org.apache.accumulo.core.client.TableNotFoundException;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.Range;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.file.rfile.RFile;
import org.apache.accumulo.core.master.thrift.MasterClientService;
import org.apache.accumulo.core.metadata.MetadataTable;
import org.apache.accumulo.core.metadata.schema.MetadataSchema;
import org.apache.accumulo.core.metadata.schema.MetadataSchema.ReplicationSection;
import org.apache.accumulo.core.replication.ReplicationTable;
import org.apache.accumulo.core.rpc.ThriftUtil;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.core.tabletserver.thrift.TabletClientService;
import org.apache.accumulo.core.trace.Span;
import org.apache.accumulo.core.trace.Trace;
import org.apache.accumulo.core.trace.thrift.TInfo;
import org.apache.accumulo.server.AccumuloServerContext;
import org.apache.accumulo.server.log.WalStateManager;
import org.apache.accumulo.server.log.WalStateManager.WalMarkerException;
import org.apache.accumulo.server.log.WalStateManager.WalState;
import org.apache.accumulo.server.replication.StatusUtil;
import org.apache.accumulo.server.replication.proto.Replication.Status;
import org.apache.accumulo.server.zookeeper.ZooReaderWriter;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.thrift.TException;
import org.apache.thrift.transport.TTransportException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Stopwatch;
import com.google.common.net.HostAndPort;
import com.google.protobuf.InvalidProtocolBufferException;

/**
 * It's impossible to know when all references to a WAL have been removed from the metadata table as the references are potentially spread across the entire
 * tablets row-space.
 * <p>
 * This tool scans the metadata table to collect a set of WALs that are still referenced. Then, each {@link Status} record from the metadata and replication
 * tables that point to that WAL can be "closed", by writing a new Status to the same key with the closed member true.
 */
public class CloseWriteAheadLogReferences implements Runnable {
    private static final Logger log = LoggerFactory.getLogger(CloseWriteAheadLogReferences.class);

    private static final String RFILE_SUFFIX = "." + RFile.EXTENSION;

    private final AccumuloServerContext context;

    public CloseWriteAheadLogReferences(AccumuloServerContext context) {
        this.context = context;
    }

    @Override
    public void run() {
        // As long as we depend on a newer Guava than Hadoop uses, we have to make sure we're compatible with
        // what the version they bundle uses.
        Stopwatch sw = new Stopwatch();

        Connector conn;
        try {
            conn = context.getConnector();
        } catch (Exception e) {
            log.error("Could not create connector", e);
            throw new RuntimeException(e);
        }

        if (!ReplicationTable.isOnline(conn)) {
            log.debug("Replication table isn't online, not attempting to clean up wals");
            return;
        }

        Span findWalsSpan = Trace.start("findReferencedWals");
        HashSet<String> closed = null;
        try {
            sw.start();
            closed = getClosedLogs(conn);
        } finally {
            sw.stop();
            findWalsSpan.stop();
        }

        log.info("Found " + closed.size() + " WALs referenced in metadata in " + sw.toString());
        sw.reset();

        Span updateReplicationSpan = Trace.start("updateReplicationTable");
        long recordsClosed = 0;
        try {
            sw.start();
            recordsClosed = updateReplicationEntries(conn, closed);
        } finally {
            sw.stop();
            updateReplicationSpan.stop();
        }

        log.info(
                "Closed " + recordsClosed + " WAL replication references in replication table in " + sw.toString());
    }

    /**
     * Construct the set of referenced WALs from zookeeper
     *
     * @param conn
     *          Connector
     * @return The Set of WALs that are referenced in the metadata table
     */
    protected HashSet<String> getClosedLogs(Connector conn) {
        WalStateManager wals = new WalStateManager(conn.getInstance(), ZooReaderWriter.getInstance());

        HashSet<String> result = new HashSet<>();
        try {
            for (Entry<Path, WalState> entry : wals.getAllState().entrySet()) {
                if (entry.getValue() == WalState.UNREFERENCED || entry.getValue() == WalState.CLOSED) {
                    Path path = entry.getKey();
                    log.debug("Found closed WAL " + path.toString());
                    result.add(path.toString());
                }
            }
        } catch (WalMarkerException e) {
            throw new RuntimeException(e);
        }
        return result;
    }

    /**
     * Given the set of WALs which have references in the metadata table, close any status messages with reference that WAL.
     *
     * @param conn
     *          Connector
     * @param closedWals
     *          {@link Set} of paths to WALs that marked as closed or unreferenced in zookeeper
     */
    protected long updateReplicationEntries(Connector conn, Set<String> closedWals) {
        BatchScanner bs = null;
        BatchWriter bw = null;
        long recordsClosed = 0;
        try {
            bw = conn.createBatchWriter(MetadataTable.NAME, new BatchWriterConfig());
            bs = conn.createBatchScanner(MetadataTable.NAME, Authorizations.EMPTY, 4);
            bs.setRanges(Collections.singleton(Range.prefix(ReplicationSection.getRowPrefix())));
            bs.fetchColumnFamily(ReplicationSection.COLF);

            Text replFileText = new Text();
            for (Entry<Key, Value> entry : bs) {
                Status status;
                try {
                    status = Status.parseFrom(entry.getValue().get());
                } catch (InvalidProtocolBufferException e) {
                    log.error("Could not parse Status protobuf for {}", entry.getKey(), e);
                    continue;
                }

                // Ignore things that aren't completely replicated as we can't delete those anyways
                MetadataSchema.ReplicationSection.getFile(entry.getKey(), replFileText);
                String replFile = replFileText.toString();
                boolean isClosed = closedWals.contains(replFile);

                // We only want to clean up WALs (which is everything but rfiles) and only when
                // metadata doesn't have a reference to the given WAL
                if (!status.getClosed() && !replFile.endsWith(RFILE_SUFFIX) && isClosed) {
                    try {
                        closeWal(bw, entry.getKey());
                        recordsClosed++;
                    } catch (MutationsRejectedException e) {
                        log.error("Failed to submit delete mutation for " + entry.getKey());
                        continue;
                    }
                }
            }
        } catch (TableNotFoundException e) {
            log.error("Replication table was deleted", e);
        } finally {
            if (null != bs) {
                bs.close();
            }

            if (null != bw) {
                try {
                    bw.close();
                } catch (MutationsRejectedException e) {
                    log.error("Failed to write delete mutations for replication table", e);
                }
            }
        }

        return recordsClosed;
    }

    /**
     * Write a closed {@link Status} mutation for the given {@link Key} using the provided {@link BatchWriter}
     *
     * @param bw
     *          BatchWriter
     * @param k
     *          Key to create close mutation from
     */
    protected void closeWal(BatchWriter bw, Key k) throws MutationsRejectedException {
        log.debug("Closing unreferenced WAL ({}) in metadata table", k.toStringNoTruncate());
        Mutation m = new Mutation(k.getRow());
        m.put(k.getColumnFamily(), k.getColumnQualifier(), StatusUtil.fileClosedValue());
        bw.addMutation(m);
    }

    private HostAndPort getMasterAddress() {
        try {
            List<String> locations = context.getInstance().getMasterLocations();
            if (locations.size() == 0)
                return null;
            return HostAndPort.fromString(locations.get(0));
        } catch (Exception e) {
            log.warn("Failed to obtain master host " + e);
        }

        return null;
    }

    private MasterClientService.Client getMasterConnection() {
        final HostAndPort address = getMasterAddress();
        try {
            if (address == null) {
                log.warn("Could not fetch Master address");
                return null;
            }
            return ThriftUtil.getClient(new MasterClientService.Client.Factory(), address, context);
        } catch (Exception e) {
            log.warn("Issue with masterConnection (" + address + ") " + e, e);
        }
        return null;
    }

    /**
     * Get the active tabletservers as seen by the master.
     *
     * @return The active tabletservers, null if they can't be computed.
     */
    protected List<String> getActiveTservers(TInfo tinfo) {
        MasterClientService.Client client = null;

        List<String> tservers = null;
        try {
            client = getMasterConnection();

            // Could do this through InstanceOperations, but that would set a bunch of new Watchers via ZK on every tserver
            // node. The master is already tracking all of this info, so hopefully this is less overall work.
            if (null != client) {
                tservers = client.getActiveTservers(tinfo, context.rpcCreds());
            }
        } catch (TException e) {
            // If we can't fetch the tabletservers, we can't fetch any active WALs
            log.warn("Failed to fetch active tabletservers from the master", e);
            return null;
        } finally {
            ThriftUtil.returnClient(client);
        }

        return tservers;
    }

    protected List<String> getActiveWalsForServer(TInfo tinfo, HostAndPort server) {
        TabletClientService.Client tserverClient = null;
        try {
            tserverClient = ThriftUtil.getClient(new TabletClientService.Client.Factory(), server, context);
            return tserverClient.getActiveLogs(tinfo, context.rpcCreds());
        } catch (TTransportException e) {
            log.warn("Failed to fetch active write-ahead logs from " + server, e);
            return null;
        } catch (TException e) {
            log.warn("Failed to fetch active write-ahead logs from " + server, e);
            return null;
        } finally {
            ThriftUtil.returnClient(tserverClient);
        }
    }
}