* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.cassandra.repair.consistent;
import java.io.IOException;
import java.net.InetAddress;
import java.nio.ByteBuffer;
import java.time.Instant;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import javax.annotation.Nullable;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.primitives.Ints;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.compaction.CompactionManager;
import org.apache.cassandra.schema.Schema;
import org.apache.cassandra.schema.SchemaConstants;
import org.apache.cassandra.cql3.QueryProcessor;
import org.apache.cassandra.cql3.UntypedResultSet;
import org.apache.cassandra.db.SystemKeyspace;
import org.apache.cassandra.db.marshal.BytesType;
import org.apache.cassandra.db.marshal.InetAddressType;
import org.apache.cassandra.db.marshal.UUIDType;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.gms.FailureDetector;
import org.apache.cassandra.io.util.DataInputBuffer;
import org.apache.cassandra.io.util.DataOutputBuffer;
import org.apache.cassandra.net.MessageOut;
import org.apache.cassandra.net.MessagingService;
import org.apache.cassandra.repair.messages.FailSession;
import org.apache.cassandra.repair.messages.FinalizeCommit;
import org.apache.cassandra.repair.messages.FinalizePromise;
import org.apache.cassandra.repair.messages.FinalizePropose;
import org.apache.cassandra.repair.messages.PrepareConsistentRequest;
import org.apache.cassandra.repair.messages.PrepareConsistentResponse;
import org.apache.cassandra.repair.messages.RepairMessage;
import org.apache.cassandra.repair.messages.StatusRequest;
import org.apache.cassandra.repair.messages.StatusResponse;
import org.apache.cassandra.schema.TableId;
import org.apache.cassandra.service.ActiveRepairService;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.FBUtilities;
import static org.apache.cassandra.repair.consistent.ConsistentSession.State.*;
* Manages all consistent repair sessions a node is participating in.
* <p/>
* Since sessions need to be loaded, and since we need to handle cases where sessions might not exist, most of the logic
* around local sessions is implemented in this class, with the LocalSession class being treated more like a simple struct,
* in contrast with {@link CoordinatorSession}
public class LocalSessions
private static final Logger logger = LoggerFactory.getLogger(LocalSessions.class);
* Amount of time a session can go without any activity before we start checking the status of other
* participants to see if we've missed a message
static final int CHECK_STATUS_TIMEOUT = Integer.getInteger("cassandra.repair_status_check_timeout_seconds",
* Amount of time a session can go without any activity before being automatically set to FAILED
static final int AUTO_FAIL_TIMEOUT = Integer.getInteger("cassandra.repair_fail_timeout_seconds",
* Amount of time a completed session is kept around after completion before being deleted, this gives
* compaction plenty of time to move sstables from successful sessions into the repaired bucket
static final int AUTO_DELETE_TIMEOUT = Integer.getInteger("cassandra.repair_delete_timeout_seconds",
* How often LocalSessions.cleanup is run
public static final int CLEANUP_INTERVAL = Integer.getInteger("cassandra.repair_cleanup_interval_seconds",
private static Set<TableId> uuidToTableId(Set<UUID> src)
return ImmutableSet.copyOf(Iterables.transform(src, TableId::fromUUID));
private static Set<UUID> tableIdToUuid(Set<TableId> src)
return ImmutableSet.copyOf(Iterables.transform(src, TableId::asUUID));
private final String keyspace = SchemaConstants.SYSTEM_KEYSPACE_NAME;
private final String table = SystemKeyspace.REPAIRS;
private boolean started = false;
private volatile ImmutableMap<UUID, LocalSession> sessions = ImmutableMap.of();
int getNumSessions()
return sessions.size();
protected InetAddress getBroadcastAddress()
return FBUtilities.getBroadcastAddress();
protected boolean isAlive(InetAddress address)
return FailureDetector.instance.isAlive(address);
protected boolean isNodeInitialized()
return StorageService.instance.isInitialized();
public List<Map<String, String>> sessionInfo(boolean all)
Iterable<LocalSession> currentSessions = sessions.values();
if (!all)
currentSessions = Iterables.filter(currentSessions, s -> !s.isCompleted());
return Lists.newArrayList(Iterables.transform(currentSessions, LocalSessionInfo::sessionToMap));
* hook for operators to cancel sessions, cancelling from a non-coordinator is an error, unless
* force is set to true. Messages are sent out to other participants, but we don't wait for a response
public void cancelSession(UUID sessionID, boolean force)
logger.info("Cancelling local repair session {}", sessionID);
LocalSession session = getSession(sessionID);
Preconditions.checkArgument(session != null, "Session {} does not exist", sessionID);
Preconditions.checkArgument(force || session.coordinator.equals(getBroadcastAddress()),
"Cancel session %s from it's coordinator (%s) or use --force",
sessionID, session.coordinator);
setStateAndSave(session, FAILED);
for (InetAddress participant : session.participants)
if (!participant.equals(getBroadcastAddress()))
sendMessage(participant, new FailSession(sessionID));
* Loads sessions out of the repairs table and sets state to started
public synchronized void start()
Preconditions.checkArgument(!started, "LocalSessions.start can only be called once");
Preconditions.checkArgument(sessions.isEmpty(), "No sessions should be added before start");
UntypedResultSet rows = QueryProcessor.executeInternalWithPaging(String.format("SELECT * FROM %s.%s", keyspace, table), 1000);
Map<UUID, LocalSession> loadedSessions = new HashMap<>();
for (UntypedResultSet.Row row : rows)
LocalSession session = load(row);
loadedSessions.put(session.sessionID, session);
catch (IllegalArgumentException | NullPointerException e)
logger.warn("Unable to load malformed repair session {}, ignoring", row.has("parent_id") ? row.getUUID("parent_id") : null);
sessions = ImmutableMap.copyOf(loadedSessions);
started = true;
public boolean isStarted()
return started;
private static boolean shouldCheckStatus(LocalSession session, int now)
return !session.isCompleted() && (now > session.getLastUpdate() + CHECK_STATUS_TIMEOUT);
private static boolean shouldFail(LocalSession session, int now)
return !session.isCompleted() && (now > session.getLastUpdate() + AUTO_FAIL_TIMEOUT);
private static boolean shouldDelete(LocalSession session, int now)
return session.isCompleted() && (now > session.getLastUpdate() + AUTO_DELETE_TIMEOUT);
* Auto fails and auto deletes timed out and old sessions
* Compaction will clean up the sstables still owned by a deleted session
public void cleanup()
logger.trace("Running LocalSessions.cleanup");
if (!isNodeInitialized())
logger.trace("node not initialized, aborting local session cleanup");
Set<LocalSession> currentSessions = new HashSet<>(sessions.values());
for (LocalSession session : currentSessions)
synchronized (session)
int now = FBUtilities.nowInSeconds();
if (shouldFail(session, now))
logger.warn("Auto failing timed out repair session {}", session);
failSession(session.sessionID, false);
else if (shouldDelete(session, now))
logger.debug("Auto deleting repair session {}", session);
else if (shouldCheckStatus(session, now))
private static ByteBuffer serializeRange(Range<Token> range)
int size = (int) Token.serializer.serializedSize(range.left, 0);
size += (int) Token.serializer.serializedSize(range.right, 0);
try (DataOutputBuffer buffer = new DataOutputBuffer(size))
Token.serializer.serialize(range.left, buffer, 0);
Token.serializer.serialize(range.right, buffer, 0);
return buffer.buffer();
catch (IOException e)
throw new RuntimeException(e);
private static Set<ByteBuffer> serializeRanges(Set<Range<Token>> ranges)
Set<ByteBuffer> buffers = new HashSet<>(ranges.size());
ranges.forEach(r -> buffers.add(serializeRange(r)));
return buffers;
private static Range<Token> deserializeRange(ByteBuffer bb)
try (DataInputBuffer in = new DataInputBuffer(bb, false))
IPartitioner partitioner = DatabaseDescriptor.getPartitioner();
Token left = Token.serializer.deserialize(in, partitioner, 0);
Token right = Token.serializer.deserialize(in, partitioner, 0);
return new Range<>(left, right);
catch (IOException e)
throw new RuntimeException(e);
private static Set<Range<Token>> deserializeRanges(Set<ByteBuffer> buffers)
Set<Range<Token>> ranges = new HashSet<>(buffers.size());
buffers.forEach(bb -> ranges.add(deserializeRange(bb)));
return ranges;
* Save session state to table
void save(LocalSession session)
String query = "INSERT INTO %s.%s " +
"(parent_id, " +
"started_at, " +
"last_update, " +
"repaired_at, " +
"state, " +
"coordinator, " +
"participants, " +
"ranges, " +
"cfids) " +
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)";
QueryProcessor.executeInternal(String.format(query, keyspace, table),
private static int dateToSeconds(Date d)
return Ints.checkedCast(TimeUnit.MILLISECONDS.toSeconds(d.getTime()));
private LocalSession load(UntypedResultSet.Row row)
LocalSession.Builder builder = LocalSession.builder();
builder.withTableIds(uuidToTableId(row.getSet("cfids", UUIDType.instance)));
builder.withRanges(deserializeRanges(row.getSet("ranges", BytesType.instance)));
builder.withParticipants(row.getSet("participants", InetAddressType.instance));
return buildSession(builder);
private void deleteRow(UUID sessionID)
String query = "DELETE FROM %s.%s WHERE parent_id=?";
QueryProcessor.executeInternal(String.format(query, keyspace, table), sessionID);
* Loads a session directly from the table. Should be used for testing only
LocalSession loadUnsafe(UUID sessionId)
String query = "SELECT * FROM %s.%s WHERE parent_id=?";
UntypedResultSet result = QueryProcessor.executeInternal(String.format(query, keyspace, table), sessionId);
if (result.isEmpty())
return null;
UntypedResultSet.Row row = result.one();
return load(row);
protected LocalSession buildSession(LocalSession.Builder builder)
return new LocalSession(builder);
protected LocalSession getSession(UUID sessionID)
return sessions.get(sessionID);
synchronized void putSessionUnsafe(LocalSession session)
private synchronized void putSession(LocalSession session)
"LocalSession {} already exists", session.sessionID);
Preconditions.checkArgument(started, "sessions cannot be added before LocalSessions is started");
sessions = ImmutableMap.<UUID, LocalSession>builder()
.put(session.sessionID, session)
private synchronized void removeSession(UUID sessionID)
Preconditions.checkArgument(sessionID != null);
Map<UUID, LocalSession> temp = new HashMap<>(sessions);
sessions = ImmutableMap.copyOf(temp);
LocalSession createSessionUnsafe(UUID sessionId, ActiveRepairService.ParentRepairSession prs, Set<InetAddress> peers)
LocalSession.Builder builder = LocalSession.builder();
int now = FBUtilities.nowInSeconds();
return buildSession(builder);
protected ActiveRepairService.ParentRepairSession getParentRepairSession(UUID sessionID)
return ActiveRepairService.instance.getParentRepairSession(sessionID);
protected void sendMessage(InetAddress destination, RepairMessage message)
logger.trace("sending {} to {}", message, destination);
MessageOut<RepairMessage> messageOut = new MessageOut<RepairMessage>(MessagingService.Verb.REPAIR_MESSAGE, message, RepairMessage.serializer);
MessagingService.instance().sendOneWay(messageOut, destination);
private void setStateAndSave(LocalSession session, ConsistentSession.State state)
synchronized (session)
"Invalid state transition %s -> %s",
session.getState(), state);
logger.trace("Changing LocalSession state from {} -> {} for {}", session.getState(), state, session.sessionID);
boolean wasCompleted = session.isCompleted();
if (session.isCompleted() && !wasCompleted)
public void failSession(UUID sessionID)
failSession(sessionID, true);
public void failSession(UUID sessionID, boolean sendMessage)
logger.info("Failing local repair session {}", sessionID);
LocalSession session = getSession(sessionID);
if (session != null)
setStateAndSave(session, FAILED);
if (sendMessage)
sendMessage(session.coordinator, new FailSession(sessionID));
public synchronized void deleteSession(UUID sessionID)
logger.debug("Deleting local repair session {}", sessionID);
LocalSession session = getSession(sessionID);
Preconditions.checkArgument(session.isCompleted(), "Cannot delete incomplete sessions");
ListenableFuture submitPendingAntiCompaction(LocalSession session, ExecutorService executor)
PendingAntiCompaction pac = new PendingAntiCompaction(session.sessionID, session.ranges, executor);
return pac.run();
* The PrepareConsistentRequest effectively promotes the parent repair session to a consistent
* incremental session, and begins the 'pending anti compaction' which moves all sstable data
* that is to be repaired into it's own silo, preventing it from mixing with other data.
* No response is sent to the repair coordinator until the pending anti compaction has completed
* successfully. If the pending anti compaction fails, a failure message is sent to the coordinator,
* cancelling the session.
public void handlePrepareMessage(InetAddress from, PrepareConsistentRequest request)
logger.trace("received {} from {}", request, from);
UUID sessionID = request.parentSession;
InetAddress coordinator = request.coordinator;
Set<InetAddress> peers = request.participants;
ActiveRepairService.ParentRepairSession parentSession;
parentSession = getParentRepairSession(sessionID);
catch (Throwable e)
logger.trace("Error retrieving ParentRepairSession for session {}, responding with failure", sessionID);
sendMessage(coordinator, new FailSession(sessionID));
LocalSession session = createSessionUnsafe(sessionID, parentSession, peers);
logger.info("Beginning local incremental repair session {}", session);
ExecutorService executor = Executors.newFixedThreadPool(parentSession.getColumnFamilyStores().size());
ListenableFuture pendingAntiCompaction = submitPendingAntiCompaction(session, executor);
Futures.addCallback(pendingAntiCompaction, new FutureCallback()
public void onSuccess(@Nullable Object result)
logger.debug("Prepare phase for incremental repair session {} completed", sessionID);
setStateAndSave(session, PREPARED);
sendMessage(coordinator, new PrepareConsistentResponse(sessionID, getBroadcastAddress(), true));
public void onFailure(Throwable t)
logger.error(String.format("Prepare phase for incremental repair session %s failed", sessionID), t);
public void maybeSetRepairing(UUID sessionID)
LocalSession session = getSession(sessionID);
if (session != null && session.getState() != REPAIRING)
logger.debug("Setting local incremental repair session {} to REPAIRING", session);
setStateAndSave(session, REPAIRING);
public void handleFinalizeProposeMessage(InetAddress from, FinalizePropose propose)
logger.trace("received {} from {}", propose, from);
UUID sessionID = propose.sessionID;
LocalSession session = getSession(sessionID);
if (session == null)
logger.debug("Received FinalizePropose message for unknown repair session {}, responding with failure");
sendMessage(from, new FailSession(sessionID));
setStateAndSave(session, FINALIZE_PROMISED);
sendMessage(from, new FinalizePromise(sessionID, getBroadcastAddress(), true));
logger.debug("Received FinalizePropose message for incremental repair session {}, responded with FinalizePromise");
catch (IllegalArgumentException e)
logger.error(String.format("Error handling FinalizePropose message for %s", session), e);
protected void sessionCompleted(LocalSession session)
for (TableId tid: session.tableIds)
ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(tid);
if (cfs != null)
* Finalizes the repair session, completing it as successful.
* This only changes the state of the session, it doesn't promote the siloed sstables to repaired. That will happen
* as part of the compaction process, and avoids having to worry about in progress compactions interfering with the
* promotion.
public void handleFinalizeCommitMessage(InetAddress from, FinalizeCommit commit)
logger.trace("received {} from {}", commit, from);
UUID sessionID = commit.sessionID;
LocalSession session = getSession(sessionID);
if (session == null)
logger.warn("Ignoring FinalizeCommit message for unknown repair session {}", sessionID);
setStateAndSave(session, FINALIZED);
logger.info("Finalized local repair session {}", sessionID);
public void handleFailSessionMessage(InetAddress from, FailSession msg)
logger.trace("received {} from {}", msg, from);
failSession(msg.sessionID, false);
public void sendStatusRequest(LocalSession session)
logger.debug("Attempting to learn the outcome of unfinished local incremental repair session {}", session.sessionID);
StatusRequest request = new StatusRequest(session.sessionID);
for (InetAddress participant : session.participants)
if (!getBroadcastAddress().equals(participant) && isAlive(participant))
sendMessage(participant, request);
public void handleStatusRequest(InetAddress from, StatusRequest request)
logger.trace("received {} from {}", request, from);
UUID sessionID = request.sessionID;
LocalSession session = getSession(sessionID);
if (session == null)
logger.warn("Received status response message for unknown session {}", sessionID);
sendMessage(from, new StatusResponse(sessionID, FAILED));
sendMessage(from, new StatusResponse(sessionID, session.getState()));
logger.debug("Responding to status response message for incremental repair session {} with local state {}", sessionID, session.getState());
public void handleStatusResponse(InetAddress from, StatusResponse response)
logger.trace("received {} from {}", response, from);
UUID sessionID = response.sessionID;
LocalSession session = getSession(sessionID);
if (session == null)
logger.warn("Received StatusResponse message for unknown repair session {}", sessionID);
// only change local state if response state is FINALIZED or FAILED, since those are
// the only statuses that would indicate we've missed a message completing the session
if (response.state == FINALIZED || response.state == FAILED)
setStateAndSave(session, response.state);
logger.info("Unfinished local incremental repair session {} set to state {}", sessionID, response.state);
logger.debug("Received StatusResponse for repair session {} with state {}, which is not actionable. Doing nothing.", sessionID, response.state);
* determines if a local session exists, and if it's not finalized or failed
public boolean isSessionInProgress(UUID sessionID)
LocalSession session = getSession(sessionID);
return session != null && session.getState() != FINALIZED && session.getState() != FAILED;
* Returns the repairedAt time for a sessions which is unknown, failed, or finalized
* calling this for a session which is in progress throws an exception
public long getFinalSessionRepairedAt(UUID sessionID)
LocalSession session = getSession(sessionID);
if (session == null || session.getState() == FAILED)
return ActiveRepairService.UNREPAIRED_SSTABLE;
else if (session.getState() == FINALIZED)
return session.repairedAt;
throw new IllegalStateException("Cannot get final repaired at value for in progress session: " + session);