/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.service;
import java.io.File;
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.concurrent.JMXConfigurableThreadPoolExecutor;
import org.apache.cassandra.concurrent.NamedThreadFactory;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.compaction.CompactionManager;
import org.apache.cassandra.dht.Bounds;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.gms.FailureDetector;
import org.apache.cassandra.gms.Gossiper;
import org.apache.cassandra.io.sstable.Component;
import org.apache.cassandra.io.sstable.SSTableReader;
import org.apache.cassandra.locator.TokenMetadata;
import org.apache.cassandra.net.IAsyncCallbackWithFailure;
import org.apache.cassandra.net.MessageIn;
import org.apache.cassandra.net.MessageOut;
import org.apache.cassandra.net.MessagingService;
import org.apache.cassandra.repair.*;
import org.apache.cassandra.repair.messages.AnticompactionRequest;
import org.apache.cassandra.repair.messages.PrepareMessage;
import org.apache.cassandra.repair.messages.RepairMessage;
import org.apache.cassandra.repair.messages.SyncComplete;
import org.apache.cassandra.repair.messages.ValidationComplete;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.UUIDGen;
import org.apache.cassandra.utils.concurrent.Ref;
import org.apache.cassandra.utils.concurrent.RefCounted;
import org.apache.cassandra.utils.concurrent.Refs;
/**
* ActiveRepairService is the starting point for manual "active" repairs.
*
* Each user triggered repair will correspond to one or multiple repair session,
* one for each token range to repair. On repair session might repair multiple
* column families. For each of those column families, the repair session will
* request merkle trees for each replica of the range being repaired, diff those
* trees upon receiving them, schedule the streaming ofthe parts to repair (based on
* the tree diffs) and wait for all those operation. See RepairSession for more
* details.
*
* The creation of a repair session is done through the submitRepairSession that
* returns a future on the completion of that session.
*/
public class ActiveRepairService
{
private static final Logger logger = LoggerFactory.getLogger(ActiveRepairService.class);
// singleton enforcement
public static final ActiveRepairService instance = new ActiveRepairService();
public static final long UNREPAIRED_SSTABLE = 0;
private static final ThreadPoolExecutor executor;
static
{
executor = new JMXConfigurableThreadPoolExecutor(4,
60,
TimeUnit.SECONDS,
new LinkedBlockingQueue<Runnable>(),
new NamedThreadFactory("AntiEntropySessions"),
"internal");
}
public static enum Status
{
STARTED, SESSION_SUCCESS, SESSION_FAILED, FINISHED
}
/**
* A map of active coordinator session.
*/
private final ConcurrentMap<UUID, RepairSession> sessions;
private final ConcurrentMap<UUID, ParentRepairSession> parentRepairSessions;
/**
* Protected constructor. Use ActiveRepairService.instance.
*/
protected ActiveRepairService()
{
sessions = new ConcurrentHashMap<>();
parentRepairSessions = new ConcurrentHashMap<>();
}
/**
* Requests repairs for the given keyspace and column families.
*
* @return Future for asynchronous call or null if there is no need to repair
*/
public RepairFuture submitRepairSession(UUID parentRepairSession, Range<Token> range, String keyspace, RepairParallelism parallelismDegree, Set<InetAddress> endpoints, String... cfnames)
{
if (cfnames.length == 0)
return null;
RepairSession session = new RepairSession(parentRepairSession, range, keyspace, parallelismDegree, endpoints, cfnames);
if (session.endpoints.isEmpty())
return null;
RepairFuture futureTask = new RepairFuture(session);
executor.execute(futureTask);
return futureTask;
}
public void addToActiveSessions(RepairSession session)
{
sessions.put(session.getId(), session);
Gossiper.instance.register(session);
FailureDetector.instance.registerFailureDetectionEventListener(session);
}
public void removeFromActiveSessions(RepairSession session)
{
Gossiper.instance.unregister(session);
sessions.remove(session.getId());
}
public synchronized void terminateSessions()
{
for (RepairSession session : sessions.values())
{
session.forceShutdown();
}
parentRepairSessions.clear();
}
// for testing only. Create a session corresponding to a fake request and
// add it to the sessions (avoid NPE in tests)
RepairFuture submitArtificialRepairSession(RepairJobDesc desc)
{
Set<InetAddress> neighbours = new HashSet<>();
neighbours.addAll(ActiveRepairService.getNeighbors(desc.keyspace, desc.range, null, null));
RepairSession session = new RepairSession(desc.parentSessionId, desc.sessionId, desc.range, desc.keyspace, RepairParallelism.PARALLEL, neighbours, new String[]{desc.columnFamily});
sessions.put(session.getId(), session);
RepairFuture futureTask = new RepairFuture(session);
executor.execute(futureTask);
return futureTask;
}
/**
* Return all of the neighbors with whom we share the provided range.
*
* @param keyspaceName keyspace to repair
* @param toRepair token to repair
* @param dataCenters the data centers to involve in the repair
*
* @return neighbors with whom we share the provided range
*/
public static Set<InetAddress> getNeighbors(String keyspaceName, Range<Token> toRepair, Collection<String> dataCenters, Collection<String> hosts)
{
StorageService ss = StorageService.instance;
Map<Range<Token>, List<InetAddress>> replicaSets = ss.getRangeToAddressMap(keyspaceName);
Range<Token> rangeSuperSet = null;
for (Range<Token> range : ss.getLocalRanges(keyspaceName))
{
if (range.contains(toRepair))
{
rangeSuperSet = range;
break;
}
else if (range.intersects(toRepair))
{
throw new IllegalArgumentException("Requested range intersects a local range but is not fully contained in one; this would lead to imprecise repair");
}
}
if (rangeSuperSet == null || !replicaSets.containsKey(rangeSuperSet))
return Collections.emptySet();
Set<InetAddress> neighbors = new HashSet<>(replicaSets.get(rangeSuperSet));
neighbors.remove(FBUtilities.getBroadcastAddress());
if (dataCenters != null)
{
TokenMetadata.Topology topology = ss.getTokenMetadata().cloneOnlyTokenMap().getTopology();
Set<InetAddress> dcEndpoints = Sets.newHashSet();
Multimap<String,InetAddress> dcEndpointsMap = topology.getDatacenterEndpoints();
for (String dc : dataCenters)
{
Collection<InetAddress> c = dcEndpointsMap.get(dc);
if (c != null)
dcEndpoints.addAll(c);
}
return Sets.intersection(neighbors, dcEndpoints);
}
else if (hosts != null)
{
Set<InetAddress> specifiedHost = new HashSet<>();
for (final String host : hosts)
{
try
{
final InetAddress endpoint = InetAddress.getByName(host.trim());
if (endpoint.equals(FBUtilities.getBroadcastAddress()) || neighbors.contains(endpoint))
specifiedHost.add(endpoint);
}
catch (UnknownHostException e)
{
throw new IllegalArgumentException("Unknown host specified " + host, e);
}
}
if (!specifiedHost.contains(FBUtilities.getBroadcastAddress()))
throw new IllegalArgumentException("The current host must be part of the repair");
if (specifiedHost.size() <= 1)
{
String msg = "Repair requires at least two endpoints that are neighbours before it can continue, the endpoint used for this repair is %s, " +
"other available neighbours are %s but these neighbours were not part of the supplied list of hosts to use during the repair (%s).";
throw new IllegalArgumentException(String.format(msg, specifiedHost, neighbors, hosts));
}
specifiedHost.remove(FBUtilities.getBroadcastAddress());
return specifiedHost;
}
return neighbors;
}
public synchronized UUID prepareForRepair(Set<InetAddress> endpoints, Collection<Range<Token>> ranges, List<ColumnFamilyStore> columnFamilyStores)
{
UUID parentRepairSession = UUIDGen.getTimeUUID();
registerParentRepairSession(parentRepairSession, columnFamilyStores, ranges);
final CountDownLatch prepareLatch = new CountDownLatch(endpoints.size());
final AtomicBoolean status = new AtomicBoolean(true);
final Set<String> failedNodes = Collections.synchronizedSet(new HashSet<String>());
IAsyncCallbackWithFailure callback = new IAsyncCallbackWithFailure()
{
public void response(MessageIn msg)
{
prepareLatch.countDown();
}
public boolean isLatencyForSnitch()
{
return false;
}
public void onFailure(InetAddress from)
{
status.set(false);
failedNodes.add(from.getHostAddress());
prepareLatch.countDown();
}
};
List<UUID> cfIds = new ArrayList<>(columnFamilyStores.size());
for (ColumnFamilyStore cfs : columnFamilyStores)
cfIds.add(cfs.metadata.cfId);
for(InetAddress neighbour : endpoints)
{
PrepareMessage message = new PrepareMessage(parentRepairSession, cfIds, ranges);
MessageOut<RepairMessage> msg = message.createMessage();
MessagingService.instance().sendRRWithFailure(msg, neighbour, callback);
}
try
{
prepareLatch.await(1, TimeUnit.HOURS);
}
catch (InterruptedException e)
{
parentRepairSessions.remove(parentRepairSession);
throw new RuntimeException("Did not get replies from all endpoints. List of failed endpoint(s): " + failedNodes.toString(), e);
}
if (!status.get())
{
parentRepairSessions.remove(parentRepairSession);
throw new RuntimeException("Did not get positive replies from all endpoints. List of failed endpoint(s): " + failedNodes.toString());
}
return parentRepairSession;
}
public synchronized void registerParentRepairSession(UUID parentRepairSession, List<ColumnFamilyStore> columnFamilyStores, Collection<Range<Token>> ranges)
{
parentRepairSessions.put(parentRepairSession, new ParentRepairSession(columnFamilyStores, ranges, System.currentTimeMillis()));
}
public Set<SSTableReader> currentlyRepairing(UUID cfId, UUID parentRepairSession)
{
Set<SSTableReader> repairing = new HashSet<>();
for (Map.Entry<UUID, ParentRepairSession> entry : parentRepairSessions.entrySet())
{
Collection<SSTableReader> sstables = entry.getValue().sstableMap.get(cfId);
if (sstables != null && !entry.getKey().equals(parentRepairSession))
repairing.addAll(sstables);
}
return repairing;
}
public synchronized void finishParentSession(UUID parentSession, Set<InetAddress> neighbors, boolean doAntiCompaction) throws InterruptedException, ExecutionException, IOException
{
try
{
if (doAntiCompaction)
{
for (InetAddress neighbor : neighbors)
{
AnticompactionRequest acr = new AnticompactionRequest(parentSession);
MessageOut<RepairMessage> req = acr.createMessage();
MessagingService.instance().sendOneWay(req, neighbor);
}
List<Future<?>> futures = doAntiCompaction(parentSession);
FBUtilities.waitOnFutures(futures);
}
}
finally
{
parentRepairSessions.remove(parentSession);
}
}
public ParentRepairSession getParentRepairSession(UUID parentSessionId)
{
return parentRepairSessions.get(parentSessionId);
}
public synchronized ParentRepairSession removeParentRepairSession(UUID parentSessionId)
{
return parentRepairSessions.remove(parentSessionId);
}
public List<Future<?>> doAntiCompaction(UUID parentRepairSession) throws InterruptedException, ExecutionException, IOException
{
assert parentRepairSession != null;
ParentRepairSession prs = getParentRepairSession(parentRepairSession);
List<Future<?>> futures = new ArrayList<>();
for (Map.Entry<UUID, ColumnFamilyStore> columnFamilyStoreEntry : prs.columnFamilyStores.entrySet())
{
Refs<SSTableReader> sstables = prs.getAndReferenceSSTables(columnFamilyStoreEntry.getKey());
ColumnFamilyStore cfs = columnFamilyStoreEntry.getValue();
futures.add(CompactionManager.instance.submitAntiCompaction(cfs, prs.ranges, sstables, prs.repairedAt));
}
return futures;
}
public void handleMessage(InetAddress endpoint, RepairMessage message)
{
RepairJobDesc desc = message.desc;
RepairSession session = sessions.get(desc.sessionId);
if (session == null)
return;
switch (message.messageType)
{
case VALIDATION_COMPLETE:
ValidationComplete validation = (ValidationComplete) message;
session.validationComplete(desc, endpoint, validation.tree);
break;
case SYNC_COMPLETE:
// one of replica is synced.
SyncComplete sync = (SyncComplete) message;
session.syncComplete(desc, sync.nodes, sync.success);
break;
default:
break;
}
}
public static class ParentRepairSession
{
public final Map<UUID, ColumnFamilyStore> columnFamilyStores = new HashMap<>();
public final Collection<Range<Token>> ranges;
public final Map<UUID, Set<SSTableReader>> sstableMap;
public final long repairedAt;
public ParentRepairSession(List<ColumnFamilyStore> columnFamilyStores, Collection<Range<Token>> ranges, long repairedAt)
{
for (ColumnFamilyStore cfs : columnFamilyStores)
this.columnFamilyStores.put(cfs.metadata.cfId, cfs);
this.ranges = ranges;
this.sstableMap = new HashMap<>();
this.repairedAt = repairedAt;
}
public synchronized Refs<SSTableReader> getAndReferenceSSTables(UUID cfId)
{
Set<SSTableReader> sstables = sstableMap.get(cfId);
Iterator<SSTableReader> sstableIterator = sstables.iterator();
ImmutableMap.Builder<SSTableReader, Ref<SSTableReader>> references = ImmutableMap.builder();
while (sstableIterator.hasNext())
{
SSTableReader sstable = sstableIterator.next();
if (!new File(sstable.descriptor.filenameFor(Component.DATA)).exists())
{
sstableIterator.remove();
}
else
{
Ref<SSTableReader> ref = sstable.tryRef();
if (ref == null)
sstableIterator.remove();
else
references.put(sstable, ref);
}
}
return new Refs<>(references.build());
}
public void addSSTables(UUID cfId, Collection<SSTableReader> sstables)
{
Set<SSTableReader> existingSSTables = this.sstableMap.get(cfId);
if (existingSSTables == null)
existingSSTables = new HashSet<>();
existingSSTables.addAll(sstables);
this.sstableMap.put(cfId, existingSSTables);
}
@Override
public String toString()
{
return "ParentRepairSession{" +
"columnFamilyStores=" + columnFamilyStores +
", ranges=" + ranges +
", sstableMap=" + sstableMap +
", repairedAt=" + repairedAt +
'}';
}
}
}