/* * Copyright 2007-2010 Sun Microsystems, Inc. * * This file is part of Project Darkstar Server. * * Project Darkstar Server is free software: you can redistribute it * and/or modify it under the terms of the GNU General Public License * version 2 as published by the Free Software Foundation and * distributed hereunder to you. * * Project Darkstar Server is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * -- */ package com.sun.sgs.impl.service.watchdog; import com.sun.sgs.app.NameNotBoundException; import com.sun.sgs.app.util.ManagedSerializable; import com.sun.sgs.impl.kernel.StandardProperties; import com.sun.sgs.impl.sharedutil.LoggerWrapper; import com.sun.sgs.impl.sharedutil.Objects; import com.sun.sgs.impl.sharedutil.PropertiesWrapper; import com.sun.sgs.impl.util.AbstractKernelRunnable; import com.sun.sgs.impl.util.AbstractService; import com.sun.sgs.impl.util.AbstractService.Version; import com.sun.sgs.impl.util.Exporter; import com.sun.sgs.kernel.ComponentRegistry; import com.sun.sgs.kernel.KernelRunnable; import com.sun.sgs.kernel.RecurringTaskHandle; import com.sun.sgs.management.NodeInfo; import com.sun.sgs.management.NodesMXBean; import com.sun.sgs.profile.ProfileCollector; import com.sun.sgs.service.Node; import com.sun.sgs.service.Node.Health; import com.sun.sgs.service.TransactionProxy; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.Properties; import java.util.Queue; import java.util.Random; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Level; import java.util.logging.Logger; import javax.management.JMException; import javax.management.MBeanNotificationInfo; import javax.management.Notification; import javax.management.NotificationBroadcasterSupport; import java.io.IOException; import java.util.Arrays; /** * The {@link WatchdogServer} implementation. <p> * * The {@link #WatchdogServerImpl constructor} supports the following * properties: <p> * * <dl style="margin-left: 1em"> * * <dt> <i>Property:</i> <code><b> * com.sun.sgs.impl.service.watchdog.server.port * </b></code><br> * <i>Default:</i> {@code 44533} * * <dd style="padding-top: .5em">Specifies the network port for the server. * This value must be greater than or equal to {@code 0} and no greater * than {@code 65535}. If the value specified is {@code 0}, then an * anonymous port will be chosen. The value chosen will be logged, and * can also be accessed with the {@link #getPort getPort} method. <p> * * <dt> <i>Property:</i> <code><b> * com.sun.sgs.impl.service.watchdog.server.renew.interval * </b></code><br> * <i>Default:</i> {@code 1000} (one second)<br> * * <dd style="padding-top: .5em"> * Specifies the renew interval which is returned by the * {@link #renewNode renewNode} method). The interval must be greater * than or equal to {@code 5} milliseconds and less than or equal to * {@code 10000} milliseconds (10 seconds).<p> * * <dt> <i>Property:</i> <code><b> * com.sun.sgs.impl.service.watchdog.server.timeflush.interval * </b></code><br> * <i>Default:</i> {@code 5000} (five seconds) * * <dd style="padding-top: .5em">Represents the amount of time in milliseconds * that the server will wait between updates to the global application time * stored in the data store. A larger value will take less system * resources but will allow the possibility of the global application clock * to drift by at least the given value if the system crashes. The * interval must be greater than or equal to {@code 100} milliseconds and * less than or equal to {@code 300000} milliseconds.<p> * * </dl> <p> * * Note that this server caches NodeImpls outside the data service to * maintain state. */ public final class WatchdogServerImpl extends AbstractService implements WatchdogServer { /** The name of this class. */ private static final String CLASSNAME = WatchdogServerImpl.class.getName(); /** The package name. */ private static final String PKG_NAME = "com.sun.sgs.impl.service.watchdog"; /** The prefix for server properties. */ private static final String SERVER_PROPERTY_PREFIX = PKG_NAME + ".server"; /** The logger for this class. */ private static final LoggerWrapper logger = new LoggerWrapper(Logger.getLogger(SERVER_PROPERTY_PREFIX)); /** The name of the version key. */ private static final String VERSION_KEY = SERVER_PROPERTY_PREFIX + ".version"; /** The major version. */ private static final int MAJOR_VERSION = 1; /** The minor version. */ private static final int MINOR_VERSION = 0; /** The server name in the registry. */ static final String WATCHDOG_SERVER_NAME = "WatchdogServer"; /** The property name for the server port. */ static final String PORT_PROPERTY = SERVER_PROPERTY_PREFIX + ".port"; /** The default value of the server port. */ static final int DEFAULT_PORT = 44533; /** The property name for the renew interval. */ private static final String RENEW_INTERVAL_PROPERTY = SERVER_PROPERTY_PREFIX + ".renew.interval"; /** The default value of the renew interval. */ private static final int DEFAULT_RENEW_INTERVAL = 1000; /** The lower bound for the renew interval. */ private static final int RENEW_INTERVAL_LOWER_BOUND = 100; /** The upper bound for the renew interval. */ private static final int RENEW_INTERVAL_UPPER_BOUND = Integer.MAX_VALUE; /** The property name for the timeflush interval. */ private static final String TIMEFLUSH_INTERVAL_PROPERTY = SERVER_PROPERTY_PREFIX + ".timeflush.interval"; /** The default time in milliseconds to wait between timeflushes. */ private static final long DEFAULT_TIMEFLUSH_INTERVAL = 5000L; /** * The name binding used to store the current global time in the data store. */ private static final String APP_TIME_BINDING = PKG_NAME + ".appTime"; /** * The name binding used to store the most recent timestamp interval * that was being used when the application time was updated in the * data store. */ private static final String APP_TIME_DRIFT_BINDING = PKG_NAME + ".appTimeDrift"; /** The server port. */ private final int serverPort; /** The renew interval. */ final long renewInterval; /** The timeflush interval. */ final long timeflushInterval; /** The node ID for this server. */ final long localNodeId; /** * If {@code true}, this stack is a full stack, so the local node * can be assigned as a backup node. If {@code false}, this stack * is a server stack only, so the local node can not be assigned as * a backup node. */ private final boolean isFullStack; /** The exporter for this server. */ private final Exporter<WatchdogServer> exporter; /** The lock for notifying the {@code NotifyClientsThread}. */ final Object notifyClientsLock = new Object(); /** The thread to notify clients of node status changes. */ private final Thread notifyClientsThread = new NotifyClientsThread(); /** The queue of nodes whose status has changed. */ final Queue<NodeImpl> statusChangedNodes = new ConcurrentLinkedQueue<NodeImpl>(); /** The map of registered nodes that are alive, keyed by node ID. */ private final ConcurrentMap<Long, NodeImpl> aliveNodes = new ConcurrentHashMap<Long, NodeImpl>(); // TBD: use a ConcurrentSkipListSet? /** The set of alive nodes, sorted by renew expiration time. */ final SortedSet<NodeImpl> expirationSet = Collections.synchronizedSortedSet(new TreeSet<NodeImpl>()); /** The set of failed nodes that are currently recovering. */ private final ConcurrentMap<Long, NodeImpl> recoveringNodes = new ConcurrentHashMap<Long, NodeImpl>(); /** A random number generator, for choosing backup nodes. */ private final Random backupChooser = new Random(); /** The thread for checking node expiration times and checking if * recovering nodes need backups assigned.. */ private final Thread checkExpirationThread = new CheckExpirationThread(); /** The JMX MXBean to expose nodes in the system. */ private final NodeManager nodeMgr; /** The offset to use when reporting the global application time. */ private long timeOffset; /** a handle to the periodic global time flush task */ private RecurringTaskHandle timeflushTaskHandle = null; /** * Constructs an instance of this class with the specified properties. * See the {@link WatchdogServerImpl class documentation} for a list * of supported properties. * * @param properties server properties * @param systemRegistry the system registry * @param txnProxy the transaction proxy * @param host the local host name * @param client the local watchdog client * @param fullStack {@code true} if this server is running on a full * stack * * @throws Exception if there is a problem starting the server */ public WatchdogServerImpl(Properties properties, ComponentRegistry systemRegistry, TransactionProxy txnProxy, String host, WatchdogClient client, boolean fullStack) throws Exception { super(properties, systemRegistry, txnProxy, logger); logger.log(Level.CONFIG, "Creating WatchdogServerImpl"); PropertiesWrapper wrappedProps = new PropertiesWrapper(properties); isFullStack = fullStack; if (logger.isLoggable(Level.CONFIG)) { logger.log(Level.CONFIG, "WatchdogServerImpl[" + host + "]: detected " + (isFullStack ? "full stack" : "server stack")); } /* * Check service version. */ transactionScheduler.runTask( new AbstractKernelRunnable("CheckServiceVersion") { public void run() { checkServiceVersion( VERSION_KEY, MAJOR_VERSION, MINOR_VERSION); } }, taskOwner); int requestedPort = wrappedProps.getIntProperty( PORT_PROPERTY, DEFAULT_PORT, 0, 65535); boolean noRenewIntervalProperty = wrappedProps.getProperty(RENEW_INTERVAL_PROPERTY) == null; renewInterval = isFullStack && noRenewIntervalProperty ? RENEW_INTERVAL_UPPER_BOUND : wrappedProps.getLongProperty( RENEW_INTERVAL_PROPERTY, DEFAULT_RENEW_INTERVAL, RENEW_INTERVAL_LOWER_BOUND, RENEW_INTERVAL_UPPER_BOUND); if (logger.isLoggable(Level.CONFIG)) { logger.log(Level.CONFIG, "WatchdogServerImpl[" + host + "]: renewInterval:" + renewInterval); } timeflushInterval = wrappedProps.getLongProperty( TIMEFLUSH_INTERVAL_PROPERTY, DEFAULT_TIMEFLUSH_INTERVAL, 100, 300000); FailedNodesRunnable failedNodesRunnable = new FailedNodesRunnable(); transactionScheduler.runTask(failedNodesRunnable, taskOwner); Collection<NodeImpl> failedNodes = failedNodesRunnable.nodes; statusChangedNodes.addAll(failedNodes); for (NodeImpl failedNode : failedNodes) { recoveringNodes.put(failedNode.getId(), failedNode); } // Create the node manager MBean and register it. This must be // done before regiseterNode is called. ProfileCollector collector = systemRegistry.getComponent(ProfileCollector.class); nodeMgr = new NodeManager(this); try { collector.registerMBean(nodeMgr, NodeManager.MXBEAN_NAME); } catch (JMException e) { logger.logThrow(Level.CONFIG, e, "Could not register MBean"); } // register our local id int jmxPort = wrappedProps.getIntProperty( StandardProperties.SYSTEM_JMX_REMOTE_PORT, -1); localNodeId = dataService.getLocalNodeId(); registerNode(localNodeId, host, client, jmxPort); exporter = new Exporter<WatchdogServer>(WatchdogServer.class); serverPort = exporter.export(this, WATCHDOG_SERVER_NAME, requestedPort); if (requestedPort == 0) { logger.log( Level.INFO, "Server is using port {0,number,#}", serverPort); } checkExpirationThread.start(); logger.log(Level.CONFIG, "Created WatchdogServerImpl with properties:" + "\n " + PORT_PROPERTY + "=" + requestedPort + "\n " + RENEW_INTERVAL_PROPERTY + "=" + renewInterval + "\n " + TIMEFLUSH_INTERVAL_PROPERTY + "=" + timeflushInterval); } /** Calls NodeImpl.markAllNodesFailed. */ private class FailedNodesRunnable extends AbstractKernelRunnable { Collection<NodeImpl> nodes = null; /** Constructs an instance. */ FailedNodesRunnable() { super(null); } /** {@inheritDoc} */ public void run() { nodes = NodeImpl.markAllNodesFailed(dataService); } } /* -- Implement AbstractService -- */ /** {@inheritDoc} */ protected void handleServiceVersionMismatch( Version oldVersion, Version currentVersion) { throw new IllegalStateException( "unable to convert version:" + oldVersion + " to current version:" + currentVersion); } /** {@inheritDoc} */ protected void doReady() throws Exception { assert !notifyClientsThread.isAlive(); // Don't notify clients until other services have had a chance // to register themselves with the watchdog. notifyClientsThread.start(); // If this is the first time booting up, bind the current global // time and set now as time 0. Also establish the global timeOffset // for the server try { transactionScheduler.runTask(new TimestampBindingRunner(), taskOwner); } catch (Exception e) { throw new AssertionError("Failed to initiate global time"); } // kick off a periodic time flush task timeflushTaskHandle = transactionScheduler.scheduleRecurringTask( new TimeflushRunner(timeflushInterval), taskOwner, System.currentTimeMillis(), timeflushInterval); timeflushTaskHandle.start(); } /** {@inheritDoc} */ protected void doShutdown() { // Unexport server and stop threads. exporter.unexport(); synchronized (checkExpirationThread) { checkExpirationThread.notifyAll(); } synchronized (notifyClientsLock) { notifyClientsLock.notifyAll(); } try { checkExpirationThread.join(); notifyClientsThread.join(); } catch (InterruptedException e) { } expirationSet.clear(); statusChangedNodes.clear(); // Mark all nodes failed and notify all clients (except local one) // of failure. final Collection<NodeImpl> failedNodes = aliveNodes.values(); try { transactionScheduler.runTask( new AbstractKernelRunnable("MarkAllNodesFailed") { public void run() { for (NodeImpl node : failedNodes) { node.setFailed(dataService, null); } } }, taskOwner); } catch (Exception e) { logger.logThrow( Level.WARNING, e, "Failed to update failed nodes during shutdown, throws"); } Set<NodeImpl> failedNodesExceptMe = new HashSet<NodeImpl>(failedNodes); failedNodesExceptMe.remove(aliveNodes.get(localNodeId)); notifyClients(failedNodesExceptMe, failedNodes); for (long nodeId : aliveNodes.keySet()) { nodeMgr.notifyNodeFailed(nodeId); } aliveNodes.clear(); // stop the time flush task and take the final timestamp if (timeflushTaskHandle != null) { timeflushTaskHandle.cancel(); } try { transactionScheduler.runTask(new TimeflushRunner(0), taskOwner); } catch (Exception e) { if (logger.isLoggable(Level.FINE)) { logger.logThrow(Level.FINE, e, "Unable to store latest application time"); } } } /* -- Implement WatchdogServer -- */ /** * {@inheritDoc} */ public long registerNode(long nodeId, final String host, WatchdogClient client, int jmxPort) throws NodeRegistrationFailedException { callStarted(); if (logger.isLoggable(Level.FINEST)) { logger.log(Level.FINEST, "registering node {0} on host {1}", nodeId, host); } try { if (host == null) { throw new IllegalArgumentException("null host"); } else if (client == null) { throw new IllegalArgumentException("null client"); } // Put new node in transient map. final NodeImpl node = new NodeImpl(nodeId, host, jmxPort, client); if (aliveNodes.putIfAbsent(nodeId, node) != null) { logger.log(Level.SEVERE, "Duplicate node ID generated for node on {0}", host); throw new NodeRegistrationFailedException( "Duplicate node ID generated"); } // Persist node try { transactionScheduler.runTask( new AbstractKernelRunnable("StoreNewNode") { public void run() { node.putNode(dataService); } }, taskOwner); } catch (Exception e) { aliveNodes.remove(nodeId); throw new NodeRegistrationFailedException( "registration failed: " + nodeId, e); } // Put node in set, sorted by expiration. node.setExpiration(calculateExpiration()); nodeMgr.notifyNodeStarted(nodeId); expirationSet.add(node); // Notify clients of new node. statusChangedNodes.add(node); synchronized (notifyClientsLock) { notifyClientsLock.notifyAll(); } logger.log(Level.INFO, "node:{0} registered", node); return renewInterval; } finally { callFinished(); } } /** * {@inheritDoc} */ public boolean renewNode(long nodeId) { callStarted(); try { NodeImpl node = aliveNodes.get(nodeId); if (node == null || !node.isAlive() || node.isExpired()) { return false; } synchronized (expirationSet) { // update expiration time in sorted set. expirationSet.remove(node); node.setExpiration(calculateExpiration()); expirationSet.add(node); return true; } } finally { callFinished(); } } /** * {@inheritDoc} */ public void recoveredNode(final long nodeId, long backupId) { callStarted(); try { try { // TBD: should the node be removed if the current // backup ID for the node with the given node ID // is not the given backup ID? transactionScheduler.runTask( new AbstractKernelRunnable("RemoveRecoveredNode") { public void run() { NodeImpl.removeNode(dataService, nodeId); } }, taskOwner); } catch (Exception e) { logger.logThrow( Level.WARNING, e, "Removing recovered node {0} throws", nodeId); } recoveringNodes.remove(nodeId); } finally { callFinished(); } } /** * {@inheritDoc} */ public void setNodeHealth(long nodeId, boolean isLocal, final Health health, String component, int maxNumberOfAttempts) { final NodeImpl node = aliveNodes.get(nodeId); if (node == null) { if (logger.isLoggable(Level.FINEST)) { logger.log(Level.FINEST, "Node with ID {0} is already reported as failed", nodeId); } return; } if (!health.isAlive()) { setNodeAsFailed(node, isLocal, component, maxNumberOfAttempts); } else { // persist the change try { transactionScheduler.runTask( new AbstractKernelRunnable("SetNodeHealth") { public void run() { node.setHealth(dataService, health); } }, taskOwner); } catch (Exception e) { logger.logThrow(Level.SEVERE, e, "Setting node: {0} health throws", node); } // Notify clients of a status change. statusChangedNodes.add(node); synchronized (notifyClientsLock) { notifyClientsLock.notifyAll(); } } } private void setNodeAsFailed(NodeImpl node, boolean isLocal, String component, int maxNumberOfAttempts) { if (!isLocal) { // Try to report the failure to the remote failed node so that // the node can be shutdown itself. Try a few times if we run // into an IOException. int retries = maxNumberOfAttempts; while (retries-- > 0) { try { node.getWatchdogClient().reportFailure(component); break; } catch (IOException ioe) { if (retries == 0) { logger.logThrow( Level.WARNING, ioe, "Reporting failure " + "to node:{0} from node:{1}, className:{2} after " + "{3} attempt(s), throws", node.getId(), localNodeId, component, maxNumberOfAttempts); } } } } processNodeFailures(Arrays.asList(node)); statusChangedNodes.add(node); synchronized (notifyClientsLock) { notifyClientsLock.notifyAll(); } } /** * {@inheritDoc} */ public long currentAppTimeMillis() { return System.currentTimeMillis() - timeOffset; } /* -- other methods -- */ /** * Returns the offset being used by this server to report global * application time with the {@link #currentAppTimeMillis()} method. * * @return the time offset */ long getTimeOffset() { return timeOffset; } /** * Processes the nodes which have failed by calling the failure methods * for each node in the collection. The processes are separated into two * for-loops so that a failed node is not mistakenly chosen as a backup * while this operation is occurring. * * @param nodesToFail the collection of failed nodes * @return a subset of {@code nodesToFail} that were marked as failed from * this method */ Collection<NodeImpl> processNodeFailures(Collection<NodeImpl> nodesToFail) { Collection<NodeImpl> aliveNodesToFail = new ArrayList<NodeImpl>(); // Declare the nodes as failed only if it has not be reported to be // be failed already to prevent a failed node from being assigned as a // backup. It should be noted that nodes that are removed from the set // of {@code aliveNodes} are never added back. for (NodeImpl node : nodesToFail) { if (aliveNodes.remove(node.getId()) != null) { aliveNodesToFail.add(node); } } // Iterate through the nodes known to still be alive for (NodeImpl node : aliveNodesToFail) { /** * Mark the node as failed, assign it a backup, and update the * data store. Add the node to the list of recovering nodes. The * node will be removed from the list and from the data store in * the 'recoveredNode' callback. */ setFailed(node); recoveringNodes.put(node.getId(), node); } return aliveNodesToFail; } /** * Returns the port being used for this server. * * @return the server port */ public int getPort() { return serverPort; } /** * Returns an expiration time based on the current time. */ private long calculateExpiration() { return System.currentTimeMillis() + renewInterval; } /** * This thread checks the node map, sorted by expiration times to * determine if any nodes have failed and updates their state accordingly. * It also checks the recovered nodes to see if a given recovering node * has no backup assigned, and assigns a backup if one is available. */ private final class CheckExpirationThread extends Thread { /** Constructs an instance of this class as a daemon thread. */ CheckExpirationThread() { super(CLASSNAME + "$CheckExpirationThread"); setDaemon(true); } /** * Wakes up periodically to detect failed nodes and update * state accordingly. */ public void run() { Collection<NodeImpl> expiredNodes = new ArrayList<NodeImpl>(); while (!shuttingDown()) { /* * Determine which nodes have failed because they * haven't renewed before their expiration time. */ long now = System.currentTimeMillis(); synchronized (expirationSet) { while (!expirationSet.isEmpty()) { NodeImpl node = expirationSet.first(); // We are done aggregating from the sorted // set once the expiration exceeds "now" if (node.getExpiration() > now) { break; } // Only report the node as expired if it // is still alive. Otherwise we assume it // is already being reported as failed. if (aliveNodes.containsKey(node.getId())) { expiredNodes.add(node); } expirationSet.remove(node); } } /** * Perform the node failure procedure */ if (!expiredNodes.isEmpty()) { processNodeFailures(expiredNodes); /* * Remove failed nodes from map of "alive" nodes so * that a failed node won't be assigned as a backup. * Also, clean up the host port map entry. */ for (NodeImpl node : expiredNodes) { aliveNodes.remove(node.getId()); nodeMgr.notifyNodeFailed(node.getId()); } /* * Mark each expired node as failed, assign it a * backup, and update the data store. Add each * expired node to the list of recovering nodes. * The node will be removed from the list and from * the data store in the 'recoveredNode' callback. */ for (NodeImpl node : expiredNodes) { setFailed(node); recoveringNodes.put(node.getId(), node); } statusChangedNodes.addAll(expiredNodes); expiredNodes.clear(); } /* * Check each recovering node: if a given recovering * node doesn't have a backup, assign it a backup if * an "alive" node is available to serve as one. */ if (!recoveringNodes.isEmpty()) { for (NodeImpl recoveringNode : recoveringNodes.values()) { if (!recoveringNode.hasBackup()) { NodeImpl backup = chooseBackup(recoveringNode); if (backup != null) { assignBackup(recoveringNode, backup); statusChangedNodes.add(recoveringNode); } } } } // TBD: should reminder notifications be sent to // nodes that haven't recovered yet? /* * Notify thread to send out node status change notifications. */ if (!statusChangedNodes.isEmpty()) { synchronized (notifyClientsLock) { notifyClientsLock.notifyAll(); } } /* * Readjust time to sleep before checking for expired * nodes. */ long sleepTime; synchronized (expirationSet) { sleepTime = expirationSet.isEmpty() ? renewInterval : expirationSet.first().getExpiration() - now; } synchronized (this) { if (shuttingDown()) { return; } try { wait(sleepTime); } catch (InterruptedException e) { return; } } } } } /** * Chooses a backup for the failed {@code node}, updates the * node's status to failed, assigns the chosen backup for the * node, and persists the node state changes in the data * service. */ private void setFailed(final NodeImpl node) { final long nodeId = node.getId(); logger.log(Level.FINE, "Node failed: {0}", nodeId); /* * First, reassign a backup to each primary for which the * failed node is a backup but hadn't yet completed * recovery. If a backup is reassigned, add to the * statusChangedNodes queue so that the change can be * propagated to clients. */ for (Long primaryId : node.getPrimaries()) { final NodeImpl primary = recoveringNodes.get(primaryId); if (primary != null) { assignBackup(primary, chooseBackup(primary)); statusChangedNodes.add(primary); } } /* * Choose a backup for the failed node, update the node's * status to failed, update backup's state to include * failed node as one that is being recovered. */ assignBackup(node, chooseBackup(node)); } /** * Chooses a backup for the specified {@code node} from the * map of "alive" nodes. The backup is picked randomly. Before this * method is invoked, the specified {@code node} as well as other * currently detected failed nodes should not be present in the "alive" * nodes map. */ private NodeImpl chooseBackup(NodeImpl node) { NodeImpl choice = null; // Copy of the alive nodes NodeImpl[] values; values = aliveNodes.values().toArray(new NodeImpl[0]); final int numAliveNodes = values.length; int random = numAliveNodes > 0 ? backupChooser.nextInt(numAliveNodes) : 0; for (int i = 0; i < numAliveNodes; i++) { // Choose one of the values[] elements randomly. If we // chose the localNodeId, loop again, choosing the next // array element. int tryNode = (random + i) % numAliveNodes; NodeImpl backupCandidate = values[tryNode]; /* * The local node can only be assigned as a backup * if this stack is a full stack (meaning that * this stack is running a single-node application). * If this node is the only "alive" node in a server * stack, then a backup for the failed node will remain * unassigned until a new node is registered and * recovery for the failed node will be delayed. */ // assert backupCandidate.getId() != node.getId() if (isFullStack || backupCandidate.getId() != localNodeId) { choice = backupCandidate; break; } } if (logger.isLoggable(Level.FINE)) { logger.log(Level.FINE, "backup:{0} chosen for node:{1}", choice, node); } return choice; } /** * Persists node and backup status updates in data service. */ private void assignBackup(final NodeImpl node, final NodeImpl backup) { try { transactionScheduler.runTask( new AbstractKernelRunnable("SetNodeFailed") { public void run() { node.setFailed(dataService, backup); if (backup != null) { backup.addPrimary(dataService, node.getId()); } } }, taskOwner); } catch (Exception e) { logger.logThrow( Level.SEVERE, e, "Marking node:{0} failed and assigning backup throws", node); } } /** * This thread informs all currently known clients of node status * changes (either nodes started or failed) as they occur. This * thread is notified by {@link #registerNode registerNode} when * nodes are registered, or by the {@code CheckExpirationThread} * when nodes fail to renew before their expiration time has * lapsed. */ private final class NotifyClientsThread extends Thread { /** Constructs an instance of this class as a daemon thread. */ NotifyClientsThread() { super(CLASSNAME + "$NotifyClientsThread"); setDaemon(true); } /** {@inheritDoc} */ public void run() { while (true) { synchronized (notifyClientsLock) { while (statusChangedNodes.isEmpty()) { if (shuttingDown()) { return; } try { notifyClientsLock.wait(); } catch (InterruptedException e) { return; } } } if (shuttingDown()) { break; } // TBD: possibly wait for more updates to batch? Iterator<NodeImpl> iter = statusChangedNodes.iterator(); Collection<NodeImpl> changedNodes = new ArrayList<NodeImpl>(); while (iter.hasNext()) { changedNodes.add(iter.next()); iter.remove(); } notifyClients(aliveNodes.values(), changedNodes); } } } /** * Notifies the {@code WatchdogClient} of each node in the * collection of {@code notifyNodes} of the node status changes in * {@code changedNodes}. * * @param notifyNodes nodes whose clients should be notified * @param changedNodes nodes with status changes */ private void notifyClients(Collection<NodeImpl> notifyNodes, Collection<NodeImpl> changedNodes) { // Assemble node information into arrays. int size = changedNodes.size(); long[] ids = new long[size]; String[] hosts = new String[size]; Health[] health = new Health[size]; long[] backups = new long[size]; int i = 0; for (NodeImpl changedNode : changedNodes) { logger.log(Level.FINEST, "changed node:{0}", changedNode); ids[i] = changedNode.getId(); hosts[i] = changedNode.getHostName(); health[i] = changedNode.getHealth(); backups[i] = changedNode.getBackupId(); i++; } // Notify clients of status changes. for (NodeImpl notifyNode : notifyNodes) { WatchdogClient client = notifyNode.getWatchdogClient(); try { if (logger.isLoggable(Level.FINEST)) { logger.log( Level.FINEST, "notifying client:{0} of status change", notifyNode); } client.nodeStatusChanges(ids, hosts, health, backups); } catch (Exception e) { // TBD: Should it try harder to notify the client in // the non-restart case? In the restart case, the // client may have failed too. if (!shuttingDown()) { logger.logThrow( Level.WARNING, e, "Notifying {0} of node status changes failed:", notifyNode.getId()); } } } } // Management support private NodeInfo[] getAllNodeInfo() { final Set<NodeInfo> nodes = new HashSet<NodeInfo>(); try { transactionScheduler.runTask( new AbstractKernelRunnable("GetNodeInfo") { public void run() { Iterator<Node> iter = NodeImpl.getNodes(dataService); while (iter.hasNext()) { NodeImpl node = (NodeImpl) iter.next(); nodes.add(node.getNodeInfo()); } } }, taskOwner); } catch (Exception e) { logger.logThrow(Level.INFO, e, "Could not retrieve node information"); return new NodeInfo[0]; } return nodes.toArray(new NodeInfo[nodes.size()]); } /** * Private class for JMX information. */ private static class NodeManager extends NotificationBroadcasterSupport implements NodesMXBean { /** The watchdog server we'll use to get the node info. */ private WatchdogServerImpl watchdog; private AtomicLong seqNumber = new AtomicLong(); /** Description of the notifications. */ private static MBeanNotificationInfo[] notificationInfo = new MBeanNotificationInfo[] { new MBeanNotificationInfo( new String[] {NODE_STARTED_NOTIFICATION, NODE_FAILED_NOTIFICATION }, Notification.class.getName(), "A node has started or failed") }; /** * Creates an instance of the manager. * @param watchdog the watchdog server */ NodeManager(WatchdogServerImpl watchdog) { super(notificationInfo); this.watchdog = watchdog; } /** {@inheritDoc} */ public NodeInfo[] getNodes() { return watchdog.getAllNodeInfo(); } /* * Package private methods. */ /** * Sends JMX notification that a node started. * @param nodeId the identifier of the newly started node */ void notifyNodeStarted(long nodeId) { sendNotification( new Notification(NODE_STARTED_NOTIFICATION, this.MXBEAN_NAME, seqNumber.incrementAndGet(), System.currentTimeMillis(), "Node started: " + nodeId)); } /** * Sends JMX notification that a node failed. * @param nodeId the identifier of the failed node */ void notifyNodeFailed(long nodeId) { sendNotification( new Notification(NODE_FAILED_NOTIFICATION, this.MXBEAN_NAME, seqNumber.incrementAndGet(), System.currentTimeMillis(), "Node failed: " + nodeId)); } } /** * Private runnable that is used to setup the initial binding of the * current global time in the data store. This task also establishes the * server's global time offset value. */ private final class TimestampBindingRunner implements KernelRunnable { /** {@inheritDoc} */ public String getBaseTaskType() { return TimestampBindingRunner.class.getName(); } /** {@inheritDoc} */ public void run() throws Exception { ManagedSerializable<Long> time = null; ManagedSerializable<Long> drift = null; try { time = Objects.uncheckedCast(dataService.getServiceBinding( APP_TIME_BINDING)); drift = Objects.uncheckedCast(dataService.getServiceBinding( APP_TIME_DRIFT_BINDING)); // add a small amount of time when recovering to keep the // global time from gradually drifting behind due to // system crashes time.set(time.get() + drift.get() / 2); } catch (NameNotBoundException nnbe) { time = new ManagedSerializable<Long>(Long.valueOf(0)); drift = new ManagedSerializable<Long>(timeflushInterval); dataService.setServiceBinding(APP_TIME_BINDING, time); dataService.setServiceBinding(APP_TIME_DRIFT_BINDING, drift); } timeOffset = System.currentTimeMillis() - time.get(); } } /** * Private runnable that periodically records the current global time * in the data store */ private final class TimeflushRunner implements KernelRunnable { private final long drift; public TimeflushRunner(long drift) { this.drift = drift; } /** {@inheritDoc} */ public String getBaseTaskType() { return TimeflushRunner.class.getName(); } /** {@inheritDoc} */ public void run() throws Exception { ManagedSerializable<Long> time = Objects.uncheckedCast( dataService.getServiceBinding(APP_TIME_BINDING)); ManagedSerializable<Long> drift = Objects.uncheckedCast( dataService.getServiceBinding(APP_TIME_DRIFT_BINDING)); time.set(currentAppTimeMillis()); drift.set(this.drift); } } }