/* * Copyright 2007-2010 Sun Microsystems, Inc. * * This file is part of Project Darkstar Server. * * Project Darkstar Server is free software: you can redistribute it * and/or modify it under the terms of the GNU General Public License * version 2 as published by the Free Software Foundation and * distributed hereunder to you. * * Project Darkstar Server is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * * -- */ package com.sun.sgs.impl.service.watchdog; import com.sun.sgs.impl.kernel.ConfigManager; import com.sun.sgs.impl.kernel.KernelShutdownController; import com.sun.sgs.impl.kernel.StandardProperties; import com.sun.sgs.impl.sharedutil.LoggerWrapper; import static com.sun.sgs.impl.sharedutil.Objects.checkNull; import com.sun.sgs.impl.sharedutil.PropertiesWrapper; import com.sun.sgs.impl.util.AbstractKernelRunnable; import com.sun.sgs.impl.util.AbstractService; import com.sun.sgs.impl.util.Exporter; import com.sun.sgs.kernel.ComponentRegistry; import com.sun.sgs.kernel.NodeType; import com.sun.sgs.kernel.KernelRunnable; import com.sun.sgs.kernel.RecurringTaskHandle; import com.sun.sgs.management.NodeInfo; import com.sun.sgs.profile.ProfileCollector; import com.sun.sgs.service.Node; import com.sun.sgs.service.Node.Health; import com.sun.sgs.service.NodeListener; import com.sun.sgs.service.RecoveryListener; import com.sun.sgs.service.SimpleCompletionHandler; import com.sun.sgs.service.TransactionProxy; import com.sun.sgs.service.WatchdogService; import java.io.IOException; import java.net.InetAddress; import java.rmi.registry.LocateRegistry; import java.rmi.registry.Registry; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Properties; import java.util.Queue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ConcurrentMap; import java.util.logging.Level; import java.util.logging.Logger; /* * TBD: Modify implementation to not accept calls before service is ready. * The server should not service incoming remote calls (registerNode, etc.) * until it receives the 'ready' invocation (or finishes construction * successfully). Some of the fields used in registerNode aren't initialized * until after the server is exported, so it can cause problems if the server * receives an incoming request before it has completed initializing. In * practice, this flaw is not a problem so long as the server is started first * before starting other nodes. */ import javax.management.JMException; /** * The {@link WatchdogService} implementation. <p> * * The {@link #WatchdogServiceImpl constructor} supports the following * properties: <p> * * <dl style="margin-left: 1em"> * * <dt> <i>Property:</i> <code><b> * com.sun.sgs.impl.service.watchdog.server.host * </b></code><br> * <i>Default:</i> the value of the {@code com.sun.sgs.server.host} * property, if present, or {@code localhost} if this node is starting the * server <br> <br> * * <dd style="padding-top: .5em"> * Specifies the host name for the watchdog server that this service * contacts. If the {@code * com.sun.sgs.node.type} property is not {@code appNode}, then this * property's default is used (since the watchdog server to contact will * be the one started on the local host). * * <dt> <i>Property:</i> <code><b> * com.sun.sgs.impl.service.watchdog.server.port * </b></code><br> * <i>Default:</i> {@code 44533} <br> * * <dd style="padding-top: .5em"> * Specifies the network port for the watchdog server that this service * contacts (and, optionally, starts). If the {@code * com.sun.sgs.node.type} property is not {@code singleNode}, then the * value must be greater than or equal to {@code 0} and no greater than * {@code 65535}, otherwise the value must be greater than {@code 0}, * and no greater than {@code 65535}.<p> * * <dt> <i>Property:</i> <code><b> * com.sun.sgs.impl.service.watchdog.client.host * </b></code><br> * <i>Default:</i> the local host name <br> * * <dd style="padding-top: .5em"> * Specifies the host name for the watchdog client used when * registering the node with the watchdog service. * * <dt> <i>Property:</i> <code><b> * com.sun.sgs.impl.service.watchdog.client.port * </b></code><br> * <i>Default:</i> {@code 0} (anonymous port) <br> * * <dd style="padding-top: .5em"> * Specifies the network port for this watchdog service for receiving * node status change notifications from the watchdog server. The value * must be greater than or equal to {@code 0} and no greater than * {@code 65535}.<p> * * <dt> <i>Property:</i> <code><b> * com.sun.sgs.impl.service.watchdog.timesync.interval * </b></code><br> * <i>Default:</i> {@code 300000} (five minutes) <br> * * <dd style="padding-top: .5em"> * Specifies the amount of time in milliseconds that this service will * wait between synchronizing its local time with the global time * of the {@code WatchdogServer}. The value must be greater than or * equal to {@code 1000} and no greater than {@link Long#MAX_VALUE}. * * <dt> <i>Property:</i> <code><b> * com.sun.management.jmxremote.port * </b></code><br> * <i>Default:</i> None <br> * * <dd style="padding-top: .5em"> * Enables remote JMX monitoring through the specified port. By default, * remote monitoring is not enabled. Not that this is a system property, * and must be set on the command line when starting the node.<p> * * </dl> <p> */ public final class WatchdogServiceImpl extends AbstractService implements WatchdogService { /** The name of this class. */ private static final String CLASSNAME = WatchdogServiceImpl.class.getName(); /** The package name. */ private static final String PKG_NAME = "com.sun.sgs.impl.service.watchdog"; /** The logger for this class. */ private static final LoggerWrapper logger = new LoggerWrapper( Logger.getLogger(PKG_NAME + ".service")); /** The name of the version key. */ private static final String VERSION_KEY = PKG_NAME + ".service.version"; /** The major version. */ private static final int MAJOR_VERSION = 1; /** The minor version. */ private static final int MINOR_VERSION = 0; /** The prefix for server properties. */ private static final String SERVER_PROPERTY_PREFIX = PKG_NAME + ".server"; /** The prefix for client properties. */ private static final String CLIENT_PROPERTY_PREFIX = PKG_NAME + ".client"; /** The property name for the watchdog server host. */ private static final String HOST_PROPERTY = SERVER_PROPERTY_PREFIX + ".host"; /** The property name for the watchdog server port. */ private static final String SERVER_PORT_PROPERTY = WatchdogServerImpl.PORT_PROPERTY; /** The default value of the server port. */ private static final int DEFAULT_SERVER_PORT = WatchdogServerImpl.DEFAULT_PORT; /** The property name for the watchdog client host. */ private static final String CLIENT_HOST_PROPERTY = CLIENT_PROPERTY_PREFIX + ".host"; /** The property name for the watchdog client port. */ private static final String CLIENT_PORT_PROPERTY = CLIENT_PROPERTY_PREFIX + ".port"; /** The default value of the client port. */ private static final int DEFAULT_CLIENT_PORT = 0; /** The property name for the timesync interval. */ private static final String TIMESYNC_INTERVAL_PROPERTY = PKG_NAME + ".timesync.interval"; /** The default time in milliseconds to wait between timesync. */ private static final long DEFAULT_TIMESYNC_INTERVAL = 300000L; /** The minimum renew interval. */ private static final long MIN_RENEW_INTERVAL = 25; /** The exporter for this server or {@code null}. */ private Exporter<WatchdogClient> exporter = null; /** The watchdog server impl, or {@code null}. */ private WatchdogServerImpl serverImpl = null; /** The watchdog server proxy, or {@code null}. */ final WatchdogServer serverProxy; /** The watchdog client impl. */ private final WatchdogClientImpl clientImpl; /** The watchdog client proxy. */ final WatchdogClient clientProxy; /** The name of the local host. */ final String localHost; /** The controller which enables node shutdown */ private final KernelShutdownController shutdownController; /** The thread that renews the node with the watchdog server. */ final Thread renewThread = new RenewThread(); /** The local nodeId. */ final long localNodeId; /** The interval for renewals with the watchdog server. */ private final long renewInterval; /** The set of node listeners for all nodes. */ private final ConcurrentMap<NodeListener, NodeListener> nodeListeners = new ConcurrentHashMap<NodeListener, NodeListener>(); /** The set of recovery listeners for this node. */ private final ConcurrentMap<RecoveryListener, RecoveryListener> recoveryListeners = new ConcurrentHashMap<RecoveryListener, RecoveryListener>(); /** The queues of SimpleCompletionHandlers, keyed by node being * recovered. */ private final ConcurrentMap<Node, Queue<SimpleCompletionHandler>> recoveryQueues = new ConcurrentHashMap<Node, Queue<SimpleCompletionHandler>>(); /** * The set of health reports for this node by component. Should only contain * non-GREEN health reports. Accesses to this map and the {@code health} * field must be synchronized. */ private final Map<String, Health> healthReports = new HashMap<String, Health>(); /** * Overall health of this node, initially, the field is {@code * GREEN}. The health of this node is the most severe condition reported, * or {@code Health.GREEN} if no reports exits. Accesses to this field * and the {@code healthReports} map must be synchronized. */ private Health health = Health.GREEN; /** Our profiled data */ private final WatchdogServiceStats serviceStats; /** The interval between synchronizations of global time with the server. */ private final long timesyncInterval; /** The local offset to use when reporting the application time. */ private volatile long timeOffset; /** a handle to the periodic time sync task */ private RecurringTaskHandle timesyncTaskHandle = null; /** * Constructs an instance of this class with the specified properties. * See the {@link WatchdogServiceImpl class documentation} for a list * of supported properties. The Watchdog service is given the ability to * shutdown a node with the {@link KernelShutdownController}. * * @param properties service (and server) properties * @param systemRegistry system registry * @param txnProxy transaction proxy * @param ctrl shutdown controller * @throws Exception if a problem occurs constructing the service/server */ public WatchdogServiceImpl(Properties properties, ComponentRegistry systemRegistry, TransactionProxy txnProxy, KernelShutdownController ctrl) throws Exception { super(properties, systemRegistry, txnProxy, logger); logger.log(Level.CONFIG, "Creating WatchdogServiceImpl"); PropertiesWrapper wrappedProps = new PropertiesWrapper(properties); // Setup the KernelShutdownController object if (ctrl == null) { throw new NullPointerException("null shutdown controller"); } shutdownController = ctrl; try { localHost = InetAddress.getLocalHost().getHostName(); NodeType nodeType = wrappedProps.getEnumProperty(StandardProperties.NODE_TYPE, NodeType.class, NodeType.singleNode); boolean startServer = nodeType != NodeType.appNode; boolean isFullStack = nodeType != NodeType.coreServerNode; int clientPort = wrappedProps.getIntProperty( CLIENT_PORT_PROPERTY, DEFAULT_CLIENT_PORT, 0, 65535); String clientHost = wrappedProps.getProperty( CLIENT_HOST_PROPERTY, localHost); /* * Check service version. */ transactionScheduler.runTask( new AbstractKernelRunnable("CheckServiceVersion") { public void run() { checkServiceVersion( VERSION_KEY, MAJOR_VERSION, MINOR_VERSION); } }, taskOwner); clientImpl = new WatchdogClientImpl(); exporter = new Exporter<WatchdogClient>(WatchdogClient.class); exporter.export(clientImpl, clientPort); clientProxy = exporter.getProxy(); String host; int serverPort; if (startServer) { serverImpl = new WatchdogServerImpl( properties, systemRegistry, txnProxy, clientHost, clientProxy, isFullStack); host = localHost; serverPort = serverImpl.getPort(); } else { host = wrappedProps.getProperty( HOST_PROPERTY, wrappedProps.getProperty( StandardProperties.SERVER_HOST)); if (host == null) { throw new IllegalArgumentException( "A server host must be specified"); } serverPort = wrappedProps.getIntProperty( SERVER_PORT_PROPERTY, DEFAULT_SERVER_PORT, 1, 65535); } Registry rmiRegistry = LocateRegistry.getRegistry(host, serverPort); serverProxy = (WatchdogServer) rmiRegistry.lookup(WatchdogServerImpl.WATCHDOG_SERVER_NAME); int jmxPort = wrappedProps.getIntProperty( StandardProperties.SYSTEM_JMX_REMOTE_PORT, -1); localNodeId = dataService.getLocalNodeId(); if (startServer) { renewInterval = serverImpl.renewInterval; } else { renewInterval = serverProxy.registerNode( localNodeId, clientHost, clientProxy, jmxPort); } renewThread.start(); timesyncInterval = wrappedProps.getLongProperty( TIMESYNC_INTERVAL_PROPERTY, DEFAULT_TIMESYNC_INTERVAL, 1000, Long.MAX_VALUE); // create our profiling info and register our MBean ProfileCollector collector = systemRegistry.getComponent(ProfileCollector.class); serviceStats = new WatchdogServiceStats(collector, this); try { collector.registerMBean(serviceStats, WatchdogServiceStats.MXBEAN_NAME); } catch (JMException e) { logger.logThrow(Level.CONFIG, e, "Could not register MBean"); } // set our data in the ConfigMXBean ConfigManager config = (ConfigManager) collector.getRegisteredMBean(ConfigManager.MXBEAN_NAME); if (config == null) { logger.log(Level.CONFIG, "Could not find ConfigMXBean"); } else { config.setJmxPort(jmxPort); } if (logger.isLoggable(Level.CONFIG)) { logger.log(Level.CONFIG, "node registered, host:{0}, localNodeId:{1}", clientHost, localNodeId); } logger.log(Level.CONFIG, "Created WatchdogServiceImpl with properties:" + "\n " + CLIENT_HOST_PROPERTY + "=" + clientHost + "\n " + CLIENT_PORT_PROPERTY + "=" + clientPort + "\n " + HOST_PROPERTY + "=" + host + "\n " + SERVER_PORT_PROPERTY + "=" + serverPort + "\n " + TIMESYNC_INTERVAL_PROPERTY + "=" + timesyncInterval); } catch (Exception e) { logger.logThrow( Level.CONFIG, e, "Failed to create WatchdogServiceImpl"); doShutdown(); throw e; } } /* -- Implement AbstractService -- */ /** {@inheritDoc} */ protected void handleServiceVersionMismatch( Version oldVersion, Version currentVersion) { throw new IllegalStateException( "unable to convert version:" + oldVersion + " to current version:" + currentVersion); } /** * {@inheritDoc} * * A health update will be sent to listeners. */ protected void doReady() throws Exception { // TBD: the client shouldn't accept incoming calls until this // service is ready which would give all RecoveryListeners a // chance to register. if (serverImpl != null) { serverImpl.ready(); this.timeOffset = serverImpl.getTimeOffset(); } else { TimeSyncRunner timeSyncRunner = new TimeSyncRunner(); timeSyncRunner.run(); timesyncTaskHandle = taskScheduler.scheduleRecurringTask( timeSyncRunner, taskOwner, System.currentTimeMillis() + timesyncInterval, timesyncInterval); timesyncTaskHandle.start(); } // Report this component is healthy and ready for work reportHealth(localNodeId, Health.GREEN, CLASSNAME); } /** {@inheritDoc} */ protected void doShutdown() { synchronized (renewThread) { renewThread.notifyAll(); } if (timesyncTaskHandle != null) { timesyncTaskHandle.cancel(); } try { // The following 'join' call relies on an undocumented feature: // 'join' can also be invoked on a thread that isn't started. // If the server can't be exported, the renewThread won't be // started when 'doShutdown' is invoked. renewThread.join(); } catch (InterruptedException e) { } if (exporter != null) { exporter.unexport(); } if (serverImpl != null) { serverImpl.shutdown(); } } /* -- Implement WatchdogService -- */ /** {@inheritDoc} */ public Health getLocalNodeHealth() { checkState(); serviceStats.getLocalNodeHealthOp.report(); return getNodeHealthTransactional(); } private Health getNodeHealthTransactional() { if (!isLocalAlive()) { return Health.RED; } else { Node node = NodeImpl.getNode(dataService, localNodeId); if (node == null || !node.isAlive()) { reportFailure(localNodeId, CLASSNAME); return Health.RED; } else { return node.getHealth(); } } } /** {@inheritDoc} */ public boolean isLocalNodeAlive() { checkState(); serviceStats.isLocalNodeAliveOp.report(); return getNodeHealthTransactional().isAlive(); } /** {@inheritDoc} */ public synchronized Health getLocalNodeHealthNonTransactional() { checkState(); serviceStats.getLocalNodeHealthNonTransOp.report(); return health; } /** {@inheritDoc} */ public boolean isLocalNodeAliveNonTransactional() { checkState(); serviceStats.isLocalNodeAliveNonTransOp.report(); return isLocalAlive(); } /** {@inheritDoc} */ public Iterator<Node> getNodes() { checkState(); serviceStats.getNodesOp.report(); txnProxy.getCurrentTransaction(); return NodeImpl.getNodes(dataService); } /** {@inheritDoc} */ public Node getNode(long nodeId) { checkState(); if (nodeId < 0) { throw new IllegalArgumentException("invalid nodeId: " + nodeId); } serviceStats.getNodeOp.report(); return NodeImpl.getNode(dataService, nodeId); } /** {@inheritDoc} */ public void addNodeListener(NodeListener listener) { checkState(); checkNonTransactionalContext(); checkNull("listener", listener); serviceStats.addNodeListenerOp.report(); nodeListeners.putIfAbsent(listener, listener); } /** {@inheritDoc} */ public Node getBackup(long nodeId) { checkState(); serviceStats.getBackupOp.report(); NodeImpl node = (NodeImpl) getNode(nodeId); return (node != null && node.hasBackup()) ? getNode(node.getBackupId()) : null; } /** {@inheritDoc} */ public void addRecoveryListener(RecoveryListener listener) { checkState(); checkNonTransactionalContext(); checkNull("listener", listener); serviceStats.addRecoveryListenerOp.report(); recoveryListeners.putIfAbsent(listener, listener); } /** {@inheritDoc} */ public void reportFailure(long nodeId, String component) { reportHealth(nodeId, Health.RED, component); } /** {@inheritDoc} */ public synchronized void reportHealth(long nodeId, Health nodeHealth, String component) { checkNull("nodeHealth", nodeHealth); checkNull("component", component); checkNonTransactionalContext(); if (shuttingDown() || !isLocalAlive()) { return; } boolean isLocal = (nodeId == localNodeId); if (logger.isLoggable(Level.FINER) || !nodeHealth.isAlive()) { logger.log((nodeHealth.isAlive() ? Level.WARNING : Level.FINER), "{1} reported {2} health in {3} node with id: {0}", nodeId, component, nodeHealth, isLocal ? "local" : "remote"); } // If the report is for this node, determine the actual (overall) health // which is the more severe health reported to date if (isLocal) { // If reported health is GREEN then just remove the entry (since // empty reports == GREEN) otherwise add this report, possibly // replacing a previous report from the component if (nodeHealth == Health.GREEN) { healthReports.remove(component); } else { healthReports.put(component, nodeHealth); } // If the report is an improvement over the current // health, see if it improves the overall health if (health.worseThan(nodeHealth)) { // Look at all the reports for this node, recording the most // severe for (Map.Entry<String, Health> report : healthReports.entrySet()) { if (report.getValue().worseThan(nodeHealth)) { nodeHealth = report.getValue(); component = report.getKey(); } } } } // Try to report the health to the watchdog server. If we cannot // contact the Watchdog server while reporting, then set a local // failure. int retries = maxIoAttempts; while (retries-- > 0) { try { serverProxy.setNodeHealth(nodeId, isLocal, nodeHealth, component, maxIoAttempts); break; } catch (IOException ioe) { if (retries == 0) { logger.logThrow( Level.SEVERE, ioe, "node:{0} cannot report failure of node:{1} to " + "Watchdog server", localNodeId, nodeId); setFailedThenNotify(); return; } } } if (isLocal) { setHealthThenNotify(nodeHealth, component); } } /** * Sets the local health of this node to {@code RED} and notifies * appropriate registered node listeners of this node's failure. This method * should only be called by this service when this node is no longer * considered alive by the Watchdog server or the server can no longer * be contacted. If the Watchdog server needs to be made aware of this * node's failure, use {@code reportFailure()}. <p> * * If this node's local health status was already set to {@code RED}, * then this method does nothing. */ private void setFailedThenNotify() { setHealthThenNotify(Health.RED, CLASSNAME); } /** * Sets the local health of this node and notifies appropriate * registered node listeners of the possible change. If this node's local * health status was already set to {@code RED}, then this method does * nothing. It is assumed that the Watchdog server has been informed of the * possible health change. If the Watchdog server needs to be notified * use {@code reportHealth()}, which will eventually invoke this method. * * @param newHealth the new health for the local node * @param component the component reporting the health */ private synchronized void setHealthThenNotify(Health newHealth, String component) { if (logger.isLoggable(Level.FINER)) { logger.log(Level.FINER, "Set local health to {0}, reported by {1}, " + "previous health was: {2}", newHealth, component, health); } if (!health.isAlive()) { return; } health = newHealth; notifyNodeListeners(new NodeImpl(localNodeId, localHost, health)); if (!health.isAlive()) { logger.log(Level.SEVERE, "Node:{0} forced to shutdown due to service failure", localNodeId); shutdownController.shutdownNode(this); } } /** * {@inheritDoc} */ public long currentAppTimeMillis() { return System.currentTimeMillis() - timeOffset; } /** * {@inheritDoc} */ public long getAppTimeMillis(long systemTimeMillis) { if (systemTimeMillis < timeOffset) { throw new IllegalArgumentException( "System time : " + systemTimeMillis + " is before the start time of this application."); } return systemTimeMillis - timeOffset; } /** * {@inheritDoc} */ public long getSystemTimeMillis(long appTimeMillis) { return appTimeMillis + timeOffset; } /** * This thread continuously renews this node with the watchdog server * before the renew interval (returned when registering the node) expires. */ private final class RenewThread extends Thread { /** Constructs an instance of this class as a daemon thread. */ RenewThread() { super(CLASSNAME + "$RenewThread"); setDaemon(true); } /** * Registers the node with the watchdog server, and sends * periodic renew requests. This thread terminates if the * node is no longer considered alive or if the watchdog * service is shutdown. */ public void run() { long startRenewInterval = renewInterval / 2; long nextRenewInterval = startRenewInterval; long lastRenewTime = System.currentTimeMillis(); while (isLocalAlive()) { synchronized (this) { if (shuttingDown()) { return; } try { wait(nextRenewInterval); } catch (InterruptedException e) { return; } } if (shuttingDown()) { return; } boolean renewed = false; try { if (!serverProxy.renewNode(localNodeId)) { // server has already marked node as failed, so we can // go directly to removing this node setFailedThenNotify(); return; } renewed = true; nextRenewInterval = startRenewInterval; } catch (IOException e) { /* * Adjust renew interval in order to renew with * server again before the renew interval expires. */ logger.logThrow( Level.INFO, e, "renewing with watchdog server throws"); nextRenewInterval = Math.max(nextRenewInterval / 2, MIN_RENEW_INTERVAL); } long now = System.currentTimeMillis(); if (now - lastRenewTime > renewInterval) { // server has already marked node as failed, so we can // go directly to removing this node setFailedThenNotify(); return; } if (renewed) { lastRenewTime = now; } } } } /* -- other methods -- */ /** * Returns the server. This method is used for testing. * * @return the server */ public WatchdogServerImpl getServer() { return serverImpl; } /** * Throws {@code IllegalStateException} if this service is shutting down. */ private void checkState() { if (shuttingDown()) { throw new IllegalStateException("service shutting down"); } } /** * Returns {@code true} if this node is considered alive. */ private synchronized boolean isLocalAlive() { return health.isAlive(); } /** * Notifies the appropriate registered node listeners of the * status change of the specified {@code node}. * * @param node a node * @throws IllegalStateException if this service is shutting down */ private void notifyNodeListeners(final Node node) { for (NodeListener listener : nodeListeners.keySet()) { final NodeListener nodeListener = listener; taskScheduler.scheduleTask( new AbstractKernelRunnable("NotifyNodeListeners") { public void run() { if (!shuttingDown() && isLocalNodeAliveNonTransactional()) { nodeListener.nodeHealthUpdate(node); } } }, taskOwner); } } /** * Notifies the registered recovery listeners that the specified * {@code node} needs to be recovered. * * @param node a node */ private void notifyRecoveryListeners(final Node node) { if (logger.isLoggable(Level.INFO)) { logger.log(Level.INFO, "Node:{0} recovering for node:{1}", localNodeId, node.getId()); } Queue<SimpleCompletionHandler> handlers = new ConcurrentLinkedQueue<SimpleCompletionHandler>(); if (recoveryQueues.putIfAbsent(node, handlers) != null) { // recovery for node already being handled return; } for (RecoveryListener listener : recoveryListeners.keySet()) { final RecoveryListener recoveryListener = listener; final SimpleCompletionHandler handler = new RecoveryCompletionHandler(node, listener); handlers.add(handler); taskScheduler.scheduleTask( new AbstractKernelRunnable("NotifyRecoveryListeners") { public void run() { try { if (!shuttingDown() && isLocalNodeAliveNonTransactional()) { recoveryListener.recover(node, handler); } } catch (Exception e) { logger.logThrow( Level.WARNING, e, "Notifying recovery listener on node:{0} " + "with node:{1}, handler:{2} throws", localNodeId, node, handler); } } }, taskOwner); } } // Management methods /** * Retrieves information about the current node. * @return information about the current node */ NodeInfo getNodeStatusInfo() { GetNodeStatusTask task = new GetNodeStatusTask(); try { transactionScheduler.runTask(task, taskOwner); } catch (Exception e) { logger.logThrow(Level.INFO, e, "Could not retrive node info"); } return task.info; } private final class GetNodeStatusTask extends AbstractKernelRunnable { NodeInfo info; GetNodeStatusTask() { super(null); } public void run() { NodeImpl node = NodeImpl.getNode(dataService, localNodeId); info = node.getNodeInfo(); } } /** * Implements the WatchdogClient that receives callbacks from the * WatchdogServer. */ private final class WatchdogClientImpl implements WatchdogClient { /** {@inheritDoc} */ @Override public void nodeStatusChanges(long[] ids, String[] hosts, Health[] health, long[] backups) { if (ids.length != hosts.length || hosts.length != health.length || health.length != backups.length) { throw new IllegalArgumentException("array lengths don't match"); } for (int i = 0; i < ids.length; i++) { if (ids[i] == localNodeId && health[i].isAlive()) { /* Don't notify the local node that it is alive. */ continue; } Node node = new NodeImpl(ids[i], hosts[i], health[i], backups[i]); notifyNodeListeners(node); if (!health[i].isAlive() && backups[i] == localNodeId) { notifyRecoveryListeners(node); } } } /** * {@inheritDoc} */ public void reportFailure(String className) { setFailedThenNotify(); } } /** * The {@code SimpleCompletionHandler} implementation for recovery. * When {@code completed} is invoked, the handler instance is removed * from the recovery completion handler queue for the associated node. * If a given handler is the last one to be removed from a node's * queue, then recovery is complete for that node, and the data store * is updated to clean up recovery information for that node. */ private final class RecoveryCompletionHandler implements SimpleCompletionHandler { /** The failed node. */ private final Node node; /** The recovery listener for this handler (currently unused). */ private final RecoveryListener listener; /** Indicates whether recovery is done. */ private boolean isDone = false; /** * Constructs an instance with the specified {@code node} and * recovery {@code listener}. */ RecoveryCompletionHandler(Node node, RecoveryListener listener) { this.node = node; this.listener = listener; } /** {@inheritDoc} */ public void completed() { synchronized (this) { if (isDone) { return; } isDone = true; } Queue<SimpleCompletionHandler> handlers = recoveryQueues.get(node); assert handlers != null; handlers.remove(this); if (handlers.isEmpty()) { // recovery for the node is complete, so remove node // from table of recovery queues. if (recoveryQueues.remove(node) != null) { try { if (isLocalNodeAliveNonTransactional()) { serverProxy.recoveredNode( node.getId(), localNodeId); } } catch (Exception e) { logger.logThrow( Level.WARNING, e, "Problem invoking WatchdogServer.recoveredNode " + "for node:{0} backup:{1}", node, localNodeId); } } } } } /** * Private runnable that is used to periodically synchronize the local * time offset with that maintained in the global {@code WatchdogServer}. */ private final class TimeSyncRunner implements KernelRunnable { /** {@inheritDoc} */ public String getBaseTaskType() { return TimeSyncRunner.class.getName(); } /** {@inheritDoc} */ public void run() throws Exception { long before = System.currentTimeMillis(); long appTime = serverProxy.currentAppTimeMillis(); long after = System.currentTimeMillis(); // calculate local offset value based on round trip time timeOffset = after - (appTime + (after - before) / 2L); } } }