/* * * Copyright 2016 Netflix, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package com.netflix.genie.web.tasks.leader; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.type.TypeFactory; import com.google.common.base.Splitter; import com.netflix.genie.common.dto.Job; import com.netflix.genie.common.dto.JobExecution; import com.netflix.genie.common.dto.JobStatus; import com.netflix.genie.common.exceptions.GenieException; import com.netflix.genie.core.services.JobPersistenceService; import com.netflix.genie.core.services.JobSearchService; import com.netflix.genie.web.properties.ClusterCheckerProperties; import com.netflix.genie.web.tasks.GenieTaskScheduleType; import com.netflix.spectator.api.Counter; import com.netflix.spectator.api.Registry; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.boot.actuate.autoconfigure.ManagementServerProperties; import org.springframework.boot.actuate.health.Status; import org.springframework.stereotype.Component; import org.springframework.web.client.HttpStatusCodeException; import org.springframework.web.client.RestTemplate; import javax.validation.constraints.NotNull; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; /** * A task which checks to see if this leader node can communicate with all other nodes in the cluster. If it can't * it will keep track of which nodes it can't communicate with and perform various actions based on the number of times * it can't communicate with that node. Currently (as of 3.0) this task will mark jobs as lost if they miss a certain * number of checks. * * @author tgianos * @since 3.0.0 */ @Component @Slf4j public class ClusterCheckerTask extends LeadershipTask { private static final String PROPERTY_STATUS = "status"; private final String hostName; private final ClusterCheckerProperties properties; private final JobSearchService jobSearchService; private final JobPersistenceService jobPersistenceService; private final RestTemplate restTemplate; private final String scheme; private final String healthEndpoint; private final ObjectMapper mapper = new ObjectMapper(); private final List<String> healthIndicatorsToIgnore; private final Map<String, Integer> errorCounts = new HashMap<>(); // TODO: Add metrics private final Counter lostJobsCounter; private final Counter unableToUpdateJobCounter; /** * Constructor. * * @param hostName The host name of this node * @param properties The properties to use to configure the task * @param jobSearchService The job search service to use * @param jobPersistenceService The job persistence service to use * @param restTemplate The rest template for http calls * @param managementServerProperties The properties where Spring actuator is running * @param registry The spectator registry for getting metrics */ @Autowired public ClusterCheckerTask( @NotNull final String hostName, @NotNull final ClusterCheckerProperties properties, @NotNull final JobSearchService jobSearchService, @NotNull final JobPersistenceService jobPersistenceService, @Qualifier("genieRestTemplate") @NotNull final RestTemplate restTemplate, @NotNull final ManagementServerProperties managementServerProperties, @NotNull final Registry registry ) { this.hostName = hostName; this.properties = properties; this.jobSearchService = jobSearchService; this.jobPersistenceService = jobPersistenceService; this.restTemplate = restTemplate; this.scheme = this.properties.getScheme() + "://"; this.healthEndpoint = ":" + this.properties.getPort() + managementServerProperties.getContextPath() + "/health"; this.healthIndicatorsToIgnore = Splitter.on(",").omitEmptyStrings() .trimResults().splitToList(properties.getHealthIndicatorsToIgnore()); // Keep track of the number of nodes currently unreachable from the the master registry.mapSize("genie.tasks.clusterChecker.errorCounts.gauge", this.errorCounts); this.lostJobsCounter = registry.counter("genie.tasks.clusterChecker.lostJobs.rate"); this.unableToUpdateJobCounter = registry.counter("genie.tasks.clusterChecker.unableToUpdateJob.rate"); } /** * Ping the health check endpoint of all other nodes which have running jobs. Track results. */ @Override public void run() { log.info("Checking for cluster node health..."); this.jobSearchService.getAllHostsWithActiveJobs() .stream() .filter(host -> !this.hostName.equals(host)) .forEach(this::validateHostAndUpdateErrorCount); this.errorCounts.entrySet().removeIf(entry -> { final String host = entry.getKey(); boolean result = true; if (entry.getValue() >= properties.getLostThreshold()) { try { updateJobsToFailedOnHost(host); } catch (Exception e) { log.error("Unable to update jobs on host {} due to exception", host, e); unableToUpdateJobCounter.increment(); result = false; } } else { result = false; } return result; }); log.info("Finished checking for cluster node health."); } private void updateJobsToFailedOnHost(final String host) { final Set<Job> jobs = jobSearchService.getAllActiveJobsOnHost(host); jobs.forEach( job -> { try { jobPersistenceService.setJobCompletionInformation( job.getId().orElseThrow(IllegalArgumentException::new), JobExecution.LOST_EXIT_CODE, JobStatus.FAILED, "Genie leader can't reach node running job. Assuming node and job are lost.", null, null ); lostJobsCounter.increment(); } catch (final GenieException ge) { log.error("Unable to update job {} to failed due to exception", job.getId(), ge); unableToUpdateJobCounter.increment(); } } ); } private void validateHostAndUpdateErrorCount(final String host) { // // If node is healthy, remove the entry from the errorCounts. // If node is not healthy, update the entry in errorCounts // if (isNodeHealthy(host)) { if (errorCounts.containsKey(host)) { errorCounts.remove(host); } } else { if (this.errorCounts.containsKey(host)) { this.errorCounts.put(host, this.errorCounts.get(host) + 1); } else { this.errorCounts.put(host, 1); } } } private boolean isNodeHealthy(final String host) { // // A node is valid and healthy if all health indicators excluding the ones mentioned in healthIndicatorsToIgnore // are UP. // boolean result = true; try { restTemplate.getForObject(this.scheme + host + this.healthEndpoint, String.class); } catch (final HttpStatusCodeException e) { log.error("Failed validating host {}", host, e); try { final Map<String, Object> responseMap = mapper.readValue(e.getResponseBodyAsByteArray(), TypeFactory.defaultInstance().constructMapType(Map.class, String.class, Object.class)); for (Map.Entry<String, Object> responseEntry : responseMap.entrySet()) { if (responseEntry.getValue() instanceof Map && !healthIndicatorsToIgnore.contains(responseEntry.getKey()) && !Status.UP.getCode().equals(((Map) responseEntry.getValue()).get(PROPERTY_STATUS))) { result = false; break; } } } catch (Exception ex) { log.error("Failed reading the error response when validating host {}", host, ex); result = false; } } catch (final Exception e) { log.error("Unable to reach {}", host, e); result = false; } return result; } /** * {@inheritDoc} */ @Override public GenieTaskScheduleType getScheduleType() { return GenieTaskScheduleType.FIXED_RATE; } /** * {@inheritDoc} */ @Override public long getFixedRate() { return this.properties.getRate(); } /** * {@inheritDoc} */ @Override public void cleanup() { this.errorCounts.clear(); } /** * Get the current size of error counts. Mainly used for testing. * * @return Number of nodes currently in an error state */ int getErrorCountsSize() { return this.errorCounts.size(); } }