/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.yarn;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.AbstractMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.NodeReport;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.client.api.AMRMClient;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
import org.apache.hadoop.yarn.util.Records;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Function;
import com.google.common.base.Optional;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Queues;
import com.google.common.eventbus.EventBus;
import com.google.common.eventbus.Subscribe;
import com.google.common.io.Closer;
import com.google.common.util.concurrent.AbstractIdleService;
import com.typesafe.config.Config;
import gobblin.configuration.ConfigurationKeys;
import gobblin.cluster.GobblinClusterConfigurationKeys;
import gobblin.cluster.GobblinClusterMetricTagNames;
import gobblin.cluster.GobblinClusterUtils;
import gobblin.cluster.HelixUtils;
import gobblin.metrics.GobblinMetrics;
import gobblin.metrics.Tag;
import gobblin.metrics.event.EventSubmitter;
import gobblin.util.ConfigUtils;
import gobblin.util.ExecutorsUtils;
import gobblin.util.JvmUtils;
import gobblin.cluster.event.ClusterManagerShutdownRequest;
import gobblin.yarn.event.ContainerShutdownRequest;
import gobblin.yarn.event.NewContainerRequest;
/**
* This class is responsible for all Yarn-related stuffs including ApplicationMaster registration,
* ApplicationMaster un-registration, Yarn container management, etc.
*
* @author Yinan Li
*/
public class YarnService extends AbstractIdleService {
private static final Logger LOGGER = LoggerFactory.getLogger(YarnService.class);
private static final Splitter SPLITTER = Splitter.on(',').omitEmptyStrings().trimResults();
private final String applicationName;
private final String applicationId;
private final Config config;
private final EventBus eventBus;
private final Configuration yarnConfiguration;
private final FileSystem fs;
private final Optional<GobblinMetrics> gobblinMetrics;
private final Optional<EventSubmitter> eventSubmitter;
private final AMRMClientAsync<AMRMClient.ContainerRequest> amrmClientAsync;
private final NMClientAsync nmClientAsync;
private final ExecutorService containerLaunchExecutor;
private final int initialContainers;
private final int requestedContainerMemoryMbs;
private final int requestedContainerCores;
private final boolean containerHostAffinityEnabled;
private final int helixInstanceMaxRetries;
private final Optional<String> containerJvmArgs;
private volatile Optional<Resource> maxResourceCapacity = Optional.absent();
// Security tokens for accessing HDFS
private final ByteBuffer tokens;
private final Closer closer = Closer.create();
private final Object allContainersStopped = new Object();
// A map from container IDs to pairs of Container instances and Helix participant IDs of the containers
private final ConcurrentMap<ContainerId, Map.Entry<Container, String>> containerMap = Maps.newConcurrentMap();
// A generator for an integer ID of a Helix instance (participant)
private final AtomicInteger helixInstanceIdGenerator = new AtomicInteger(0);
// A map from Helix instance names to the number times the instances are retried to be started
private final ConcurrentMap<String, AtomicInteger> helixInstanceRetryCount = Maps.newConcurrentMap();
// A queue of unused Helix instance names. An unused Helix instance name gets put
// into the queue if the container running the instance completes. Unused Helix
// instance names get picked up when replacement containers get allocated.
private final ConcurrentLinkedQueue<String> unusedHelixInstanceNames = Queues.newConcurrentLinkedQueue();
private volatile boolean shutdownInProgress = false;
public YarnService(Config config, String applicationName, String applicationId, YarnConfiguration yarnConfiguration,
FileSystem fs, EventBus eventBus) throws Exception {
this.applicationName = applicationName;
this.applicationId = applicationId;
this.config = config;
this.eventBus = eventBus;
this.gobblinMetrics = config.getBoolean(ConfigurationKeys.METRICS_ENABLED_KEY) ?
Optional.of(buildGobblinMetrics()) : Optional.<GobblinMetrics>absent();
this.eventSubmitter = config.getBoolean(ConfigurationKeys.METRICS_ENABLED_KEY) ?
Optional.of(buildEventSubmitter()) : Optional.<EventSubmitter>absent();
this.yarnConfiguration = yarnConfiguration;
this.fs = fs;
this.amrmClientAsync = closer.register(
AMRMClientAsync.createAMRMClientAsync(1000, new AMRMClientCallbackHandler()));
this.amrmClientAsync.init(this.yarnConfiguration);
this.nmClientAsync = closer.register(NMClientAsync.createNMClientAsync(new NMClientCallbackHandler()));
this.nmClientAsync.init(this.yarnConfiguration);
this.initialContainers = config.getInt(GobblinYarnConfigurationKeys.INITIAL_CONTAINERS_KEY);
this.requestedContainerMemoryMbs = config.getInt(GobblinYarnConfigurationKeys.CONTAINER_MEMORY_MBS_KEY);
this.requestedContainerCores = config.getInt(GobblinYarnConfigurationKeys.CONTAINER_CORES_KEY);
this.containerHostAffinityEnabled = config.getBoolean(GobblinYarnConfigurationKeys.CONTAINER_HOST_AFFINITY_ENABLED);
this.helixInstanceMaxRetries = config.getInt(GobblinYarnConfigurationKeys.HELIX_INSTANCE_MAX_RETRIES);
this.containerJvmArgs = config.hasPath(GobblinYarnConfigurationKeys.CONTAINER_JVM_ARGS_KEY) ?
Optional.of(config.getString(GobblinYarnConfigurationKeys.CONTAINER_JVM_ARGS_KEY)) :
Optional.<String>absent();
this.containerLaunchExecutor = Executors.newFixedThreadPool(10,
ExecutorsUtils.newThreadFactory(Optional.of(LOGGER), Optional.of("ContainerLaunchExecutor")));
this.tokens = getSecurityTokens();
}
@SuppressWarnings("unused")
@Subscribe
public void handleNewContainerRequest(NewContainerRequest newContainerRequest) {
if (!this.maxResourceCapacity.isPresent()) {
LOGGER.error(String.format(
"Unable to handle new container request as maximum resource capacity is not available: "
+ "[memory (MBs) requested = %d, vcores requested = %d]", this.requestedContainerMemoryMbs,
this.requestedContainerCores));
return;
}
requestContainer(newContainerRequest.getReplacedContainer().transform(new Function<Container, String>() {
@Override
public String apply(Container container) {
return container.getNodeId().getHost();
}
}));
}
@SuppressWarnings("unused")
@Subscribe
public void handleContainerShutdownRequest(ContainerShutdownRequest containerShutdownRequest) {
for (Container container : containerShutdownRequest.getContainers()) {
LOGGER.info(String.format("Stopping container %s running on %s", container.getId(), container.getNodeId()));
this.nmClientAsync.stopContainerAsync(container.getId(), container.getNodeId());
}
}
@Override
protected void startUp() throws Exception {
LOGGER.info("Starting the YarnService");
// Register itself with the EventBus for container-related requests
this.eventBus.register(this);
this.amrmClientAsync.start();
this.nmClientAsync.start();
// The ApplicationMaster registration response is used to determine the maximum resource capacity of the cluster
RegisterApplicationMasterResponse response = this.amrmClientAsync.registerApplicationMaster(
GobblinClusterUtils.getHostname(), -1, "");
LOGGER.info("ApplicationMaster registration response: " + response);
this.maxResourceCapacity = Optional.of(response.getMaximumResourceCapability());
LOGGER.info("Requesting initial containers");
requestInitialContainers(this.initialContainers);
}
@Override
protected void shutDown() throws IOException {
LOGGER.info("Stopping the YarnService");
this.shutdownInProgress = true;
try {
ExecutorsUtils.shutdownExecutorService(this.containerLaunchExecutor, Optional.of(LOGGER));
// Stop the running containers
for (Map.Entry<Container, String> entry : this.containerMap.values()) {
LOGGER.info(String.format("Stopping container %s running participant %s", entry.getKey().getId(),
entry.getValue()));
this.nmClientAsync.stopContainerAsync(entry.getKey().getId(), entry.getKey().getNodeId());
}
if (!this.containerMap.isEmpty()) {
synchronized (this.allContainersStopped) {
try {
// Wait 5 minutes for the containers to stop
this.allContainersStopped.wait(5 * 60 * 1000);
LOGGER.info("All of the containers have been stopped");
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
}
}
}
this.amrmClientAsync.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, null, null);
} catch (IOException | YarnException e) {
LOGGER.error("Failed to unregister the ApplicationMaster", e);
} finally {
try {
this.closer.close();
} finally {
if (this.gobblinMetrics.isPresent()) {
this.gobblinMetrics.get().stopMetricsReporting();
}
}
}
}
private GobblinMetrics buildGobblinMetrics() {
// Create tags list
ImmutableList.Builder<Tag<?>> tags = new ImmutableList.Builder<>();
tags.add(new Tag<>(GobblinClusterMetricTagNames.APPLICATION_ID, this.applicationId));
tags.add(new Tag<>(GobblinClusterMetricTagNames.APPLICATION_NAME, this.applicationName));
// Intialize Gobblin metrics and start reporters
GobblinMetrics gobblinMetrics = GobblinMetrics.get(this.applicationId, null, tags.build());
gobblinMetrics.startMetricReporting(ConfigUtils.configToProperties(config));
return gobblinMetrics;
}
private EventSubmitter buildEventSubmitter() {
return new EventSubmitter.Builder(this.gobblinMetrics.get().getMetricContext(),
GobblinYarnEventConstants.EVENT_NAMESPACE)
.build();
}
private void requestInitialContainers(int containersRequested) {
for (int i = 0; i < containersRequested; i++) {
requestContainer(Optional.<String>absent());
}
}
private void requestContainer(Optional<String> preferredNode) {
Priority priority = Records.newRecord(Priority.class);
priority.setPriority(0);
Resource capability = Records.newRecord(Resource.class);
int maxMemoryCapacity = this.maxResourceCapacity.get().getMemory();
capability.setMemory(this.requestedContainerMemoryMbs <= maxMemoryCapacity ?
this.requestedContainerMemoryMbs : maxMemoryCapacity);
int maxCoreCapacity = this.maxResourceCapacity.get().getVirtualCores();
capability.setVirtualCores(this.requestedContainerCores <= maxCoreCapacity ?
this.requestedContainerCores : maxCoreCapacity);
String[] preferredNodes = preferredNode.isPresent() ? new String[] {preferredNode.get()} : null;
this.amrmClientAsync.addContainerRequest(
new AMRMClient.ContainerRequest(capability, preferredNodes, null, priority));
}
private ContainerLaunchContext newContainerLaunchContext(Container container, String helixInstanceName)
throws IOException {
Path appWorkDir = GobblinClusterUtils.getAppWorkDirPath(this.fs, this.applicationName, this.applicationId);
Path containerWorkDir = new Path(appWorkDir, GobblinYarnConfigurationKeys.CONTAINER_WORK_DIR_NAME);
Map<String, LocalResource> resourceMap = Maps.newHashMap();
addContainerLocalResources(new Path(appWorkDir, GobblinYarnConfigurationKeys.LIB_JARS_DIR_NAME), resourceMap);
addContainerLocalResources(new Path(containerWorkDir, GobblinYarnConfigurationKeys.APP_JARS_DIR_NAME), resourceMap);
addContainerLocalResources(
new Path(containerWorkDir, GobblinYarnConfigurationKeys.APP_FILES_DIR_NAME), resourceMap);
if (this.config.hasPath(GobblinYarnConfigurationKeys.CONTAINER_FILES_REMOTE_KEY)) {
addRemoteAppFiles(this.config.getString(GobblinYarnConfigurationKeys.CONTAINER_FILES_REMOTE_KEY), resourceMap);
}
ContainerLaunchContext containerLaunchContext = Records.newRecord(ContainerLaunchContext.class);
containerLaunchContext.setLocalResources(resourceMap);
containerLaunchContext.setEnvironment(YarnHelixUtils.getEnvironmentVariables(this.yarnConfiguration));
containerLaunchContext.setCommands(Lists.newArrayList(buildContainerCommand(container, helixInstanceName)));
if (UserGroupInformation.isSecurityEnabled()) {
containerLaunchContext.setTokens(this.tokens.duplicate());
}
return containerLaunchContext;
}
private void addContainerLocalResources(Path destDir, Map<String, LocalResource> resourceMap) throws IOException {
if (!this.fs.exists(destDir)) {
LOGGER.warn(String.format("Path %s does not exist so no container LocalResource to add", destDir));
return;
}
FileStatus[] statuses = this.fs.listStatus(destDir);
if (statuses != null) {
for (FileStatus status : statuses) {
YarnHelixUtils.addFileAsLocalResource(this.fs, status.getPath(), LocalResourceType.FILE, resourceMap);
}
}
}
private void addRemoteAppFiles(String hdfsFileList, Map<String, LocalResource> resourceMap) throws IOException {
for (String hdfsFilePath : SPLITTER.split(hdfsFileList)) {
Path srcFilePath = new Path(hdfsFilePath);
YarnHelixUtils.addFileAsLocalResource(
srcFilePath.getFileSystem(this.yarnConfiguration), srcFilePath, LocalResourceType.FILE, resourceMap);
}
}
private ByteBuffer getSecurityTokens() throws IOException {
Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
Closer closer = Closer.create();
try {
DataOutputBuffer dataOutputBuffer = closer.register(new DataOutputBuffer());
credentials.writeTokenStorageToStream(dataOutputBuffer);
// Remove the AM->RM token so that containers cannot access it
Iterator<Token<?>> tokenIterator = credentials.getAllTokens().iterator();
while (tokenIterator.hasNext()) {
Token<?> token = tokenIterator.next();
if (token.getKind().equals(AMRMTokenIdentifier.KIND_NAME)) {
tokenIterator.remove();
}
}
return ByteBuffer.wrap(dataOutputBuffer.getData(), 0, dataOutputBuffer.getLength());
} catch (Throwable t) {
throw closer.rethrow(t);
} finally {
closer.close();
}
}
private String buildContainerCommand(Container container, String helixInstanceName) {
String containerProcessName = GobblinYarnTaskRunner.class.getSimpleName();
return new StringBuilder()
.append(ApplicationConstants.Environment.JAVA_HOME.$()).append("/bin/java")
.append(" -Xmx").append(container.getResource().getMemory()).append("M")
.append(" ").append(JvmUtils.formatJvmArguments(this.containerJvmArgs))
.append(" ").append(GobblinYarnTaskRunner.class.getName())
.append(" --").append(GobblinClusterConfigurationKeys.APPLICATION_NAME_OPTION_NAME)
.append(" ").append(this.applicationName)
.append(" --").append(GobblinClusterConfigurationKeys.HELIX_INSTANCE_NAME_OPTION_NAME)
.append(" ").append(helixInstanceName)
.append(" 1>").append(ApplicationConstants.LOG_DIR_EXPANSION_VAR).append(File.separator).append(
containerProcessName).append(".").append(ApplicationConstants.STDOUT)
.append(" 2>").append(ApplicationConstants.LOG_DIR_EXPANSION_VAR).append(File.separator).append(
containerProcessName).append(".").append(ApplicationConstants.STDERR)
.toString();
}
/**
* Check the exit status of a completed container and see if the replacement container
* should try to be started on the same node. Some exit status indicates a disk or
* node failure and in such cases the replacement container should try to be started on
* a different node.
*/
private boolean shouldStickToTheSameNode(int containerExitStatus) {
switch (containerExitStatus) {
case ContainerExitStatus.DISKS_FAILED:
return false;
case ContainerExitStatus.ABORTED:
// Mostly likely this exit status is due to node failures because the
// application itself will not release containers.
return false;
default:
// Stick to the same node for other cases if host affinity is enabled.
return this.containerHostAffinityEnabled;
}
}
/**
* Handle the completion of a container. A new container will be requested to replace the one
* that just exited. Depending on the exit status and if container host affinity is enabled,
* the new container may or may not try to be started on the same node.
*
* A container completes in either of the following conditions: 1) some error happens in the
* container and caused the container to exit, 2) the container gets killed due to some reason,
* for example, if it runs over the allowed amount of virtual or physical memory, 3) the gets
* preempted by the ResourceManager, or 4) the container gets stopped by the ApplicationMaster.
* A replacement container is needed in all but the last case.
*/
private void handleContainerCompletion(ContainerStatus containerStatus) {
Map.Entry<Container, String> completedContainerEntry = this.containerMap.remove(containerStatus.getContainerId());
String completedInstanceName = completedContainerEntry.getValue();
LOGGER.info(String.format("Container %s running Helix instance %s has completed with exit status %d",
containerStatus.getContainerId(), completedInstanceName, containerStatus.getExitStatus()));
if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
LOGGER.info(String.format("Received the following diagnostics information for container %s: %s",
containerStatus.getContainerId(), containerStatus.getDiagnostics()));
}
if (this.shutdownInProgress) {
return;
}
int retryCount =
this.helixInstanceRetryCount.putIfAbsent(completedInstanceName, new AtomicInteger(0)).incrementAndGet();
// Populate event metadata
Optional<ImmutableMap.Builder<String, String>> eventMetadataBuilder = Optional.absent();
if (this.eventSubmitter.isPresent()) {
eventMetadataBuilder = Optional.of(buildContainerStatusEventMetadata(containerStatus));
eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.HELIX_INSTANCE_ID, completedInstanceName);
eventMetadataBuilder.get().put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_RETRY_ATTEMPT, retryCount + "");
}
if (this.helixInstanceMaxRetries > 0 && retryCount > this.helixInstanceMaxRetries) {
if (this.eventSubmitter.isPresent()) {
this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
eventMetadataBuilder.get().build());
}
LOGGER.warn("Maximum number of retries has been achieved for Helix instance " + completedInstanceName);
return;
}
// Add the Helix instance name of the completed container to the queue of unused
// instance names so they can be reused by a replacement container.
this.unusedHelixInstanceNames.offer(completedInstanceName);
if (this.eventSubmitter.isPresent()) {
this.eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.HELIX_INSTANCE_COMPLETION,
eventMetadataBuilder.get().build());
}
LOGGER.info(String.format("Requesting a new container to replace %s to run Helix instance %s",
containerStatus.getContainerId(), completedInstanceName));
this.eventBus.post(new NewContainerRequest(
shouldStickToTheSameNode(containerStatus.getExitStatus()) ?
Optional.of(completedContainerEntry.getKey()) : Optional.<Container>absent()));
}
private ImmutableMap.Builder<String, String> buildContainerStatusEventMetadata(ContainerStatus containerStatus) {
ImmutableMap.Builder<String, String> eventMetadataBuilder = new ImmutableMap.Builder<>();
eventMetadataBuilder.put(GobblinYarnMetricTagNames.CONTAINER_ID, containerStatus.getContainerId().toString());
eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_CONTAINER_STATE,
containerStatus.getState().toString());
if (ContainerExitStatus.INVALID != containerStatus.getExitStatus()) {
eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_STATUS,
containerStatus.getExitStatus() + "");
}
if (!Strings.isNullOrEmpty(containerStatus.getDiagnostics())) {
eventMetadataBuilder.put(GobblinYarnEventConstants.EventMetadata.CONTAINER_STATUS_EXIT_DIAGNOSTICS,
containerStatus.getDiagnostics());
}
return eventMetadataBuilder;
}
/**
* A custom implementation of {@link AMRMClientAsync.CallbackHandler}.
*/
private class AMRMClientCallbackHandler implements AMRMClientAsync.CallbackHandler {
private volatile boolean done = false;
@Override
public void onContainersCompleted(List<ContainerStatus> statuses) {
for (ContainerStatus containerStatus : statuses) {
handleContainerCompletion(containerStatus);
}
}
@Override
public void onContainersAllocated(List<Container> containers) {
for (final Container container : containers) {
if (eventSubmitter.isPresent()) {
eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_ALLOCATION,
GobblinYarnMetricTagNames.CONTAINER_ID, container.getId().toString());
}
LOGGER.info(String.format("Container %s has been allocated", container.getId()));
String instanceName = unusedHelixInstanceNames.poll();
if (Strings.isNullOrEmpty(instanceName)) {
// No unused instance name, so generating a new one.
instanceName = HelixUtils.getHelixInstanceName(GobblinYarnTaskRunner.class.getSimpleName(),
helixInstanceIdGenerator.incrementAndGet());
}
final String finalInstanceName = instanceName;
containerMap.put(container.getId(), new AbstractMap.SimpleImmutableEntry<>(container, finalInstanceName));
containerLaunchExecutor.submit(new Runnable() {
@Override
public void run() {
try {
LOGGER.info("Starting container " + container.getId());
nmClientAsync.startContainerAsync(container, newContainerLaunchContext(container, finalInstanceName));
} catch (IOException ioe) {
LOGGER.error("Failed to start container " + container.getId(), ioe);
}
}
});
}
}
@Override
public void onShutdownRequest() {
if (eventSubmitter.isPresent()) {
eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.SHUTDOWN_REQUEST);
}
LOGGER.info("Received shutdown request from the ResourceManager");
this.done = true;
eventBus.post(new ClusterManagerShutdownRequest());
}
@Override
public void onNodesUpdated(List<NodeReport> updatedNodes) {
for (NodeReport nodeReport : updatedNodes) {
LOGGER.info("Received node update report: " + nodeReport);
}
}
@Override
public float getProgress() {
return this.done ? 1.0f : 0.0f;
}
@Override
public void onError(Throwable t) {
if (eventSubmitter.isPresent()) {
eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.ERROR,
GobblinYarnEventConstants.EventMetadata.ERROR_EXCEPTION, Throwables.getStackTraceAsString(t));
}
LOGGER.error("Received error: " + t, t);
this.done = true;
eventBus.post(new ClusterManagerShutdownRequest());
}
}
/**
* A custom implementation of {@link NMClientAsync.CallbackHandler}.
*/
private class NMClientCallbackHandler implements NMClientAsync.CallbackHandler {
@Override
public void onContainerStarted(ContainerId containerId, Map<String, ByteBuffer> allServiceResponse) {
if (eventSubmitter.isPresent()) {
eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_STARTED,
GobblinYarnMetricTagNames.CONTAINER_ID, containerId.toString());
}
LOGGER.info(String.format("Container %s has been started", containerId));
}
@Override
public void onContainerStatusReceived(ContainerId containerId, ContainerStatus containerStatus) {
if (eventSubmitter.isPresent()) {
eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_STATUS_RECEIVED,
buildContainerStatusEventMetadata(containerStatus).build());
}
LOGGER.info(String.format("Received container status for container %s: %s", containerId, containerStatus));
}
@Override
public void onContainerStopped(ContainerId containerId) {
if (eventSubmitter.isPresent()) {
eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_STOPPED,
GobblinYarnMetricTagNames.CONTAINER_ID, containerId.toString());
}
LOGGER.info(String.format("Container %s has been stopped", containerId));
containerMap.remove(containerId);
if (containerMap.isEmpty()) {
synchronized (allContainersStopped) {
allContainersStopped.notify();
}
}
}
@Override
public void onStartContainerError(ContainerId containerId, Throwable t) {
if (eventSubmitter.isPresent()) {
eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_START_ERROR,
GobblinYarnMetricTagNames.CONTAINER_ID, containerId.toString(),
GobblinYarnEventConstants.EventMetadata.ERROR_EXCEPTION, Throwables.getStackTraceAsString(t));
}
LOGGER.error(String.format("Failed to start container %s due to error %s", containerId, t));
containerMap.remove(containerId);
}
@Override
public void onGetContainerStatusError(ContainerId containerId, Throwable t) {
if (eventSubmitter.isPresent()) {
eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_GET_STATUS_ERROR,
GobblinYarnMetricTagNames.CONTAINER_ID, containerId.toString(),
GobblinYarnEventConstants.EventMetadata.ERROR_EXCEPTION, Throwables.getStackTraceAsString(t));
}
LOGGER.error(String.format("Failed to get status for container %s due to error %s", containerId, t));
}
@Override
public void onStopContainerError(ContainerId containerId, Throwable t) {
if (eventSubmitter.isPresent()) {
eventSubmitter.get().submit(GobblinYarnEventConstants.EventNames.CONTAINER_STOP_ERROR,
GobblinYarnMetricTagNames.CONTAINER_ID, containerId.toString(),
GobblinYarnEventConstants.EventMetadata.ERROR_EXCEPTION, Throwables.getStackTraceAsString(t));
}
LOGGER.error(String.format("Failed to stop container %s due to error %s", containerId, t));
}
}
}