/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.sls;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.text.MessageFormat;
import java.util.Map;
import java.util.HashMap;
import java.util.Set;
import java.util.HashSet;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Random;
import java.util.Arrays;
import io.hops.util.DBUtility;
import io.hops.util.RMStorageFactory;
import io.hops.util.YarnAPIStorageFactory;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.tools.rumen.JobTraceReader;
import org.apache.hadoop.tools.rumen.LoggedJob;
import org.apache.hadoop.tools.rumen.LoggedTask;
import org.apache.hadoop.tools.rumen.LoggedTaskAttempt;
import org.apache.hadoop.yarn.api.records.NodeState;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
import org.apache.hadoop.yarn.sls.appmaster.AMSimulator;
import org.apache.hadoop.yarn.sls.conf.SLSConfiguration;
import org.apache.hadoop.yarn.sls.nodemanager.NMSimulator;
import org.apache.hadoop.yarn.sls.scheduler.ContainerSimulator;
import org.apache.hadoop.yarn.sls.scheduler.ResourceSchedulerWrapper;
import org.apache.hadoop.yarn.sls.scheduler.TaskRunner;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.sls.utils.SLSUtils;
import org.apache.log4j.Logger;
import org.codehaus.jackson.JsonFactory;
import org.codehaus.jackson.map.ObjectMapper;
@Private
@Unstable
public class SLSRunner {
// RM, Runner
private ResourceManager rm;
private static TaskRunner runner = new TaskRunner();
private String[] inputTraces;
private Configuration conf;
private Map<String, Integer> queueAppNumMap;
// NM simulator
private HashMap<NodeId, NMSimulator> nmMap;
private int nmMemoryMB, nmVCores;
private String nodeFile;
// AM simulator
private int AM_ID;
private Map<String, AMSimulator> amMap;
private Set<String> trackedApps;
private Map<String, Class> amClassMap;
private static int remainingApps = 0;
// metrics
private String metricsOutputDir;
private boolean printSimulation;
// other simulation information
private int numNMs, numRacks, numAMs, numTasks;
private long maxRuntime;
public final static Map<String, Object> simulateInfoMap =
new HashMap<String, Object>();
// logger
public final static Logger LOG = Logger.getLogger(SLSRunner.class);
// input traces, input-rumen or input-sls
private boolean isSLS;
public SLSRunner(boolean isSLS, String inputTraces[], String nodeFile,
String outputDir, Set<String> trackedApps,
boolean printsimulation)
throws IOException, ClassNotFoundException {
this.isSLS = isSLS;
this.inputTraces = inputTraces.clone();
this.nodeFile = nodeFile;
this.trackedApps = trackedApps;
this.printSimulation = printsimulation;
metricsOutputDir = outputDir;
nmMap = new HashMap<NodeId, NMSimulator>();
queueAppNumMap = new HashMap<String, Integer>();
amMap = new HashMap<String, AMSimulator>();
amClassMap = new HashMap<String, Class>();
// runner configuration
conf = new Configuration(false);
conf.addResource("sls-runner.xml");
// runner
int poolSize = conf.getInt(SLSConfiguration.RUNNER_POOL_SIZE,
SLSConfiguration.RUNNER_POOL_SIZE_DEFAULT);
SLSRunner.runner.setQueueSize(poolSize);
// <AMType, Class> map
for (Map.Entry e : conf) {
String key = e.getKey().toString();
if (key.startsWith(SLSConfiguration.AM_TYPE)) {
String amType = key.substring(SLSConfiguration.AM_TYPE.length());
amClassMap.put(amType, Class.forName(conf.get(key)));
}
}
}
public void start() throws Exception {
// start resource manager
startRM();
// start node managers
startNM();
// start application masters
startAM();
// set queue & tracked apps information
((ResourceSchedulerWrapper) rm.getResourceScheduler())
.setQueueSet(this.queueAppNumMap.keySet());
((ResourceSchedulerWrapper) rm.getResourceScheduler())
.setTrackedAppSet(this.trackedApps);
// print out simulation info
printSimulationInfo();
// blocked until all nodes RUNNING
waitForNodesRunning();
// starting the runner once everything is ready to go,
runner.start();
}
private void startRM() throws IOException, ClassNotFoundException {
Configuration rmConf = new YarnConfiguration();
String schedulerClass = rmConf.get(YarnConfiguration.RM_SCHEDULER);
rmConf.set(SLSConfiguration.RM_SCHEDULER, schedulerClass);
rmConf.set(YarnConfiguration.RM_SCHEDULER,
ResourceSchedulerWrapper.class.getName());
rmConf.set(SLSConfiguration.METRICS_OUTPUT_DIR, metricsOutputDir);
RMStorageFactory.setConfiguration(rmConf);
YarnAPIStorageFactory.setConfiguration(rmConf);
DBUtility.InitializeDB();
rm = new ResourceManager();
rm.init(rmConf);
rm.start();
}
private void startNM() throws YarnException, IOException {
// nm configuration
nmMemoryMB = conf.getInt(SLSConfiguration.NM_MEMORY_MB,
SLSConfiguration.NM_MEMORY_MB_DEFAULT);
nmVCores = conf.getInt(SLSConfiguration.NM_VCORES,
SLSConfiguration.NM_VCORES_DEFAULT);
int heartbeatInterval = conf.getInt(
SLSConfiguration.NM_HEARTBEAT_INTERVAL_MS,
SLSConfiguration.NM_HEARTBEAT_INTERVAL_MS_DEFAULT);
// nm information (fetch from topology file, or from sls/rumen json file)
Set<String> nodeSet = new HashSet<String>();
if (nodeFile.isEmpty()) {
if (isSLS) {
for (String inputTrace : inputTraces) {
nodeSet.addAll(SLSUtils.parseNodesFromSLSTrace(inputTrace));
}
} else {
for (String inputTrace : inputTraces) {
nodeSet.addAll(SLSUtils.parseNodesFromRumenTrace(inputTrace));
}
}
} else {
nodeSet.addAll(SLSUtils.parseNodesFromNodeFile(nodeFile));
}
// create NM simulators
Random random = new Random();
Set<String> rackSet = new HashSet<String>();
for (String hostName : nodeSet) {
// we randomize the heartbeat start time from zero to 1 interval
NMSimulator nm = new NMSimulator();
nm.init(hostName, nmMemoryMB, nmVCores,
random.nextInt(heartbeatInterval), heartbeatInterval, rm);
nmMap.put(nm.getNode().getNodeID(), nm);
runner.schedule(nm);
rackSet.add(nm.getNode().getRackName());
}
numRacks = rackSet.size();
numNMs = nmMap.size();
}
private void waitForNodesRunning() throws InterruptedException {
long startTimeMS = System.currentTimeMillis();
while (true) {
int numRunningNodes = 0;
for (RMNode node : rm.getRMContext().getRMNodes().values()) {
if (node.getState() == NodeState.RUNNING) {
numRunningNodes ++;
}
}
if (numRunningNodes == numNMs) {
break;
}
LOG.info(MessageFormat.format("SLSRunner is waiting for all " +
"nodes RUNNING. {0} of {1} NMs initialized.",
numRunningNodes, numNMs));
Thread.sleep(1000);
}
LOG.info(MessageFormat.format("SLSRunner takes {0} ms to launch all nodes.",
(System.currentTimeMillis() - startTimeMS)));
}
@SuppressWarnings("unchecked")
private void startAM() throws YarnException, IOException {
// application/container configuration
int heartbeatInterval = conf.getInt(
SLSConfiguration.AM_HEARTBEAT_INTERVAL_MS,
SLSConfiguration.AM_HEARTBEAT_INTERVAL_MS_DEFAULT);
int containerMemoryMB = conf.getInt(SLSConfiguration.CONTAINER_MEMORY_MB,
SLSConfiguration.CONTAINER_MEMORY_MB_DEFAULT);
int containerVCores = conf.getInt(SLSConfiguration.CONTAINER_VCORES,
SLSConfiguration.CONTAINER_VCORES_DEFAULT);
Resource containerResource =
BuilderUtils.newResource(containerMemoryMB, containerVCores);
// application workload
if (isSLS) {
startAMFromSLSTraces(containerResource, heartbeatInterval);
} else {
startAMFromRumenTraces(containerResource, heartbeatInterval);
}
numAMs = amMap.size();
remainingApps = numAMs;
}
/**
* parse workload information from sls trace files
*/
@SuppressWarnings("unchecked")
private void startAMFromSLSTraces(Resource containerResource,
int heartbeatInterval) throws IOException {
// parse from sls traces
JsonFactory jsonF = new JsonFactory();
ObjectMapper mapper = new ObjectMapper();
for (String inputTrace : inputTraces) {
Reader input = new FileReader(inputTrace);
try {
Iterator<Map> i = mapper.readValues(jsonF.createJsonParser(input),
Map.class);
while (i.hasNext()) {
Map jsonJob = i.next();
// load job information
long jobStartTime = Long.parseLong(
jsonJob.get("job.start.ms").toString());
long jobFinishTime = Long.parseLong(
jsonJob.get("job.end.ms").toString());
String user = (String) jsonJob.get("job.user");
if (user == null) user = "default";
String queue = jsonJob.get("job.queue.name").toString();
String oldAppId = jsonJob.get("job.id").toString();
boolean isTracked = trackedApps.contains(oldAppId);
int queueSize = queueAppNumMap.containsKey(queue) ?
queueAppNumMap.get(queue) : 0;
queueSize ++;
queueAppNumMap.put(queue, queueSize);
// tasks
List tasks = (List) jsonJob.get("job.tasks");
if (tasks == null || tasks.size() == 0) {
continue;
}
List<ContainerSimulator> containerList =
new ArrayList<ContainerSimulator>();
for (Object o : tasks) {
Map jsonTask = (Map) o;
String hostname = jsonTask.get("container.host").toString();
long taskStart = Long.parseLong(
jsonTask.get("container.start.ms").toString());
long taskFinish = Long.parseLong(
jsonTask.get("container.end.ms").toString());
long lifeTime = taskFinish - taskStart;
int priority = Integer.parseInt(
jsonTask.get("container.priority").toString());
String type = jsonTask.get("container.type").toString();
containerList.add(new ContainerSimulator(containerResource,
lifeTime, hostname, priority, type));
}
// create a new AM
String amType = jsonJob.get("am.type").toString();
AMSimulator amSim = (AMSimulator) ReflectionUtils.newInstance(
amClassMap.get(amType), new Configuration());
if (amSim != null) {
amSim.init(AM_ID++, heartbeatInterval, containerList, rm,
this, jobStartTime, jobFinishTime, user, queue,
isTracked, oldAppId);
runner.schedule(amSim);
maxRuntime = Math.max(maxRuntime, jobFinishTime);
numTasks += containerList.size();
amMap.put(oldAppId, amSim);
}
}
} finally {
input.close();
}
}
}
/**
* parse workload information from rumen trace files
*/
@SuppressWarnings("unchecked")
private void startAMFromRumenTraces(Resource containerResource,
int heartbeatInterval)
throws IOException {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "file:///");
long baselineTimeMS = 0;
for (String inputTrace : inputTraces) {
File fin = new File(inputTrace);
JobTraceReader reader = new JobTraceReader(
new Path(fin.getAbsolutePath()), conf);
try {
LoggedJob job = null;
while ((job = reader.getNext()) != null) {
// only support MapReduce currently
String jobType = "mapreduce";
String user = job.getUser() == null ?
"default" : job.getUser().getValue();
String jobQueue = job.getQueue().getValue();
String oldJobId = job.getJobID().toString();
long jobStartTimeMS = job.getSubmitTime();
long jobFinishTimeMS = job.getFinishTime();
if (baselineTimeMS == 0) {
baselineTimeMS = jobStartTimeMS;
}
jobStartTimeMS -= baselineTimeMS;
jobFinishTimeMS -= baselineTimeMS;
if (jobStartTimeMS < 0) {
LOG.warn("Warning: reset job " + oldJobId + " start time to 0.");
jobFinishTimeMS = jobFinishTimeMS - jobStartTimeMS;
jobStartTimeMS = 0;
}
boolean isTracked = trackedApps.contains(oldJobId);
int queueSize = queueAppNumMap.containsKey(jobQueue) ?
queueAppNumMap.get(jobQueue) : 0;
queueSize ++;
queueAppNumMap.put(jobQueue, queueSize);
List<ContainerSimulator> containerList =
new ArrayList<ContainerSimulator>();
// map tasks
for(LoggedTask mapTask : job.getMapTasks()) {
LoggedTaskAttempt taskAttempt = mapTask.getAttempts()
.get(mapTask.getAttempts().size() - 1);
String hostname = taskAttempt.getHostName().getValue();
long containerLifeTime = taskAttempt.getFinishTime()
- taskAttempt.getStartTime();
containerList.add(new ContainerSimulator(containerResource,
containerLifeTime, hostname, 10, "map"));
}
// reduce tasks
for(LoggedTask reduceTask : job.getReduceTasks()) {
LoggedTaskAttempt taskAttempt = reduceTask.getAttempts()
.get(reduceTask.getAttempts().size() - 1);
String hostname = taskAttempt.getHostName().getValue();
long containerLifeTime = taskAttempt.getFinishTime()
- taskAttempt.getStartTime();
containerList.add(new ContainerSimulator(containerResource,
containerLifeTime, hostname, 20, "reduce"));
}
// create a new AM
AMSimulator amSim = (AMSimulator) ReflectionUtils.newInstance(
amClassMap.get(jobType), conf);
if (amSim != null) {
amSim.init(AM_ID ++, heartbeatInterval, containerList,
rm, this, jobStartTimeMS, jobFinishTimeMS, user, jobQueue,
isTracked, oldJobId);
runner.schedule(amSim);
maxRuntime = Math.max(maxRuntime, jobFinishTimeMS);
numTasks += containerList.size();
amMap.put(oldJobId, amSim);
}
}
} finally {
reader.close();
}
}
}
private void printSimulationInfo() {
if (printSimulation) {
// node
LOG.info("------------------------------------");
LOG.info(MessageFormat.format("# nodes = {0}, # racks = {1}, capacity " +
"of each node {2} MB memory and {3} vcores.",
numNMs, numRacks, nmMemoryMB, nmVCores));
LOG.info("------------------------------------");
// job
LOG.info(MessageFormat.format("# applications = {0}, # total " +
"tasks = {1}, average # tasks per application = {2}",
numAMs, numTasks, (int)(Math.ceil((numTasks + 0.0) / numAMs))));
LOG.info("JobId\tQueue\tAMType\tDuration\t#Tasks");
for (Map.Entry<String, AMSimulator> entry : amMap.entrySet()) {
AMSimulator am = entry.getValue();
LOG.info(entry.getKey() + "\t" + am.getQueue() + "\t" + am.getAMType()
+ "\t" + am.getDuration() + "\t" + am.getNumTasks());
}
LOG.info("------------------------------------");
// queue
LOG.info(MessageFormat.format("number of queues = {0} average " +
"number of apps = {1}", queueAppNumMap.size(),
(int)(Math.ceil((numAMs + 0.0) / queueAppNumMap.size()))));
LOG.info("------------------------------------");
// runtime
LOG.info(MessageFormat.format("estimated simulation time is {0}" +
" seconds", (long)(Math.ceil(maxRuntime / 1000.0))));
LOG.info("------------------------------------");
}
// package these information in the simulateInfoMap used by other places
simulateInfoMap.put("Number of racks", numRacks);
simulateInfoMap.put("Number of nodes", numNMs);
simulateInfoMap.put("Node memory (MB)", nmMemoryMB);
simulateInfoMap.put("Node VCores", nmVCores);
simulateInfoMap.put("Number of applications", numAMs);
simulateInfoMap.put("Number of tasks", numTasks);
simulateInfoMap.put("Average tasks per applicaion",
(int)(Math.ceil((numTasks + 0.0) / numAMs)));
simulateInfoMap.put("Number of queues", queueAppNumMap.size());
simulateInfoMap.put("Average applications per queue",
(int)(Math.ceil((numAMs + 0.0) / queueAppNumMap.size())));
simulateInfoMap.put("Estimated simulate time (s)",
(long)(Math.ceil(maxRuntime / 1000.0)));
}
public HashMap<NodeId, NMSimulator> getNmMap() {
return nmMap;
}
public static TaskRunner getRunner() {
return runner;
}
public static void decreaseRemainingApps() {
remainingApps --;
if (remainingApps == 0) {
LOG.info("SLSRunner tears down.");
System.exit(0);
}
}
public static void main(String args[]) throws Exception {
Options options = new Options();
options.addOption("inputrumen", true, "input rumen files");
options.addOption("inputsls", true, "input sls files");
options.addOption("nodes", true, "input topology");
options.addOption("output", true, "output directory");
options.addOption("trackjobs", true,
"jobs to be tracked during simulating");
options.addOption("printsimulation", false,
"print out simulation information");
CommandLineParser parser = new GnuParser();
CommandLine cmd = parser.parse(options, args);
String inputRumen = cmd.getOptionValue("inputrumen");
String inputSLS = cmd.getOptionValue("inputsls");
String output = cmd.getOptionValue("output");
if ((inputRumen == null && inputSLS == null) || output == null) {
System.err.println();
System.err.println("ERROR: Missing input or output file");
System.err.println();
System.err.println("Options: -inputrumen|-inputsls FILE,FILE... " +
"-output FILE [-nodes FILE] [-trackjobs JobId,JobId...] " +
"[-printsimulation]");
System.err.println();
System.exit(1);
}
File outputFile = new File(output);
if (! outputFile.exists()
&& ! outputFile.mkdirs()) {
System.err.println("ERROR: Cannot create output directory "
+ outputFile.getAbsolutePath());
System.exit(1);
}
Set<String> trackedJobSet = new HashSet<String>();
if (cmd.hasOption("trackjobs")) {
String trackjobs = cmd.getOptionValue("trackjobs");
String jobIds[] = trackjobs.split(",");
trackedJobSet.addAll(Arrays.asList(jobIds));
}
String nodeFile = cmd.hasOption("nodes") ? cmd.getOptionValue("nodes") : "";
boolean isSLS = inputSLS != null;
String inputFiles[] = isSLS ? inputSLS.split(",") : inputRumen.split(",");
SLSRunner sls = new SLSRunner(isSLS, inputFiles, nodeFile, output,
trackedJobSet, cmd.hasOption("printsimulation"));
sls.start();
}
}