/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.chukwa.inputtools.mdl;
import java.sql.SQLException;
import java.sql.ResultSet;
import java.lang.Exception;
import java.util.Calendar;
import java.util.Set;
import java.util.TreeSet;
import java.util.TreeMap;
import java.util.Iterator;
import java.lang.StringBuffer;
import java.sql.Timestamp;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.lang.Thread;
import java.util.Timer;
import java.lang.ProcessBuilder;
import java.lang.Process;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.lang.InterruptedException;
import java.lang.System;
import java.util.Date;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.chukwa.inputtools.mdl.DataConfig;
import org.apache.hadoop.chukwa.inputtools.mdl.TorqueTimerTask;
import org.apache.hadoop.chukwa.inputtools.mdl.ErStreamHandler;
import org.apache.hadoop.chukwa.util.DatabaseWriter;
public class TorqueInfoProcessor {
private static Log log = LogFactory.getLog(TorqueInfoProcessor.class);
private int intervalValue = 60;
private String torqueServer = null;
private String torqueBinDir = null;
private String domain = null;
private TreeMap<String, TreeMap<String, String>> currentHodJobs;
public TorqueInfoProcessor(DataConfig mdlConfig, int interval) {
this.intervalValue = interval;
torqueServer = System.getProperty("TORQUE_SERVER");
torqueBinDir = System.getProperty("TORQUE_HOME") + File.separator + "bin";
domain = System.getProperty("DOMAIN");
currentHodJobs = new TreeMap<String, TreeMap<String, String>>();
}
public void setup(boolean recover) throws Exception {
}
private void getHodJobInfo() throws IOException {
StringBuffer sb = new StringBuffer();
sb.append(torqueBinDir).append("/qstat -a");
String[] getQueueInfoCommand = new String[3];
getQueueInfoCommand[0] = "ssh";
getQueueInfoCommand[1] = torqueServer;
getQueueInfoCommand[2] = sb.toString();
String command = getQueueInfoCommand[0] + " " + getQueueInfoCommand[1]
+ " " + getQueueInfoCommand[2];
ProcessBuilder pb = new ProcessBuilder(getQueueInfoCommand);
Process p = pb.start();
Timer timeout = new Timer();
TorqueTimerTask torqueTimerTask = new TorqueTimerTask(p, command);
timeout.schedule(torqueTimerTask, TorqueTimerTask.timeoutInterval * 1000);
BufferedReader result = new BufferedReader(new InputStreamReader(p
.getInputStream()));
ErStreamHandler errorHandler = new ErStreamHandler(p.getErrorStream(),
command, true);
errorHandler.start();
String line = null;
boolean start = false;
TreeSet<String> jobsInTorque = new TreeSet<String>();
while ((line = result.readLine()) != null) {
if (line.startsWith("---")) {
start = true;
continue;
}
if (start) {
String[] items = line.split("\\s+");
if (items.length >= 10) {
String hodIdLong = items[0];
String hodId = hodIdLong.split("[.]")[0];
String userId = items[1];
String numOfMachine = items[5];
String status = items[9];
jobsInTorque.add(hodId);
if (!currentHodJobs.containsKey(hodId)) {
TreeMap<String, String> aJobData = new TreeMap<String, String>();
aJobData.put("userId", userId);
aJobData.put("numOfMachine", numOfMachine);
aJobData.put("traceCheckCount", "0");
aJobData.put("process", "0");
aJobData.put("status", status);
currentHodJobs.put(hodId, aJobData);
} else {
TreeMap<String, String> aJobData = currentHodJobs.get(hodId);
aJobData.put("status", status);
currentHodJobs.put(hodId, aJobData);
}// if..else
}
}
}// while
try {
errorHandler.join();
} catch (InterruptedException ie) {
log.error(ie.getMessage());
}
timeout.cancel();
Set<String> currentHodJobIds = currentHodJobs.keySet();
Iterator<String> currentHodJobIdsIt = currentHodJobIds.iterator();
TreeSet<String> finishedHodIds = new TreeSet<String>();
while (currentHodJobIdsIt.hasNext()) {
String hodId = currentHodJobIdsIt.next();
if (!jobsInTorque.contains(hodId)) {
TreeMap<String, String> aJobData = currentHodJobs.get(hodId);
String process = aJobData.get("process");
if (process.equals("0") || process.equals("1")) {
aJobData.put("status", "C");
} else {
finishedHodIds.add(hodId);
}
}
}// while
Iterator<String> finishedHodIdsIt = finishedHodIds.iterator();
while (finishedHodIdsIt.hasNext()) {
String hodId = finishedHodIdsIt.next();
currentHodJobs.remove(hodId);
}
}
private boolean loadQstatData(String hodId) throws IOException, SQLException {
TreeMap<String, String> aJobData = currentHodJobs.get(hodId);
String userId = aJobData.get("userId");
StringBuffer sb = new StringBuffer();
sb.append(torqueBinDir).append("/qstat -f -1 ").append(hodId);
String[] qstatCommand = new String[3];
qstatCommand[0] = "ssh";
qstatCommand[1] = torqueServer;
qstatCommand[2] = sb.toString();
String command = qstatCommand[0] + " " + qstatCommand[1] + " "
+ qstatCommand[2];
ProcessBuilder pb = new ProcessBuilder(qstatCommand);
Process p = pb.start();
Timer timeout = new Timer();
TorqueTimerTask torqueTimerTask = new TorqueTimerTask(p, command);
timeout.schedule(torqueTimerTask, TorqueTimerTask.timeoutInterval * 1000);
BufferedReader result = new BufferedReader(new InputStreamReader(p
.getInputStream()));
ErStreamHandler errorHandler = new ErStreamHandler(p.getErrorStream(),
command, false);
errorHandler.start();
String line = null;
String hosts = null;
long startTimeValue = -1;
long endTimeValue = Calendar.getInstance().getTimeInMillis();
long executeTimeValue = Calendar.getInstance().getTimeInMillis();
boolean qstatfinished;
while ((line = result.readLine()) != null) {
if (line.indexOf("ctime") >= 0) {
String startTime = line.split("=")[1].trim();
// Tue Sep 9 23:44:29 2008
SimpleDateFormat sdf = new SimpleDateFormat("EEE MMM d HH:mm:ss yyyy");
Date startTimeDate;
try {
startTimeDate = sdf.parse(startTime);
startTimeValue = startTimeDate.getTime();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if (line.indexOf("mtime") >= 0) {
String endTime = line.split("=")[1].trim();
SimpleDateFormat sdf = new SimpleDateFormat("EEE MMM d HH:mm:ss yyyy");
Date endTimeDate;
try {
endTimeDate = sdf.parse(endTime);
endTimeValue = endTimeDate.getTime();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if (line.indexOf("etime") >= 0) {
String executeTime = line.split("=")[1].trim();
SimpleDateFormat sdf = new SimpleDateFormat("EEE MMM d HH:mm:ss yyyy");
Date executeTimeDate;
try {
executeTimeDate = sdf.parse(executeTime);
executeTimeValue = executeTimeDate.getTime();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if (line.indexOf("exec_host") >= 0) {
hosts = line.split("=")[1].trim();
}
}
if (hosts != null && startTimeValue >= 0) {
String[] items2 = hosts.split("[+]");
int num = 0;
for (int i = 0; i < items2.length; i++) {
String machinetmp = items2[i];
if (machinetmp.length() > 3) {
String machine = items2[i].substring(0, items2[i].length() - 2);
StringBuffer data = new StringBuffer();
data.append("HodId=").append(hodId);
data.append(", Machine=").append(machine);
if (domain != null) {
data.append(".").append(domain);
}
log.info(data);
num++;
}
}
Timestamp startTimedb = new Timestamp(startTimeValue);
Timestamp endTimedb = new Timestamp(endTimeValue);
StringBuffer data = new StringBuffer();
long timeQueued = executeTimeValue - startTimeValue;
data.append("HodID=").append(hodId);
data.append(", UserId=").append(userId);
data.append(", StartTime=").append(startTimedb);
data.append(", TimeQueued=").append(timeQueued);
data.append(", NumOfMachines=").append(num);
data.append(", EndTime=").append(endTimedb);
log.info(data);
qstatfinished = true;
} else {
qstatfinished = false;
}
try {
errorHandler.join();
} catch (InterruptedException ie) {
log.error(ie.getMessage());
}
result.close();
timeout.cancel();
return qstatfinished;
}
private boolean loadTraceJobData(String hodId) throws IOException,
SQLException {
TreeMap<String, String> aJobData = currentHodJobs.get(hodId);
String userId = aJobData.get("userId");
String process = aJobData.get("process");
StringBuffer sb = new StringBuffer();
sb.append(torqueBinDir).append("/tracejob -n 10 -l -m -s ").append(hodId);
String[] traceJobCommand = new String[3];
traceJobCommand[0] = "ssh";
traceJobCommand[1] = torqueServer;
traceJobCommand[2] = sb.toString();
String command = traceJobCommand[0] + " " + traceJobCommand[1] + " "
+ traceJobCommand[2];
ProcessBuilder pb = new ProcessBuilder(traceJobCommand);
Process p = pb.start();
Timer timeout = new Timer();
TorqueTimerTask torqueTimerTask = new TorqueTimerTask(p, command);
timeout.schedule(torqueTimerTask, TorqueTimerTask.timeoutInterval * 1000);
BufferedReader result = new BufferedReader(new InputStreamReader(p
.getInputStream()));
ErStreamHandler errorHandler = new ErStreamHandler(p.getErrorStream(),
command, false);
errorHandler.start();
String line = null;
String exit_status = null;
String hosts = null;
long timeQueued = -1;
long startTimeValue = -1;
long endTimeValue = -1;
boolean findResult = false;
while ((line = result.readLine()) != null && !findResult) {
if (line.indexOf("end") >= 0 && line.indexOf("Exit_status") >= 0
&& line.indexOf("qtime") >= 0) {
TreeMap<String, String> jobData = new TreeMap<String, String>();
String[] items = line.split("\\s+");
for (int i = 0; i < items.length; i++) {
String[] items2 = items[i].split("=");
if (items2.length >= 2) {
jobData.put(items2[0], items2[1]);
}
}
String startTime = jobData.get("ctime");
startTimeValue = Long.valueOf(startTime);
startTimeValue = startTimeValue - startTimeValue % (60);
Timestamp startTimedb = new Timestamp(startTimeValue * 1000);
String queueTime = jobData.get("qtime");
long queueTimeValue = Long.valueOf(queueTime);
String sTime = jobData.get("start");
long sTimeValue = Long.valueOf(sTime);
timeQueued = sTimeValue - queueTimeValue;
String endTime = jobData.get("end");
endTimeValue = Long.valueOf(endTime);
endTimeValue = endTimeValue - endTimeValue % (60);
Timestamp endTimedb = new Timestamp(endTimeValue * 1000);
exit_status = jobData.get("Exit_status");
hosts = jobData.get("exec_host");
String[] items2 = hosts.split("[+]");
int num = 0;
for (int i = 0; i < items2.length; i++) {
String machinetemp = items2[i];
if (machinetemp.length() >= 3) {
String machine = items2[i].substring(0, items2[i].length() - 2);
StringBuffer data = new StringBuffer();
data.append("HodId=").append(hodId);
data.append(", Machine=").append(machine);
if (domain != null) {
data.append(".").append(domain);
}
log.info(data.toString());
num++;
}
}
StringBuffer data = new StringBuffer();
data.append("HodID=").append(hodId);
data.append(", UserId=").append(userId);
data.append(", Status=").append(exit_status);
data.append(", TimeQueued=").append(timeQueued);
data.append(", StartTime=").append(startTimedb);
data.append(", EndTime=").append(endTimedb);
data.append(", NumOfMachines=").append(num);
log.info(data.toString());
findResult = true;
log.debug(" hod info for job " + hodId + " has been loaded ");
}// if
}// while
try {
errorHandler.join();
} catch (InterruptedException ie) {
log.error(ie.getMessage());
}
timeout.cancel();
boolean tracedone = false;
if (!findResult) {
String traceCheckCount = aJobData.get("traceCheckCount");
int traceCheckCountValue = Integer.valueOf(traceCheckCount);
traceCheckCountValue = traceCheckCountValue + 1;
aJobData.put("traceCheckCount", String.valueOf(traceCheckCountValue));
log.debug("did not find tracejob info for job " + hodId + ", after "
+ traceCheckCountValue + " times checking");
if (traceCheckCountValue >= 2) {
tracedone = true;
}
}
boolean finished = findResult | tracedone;
return finished;
}
private void process_data() throws SQLException {
long currentTime = System.currentTimeMillis();
currentTime = currentTime - currentTime % (60 * 1000);
Timestamp timestamp = new Timestamp(currentTime);
Set<String> hodIds = currentHodJobs.keySet();
Iterator<String> hodIdsIt = hodIds.iterator();
while (hodIdsIt.hasNext()) {
String hodId = hodIdsIt.next();
TreeMap<String, String> aJobData = currentHodJobs.get(hodId);
String status = aJobData.get("status");
String process = aJobData.get("process");
if (process.equals("0") && (status.equals("R") || status.equals("E"))) {
try {
boolean result = loadQstatData(hodId);
if (result) {
aJobData.put("process", "1");
currentHodJobs.put(hodId, aJobData);
}
} catch (IOException ioe) {
log.error("load qsat data Error:" + ioe.getMessage());
}
}
if (!process.equals("2") && status.equals("C")) {
try {
boolean result = loadTraceJobData(hodId);
if (result) {
aJobData.put("process", "2");
currentHodJobs.put(hodId, aJobData);
}
} catch (IOException ioe) {
log.error("loadTraceJobData Error:" + ioe.getMessage());
}
}// if
} // while
}
private void handle_jobData() throws SQLException {
try {
getHodJobInfo();
} catch (IOException ex) {
log.error("getQueueInfo Error:" + ex.getMessage());
return;
}
try {
process_data();
} catch (SQLException ex) {
log.error("process_data Error:" + ex.getMessage());
throw ex;
}
}
public void run_forever() throws SQLException {
while (true) {
handle_jobData();
try {
log.debug("sleeping ...");
Thread.sleep(this.intervalValue * 1000);
} catch (InterruptedException e) {
log.error(e.getMessage());
}
}
}
public void shutdown() {
}
}