/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tez.mapreduce.hadoop;
import java.io.File;
import java.io.IOException;
import java.util.Map;
import java.util.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.classification.InterfaceStability.Evolving;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.tez.common.TezUtils;
import org.apache.tez.common.TezYARNUtils;
import org.apache.tez.dag.api.TezUncheckedException;
import org.apache.tez.mapreduce.combine.MRCombiner;
import org.apache.tez.mapreduce.partition.MRPartitioner;
import org.apache.tez.runtime.library.api.TezRuntimeConfiguration;
/**
* This class contains helper methods for frameworks which migrate from MapReduce to Tez, and need
* to continue to work with existing MapReduce configurations.
*/
@Public
@Evolving
public class MRHelpers {
private static final Logger LOG = LoggerFactory.getLogger(MRHelpers.class);
/**
* Translate MapReduce configuration keys to the equivalent Tez keys in the provided
* configuration. The translation is done in place. </p>
* This method is meant to be used by frameworks which rely upon existing MapReduce configuration
* instead of setting up their own.
*
* @param conf mr based configuration to be translated to tez
*/
public static void translateMRConfToTez(Configuration conf) {
translateMRConfToTez(conf, true);
}
/**
* Translate MapReduce configuration keys to the equivalent Tez keys in the provided
* configuration. The translation is done in place. </p>
* This method is meant to be used by frameworks which rely upon existing MapReduce configuration
* instead of setting up their own.
*
* @param conf mr based configuration to be translated to tez
* @param preferTez If the tez setting already exists and is set, use the Tez setting
*/
public static void translateMRConfToTez(Configuration conf, boolean preferTez) {
convertVertexConfToTez(conf, preferTez);
}
/**
* Update the provided configuration to use the new API (mapreduce) or the old API (mapred) based
* on the configured InputFormat, OutputFormat, Partitioner etc. Also ensures that keys not
* required by a particular mode are not present. </p>
*
* This method should be invoked after completely setting up the configuration. </p>
*
* Defaults to using the new API if relevant keys are not present.
*
*/
public static void configureMRApiUsage(Configuration conf) {
String oldMapperClass = "mapred.mapper.class";
conf.setBooleanIfUnset("mapred.mapper.new-api", conf.get(oldMapperClass) == null);
try {
if (conf.getBoolean("mapred.mapper.new-api", false)) {
String mode = "new map API";
ensureNotSet(conf, "mapred.input.format.class", mode);
ensureNotSet(conf, oldMapperClass, mode);
} else {
String mode = "map compatability";
ensureNotSet(conf, MRJobConfig.INPUT_FORMAT_CLASS_ATTR, mode);
ensureNotSet(conf, MRJobConfig.MAP_CLASS_ATTR, mode);
}
} catch (IOException e) {
throw new TezUncheckedException(e);
}
}
private static void convertVertexConfToTez(Configuration vertexConf, boolean preferTez) {
setStageKeysFromBaseConf(vertexConf, vertexConf, "unknown");
processDirectConversion(vertexConf, preferTez);
setupMRComponents(vertexConf);
}
private static void setupMRComponents(Configuration conf) {
if (conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_PARTITIONER_CLASS) == null) {
conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_PARTITIONER_CLASS,
MRPartitioner.class.getName());
}
if (conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINER_CLASS) == null) {
boolean useNewApi = conf.getBoolean("mapred.mapper.new-api", false);
if (useNewApi) {
if (conf.get(MRJobConfig.COMBINE_CLASS_ATTR) != null) {
conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINER_CLASS, MRCombiner.class.getName());
}
} else {
if (conf.get("mapred.combiner.class") != null) {
conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINER_CLASS, MRCombiner.class.getName());
}
}
}
}
/**
* Pulls in specific keys from the base configuration, if they are not set at
* the stage level. An explicit list of keys is copied over (not all), which
* require translation to tez keys.
*/
private static void setStageKeysFromBaseConf(Configuration conf,
Configuration baseConf, String stage) {
// Don't clobber explicit tez config.
JobConf jobConf = null;
if (conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS) == null) {
// If this is set, but the comparator is not set, and their types differ -
// the job will break.
if (conf.get(MRJobConfig.MAP_OUTPUT_KEY_CLASS) == null) {
// Pull this in from the baseConf
// Create jobConf only if required.
jobConf = new JobConf(baseConf);
conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, jobConf
.getMapOutputKeyClass().getName());
if (LOG.isDebugEnabled()) {
LOG.debug("Setting " + MRJobConfig.MAP_OUTPUT_KEY_CLASS
+ " for stage: " + stage
+ " based on job level configuration. Value: "
+ conf.get(MRJobConfig.MAP_OUTPUT_KEY_CLASS));
}
}
}
if (conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS) == null) {
if (conf.get(MRJobConfig.MAP_OUTPUT_VALUE_CLASS) == null) {
if (jobConf == null) {
// Create jobConf if not already created
jobConf = new JobConf(baseConf);
}
conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, jobConf
.getMapOutputValueClass().getName());
if (LOG.isDebugEnabled()) {
LOG.debug("Setting " + MRJobConfig.MAP_OUTPUT_VALUE_CLASS
+ " for stage: " + stage
+ " based on job level configuration. Value: "
+ conf.get(MRJobConfig.MAP_OUTPUT_VALUE_CLASS));
}
}
}
}
private static void processDirectConversion(Configuration conf, boolean preferTez) {
for (Map.Entry<String, String> dep : DeprecatedKeys.getMRToTezRuntimeParamMap().entrySet()) {
if (conf.get(dep.getKey()) != null) {
// TODO Deprecation reason does not seem to reflect in the config ?
// The ordering is important in case of keys which are also deprecated.
// Unset will unset the deprecated keys and all it's variants.
final String mrValue = conf.get(dep.getKey());
final String tezValue = conf.get(dep.getValue());
conf.unset(dep.getKey());
if (tezValue == null) {
conf.set(dep.getValue(), mrValue, "TRANSLATED_TO_TEZ");
} else if (!preferTez) {
conf.set(dep.getValue(), mrValue, "TRANSLATED_TO_TEZ_AND_MR_OVERRIDE");
}
if (LOG.isDebugEnabled()) {
LOG.debug("Config: mr(unset):" + dep.getKey() + ", mr initial value="
+ mrValue
+ ", tez(original):" + dep.getValue() + "=" + tezValue
+ ", tez(final):" + dep.getValue() + "=" + conf.get(dep.getValue()));
}
}
}
}
private static String getChildLogLevel(Configuration conf, boolean isMap) {
if (isMap) {
return conf.get(
MRJobConfig.MAP_LOG_LEVEL,
JobConf.DEFAULT_LOG_LEVEL.toString()
);
} else {
return conf.get(
MRJobConfig.REDUCE_LOG_LEVEL,
JobConf.DEFAULT_LOG_LEVEL.toString()
);
}
}
private static void ensureNotSet(Configuration conf, String attr, String msg)
throws IOException {
if (conf.get(attr) != null) {
throw new IOException(attr + " is incompatible with " + msg + " mode.");
}
}
private static String getLog4jCmdLineProperties(Configuration conf,
boolean isMap) {
Vector<String> logProps = new Vector<String>(4);
TezUtils.addLog4jSystemProperties(getChildLogLevel(conf, isMap), logProps);
StringBuilder sb = new StringBuilder();
for (String str : logProps) {
sb.append(str).append(" ");
}
return sb.toString();
}
/**
* Generate JVM options based on MapReduce AM java options. </p>
* <p/>
* This is only meant to be used if frameworks are not setting up their own java options or
* relying on the defaults specified by Tez, and instead want to use the options which may already
* have been configured for an MR AppMaster.
*
* @param conf Configuration to be used to extract JVM opts specific info
* @return JAVA_OPTS string to be used in launching the JVM
*/
public static String getJavaOptsForMRAM(Configuration conf) {
// Admin opts
String mrAppMasterAdminOptions = conf.get(
MRJobConfig.MR_AM_ADMIN_COMMAND_OPTS,
MRJobConfig.DEFAULT_MR_AM_ADMIN_COMMAND_OPTS);
// Add AM user command opts
String mrAppMasterUserOptions = conf.get(MRJobConfig.MR_AM_COMMAND_OPTS,
MRJobConfig.DEFAULT_MR_AM_COMMAND_OPTS);
return mrAppMasterAdminOptions.trim()
+ " " + mrAppMasterUserOptions.trim();
}
/**
* Generate JVM options based on MapReduce mapper java options. </p>
*
* This is only meant to be used if frameworks are not setting up their own java options,
* and would like to fallback to using java options which may already be configured for
* Hadoop MapReduce mappers.
*
* Uses mapreduce.admin.map.child.java.opts, mapreduce.map.java.opts and
* mapreduce.map.log.level from config to generate the opts.
*
* @param conf Configuration to be used to extract JVM opts specific info
* @return JAVA_OPTS string to be used in launching the JVM
*/
@SuppressWarnings("deprecation")
public static String getJavaOptsForMRMapper(Configuration conf) {
String adminOpts = conf.get(
MRJobConfig.MAPRED_MAP_ADMIN_JAVA_OPTS,
MRJobConfig.DEFAULT_MAPRED_ADMIN_JAVA_OPTS);
String userOpts = conf.get(
MRJobConfig.MAP_JAVA_OPTS,
conf.get(
JobConf.MAPRED_TASK_JAVA_OPTS,
JobConf.DEFAULT_MAPRED_TASK_JAVA_OPTS));
return adminOpts.trim() + " " + userOpts.trim() + " "
+ getLog4jCmdLineProperties(conf, true);
}
/**
* Generate JVM options based on MapReduce reducer java options. </p>
*
* This is only meant to be used if frameworks are not setting up their own java options,
* and would like to fallback to using java options which may already be configured for
* Hadoop MapReduce reducers.
*
* Uses mapreduce.admin.reduce.child.java.opts, mapreduce.reduce.java.opts
* and mapreduce.reduce.log.level from config to generate the opts.
*
* @param conf Configuration to be used to extract JVM opts specific info
* @return JAVA_OPTS string to be used in launching the JVM
*/
@SuppressWarnings("deprecation")
public static String getJavaOptsForMRReducer(Configuration conf) {
String adminOpts = conf.get(
MRJobConfig.MAPRED_REDUCE_ADMIN_JAVA_OPTS,
MRJobConfig.DEFAULT_MAPRED_ADMIN_JAVA_OPTS);
String userOpts = conf.get(
MRJobConfig.REDUCE_JAVA_OPTS,
conf.get(
JobConf.MAPRED_TASK_JAVA_OPTS,
JobConf.DEFAULT_MAPRED_TASK_JAVA_OPTS));
return adminOpts.trim() + " " + userOpts.trim() + " "
+ getLog4jCmdLineProperties(conf, false);
}
/**
* Extract the container resource requirements from the provided configuration, which would
* otherwise have been used when running a Hadoop MapReduce mapper. </p>
* <p/>
* This is only meant to be used if frameworks are not setting up their own {@link
* org.apache.hadoop.yarn.api.records.Resource} and would like to fallback to using resources
* which may already be configured for Hadoop MapReduce mappers.
*
* @param conf Configuration with MR specific settings used to extract
* information from
*
* @return Resource object used to define requirements for containers
* running Map tasks
*/
public static Resource getResourceForMRMapper(Configuration conf) {
return Resource.newInstance(conf.getInt(
MRJobConfig.MAP_MEMORY_MB, MRJobConfig.DEFAULT_MAP_MEMORY_MB),
conf.getInt(MRJobConfig.MAP_CPU_VCORES,
MRJobConfig.DEFAULT_MAP_CPU_VCORES));
}
/**
* Extract the container resource requirements from the provided configuration, which would
* otherwise have been used when running a Hadoop MapReduce reducer. </p>
* <p/>
* This is only meant to be used if frameworks are not setting up their own {@link
* org.apache.hadoop.yarn.api.records.Resource} and would like to fallback to using resources
* which may already be configured for Hadoop MapReduce reducers.
* <p/>
* Uses mapreduce.reduce.memory.mb and mapreduce.reduce.cpu.vcores from the
* provided configuration.
*
* @param conf Configuration with MR specific settings used to extract
* information from
* @return Resource object used to define requirements for containers
* running Reduce tasks
*/
public static Resource getResourceForMRReducer(Configuration conf) {
return Resource.newInstance(conf.getInt(
MRJobConfig.REDUCE_MEMORY_MB, MRJobConfig.DEFAULT_REDUCE_MEMORY_MB),
conf.getInt(MRJobConfig.REDUCE_CPU_VCORES,
MRJobConfig.DEFAULT_REDUCE_CPU_VCORES));
}
/**
* Setup classpath and other environment variables based on the configured values for MR Mappers
* or Reducers
*
* @param conf Configuration to retrieve settings from
* @param environment Environment to update
* @param isMap Whether task is a map or reduce task
*/
public static void updateEnvBasedOnMRTaskEnv(Configuration conf,
Map<String, String> environment, boolean isMap) {
// Shell
environment.put(Environment.SHELL.name(), conf.get(
MRJobConfig.MAPRED_ADMIN_USER_SHELL, MRJobConfig.DEFAULT_SHELL));
// Add pwd to LD_LIBRARY_PATH, add this before adding anything else
TezYARNUtils.addToEnvironment(environment, Environment.LD_LIBRARY_PATH.name(),
Environment.PWD.$(), File.pathSeparator);
// Add the env variables passed by the admin
TezYARNUtils.appendToEnvFromInputString(environment, conf.get(
MRJobConfig.MAPRED_ADMIN_USER_ENV,
MRJobConfig.DEFAULT_MAPRED_ADMIN_USER_ENV),
File.pathSeparator);
// Add the env variables passed by the user
String mapredChildEnv = (isMap ?
conf.get(MRJobConfig.MAP_ENV, conf.get("mapred.child.env"))
: conf.get(MRJobConfig.REDUCE_ENV, conf.get("mapred.child.env")));
TezYARNUtils.appendToEnvFromInputString(environment, mapredChildEnv, File.pathSeparator);
// Set logging level in the environment.
environment.put(
"HADOOP_ROOT_LOGGER",
getChildLogLevel(conf, isMap) + ",CLA");
}
/**
* Setup environment variables based on the configured values for the MR AM
* @param conf Configuration from which to extract information
* @param environment Environment map to update
*/
public static void updateEnvBasedOnMRAMEnv(Configuration conf, Map<String, String> environment) {
TezYARNUtils.appendToEnvFromInputString(environment, conf.get(MRJobConfig.MR_AM_ADMIN_USER_ENV),
File.pathSeparator);
TezYARNUtils.appendToEnvFromInputString(environment, conf.get(MRJobConfig.MR_AM_ENV),
File.pathSeparator);
}
}