/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tez.mapreduce.hadoop; import java.io.File; import java.io.IOException; import java.util.Map; import java.util.Vector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.classification.InterfaceAudience.Public; import org.apache.hadoop.classification.InterfaceStability.Evolving; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.yarn.api.ApplicationConstants.Environment; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.tez.common.TezUtils; import org.apache.tez.common.TezYARNUtils; import org.apache.tez.dag.api.TezUncheckedException; import org.apache.tez.mapreduce.combine.MRCombiner; import org.apache.tez.mapreduce.partition.MRPartitioner; import org.apache.tez.runtime.library.api.TezRuntimeConfiguration; /** * This class contains helper methods for frameworks which migrate from MapReduce to Tez, and need * to continue to work with existing MapReduce configurations. */ @Public @Evolving public class MRHelpers { private static final Logger LOG = LoggerFactory.getLogger(MRHelpers.class); /** * Translate MapReduce configuration keys to the equivalent Tez keys in the provided * configuration. The translation is done in place. </p> * This method is meant to be used by frameworks which rely upon existing MapReduce configuration * instead of setting up their own. * * @param conf mr based configuration to be translated to tez */ public static void translateMRConfToTez(Configuration conf) { translateMRConfToTez(conf, true); } /** * Translate MapReduce configuration keys to the equivalent Tez keys in the provided * configuration. The translation is done in place. </p> * This method is meant to be used by frameworks which rely upon existing MapReduce configuration * instead of setting up their own. * * @param conf mr based configuration to be translated to tez * @param preferTez If the tez setting already exists and is set, use the Tez setting */ public static void translateMRConfToTez(Configuration conf, boolean preferTez) { convertVertexConfToTez(conf, preferTez); } /** * Update the provided configuration to use the new API (mapreduce) or the old API (mapred) based * on the configured InputFormat, OutputFormat, Partitioner etc. Also ensures that keys not * required by a particular mode are not present. </p> * * This method should be invoked after completely setting up the configuration. </p> * * Defaults to using the new API if relevant keys are not present. * */ public static void configureMRApiUsage(Configuration conf) { String oldMapperClass = "mapred.mapper.class"; conf.setBooleanIfUnset("mapred.mapper.new-api", conf.get(oldMapperClass) == null); try { if (conf.getBoolean("mapred.mapper.new-api", false)) { String mode = "new map API"; ensureNotSet(conf, "mapred.input.format.class", mode); ensureNotSet(conf, oldMapperClass, mode); } else { String mode = "map compatability"; ensureNotSet(conf, MRJobConfig.INPUT_FORMAT_CLASS_ATTR, mode); ensureNotSet(conf, MRJobConfig.MAP_CLASS_ATTR, mode); } } catch (IOException e) { throw new TezUncheckedException(e); } } private static void convertVertexConfToTez(Configuration vertexConf, boolean preferTez) { setStageKeysFromBaseConf(vertexConf, vertexConf, "unknown"); processDirectConversion(vertexConf, preferTez); setupMRComponents(vertexConf); } private static void setupMRComponents(Configuration conf) { if (conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_PARTITIONER_CLASS) == null) { conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_PARTITIONER_CLASS, MRPartitioner.class.getName()); } if (conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINER_CLASS) == null) { boolean useNewApi = conf.getBoolean("mapred.mapper.new-api", false); if (useNewApi) { if (conf.get(MRJobConfig.COMBINE_CLASS_ATTR) != null) { conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINER_CLASS, MRCombiner.class.getName()); } } else { if (conf.get("mapred.combiner.class") != null) { conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINER_CLASS, MRCombiner.class.getName()); } } } } /** * Pulls in specific keys from the base configuration, if they are not set at * the stage level. An explicit list of keys is copied over (not all), which * require translation to tez keys. */ private static void setStageKeysFromBaseConf(Configuration conf, Configuration baseConf, String stage) { // Don't clobber explicit tez config. JobConf jobConf = null; if (conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS) == null) { // If this is set, but the comparator is not set, and their types differ - // the job will break. if (conf.get(MRJobConfig.MAP_OUTPUT_KEY_CLASS) == null) { // Pull this in from the baseConf // Create jobConf only if required. jobConf = new JobConf(baseConf); conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, jobConf .getMapOutputKeyClass().getName()); if (LOG.isDebugEnabled()) { LOG.debug("Setting " + MRJobConfig.MAP_OUTPUT_KEY_CLASS + " for stage: " + stage + " based on job level configuration. Value: " + conf.get(MRJobConfig.MAP_OUTPUT_KEY_CLASS)); } } } if (conf.get(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS) == null) { if (conf.get(MRJobConfig.MAP_OUTPUT_VALUE_CLASS) == null) { if (jobConf == null) { // Create jobConf if not already created jobConf = new JobConf(baseConf); } conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, jobConf .getMapOutputValueClass().getName()); if (LOG.isDebugEnabled()) { LOG.debug("Setting " + MRJobConfig.MAP_OUTPUT_VALUE_CLASS + " for stage: " + stage + " based on job level configuration. Value: " + conf.get(MRJobConfig.MAP_OUTPUT_VALUE_CLASS)); } } } } private static void processDirectConversion(Configuration conf, boolean preferTez) { for (Map.Entry<String, String> dep : DeprecatedKeys.getMRToTezRuntimeParamMap().entrySet()) { if (conf.get(dep.getKey()) != null) { // TODO Deprecation reason does not seem to reflect in the config ? // The ordering is important in case of keys which are also deprecated. // Unset will unset the deprecated keys and all it's variants. final String mrValue = conf.get(dep.getKey()); final String tezValue = conf.get(dep.getValue()); conf.unset(dep.getKey()); if (tezValue == null) { conf.set(dep.getValue(), mrValue, "TRANSLATED_TO_TEZ"); } else if (!preferTez) { conf.set(dep.getValue(), mrValue, "TRANSLATED_TO_TEZ_AND_MR_OVERRIDE"); } if (LOG.isDebugEnabled()) { LOG.debug("Config: mr(unset):" + dep.getKey() + ", mr initial value=" + mrValue + ", tez(original):" + dep.getValue() + "=" + tezValue + ", tez(final):" + dep.getValue() + "=" + conf.get(dep.getValue())); } } } } private static String getChildLogLevel(Configuration conf, boolean isMap) { if (isMap) { return conf.get( MRJobConfig.MAP_LOG_LEVEL, JobConf.DEFAULT_LOG_LEVEL.toString() ); } else { return conf.get( MRJobConfig.REDUCE_LOG_LEVEL, JobConf.DEFAULT_LOG_LEVEL.toString() ); } } private static void ensureNotSet(Configuration conf, String attr, String msg) throws IOException { if (conf.get(attr) != null) { throw new IOException(attr + " is incompatible with " + msg + " mode."); } } private static String getLog4jCmdLineProperties(Configuration conf, boolean isMap) { Vector<String> logProps = new Vector<String>(4); TezUtils.addLog4jSystemProperties(getChildLogLevel(conf, isMap), logProps); StringBuilder sb = new StringBuilder(); for (String str : logProps) { sb.append(str).append(" "); } return sb.toString(); } /** * Generate JVM options based on MapReduce AM java options. </p> * <p/> * This is only meant to be used if frameworks are not setting up their own java options or * relying on the defaults specified by Tez, and instead want to use the options which may already * have been configured for an MR AppMaster. * * @param conf Configuration to be used to extract JVM opts specific info * @return JAVA_OPTS string to be used in launching the JVM */ public static String getJavaOptsForMRAM(Configuration conf) { // Admin opts String mrAppMasterAdminOptions = conf.get( MRJobConfig.MR_AM_ADMIN_COMMAND_OPTS, MRJobConfig.DEFAULT_MR_AM_ADMIN_COMMAND_OPTS); // Add AM user command opts String mrAppMasterUserOptions = conf.get(MRJobConfig.MR_AM_COMMAND_OPTS, MRJobConfig.DEFAULT_MR_AM_COMMAND_OPTS); return mrAppMasterAdminOptions.trim() + " " + mrAppMasterUserOptions.trim(); } /** * Generate JVM options based on MapReduce mapper java options. </p> * * This is only meant to be used if frameworks are not setting up their own java options, * and would like to fallback to using java options which may already be configured for * Hadoop MapReduce mappers. * * Uses mapreduce.admin.map.child.java.opts, mapreduce.map.java.opts and * mapreduce.map.log.level from config to generate the opts. * * @param conf Configuration to be used to extract JVM opts specific info * @return JAVA_OPTS string to be used in launching the JVM */ @SuppressWarnings("deprecation") public static String getJavaOptsForMRMapper(Configuration conf) { String adminOpts = conf.get( MRJobConfig.MAPRED_MAP_ADMIN_JAVA_OPTS, MRJobConfig.DEFAULT_MAPRED_ADMIN_JAVA_OPTS); String userOpts = conf.get( MRJobConfig.MAP_JAVA_OPTS, conf.get( JobConf.MAPRED_TASK_JAVA_OPTS, JobConf.DEFAULT_MAPRED_TASK_JAVA_OPTS)); return adminOpts.trim() + " " + userOpts.trim() + " " + getLog4jCmdLineProperties(conf, true); } /** * Generate JVM options based on MapReduce reducer java options. </p> * * This is only meant to be used if frameworks are not setting up their own java options, * and would like to fallback to using java options which may already be configured for * Hadoop MapReduce reducers. * * Uses mapreduce.admin.reduce.child.java.opts, mapreduce.reduce.java.opts * and mapreduce.reduce.log.level from config to generate the opts. * * @param conf Configuration to be used to extract JVM opts specific info * @return JAVA_OPTS string to be used in launching the JVM */ @SuppressWarnings("deprecation") public static String getJavaOptsForMRReducer(Configuration conf) { String adminOpts = conf.get( MRJobConfig.MAPRED_REDUCE_ADMIN_JAVA_OPTS, MRJobConfig.DEFAULT_MAPRED_ADMIN_JAVA_OPTS); String userOpts = conf.get( MRJobConfig.REDUCE_JAVA_OPTS, conf.get( JobConf.MAPRED_TASK_JAVA_OPTS, JobConf.DEFAULT_MAPRED_TASK_JAVA_OPTS)); return adminOpts.trim() + " " + userOpts.trim() + " " + getLog4jCmdLineProperties(conf, false); } /** * Extract the container resource requirements from the provided configuration, which would * otherwise have been used when running a Hadoop MapReduce mapper. </p> * <p/> * This is only meant to be used if frameworks are not setting up their own {@link * org.apache.hadoop.yarn.api.records.Resource} and would like to fallback to using resources * which may already be configured for Hadoop MapReduce mappers. * * @param conf Configuration with MR specific settings used to extract * information from * * @return Resource object used to define requirements for containers * running Map tasks */ public static Resource getResourceForMRMapper(Configuration conf) { return Resource.newInstance(conf.getInt( MRJobConfig.MAP_MEMORY_MB, MRJobConfig.DEFAULT_MAP_MEMORY_MB), conf.getInt(MRJobConfig.MAP_CPU_VCORES, MRJobConfig.DEFAULT_MAP_CPU_VCORES)); } /** * Extract the container resource requirements from the provided configuration, which would * otherwise have been used when running a Hadoop MapReduce reducer. </p> * <p/> * This is only meant to be used if frameworks are not setting up their own {@link * org.apache.hadoop.yarn.api.records.Resource} and would like to fallback to using resources * which may already be configured for Hadoop MapReduce reducers. * <p/> * Uses mapreduce.reduce.memory.mb and mapreduce.reduce.cpu.vcores from the * provided configuration. * * @param conf Configuration with MR specific settings used to extract * information from * @return Resource object used to define requirements for containers * running Reduce tasks */ public static Resource getResourceForMRReducer(Configuration conf) { return Resource.newInstance(conf.getInt( MRJobConfig.REDUCE_MEMORY_MB, MRJobConfig.DEFAULT_REDUCE_MEMORY_MB), conf.getInt(MRJobConfig.REDUCE_CPU_VCORES, MRJobConfig.DEFAULT_REDUCE_CPU_VCORES)); } /** * Setup classpath and other environment variables based on the configured values for MR Mappers * or Reducers * * @param conf Configuration to retrieve settings from * @param environment Environment to update * @param isMap Whether task is a map or reduce task */ public static void updateEnvBasedOnMRTaskEnv(Configuration conf, Map<String, String> environment, boolean isMap) { // Shell environment.put(Environment.SHELL.name(), conf.get( MRJobConfig.MAPRED_ADMIN_USER_SHELL, MRJobConfig.DEFAULT_SHELL)); // Add pwd to LD_LIBRARY_PATH, add this before adding anything else TezYARNUtils.addToEnvironment(environment, Environment.LD_LIBRARY_PATH.name(), Environment.PWD.$(), File.pathSeparator); // Add the env variables passed by the admin TezYARNUtils.appendToEnvFromInputString(environment, conf.get( MRJobConfig.MAPRED_ADMIN_USER_ENV, MRJobConfig.DEFAULT_MAPRED_ADMIN_USER_ENV), File.pathSeparator); // Add the env variables passed by the user String mapredChildEnv = (isMap ? conf.get(MRJobConfig.MAP_ENV, conf.get("mapred.child.env")) : conf.get(MRJobConfig.REDUCE_ENV, conf.get("mapred.child.env"))); TezYARNUtils.appendToEnvFromInputString(environment, mapredChildEnv, File.pathSeparator); // Set logging level in the environment. environment.put( "HADOOP_ROOT_LOGGER", getChildLogLevel(conf, isMap) + ",CLA"); } /** * Setup environment variables based on the configured values for the MR AM * @param conf Configuration from which to extract information * @param environment Environment map to update */ public static void updateEnvBasedOnMRAMEnv(Configuration conf, Map<String, String> environment) { TezYARNUtils.appendToEnvFromInputString(environment, conf.get(MRJobConfig.MR_AM_ADMIN_USER_ENV), File.pathSeparator); TezYARNUtils.appendToEnvFromInputString(environment, conf.get(MRJobConfig.MR_AM_ENV), File.pathSeparator); } }