/** * Copyright 2015 StreamSets Inc. * * Licensed under the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.streamsets.pipeline.hadoop; import com.streamsets.pipeline.ClusterBinding; import com.streamsets.pipeline.Utils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import java.io.FileInputStream; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.Properties; public class HadoopMapReduceBinding implements ClusterBinding { private final String[] args; private Properties properties; private Job job; // JVM heap for map task private static final String MAPREDUCE_JAVA_OPTS = "mapreduce.map.java.opts"; // Total physical memory in MB for a map task static final String MAPREDUCE_MAP_MEMORY_MB = "mapreduce.map.memory.mb"; public HadoopMapReduceBinding(String[] args) { this.args = args; } @Override public void init() throws Exception { Configuration conf = new Configuration(); GenericOptionsParser parser = new GenericOptionsParser(conf, args); String[] remainingArgs = parser.getRemainingArgs(); properties = new Properties(); if (remainingArgs.length != 2) { List<String> argsList = new ArrayList<>(); for (String arg : remainingArgs) { argsList.add("'" + arg + "'"); } throw new IllegalArgumentException("Error expected properties-file java-opts got: " + argsList); } String propertiesFile = remainingArgs[0]; String javaOpts = remainingArgs[1]; try (InputStream in = new FileInputStream(propertiesFile)) { properties.load(in); String dataFormat = Utils.getHdfsDataFormat(properties); String source = this.getClass().getSimpleName(); for (Object key : properties.keySet()) { String realKey = String.valueOf(key); String value = Utils.getPropertyNotNull(properties, realKey); conf.set(realKey, value, source); } Integer mapMemoryMb = getMapMemoryMb(javaOpts, conf); if (mapMemoryMb != null) { conf.set(MAPREDUCE_MAP_MEMORY_MB, String.valueOf(mapMemoryMb)); } conf.set(MAPREDUCE_JAVA_OPTS, javaOpts); conf.setBoolean("mapreduce.map.speculative", false); conf.setBoolean("mapreduce.reduce.speculative", false); if ("AVRO".equalsIgnoreCase(dataFormat)) { conf.set(Job.INPUT_FORMAT_CLASS_ATTR, "org.apache.avro.mapreduce.AvroKeyInputFormat"); conf.set(Job.MAP_OUTPUT_KEY_CLASS, "org.apache.avro.mapred.AvroKey"); } job = Job.getInstance(conf, "StreamSets Data Collector - Batch Execution Mode"); job.setJarByClass(this.getClass()); job.setNumReduceTasks(0); if (!"AVRO".equalsIgnoreCase(dataFormat)) { job.setOutputKeyClass(NullWritable.class); } job.setMapperClass(PipelineMapper.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormatClass(NullOutputFormat.class); } } // visible for testing (can't annotate as can't depend on Guava) static Integer getMapMemoryMb(String javaOpts, Configuration conf) { String[] javaOptsArray = javaOpts.split(" "); Integer upperLimitMemory = null; for (String opts : javaOptsArray) { if (opts.contains("-Xmx")) { Integer memoryMb = Integer.valueOf(opts.substring(4, opts.length() - 1)); switch (opts.charAt(opts.length() - 1)) { case 'm': case 'M': break; case 'k': case 'K': memoryMb = memoryMb / (1024); break; case 'g': case 'G': memoryMb = memoryMb * 1024; break; default: memoryMb = Integer.valueOf(opts.substring(4, opts.length())) / (1024 * 1024); break; } // Add 25% to Java heap as MAP_MEMORY_MB is the total physical memory for the map task upperLimitMemory = ((int) (memoryMb * 0.25)) + memoryMb; // dont break as there could be multiple -Xmx, we need to honor the last } } if (upperLimitMemory != null) { String defaultMapMemoryString = conf.get(MAPREDUCE_MAP_MEMORY_MB); if (defaultMapMemoryString != null) { Integer defaultMapMemory = Integer.valueOf(defaultMapMemoryString); upperLimitMemory = (upperLimitMemory > defaultMapMemory ? upperLimitMemory : defaultMapMemory); } } return upperLimitMemory; } @Override public void awaitTermination() throws Exception { job.waitForCompletion(true); // killed by ClusterProviderImpl before returning } @Override public void close() throws Exception { if (job != null) { job.killJob(); } } }