/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tez.mapreduce.output;
import static org.junit.Assert.*;
import static org.mockito.Mockito.*;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.HashMap;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.tez.common.TezExecutors;
import org.apache.tez.common.TezSharedExecutor;
import org.apache.tez.common.counters.TezCounters;
import org.apache.tez.dag.api.DataSinkDescriptor;
import org.apache.tez.dag.api.ProcessorDescriptor;
import org.apache.tez.dag.api.UserPayload;
import org.apache.tez.hadoop.shim.DefaultHadoopShim;
import org.apache.tez.mapreduce.TestUmbilical;
import org.apache.tez.mapreduce.TezTestUtils;
import org.apache.tez.mapreduce.hadoop.MRConfig;
import org.apache.tez.runtime.LogicalIOProcessorRuntimeTask;
import org.apache.tez.runtime.api.OutputContext;
import org.apache.tez.runtime.api.ProcessorContext;
import org.apache.tez.runtime.api.impl.ExecutionContextImpl;
import org.apache.tez.runtime.api.impl.InputSpec;
import org.apache.tez.runtime.api.impl.OutputSpec;
import org.apache.tez.runtime.api.impl.TaskSpec;
import org.apache.tez.runtime.api.impl.TezUmbilical;
import org.apache.tez.runtime.library.api.KeyValueWriter;
import org.apache.tez.runtime.library.processor.SimpleProcessor;
import org.junit.Ignore;
import org.junit.Test;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Lists;
public class TestMROutput {
@Test(timeout = 5000)
public void testNewAPI_TextOutputFormat() throws Exception {
String outputPath = "/tmp/output";
Configuration conf = new Configuration();
conf.setBoolean(MRConfig.IS_MAP_PROCESSOR, true);
DataSinkDescriptor dataSink = MROutput
.createConfigBuilder(conf, TextOutputFormat.class, outputPath)
.build();
OutputContext outputContext = createMockOutputContext(dataSink.getOutputDescriptor().getUserPayload());
MROutput output = new MROutput(outputContext, 2);
output.initialize();
assertEquals(true, output.isMapperOutput);
assertEquals(true, output.useNewApi);
assertEquals(TextOutputFormat.class, output.newOutputFormat.getClass());
assertNull(output.oldOutputFormat);
assertNotNull(output.newApiTaskAttemptContext);
assertNull(output.oldApiTaskAttemptContext);
assertNotNull(output.newRecordWriter);
assertNull(output.oldRecordWriter);
assertEquals(FileOutputCommitter.class, output.committer.getClass());
}
@Test(timeout = 5000)
public void testOldAPI_TextOutputFormat() throws Exception {
String outputPath = "/tmp/output";
Configuration conf = new Configuration();
conf.setBoolean(MRConfig.IS_MAP_PROCESSOR, false);
DataSinkDescriptor dataSink = MROutput
.createConfigBuilder(conf, org.apache.hadoop.mapred.TextOutputFormat.class, outputPath)
.build();
OutputContext outputContext = createMockOutputContext(dataSink.getOutputDescriptor().getUserPayload());
MROutput output = new MROutput(outputContext, 2);
output.initialize();
assertEquals(false, output.isMapperOutput);
assertEquals(false, output.useNewApi);
assertEquals(org.apache.hadoop.mapred.TextOutputFormat.class, output.oldOutputFormat.getClass());
assertNull(output.newOutputFormat);
assertNotNull(output.oldApiTaskAttemptContext);
assertNull(output.newApiTaskAttemptContext);
assertNotNull(output.oldRecordWriter);
assertNull(output.newRecordWriter);
assertEquals(org.apache.hadoop.mapred.FileOutputCommitter.class, output.committer.getClass());
}
@Test(timeout = 5000)
public void testNewAPI_SequenceFileOutputFormat() throws Exception {
String outputPath = "/tmp/output";
JobConf conf = new JobConf();
conf.setOutputKeyClass(NullWritable.class);
conf.setOutputValueClass(Text.class);
DataSinkDescriptor dataSink = MROutput
.createConfigBuilder(conf, SequenceFileOutputFormat.class, outputPath)
.build();
OutputContext outputContext = createMockOutputContext(dataSink.getOutputDescriptor().getUserPayload());
MROutput output = new MROutput(outputContext, 2);
output.initialize();
assertEquals(true, output.useNewApi);
assertEquals(SequenceFileOutputFormat.class, output.newOutputFormat.getClass());
assertNull(output.oldOutputFormat);
assertEquals(NullWritable.class, output.newApiTaskAttemptContext.getOutputKeyClass());
assertEquals(Text.class, output.newApiTaskAttemptContext.getOutputValueClass());
assertNull(output.oldApiTaskAttemptContext);
assertNotNull(output.newRecordWriter);
assertNull(output.oldRecordWriter);
assertEquals(FileOutputCommitter.class, output.committer.getClass());
}
@Test(timeout = 5000)
public void testOldAPI_SequenceFileOutputFormat() throws Exception {
String outputPath = "/tmp/output";
JobConf conf = new JobConf();
conf.setOutputKeyClass(NullWritable.class);
conf.setOutputValueClass(Text.class);
DataSinkDescriptor dataSink = MROutput
.createConfigBuilder(conf, org.apache.hadoop.mapred.SequenceFileOutputFormat.class, outputPath)
.build();
OutputContext outputContext = createMockOutputContext(dataSink.getOutputDescriptor().getUserPayload());
MROutput output = new MROutput(outputContext, 2);
output.initialize();
assertEquals(false, output.useNewApi);
assertEquals(org.apache.hadoop.mapred.SequenceFileOutputFormat.class, output.oldOutputFormat.getClass());
assertNull(output.newOutputFormat);
assertEquals(NullWritable.class, output.oldApiTaskAttemptContext.getOutputKeyClass());
assertEquals(Text.class, output.oldApiTaskAttemptContext.getOutputValueClass());
assertNull(output.newApiTaskAttemptContext);
assertNotNull(output.oldRecordWriter);
assertNull(output.newRecordWriter);
assertEquals(org.apache.hadoop.mapred.FileOutputCommitter.class, output.committer.getClass());
}
// test to try and use the WorkOutputPathOutputFormat - this checks that the getDefaultWorkFile is
// set while creating recordWriters
@Test(timeout = 5000)
public void testNewAPI_WorkOutputPathOutputFormat() throws Exception {
String outputPath = "/tmp/output";
Configuration conf = new Configuration();
conf.setBoolean(MRConfig.IS_MAP_PROCESSOR, true);
DataSinkDescriptor dataSink = MROutput
.createConfigBuilder(conf, NewAPI_WorkOutputPathReadingOutputFormat.class, outputPath)
.build();
OutputContext outputContext = createMockOutputContext(dataSink.getOutputDescriptor().getUserPayload());
MROutput output = new MROutput(outputContext, 2);
output.initialize();
assertEquals(true, output.isMapperOutput);
assertEquals(true, output.useNewApi);
assertEquals(NewAPI_WorkOutputPathReadingOutputFormat.class, output.newOutputFormat.getClass());
assertNull(output.oldOutputFormat);
assertNotNull(output.newApiTaskAttemptContext);
assertNull(output.oldApiTaskAttemptContext);
assertNotNull(output.newRecordWriter);
assertNull(output.oldRecordWriter);
assertEquals(FileOutputCommitter.class, output.committer.getClass());
}
// test to try and use the WorkOutputPathOutputFormat - this checks that the workOutput path is
// set while creating recordWriters
@Test(timeout = 5000)
public void testOldAPI_WorkOutputPathOutputFormat() throws Exception {
String outputPath = "/tmp/output";
Configuration conf = new Configuration();
conf.setBoolean(MRConfig.IS_MAP_PROCESSOR, false);
DataSinkDescriptor dataSink = MROutput
.createConfigBuilder(conf, OldAPI_WorkOutputPathReadingOutputFormat.class, outputPath)
.build();
OutputContext outputContext = createMockOutputContext(dataSink.getOutputDescriptor().getUserPayload());
MROutput output = new MROutput(outputContext, 2);
output.initialize();
assertEquals(false, output.isMapperOutput);
assertEquals(false, output.useNewApi);
assertEquals(OldAPI_WorkOutputPathReadingOutputFormat.class, output.oldOutputFormat.getClass());
assertNull(output.newOutputFormat);
assertNotNull(output.oldApiTaskAttemptContext);
assertNull(output.newApiTaskAttemptContext);
assertNotNull(output.oldRecordWriter);
assertNull(output.newRecordWriter);
assertEquals(org.apache.hadoop.mapred.FileOutputCommitter.class, output.committer.getClass());
}
private OutputContext createMockOutputContext(UserPayload payload) {
OutputContext outputContext = mock(OutputContext.class);
ApplicationId appId = ApplicationId.newInstance(System.currentTimeMillis(), 1);
when(outputContext.getUserPayload()).thenReturn(payload);
when(outputContext.getApplicationId()).thenReturn(appId);
when(outputContext.getTaskVertexIndex()).thenReturn(1);
when(outputContext.getTaskAttemptNumber()).thenReturn(1);
when(outputContext.getCounters()).thenReturn(new TezCounters());
return outputContext;
}
public static LogicalIOProcessorRuntimeTask createLogicalTask(
Configuration conf,
TezUmbilical umbilical, String dagName,
String vertexName, TezExecutors sharedExecutor) throws Exception {
ProcessorDescriptor procDesc = ProcessorDescriptor.create(TestProcessor.class.getName());
List<InputSpec> inputSpecs = Lists.newLinkedList();
List<OutputSpec> outputSpecs = Lists.newLinkedList();
outputSpecs.add(new OutputSpec("Null",
MROutput.createConfigBuilder(conf, TestOutputFormat.class).build().getOutputDescriptor(), 1));
TaskSpec taskSpec = new TaskSpec(
TezTestUtils.getMockTaskAttemptId(0, 0, 0, 0),
dagName, vertexName, -1,
procDesc,
inputSpecs,
outputSpecs, null, null);
FileSystem fs = FileSystem.getLocal(conf);
Path workDir =
new Path(new Path(System.getProperty("test.build.data", "/tmp")),
"TestMapOutput").makeQualified(fs.getUri(), fs.getWorkingDirectory());
return new LogicalIOProcessorRuntimeTask(
taskSpec,
0,
conf,
new String[] {workDir.toString()},
umbilical,
null,
new HashMap<String, String>(),
HashMultimap.<String, String>create(), null, "", new ExecutionContextImpl("localhost"),
Runtime.getRuntime().maxMemory(), true, new DefaultHadoopShim(), sharedExecutor);
}
public static class TestOutputCommitter extends OutputCommitter {
@Override
public void setupJob(JobContext jobContext) throws IOException {
}
@Override
public void setupTask(TaskAttemptContext taskContext) throws IOException {
}
@Override
public boolean needsTaskCommit(TaskAttemptContext taskContext) throws IOException {
return false;
}
@Override
public void commitTask(TaskAttemptContext taskContext) throws IOException {
}
@Override
public void abortTask(TaskAttemptContext taskContext) throws IOException {
}
}
public static class TestOutputFormat extends OutputFormat<String, String> {
public static class TestRecordWriter extends RecordWriter<String, String> {
Writer writer;
boolean doWrite;
TestRecordWriter(boolean write) throws IOException {
this.doWrite = write;
if (doWrite) {
File f = File.createTempFile("test", null);
f.deleteOnExit();
writer = new BufferedWriter(new FileWriter(f));
}
}
@Override
public void write(String key, String value) throws IOException, InterruptedException {
if (doWrite) {
writer.write(key);
writer.write(value);
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
writer.close();
}
}
@Override
public RecordWriter<String, String> getRecordWriter(TaskAttemptContext context)
throws IOException, InterruptedException {
return new TestRecordWriter(true);
}
@Override
public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
}
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context)
throws IOException, InterruptedException {
return new TestOutputCommitter();
}
}
// OldAPI OutputFormat class that reads the workoutput path while creating recordWriters
public static class OldAPI_WorkOutputPathReadingOutputFormat extends org.apache.hadoop.mapred.FileOutputFormat<String, String> {
public static class NoOpRecordWriter implements org.apache.hadoop.mapred.RecordWriter<String, String> {
@Override
public void write(String key, String value) throws IOException {}
@Override
public void close(Reporter reporter) throws IOException {}
}
@Override
public org.apache.hadoop.mapred.RecordWriter<String, String> getRecordWriter(
FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException {
// check work output path is not null
Path workOutputPath = org.apache.hadoop.mapred.FileOutputFormat.getWorkOutputPath(job);
assertNotNull(workOutputPath);
return new NoOpRecordWriter();
}
}
// NewAPI OutputFormat class that reads the default work file while creating recordWriters
public static class NewAPI_WorkOutputPathReadingOutputFormat extends FileOutputFormat<String, String> {
public static class NoOpRecordWriter extends RecordWriter<String, String> {
@Override
public void write(String key, String value) throws IOException, InterruptedException {
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
}
}
@Override
public RecordWriter<String, String> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
// check default work file is not null
Path workOutputPath = getDefaultWorkFile(job, ".foo");
assertNotNull(workOutputPath);
return new NoOpRecordWriter();
}
}
public static class TestProcessor extends SimpleProcessor {
public TestProcessor(ProcessorContext context) {
super(context);
}
@Override
public void run() throws Exception {
KeyValueWriter writer = (KeyValueWriter) getOutputs().values().iterator().next().getWriter();
for (int i=0; i<1000000; ++i) {
writer.write("key", "value");
}
}
}
@Ignore
@Test
public void testPerf() throws Exception {
Configuration conf = new Configuration();
TezSharedExecutor sharedExecutor = new TezSharedExecutor(conf);
LogicalIOProcessorRuntimeTask task = createLogicalTask(conf, new TestUmbilical(), "dag",
"vertex", sharedExecutor);
task.initialize();
task.run();
task.close();
sharedExecutor.shutdownNow();
}
}