/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tez.mapreduce.input; import java.nio.ByteBuffer; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Random; import java.util.UUID; import java.util.concurrent.atomic.AtomicLong; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.serializer.SerializationFactory; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.tez.common.TezUtils; import org.apache.tez.common.counters.TaskCounter; import org.apache.tez.common.counters.TezCounters; import org.apache.tez.dag.api.UserPayload; import org.apache.tez.mapreduce.hadoop.MRInputHelpers; import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto; import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto; import org.apache.tez.runtime.api.Event; import org.apache.tez.runtime.api.InputContext; import org.apache.tez.runtime.api.events.InputDataInformationEvent; import org.apache.tez.runtime.library.api.KeyValueReader; import org.junit.AfterClass; import org.junit.Before; import org.junit.Test; public class TestMultiMRInput { private static final Logger LOG = LoggerFactory.getLogger(TestMultiMRInput.class); private static final JobConf defaultConf = new JobConf(); private static final String testTmpDir; private static final Path TEST_ROOT_DIR; private static FileSystem localFs; static { defaultConf.set("fs.defaultFS", "file:///"); try { localFs = FileSystem.getLocal(defaultConf); testTmpDir = System.getProperty("test.build.data", "target"); TEST_ROOT_DIR = new Path(testTmpDir, TestMultiMRInput.class.getSimpleName() + "-tmpDir"); } catch (IOException e) { throw new RuntimeException(e); } } @Before public void setup() throws IOException { LOG.info("Setup. Using test dir: " + TEST_ROOT_DIR); localFs.delete(TEST_ROOT_DIR, true); localFs.mkdirs(TEST_ROOT_DIR); } @Test(timeout = 5000) public void test0PhysicalInputs() throws Exception { Path workDir = new Path(TEST_ROOT_DIR, "testSingleSplit"); JobConf jobConf = new JobConf(defaultConf); jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class); FileInputFormat.setInputPaths(jobConf, workDir); InputContext inputContext = createTezInputContext(jobConf); MultiMRInput mMrInput = new MultiMRInput(inputContext, 0); mMrInput.initialize(); mMrInput.start(); assertEquals(0, mMrInput.getKeyValueReaders().size()); List<Event> events = new LinkedList<>(); try { mMrInput.handleEvents(events); fail("HandleEvents should cause an input with 0 physical inputs to fail"); } catch (Exception e) { assertTrue(e instanceof IllegalStateException); } } @Test(timeout = 5000) public void testSingleSplit() throws Exception { Path workDir = new Path(TEST_ROOT_DIR, "testSingleSplit"); JobConf jobConf = new JobConf(defaultConf); jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class); FileInputFormat.setInputPaths(jobConf, workDir); InputContext inputContext = createTezInputContext(jobConf); MultiMRInput input = new MultiMRInput(inputContext, 1); input.initialize(); AtomicLong inputLength = new AtomicLong(); LinkedHashMap<LongWritable, Text> data = createSplits(1, workDir, jobConf, inputLength); SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>(); InputSplit[] splits = format.getSplits(jobConf, 1); assertEquals(1, splits.length); MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]); InputDataInformationEvent event = InputDataInformationEvent.createWithSerializedPayload(0, splitProto.toByteString().asReadOnlyByteBuffer()); List<Event> eventList = new ArrayList<Event>(); eventList.add(event); input.handleEvents(eventList); assertReaders(input, data, 1, inputLength.get()); } @Test public void testNewFormatSplits() throws Exception { Path workDir = new Path(TEST_ROOT_DIR, "testNewFormatSplits"); Job job = Job.getInstance(defaultConf); job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class); org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(job, workDir); Configuration conf = job.getConfiguration(); conf.setBoolean("mapred.mapper.new-api", true); // Create sequence file. AtomicLong inputLength = new AtomicLong(); LinkedHashMap<LongWritable, Text> data = createSplits(1, workDir, conf, inputLength); // Get split information. org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat<LongWritable, Text> format = new org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat<>(); List<org.apache.hadoop.mapreduce.InputSplit> splits = format.getSplits(job); assertEquals(1, splits.size()); // Create the event. MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits.get(0), new SerializationFactory(conf)); InputDataInformationEvent event = InputDataInformationEvent.createWithSerializedPayload(0, splitProto.toByteString().asReadOnlyByteBuffer()); // Create input context. InputContext inputContext = createTezInputContext(conf); // Create the MR input object and process the event MultiMRInput input = new MultiMRInput(inputContext, 1); input.initialize(); input.handleEvents(Collections.<Event>singletonList(event)); assertReaders(input, data, 1, inputLength.get()); } @Test(timeout = 5000) public void testMultipleSplits() throws Exception { Path workDir = new Path(TEST_ROOT_DIR, "testMultipleSplits"); JobConf jobConf = new JobConf(defaultConf); jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class); FileInputFormat.setInputPaths(jobConf, workDir); InputContext inputContext = createTezInputContext(jobConf); MultiMRInput input = new MultiMRInput(inputContext, 2); input.initialize(); AtomicLong inputLength = new AtomicLong(); LinkedHashMap<LongWritable, Text> data = createSplits(2, workDir, jobConf, inputLength); SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>(); InputSplit[] splits = format.getSplits(jobConf, 2); assertEquals(2, splits.length); MRSplitProto splitProto1 = MRInputHelpers.createSplitProto(splits[0]); InputDataInformationEvent event1 = InputDataInformationEvent.createWithSerializedPayload(0, splitProto1.toByteString().asReadOnlyByteBuffer()); MRSplitProto splitProto2 = MRInputHelpers.createSplitProto(splits[1]); InputDataInformationEvent event2 = InputDataInformationEvent.createWithSerializedPayload(0, splitProto2.toByteString().asReadOnlyByteBuffer()); List<Event> eventList = new ArrayList<Event>(); eventList.add(event1); eventList.add(event2); input.handleEvents(eventList); assertReaders(input, data, 2, inputLength.get()); } private void assertReaders(MultiMRInput input, LinkedHashMap<LongWritable, Text> data, int expectedReaderCounts, long inputBytes) throws Exception { int readerCount = 0; int recordCount = 0; for (KeyValueReader reader : input.getKeyValueReaders()) { readerCount++; while (reader.next()) { verify(input.getContext(), times(++recordCount + readerCount - 1)).notifyProgress(); if (data.size() == 0) { fail("Found more records than expected"); } Object key = reader.getCurrentKey(); Object val = reader.getCurrentValue(); assertEquals(val, data.remove(key)); } try { reader.next(); //should throw exception fail(); } catch(IOException e) { assertTrue(e.getMessage().contains("For usage, please refer to")); } } long counterValue = input.getContext().getCounters() .findCounter(TaskCounter.INPUT_SPLIT_LENGTH_BYTES).getValue(); assertEquals(inputBytes, counterValue); assertEquals(expectedReaderCounts, readerCount); } @Test(timeout = 5000) public void testExtraEvents() throws Exception { Path workDir = new Path(TEST_ROOT_DIR, "testExtraEvents"); JobConf jobConf = new JobConf(defaultConf); jobConf.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class); FileInputFormat.setInputPaths(jobConf, workDir); InputContext inputContext = createTezInputContext(jobConf); MultiMRInput input = new MultiMRInput(inputContext, 1); input.initialize(); createSplits(1, workDir, jobConf, new AtomicLong()); SequenceFileInputFormat<LongWritable, Text> format = new SequenceFileInputFormat<LongWritable, Text>(); InputSplit[] splits = format.getSplits(jobConf, 1); assertEquals(1, splits.length); MRSplitProto splitProto = MRInputHelpers.createSplitProto(splits[0]); InputDataInformationEvent event1 = InputDataInformationEvent.createWithSerializedPayload(0, splitProto.toByteString().asReadOnlyByteBuffer()); InputDataInformationEvent event2 = InputDataInformationEvent.createWithSerializedPayload(1, splitProto.toByteString().asReadOnlyByteBuffer()); List<Event> eventList = new ArrayList<Event>(); eventList.add(event1); eventList.add(event2); try { input.handleEvents(eventList); fail("Expecting Exception due to too many events"); } catch (Exception e) { assertTrue(e.getMessage().contains( "Unexpected event. All physical sources already initialized")); } } private LinkedHashMap<LongWritable, Text> createSplits(int splitCount, Path workDir, Configuration conf, AtomicLong totalSize) throws Exception { LinkedHashMap<LongWritable, Text> data = new LinkedHashMap<LongWritable, Text>(); for (int i = 0; i < splitCount; ++i) { int start = i * 10; int end = start + 10; data.putAll(createInputData(localFs, workDir, conf, "file" + i, start, end, totalSize)); } return data; } private InputContext createTezInputContext(Configuration conf) throws Exception { MRInputUserPayloadProto.Builder builder = MRInputUserPayloadProto.newBuilder(); builder.setGroupingEnabled(false); builder.setConfigurationBytes(TezUtils.createByteStringFromConf(conf)); byte[] payload = builder.build().toByteArray(); ApplicationId applicationId = ApplicationId.newInstance(10000, 1); TezCounters counters = new TezCounters(); InputContext inputContext = mock(InputContext.class); doReturn(applicationId).when(inputContext).getApplicationId(); doReturn(counters).when(inputContext).getCounters(); doReturn(1).when(inputContext).getDAGAttemptNumber(); doReturn("dagName").when(inputContext).getDAGName(); doReturn(1).when(inputContext).getInputIndex(); doReturn("srcVertexName").when(inputContext).getSourceVertexName(); doReturn(1).when(inputContext).getTaskAttemptNumber(); doReturn(1).when(inputContext).getTaskIndex(); doReturn(1).when(inputContext).getTaskVertexIndex(); doReturn(UUID.randomUUID().toString()).when(inputContext).getUniqueIdentifier(); doReturn("taskVertexName").when(inputContext).getTaskVertexName(); doReturn(UserPayload.create(ByteBuffer.wrap(payload))).when(inputContext).getUserPayload(); return inputContext; } @AfterClass public static void cleanUp() throws IOException { localFs.delete(TEST_ROOT_DIR, true); } public static LinkedHashMap<LongWritable, Text> createInputData(FileSystem fs, Path workDir, Configuration job, String filename, long startKey, long numKeys, AtomicLong fileLength) throws IOException { LinkedHashMap<LongWritable, Text> data = new LinkedHashMap<LongWritable, Text>(); Path file = new Path(workDir, filename); LOG.info("Generating data at path: " + file); // create a file with length entries @SuppressWarnings("deprecation") SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, file, LongWritable.class, Text.class); try { Random r = new Random(System.currentTimeMillis()); LongWritable key = new LongWritable(); Text value = new Text(); for (long i = startKey; i < numKeys; i++) { key.set(i); value.set(Integer.toString(r.nextInt(10000))); data.put(new LongWritable(key.get()), new Text(value.toString())); writer.append(key, value); LOG.info("<k, v> : <" + key.get() + ", " + value + ">"); } fileLength.addAndGet(writer.getLength()); } finally { writer.close(); } return data; } }