/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tez.test; import static org.junit.Assert.assertEquals; import java.io.IOException; import java.util.ArrayList; import java.util.EnumSet; import java.util.List; import java.util.Random; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.tez.client.TezClient; import org.apache.tez.client.TezClientUtils; import org.apache.tez.common.TezCommonUtils; import org.apache.tez.common.TezUtils; import org.apache.tez.common.counters.DAGCounter; import org.apache.tez.common.counters.TezCounter; import org.apache.tez.common.counters.TezCounters; import org.apache.tez.dag.api.DAG; import org.apache.tez.dag.api.Edge; import org.apache.tez.dag.api.EdgeProperty; import org.apache.tez.dag.api.EdgeProperty.DataMovementType; import org.apache.tez.dag.api.EdgeProperty.DataSourceType; import org.apache.tez.dag.api.EdgeProperty.SchedulingType; import org.apache.tez.dag.api.InputDescriptor; import org.apache.tez.dag.api.ProcessorDescriptor; import org.apache.tez.dag.api.TezConfiguration; import org.apache.tez.dag.api.TezConstants; import org.apache.tez.dag.api.UserPayload; import org.apache.tez.dag.api.Vertex; import org.apache.tez.dag.api.VertexManagerPlugin; import org.apache.tez.dag.api.VertexManagerPluginContext; import org.apache.tez.dag.api.VertexManagerPluginContext.ScheduleTaskRequest; import org.apache.tez.dag.api.VertexManagerPluginDescriptor; import org.apache.tez.dag.api.client.DAGClient; import org.apache.tez.dag.api.client.DAGStatus; import org.apache.tez.dag.api.client.StatusGetOpts; import org.apache.tez.dag.app.RecoveryParser; import org.apache.tez.dag.app.dag.impl.ImmediateStartVertexManager; import org.apache.tez.dag.history.HistoryEvent; import org.apache.tez.dag.history.HistoryEventType; import org.apache.tez.dag.history.events.TaskAttemptFinishedEvent; import org.apache.tez.dag.history.recovery.RecoveryService; import org.apache.tez.dag.library.vertexmanager.InputReadyVertexManager; import org.apache.tez.dag.library.vertexmanager.ShuffleVertexManager; import org.apache.tez.runtime.api.Event; import org.apache.tez.runtime.api.ProcessorContext; import org.apache.tez.runtime.api.TaskAttemptIdentifier; import org.apache.tez.runtime.api.events.VertexManagerEvent; import org.apache.tez.runtime.library.processor.SimpleProcessor; import org.junit.After; import org.junit.AfterClass; import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import com.google.common.collect.Lists; public class TestAMRecovery { private static final Logger LOG = LoggerFactory.getLogger(TestAMRecovery.class); private static Configuration conf = new Configuration(); private static TezConfiguration tezConf; private static int MAX_AM_ATTEMPT = 10; private static MiniTezCluster miniTezCluster = null; private static String TEST_ROOT_DIR = "target" + Path.SEPARATOR + TestAMRecovery.class.getName() + "-tmpDir"; private static MiniDFSCluster dfsCluster = null; private static TezClient tezSession = null; private static FileSystem remoteFs = null; private static String FAIL_ON_PARTIAL_FINISHED = "FAIL_ON_PARTIAL_COMPLETED"; private static String FAIL_ON_ATTEMPT = "FAIL_ON_ATTEMPT"; @BeforeClass public static void beforeClass() throws Exception { LOG.info("Starting mini clusters"); try { conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, TEST_ROOT_DIR); dfsCluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).format(true) .racks(null).build(); remoteFs = dfsCluster.getFileSystem(); } catch (IOException io) { throw new RuntimeException("problem starting mini dfs cluster", io); } if (miniTezCluster == null) { miniTezCluster = new MiniTezCluster(TestAMRecovery.class.getName(), 1, 1, 1); Configuration miniTezconf = new Configuration(conf); miniTezconf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, MAX_AM_ATTEMPT); miniTezconf.set("fs.defaultFS", remoteFs.getUri().toString()); // use HDFS conf.setLong(TezConfiguration.TEZ_AM_SLEEP_TIME_BEFORE_EXIT_MILLIS, 500); miniTezCluster.init(miniTezconf); miniTezCluster.start(); } } @AfterClass public static void afterClass() throws InterruptedException { if (tezSession != null) { try { LOG.info("Stopping Tez Session"); tezSession.stop(); } catch (Exception e) { e.printStackTrace(); } } if (miniTezCluster != null) { try { LOG.info("Stopping MiniTezCluster"); miniTezCluster.stop(); miniTezCluster = null; } catch (Exception e) { e.printStackTrace(); } } if (dfsCluster != null) { try { LOG.info("Stopping DFSCluster"); dfsCluster.shutdown(); } catch (Exception e) { e.printStackTrace(); } } } @Before public void setup() throws Exception { LOG.info("Starting session"); Path remoteStagingDir = remoteFs.makeQualified(new Path(TEST_ROOT_DIR, String .valueOf(new Random().nextInt(100000)))); TezClientUtils.ensureStagingDirExists(conf, remoteStagingDir); tezConf = new TezConfiguration(miniTezCluster.getConfig()); tezConf.setInt(TezConfiguration.DAG_RECOVERY_MAX_UNFLUSHED_EVENTS, 0); tezConf.set(TezConfiguration.TEZ_AM_LOG_LEVEL, "INFO"); tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString()); tezConf .setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false); tezConf.setInt(TezConfiguration.TEZ_AM_MAX_APP_ATTEMPTS, MAX_AM_ATTEMPT); tezConf.setInt(TezConfiguration.TEZ_AM_RESOURCE_MEMORY_MB, 500); tezConf.set(TezConfiguration.TEZ_AM_LAUNCH_CMD_OPTS, " -Xmx256m"); tezConf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, true); tezConf.setBoolean( TezConfiguration.TEZ_AM_STAGING_SCRATCH_DATA_AUTO_DELETE, false); tezConf.setBoolean( RecoveryService.TEZ_TEST_RECOVERY_DRAIN_EVENTS_WHEN_STOPPED, true); tezSession = TezClient.create("TestDAGRecovery", tezConf); tezSession.start(); } @After public void teardown() throws InterruptedException { if (tezSession != null) { try { LOG.info("Stopping Tez Session"); tezSession.stop(); } catch (Exception e) { e.printStackTrace(); } } tezSession = null; } /** * Fine-grained recovery task-level, In a vertex (v1), task 0 is done task 1 * is not started. History flush happens. AM dies. Once AM is recovered, task 0 is * not re-run. Task 1 is re-run. (Broadcast) * * @throws Exception */ @Test(timeout = 120000) public void testVertexPartiallyFinished_Broadcast() throws Exception { DAG dag = createDAG("VertexPartiallyFinished_Broadcast", ControlledImmediateStartVertexManager.class, DataMovementType.BROADCAST, true); TezCounters counters = runDAGAndVerify(dag, DAGStatus.State.SUCCEEDED); assertEquals(4, counters.findCounter(DAGCounter.NUM_SUCCEEDED_TASKS).getValue()); assertEquals(2, counters.findCounter(TestCounter.Counter_1).getValue()); List<HistoryEvent> historyEvents1 = readRecoveryLog(1); List<HistoryEvent> historyEvents2 = readRecoveryLog(2); printHistoryEvents(historyEvents1, 1); printHistoryEvents(historyEvents1, 2); // task_0 of v1 is finished in attempt 1, task_1 of v1 is not finished in // attempt 1 assertEquals(1, findTaskAttemptFinishedEvent(historyEvents1, 0, 0).size()); assertEquals(0, findTaskAttemptFinishedEvent(historyEvents1, 0, 1).size()); // task_0 of v1 is finished in attempt 1 and not rerun, task_1 of v1 is // finished in attempt 2 assertEquals(1, findTaskAttemptFinishedEvent(historyEvents2, 0, 0).size()); assertEquals(1, findTaskAttemptFinishedEvent(historyEvents2, 0, 1).size()); } /** * Fine-grained recovery task-level, In a vertex (v1), task 0 is done task 1 * is also done. History flush happens. AM dies. Once AM is recovered, task 0 * and Task 1 is not re-run. (Broadcast) * * @throws Exception */ @Test(timeout = 120000) public void testVertexCompletelyFinished_Broadcast() throws Exception { DAG dag = createDAG("VertexCompletelyFinished_Broadcast", ControlledImmediateStartVertexManager.class, DataMovementType.BROADCAST, false); TezCounters counters = runDAGAndVerify(dag, DAGStatus.State.SUCCEEDED); assertEquals(4, counters.findCounter(DAGCounter.NUM_SUCCEEDED_TASKS).getValue()); assertEquals(2, counters.findCounter(TestCounter.Counter_1).getValue()); List<HistoryEvent> historyEvents1 = readRecoveryLog(1); List<HistoryEvent> historyEvents2 = readRecoveryLog(2); printHistoryEvents(historyEvents1, 1); printHistoryEvents(historyEvents1, 2); // task_0 of v1 is finished in attempt 1, task_1 of v1 is not finished in // attempt 1 assertEquals(1, findTaskAttemptFinishedEvent(historyEvents1, 0, 0).size()); assertEquals(1, findTaskAttemptFinishedEvent(historyEvents1, 0, 1).size()); // task_0 of v1 is finished in attempt 1 and not rerun, task_1 of v1 is // finished in attempt 2 assertEquals(1, findTaskAttemptFinishedEvent(historyEvents2, 0, 0).size()); assertEquals(1, findTaskAttemptFinishedEvent(historyEvents2, 0, 1).size()); } /** * Fine-grained recovery task-level, In a vertex (v1), task 0 is done task 1 * is not started. History flush happens. AM dies. Once AM is recovered, task 0 is * not re-run. Task 1 is re-run. (ONE_TO_ONE) * * @throws Exception */ @Test(timeout = 120000) public void testVertexPartialFinished_One2One() throws Exception { DAG dag = createDAG("VertexPartialFinished_One2One", ControlledInputReadyVertexManager.class, DataMovementType.ONE_TO_ONE, true); TezCounters counters = runDAGAndVerify(dag, DAGStatus.State.SUCCEEDED); assertEquals(4, counters.findCounter(DAGCounter.NUM_SUCCEEDED_TASKS).getValue()); assertEquals(2, counters.findCounter(TestCounter.Counter_1).getValue()); List<HistoryEvent> historyEvents1 = readRecoveryLog(1); List<HistoryEvent> historyEvents2 = readRecoveryLog(2); printHistoryEvents(historyEvents1, 1); printHistoryEvents(historyEvents1, 2); // task_0 of v1 is finished in attempt 1, task_1 of v1 is not finished in // attempt 1 assertEquals(1, findTaskAttemptFinishedEvent(historyEvents1, 0, 0).size()); assertEquals(0, findTaskAttemptFinishedEvent(historyEvents1, 0, 1).size()); // task_0 of v1 is finished in attempt 1 and not rerun, task_1 of v1 is // finished in attempt 2 assertEquals(1, findTaskAttemptFinishedEvent(historyEvents2, 0, 0).size()); assertEquals(1, findTaskAttemptFinishedEvent(historyEvents2, 0, 1).size()); } /** * Fine-grained recovery task-level, In a vertex (v1), task 0 is done task 1 * is also done. History flush happens. AM dies. Once AM is recovered, task 0 * and Task 1 is not re-run. (ONE_TO_ONE) * * @throws Exception */ @Test(timeout = 120000) public void testVertexCompletelyFinished_One2One() throws Exception { DAG dag = createDAG("VertexCompletelyFinished_One2One", ControlledInputReadyVertexManager.class, DataMovementType.ONE_TO_ONE, false); TezCounters counters = runDAGAndVerify(dag, DAGStatus.State.SUCCEEDED); assertEquals(4, counters.findCounter(DAGCounter.NUM_SUCCEEDED_TASKS).getValue()); assertEquals(2, counters.findCounter(TestCounter.Counter_1).getValue()); List<HistoryEvent> historyEvents1 = readRecoveryLog(1); List<HistoryEvent> historyEvents2 = readRecoveryLog(2); printHistoryEvents(historyEvents1, 1); printHistoryEvents(historyEvents1, 2); // task_0 of v1 is finished in attempt 1, task_1 of v1 is not finished in // attempt 1 assertEquals(1, findTaskAttemptFinishedEvent(historyEvents1, 0, 0).size()); assertEquals(1, findTaskAttemptFinishedEvent(historyEvents1, 0, 1).size()); // task_0 of v1 is finished in attempt 1 and not rerun, task_1 of v1 is // finished in attempt 2 assertEquals(1, findTaskAttemptFinishedEvent(historyEvents2, 0, 0).size()); assertEquals(1, findTaskAttemptFinishedEvent(historyEvents2, 0, 1).size()); } /** * Fine-grained recovery task-level, In a vertex (v1), task 0 is done task 1 * is not started. History flush happens. AM dies. Once AM is recovered, task 0 is * not re-run. Task 1 is re-run. (SCATTER_GATHER) * * @throws Exception */ @Test(timeout = 120000) public void testVertexPartiallyFinished_ScatterGather() throws Exception { DAG dag = createDAG("VertexPartiallyFinished_ScatterGather", ControlledShuffleVertexManager.class, DataMovementType.SCATTER_GATHER, true); TezCounters counters = runDAGAndVerify(dag, DAGStatus.State.SUCCEEDED); assertEquals(4, counters.findCounter(DAGCounter.NUM_SUCCEEDED_TASKS).getValue()); assertEquals(2, counters.findCounter(TestCounter.Counter_1).getValue()); List<HistoryEvent> historyEvents1 = readRecoveryLog(1); List<HistoryEvent> historyEvents2 = readRecoveryLog(2); printHistoryEvents(historyEvents1, 1); printHistoryEvents(historyEvents1, 2); // task_0 of v1 is finished in attempt 1, task_1 of v1 is not finished in // attempt 1 assertEquals(1, findTaskAttemptFinishedEvent(historyEvents1, 0, 0).size()); assertEquals(0, findTaskAttemptFinishedEvent(historyEvents1, 0, 1).size()); // task_0 of v1 is finished in attempt 1 and not rerun, task_1 of v1 is // finished in attempt 2 assertEquals(1, findTaskAttemptFinishedEvent(historyEvents2, 0, 0).size()); assertEquals(1, findTaskAttemptFinishedEvent(historyEvents2, 0, 1).size()); } /** * Fine-grained recovery task-level, In a vertex (v1), task 0 is done task 1 * is also done. History flush happens. AM dies. Once AM is recovered, task 0 * and Task 1 is not re-run. (SCATTER_GATHER) * * @throws Exception */ @Test(timeout = 120000) public void testVertexCompletelyFinished_ScatterGather() throws Exception { DAG dag = createDAG("VertexCompletelyFinished_ScatterGather", ControlledShuffleVertexManager.class, DataMovementType.SCATTER_GATHER, false); TezCounters counters = runDAGAndVerify(dag, DAGStatus.State.SUCCEEDED); assertEquals(4, counters.findCounter(DAGCounter.NUM_SUCCEEDED_TASKS).getValue()); assertEquals(2, counters.findCounter(TestCounter.Counter_1).getValue()); TezCounter outputCounter = counters.findCounter(TestOutput.COUNTER_NAME, TestOutput.COUNTER_NAME); TezCounter inputCounter = counters.findCounter(TestInput.COUNTER_NAME, TestInput.COUNTER_NAME); // verify that processor, input and output counters, are all being collected Assert.assertTrue(outputCounter.getValue() > 0); Assert.assertTrue(inputCounter.getValue() > 0); List<HistoryEvent> historyEvents1 = readRecoveryLog(1); List<HistoryEvent> historyEvents2 = readRecoveryLog(2); printHistoryEvents(historyEvents1, 1); printHistoryEvents(historyEvents1, 2); // task_0 of v1 is finished in attempt 1, task_1 of v1 is not finished in // attempt 1 assertEquals(1, findTaskAttemptFinishedEvent(historyEvents1, 0, 0).size()); assertEquals(1, findTaskAttemptFinishedEvent(historyEvents1, 0, 1).size()); // task_0 of v1 is finished in attempt 1 and not rerun, task_1 of v1 is // finished in attempt 2 assertEquals(1, findTaskAttemptFinishedEvent(historyEvents2, 0, 0).size()); assertEquals(1, findTaskAttemptFinishedEvent(historyEvents2, 0, 1).size()); } /** * Set AM max attempt to high number. Kill many attempts. Last AM can still be * recovered with latest AM history data. * * @throws Exception */ @Test(timeout = 600000) public void testHighMaxAttempt() throws Exception { Random rand = new Random(); tezConf.set(FAIL_ON_ATTEMPT, rand.nextInt(MAX_AM_ATTEMPT) + ""); LOG.info("Set FAIL_ON_ATTEMPT=" + tezConf.get(FAIL_ON_ATTEMPT)); DAG dag = createDAG("HighMaxAttempt", FailOnAttemptVertexManager.class, DataMovementType.SCATTER_GATHER, false); runDAGAndVerify(dag, DAGStatus.State.SUCCEEDED); } TezCounters runDAGAndVerify(DAG dag, DAGStatus.State finalState) throws Exception { tezSession.waitTillReady(); DAGClient dagClient = tezSession.submitDAG(dag); DAGStatus dagStatus = dagClient.waitForCompletionWithStatusUpdates(EnumSet .of(StatusGetOpts.GET_COUNTERS)); Assert.assertEquals(finalState, dagStatus.getState()); return dagStatus.getDAGCounters(); } /** * v1 --> v2 <br> * v1 has a customized VM to control whether to schedule only one second task when it is partiallyFinished test case. * v2 has a customized VM which could control when to kill AM * * @param vertexManagerClass * @param dmType * @param failOnParitialCompleted * @return * @throws IOException */ private DAG createDAG(String dagName, Class vertexManagerClass, DataMovementType dmType, boolean failOnParitialCompleted) throws IOException { if (failOnParitialCompleted) { tezConf.set(FAIL_ON_PARTIAL_FINISHED, "true"); } else { tezConf.set(FAIL_ON_PARTIAL_FINISHED, "false"); } DAG dag = DAG.create(dagName); UserPayload payload = UserPayload.create(null); Vertex v1 = Vertex.create("v1", MyProcessor.getProcDesc(), 2); v1.setVertexManagerPlugin(VertexManagerPluginDescriptor.create( ScheduleControlledVertexManager.class.getName()).setUserPayload( TezUtils.createUserPayloadFromConf(tezConf))); Vertex v2 = Vertex.create("v2", DoNothingProcessor.getProcDesc(), 2); v2.setVertexManagerPlugin(VertexManagerPluginDescriptor.create( vertexManagerClass.getName()).setUserPayload( TezUtils.createUserPayloadFromConf(tezConf))); dag.addVertex(v1).addVertex(v2); dag.addEdge(Edge.create(v1, v2, EdgeProperty.create(dmType, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, TestOutput.getOutputDesc(payload), TestInput.getInputDesc(payload)))); return dag; } private List<TaskAttemptFinishedEvent> findTaskAttemptFinishedEvent( List<HistoryEvent> historyEvents, int vertexId, int taskId) { List<TaskAttemptFinishedEvent> resultEvents = new ArrayList<TaskAttemptFinishedEvent>(); for (HistoryEvent historyEvent : historyEvents) { if (historyEvent.getEventType() == HistoryEventType.TASK_ATTEMPT_FINISHED) { TaskAttemptFinishedEvent taFinishedEvent = (TaskAttemptFinishedEvent) historyEvent; if (taFinishedEvent.getTaskAttemptID().getTaskID().getVertexID() .getId() == vertexId && taFinishedEvent.getTaskAttemptID().getTaskID().getId() == taskId) { resultEvents.add(taFinishedEvent); } } } return resultEvents; } private List<HistoryEvent> readRecoveryLog(int attemptNum) throws IOException { ApplicationId appId = tezSession.getAppMasterApplicationId(); Path tezSystemStagingDir = TezCommonUtils.getTezSystemStagingPath(tezConf, appId.toString()); Path recoveryDataDir = TezCommonUtils.getRecoveryPath(tezSystemStagingDir, tezConf); FileSystem fs = tezSystemStagingDir.getFileSystem(tezConf); List<HistoryEvent> historyEvents = new ArrayList<HistoryEvent>(); for (int i=1; i <= attemptNum; ++i) { Path currentAttemptRecoveryDataDir = TezCommonUtils.getAttemptRecoveryPath(recoveryDataDir, i); Path recoveryFilePath = new Path(currentAttemptRecoveryDataDir, appId.toString().replace( "application", "dag") + "_1" + TezConstants.DAG_RECOVERY_RECOVER_FILE_SUFFIX); if (fs.exists(recoveryFilePath)) { LOG.info("Read recovery file:" + recoveryFilePath); historyEvents.addAll(RecoveryParser.parseDAGRecoveryFile(fs.open(recoveryFilePath))); } } return historyEvents; } private void printHistoryEvents(List<HistoryEvent> historyEvents, int attemptId) { LOG.info("RecoveryLogs from attempt:" + attemptId); for(HistoryEvent historyEvent : historyEvents) { LOG.info("Parsed event from recovery stream" + ", eventType=" + historyEvent.getEventType() + ", event=" + historyEvent); } LOG.info(""); } public static class ControlledInputReadyVertexManager extends InputReadyVertexManager { private Configuration conf; private int completedTaskNum = 0; public ControlledInputReadyVertexManager(VertexManagerPluginContext context) { super(context); } @Override public void initialize() { super.initialize(); try { conf = TezUtils.createConfFromUserPayload(getContext().getUserPayload()); } catch (IOException e) { e.printStackTrace(); } } @Override public void onSourceTaskCompleted(TaskAttemptIdentifier attempt) { super.onSourceTaskCompleted(attempt); completedTaskNum ++; if (getContext().getDAGAttemptNumber() == 1) { if (conf.getBoolean(FAIL_ON_PARTIAL_FINISHED, true)) { if (completedTaskNum == 1) { System.exit(-1); } } else { if (completedTaskNum == getContext(). getVertexNumTasks(attempt.getTaskIdentifier().getVertexIdentifier().getName())) { System.exit(-1); } } } } } public static class ControlledShuffleVertexManager extends ShuffleVertexManager { private Configuration conf; private int completedTaskNum = 0; public ControlledShuffleVertexManager(VertexManagerPluginContext context) { super(context); } @Override public void initialize() { super.initialize(); try { conf = TezUtils.createConfFromUserPayload(getContext().getUserPayload()); } catch (IOException e) { e.printStackTrace(); } } @Override public void onSourceTaskCompleted(TaskAttemptIdentifier attempt) { super.onSourceTaskCompleted(attempt); completedTaskNum ++; if (getContext().getDAGAttemptNumber() == 1) { if (conf.getBoolean(FAIL_ON_PARTIAL_FINISHED, true)) { if (completedTaskNum == 1) { System.exit(-1); } } else { if (completedTaskNum == getContext(). getVertexNumTasks(attempt.getTaskIdentifier().getVertexIdentifier().getName())) { System.exit(-1); } } } } } public static class ControlledImmediateStartVertexManager extends ImmediateStartVertexManager { private Configuration conf; private int completedTaskNum = 0; public ControlledImmediateStartVertexManager( VertexManagerPluginContext context) { super(context); } @Override public void initialize() { super.initialize(); try { conf = TezUtils.createConfFromUserPayload(getContext().getUserPayload()); } catch (IOException e) { e.printStackTrace(); } } @Override public void onSourceTaskCompleted(TaskAttemptIdentifier attempt) { super.onSourceTaskCompleted(attempt); completedTaskNum ++; if (getContext().getDAGAttemptNumber() == 1) { if (conf.getBoolean(FAIL_ON_PARTIAL_FINISHED, true)) { if (completedTaskNum == 1) { System.exit(-1); } } else { if (completedTaskNum == getContext(). getVertexNumTasks(attempt.getTaskIdentifier().getVertexIdentifier().getName())) { System.exit(-1); } } } } } /** * VertexManager which control schedule only one task when it is test case of partially-finished. * */ public static class ScheduleControlledVertexManager extends VertexManagerPlugin { private Configuration conf; public ScheduleControlledVertexManager(VertexManagerPluginContext context) { super(context); } @Override public void initialize() { try { conf = TezUtils.createConfFromUserPayload(getContext().getUserPayload()); } catch (IOException e) { e.printStackTrace(); } } @Override public void onVertexStarted(List<TaskAttemptIdentifier> completions) throws Exception { if (getContext().getDAGAttemptNumber() == 1) { // only schedule one task if it is partiallyFinished case if (conf.getBoolean(FAIL_ON_PARTIAL_FINISHED, true)) { getContext().scheduleTasks(Lists.newArrayList(ScheduleTaskRequest.create(0, null))); return ; } } // schedule all tasks when it is not partiallyFinished int taskNum = getContext().getVertexNumTasks(getContext().getVertexName()); List<ScheduleTaskRequest> taskWithLocationHints = new ArrayList<ScheduleTaskRequest>(); for (int i=0;i<taskNum;++i) { taskWithLocationHints.add(ScheduleTaskRequest.create(i, null)); } getContext().scheduleTasks(taskWithLocationHints); } @Override public void onSourceTaskCompleted(TaskAttemptIdentifier attempt) throws Exception { } @Override public void onVertexManagerEventReceived(VertexManagerEvent vmEvent) throws Exception { } @Override public void onRootVertexInitialized(String inputName, InputDescriptor inputDescriptor, List<Event> events) throws Exception { } } /** * VM which could control fail on attempt less than a specified number * */ public static class FailOnAttemptVertexManager extends ShuffleVertexManager { private Configuration conf; public FailOnAttemptVertexManager(VertexManagerPluginContext context) { super(context); } @Override public void initialize() { super.initialize(); try { conf = TezUtils.createConfFromUserPayload(getContext().getUserPayload()); } catch (IOException e) { e.printStackTrace(); } } @Override public void onSourceTaskCompleted(TaskAttemptIdentifier attempt) { int curAttempt = getContext().getDAGAttemptNumber(); super.onSourceTaskCompleted(attempt); int failOnAttempt = conf.getInt(FAIL_ON_ATTEMPT, 1); LOG.info("failOnAttempt:" + failOnAttempt); LOG.info("curAttempt:" + curAttempt); if (curAttempt < failOnAttempt) { System.exit(-1); } } } public static enum TestCounter { Counter_1, } public static class MyProcessor extends SimpleProcessor { public MyProcessor(ProcessorContext context) { super(context); } @Override public void run() throws Exception { getContext().getCounters().findCounter(TestCounter.Counter_1).increment(1); } public static ProcessorDescriptor getProcDesc() { return ProcessorDescriptor.create(MyProcessor.class.getName()); } } public static class DoNothingProcessor extends SimpleProcessor { public DoNothingProcessor(ProcessorContext context) { super(context); } @Override public void run() throws Exception { // Sleep 3 second in vertex2 to avoid that vertex2 completed // before vertex2 get the SourceVertexTaskAttemptCompletedEvent. // SourceVertexTaskAttemptCompletedEvent will been ingored if vertex in SUCCEEDED, // so AM won't been killed in the VM of vertex2 Thread.sleep(3000); } public static ProcessorDescriptor getProcDesc() { return ProcessorDescriptor.create(DoNothingProcessor.class.getName()); } } }