/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tez.analyzer; import static org.junit.Assert.assertTrue; import java.io.File; import java.util.Collections; import java.util.List; import java.util.Random; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.server.namenode.EditLogFileOutputStream; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.tez.analyzer.plugins.CriticalPathAnalyzer; import org.apache.tez.analyzer.plugins.CriticalPathAnalyzer.CriticalPathDependency; import org.apache.tez.analyzer.plugins.CriticalPathAnalyzer.CriticalPathStep; import org.apache.tez.analyzer.plugins.CriticalPathAnalyzer.CriticalPathStep.EntityType; import org.apache.tez.client.TezClient; import org.apache.tez.dag.api.DAG; import org.apache.tez.dag.api.TezConfiguration; import org.apache.tez.dag.api.TezConstants; import org.apache.tez.dag.api.client.DAGClient; import org.apache.tez.dag.api.client.DAGStatus; import org.apache.tez.dag.history.logging.ats.ATSHistoryLoggingService; import org.apache.tez.dag.history.logging.impl.SimpleHistoryLoggingService; import org.apache.tez.dag.records.TaskAttemptTerminationCause; import org.apache.tez.dag.records.TezDAGID; import org.apache.tez.history.ATSImportTool; import org.apache.tez.history.parser.ATSFileParser; import org.apache.tez.history.parser.SimpleHistoryParser; import org.apache.tez.history.parser.datamodel.DagInfo; import org.apache.tez.test.SimpleTestDAG; import org.apache.tez.test.SimpleTestDAG3Vertices; import org.apache.tez.test.TestInput; import org.apache.tez.test.TestProcessor; import org.apache.tez.test.dag.SimpleReverseVTestDAG; import org.apache.tez.test.dag.SimpleVTestDAG; import org.apache.tez.tests.MiniTezClusterWithTimeline; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Joiner; import com.google.common.collect.Lists; public class TestAnalyzer { private static final Logger LOG = LoggerFactory.getLogger(TestAnalyzer.class); private static String TEST_ROOT_DIR = "target" + Path.SEPARATOR + TestAnalyzer.class.getName() + "-tmpDir"; private static String DOWNLOAD_DIR = TEST_ROOT_DIR + Path.SEPARATOR + "download"; private final static String SIMPLE_HISTORY_DIR = "/tmp/simplehistory/"; private final static String HISTORY_TXT = "history.txt"; private static MiniDFSCluster dfsCluster; private static MiniTezClusterWithTimeline miniTezCluster; private static Configuration conf = new Configuration(); private static FileSystem fs; private static TezClient tezSession = null; private boolean usingATS = true; private boolean downloadedSimpleHistoryFile = false; private static String yarnTimelineAddress; @BeforeClass public static void setupClass() throws Exception { conf = new Configuration(); conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_EDITS_NOEDITLOGCHANNELFLUSH, false); EditLogFileOutputStream.setShouldSkipFsyncForTesting(true); conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, TEST_ROOT_DIR); dfsCluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).format(true).build(); fs = dfsCluster.getFileSystem(); conf.set("fs.defaultFS", fs.getUri().toString()); setupTezCluster(); } @AfterClass public static void tearDownClass() throws Exception { LOG.info("Stopping mini clusters"); if (miniTezCluster != null) { miniTezCluster.stop(); miniTezCluster = null; } if (dfsCluster != null) { dfsCluster.shutdown(); dfsCluster = null; } } private CriticalPathAnalyzer setupCPAnalyzer() { Configuration analyzerConf = new Configuration(false); analyzerConf.setBoolean(CriticalPathAnalyzer.DRAW_SVG, false); CriticalPathAnalyzer cp = new CriticalPathAnalyzer(); cp.setConf(analyzerConf); return cp; } private static void setupTezCluster() throws Exception { // make the test run faster by speeding heartbeat frequency conf.setInt(YarnConfiguration.RM_NM_HEARTBEAT_INTERVAL_MS, 100); conf.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true); conf.setBoolean(TezConfiguration.TEZ_AM_ALLOW_DISABLED_TIMELINE_DOMAINS, true); conf.set(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS, ATSHistoryLoggingService .class.getName()); miniTezCluster = new MiniTezClusterWithTimeline(TestAnalyzer.class.getName(), 1, 1, 1, true); miniTezCluster.init(conf); miniTezCluster.start(); yarnTimelineAddress = miniTezCluster.getConfig().get(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_ADDRESS); } private TezConfiguration createCommonTezLog() throws Exception { TezConfiguration tezConf = new TezConfiguration(miniTezCluster.getConfig()); tezConf.setInt(TezConfiguration.TEZ_AM_RM_HEARTBEAT_INTERVAL_MS_MAX, 100); Path remoteStagingDir = dfsCluster.getFileSystem().makeQualified(new Path(TEST_ROOT_DIR, String .valueOf(new Random().nextInt(100000)))); tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString()); tezConf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false); return tezConf; } private void createTezSessionATS() throws Exception { TezConfiguration tezConf = createCommonTezLog(); tezConf.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true); tezConf.set(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_ADDRESS, miniTezCluster.getConfig().get(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_ADDRESS)); tezConf.setBoolean(TezConfiguration.TEZ_AM_ALLOW_DISABLED_TIMELINE_DOMAINS, true); tezConf.set(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS, ATSHistoryLoggingService.class.getName()); Path remoteStagingDir = dfsCluster.getFileSystem().makeQualified(new Path(TEST_ROOT_DIR, String .valueOf(new Random().nextInt(100000)))); tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString()); tezConf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false); tezSession = TezClient.create("TestAnalyzer", tezConf, true); tezSession.start(); } private void createTezSessionSimpleHistory() throws Exception { TezConfiguration tezConf = createCommonTezLog(); tezConf.set(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS, SimpleHistoryLoggingService.class.getName()); tezConf.set(TezConfiguration.TEZ_SIMPLE_HISTORY_LOGGING_DIR, SIMPLE_HISTORY_DIR); Path remoteStagingDir = dfsCluster.getFileSystem().makeQualified(new Path(TEST_ROOT_DIR, String .valueOf(new Random().nextInt(100000)))); tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString()); tezConf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false); tezSession = TezClient.create("TestFaultTolerance", tezConf, true); tezSession.start(); } private StepCheck createStep(String attempt, CriticalPathDependency reason) { return createStep(attempt, reason, null, null); } private StepCheck createStep(String attempt, CriticalPathDependency reason, TaskAttemptTerminationCause errCause, List<String> notes) { return new StepCheck(attempt, reason, errCause, notes); } private class StepCheck { String attempt; // attempt is the TaskAttemptInfo short name with regex CriticalPathDependency reason; TaskAttemptTerminationCause errCause; List<String> notesStr; StepCheck(String attempt, CriticalPathDependency reason, TaskAttemptTerminationCause cause, List<String> notes) { this.attempt = attempt; this.reason = reason; this.errCause = cause; this.notesStr = notes; } String getAttemptDetail() { return attempt; } CriticalPathDependency getReason() { return reason; } TaskAttemptTerminationCause getErrCause() { return errCause; } List<String> getNotesStr() { return notesStr; } } private void runDAG(DAG dag, DAGStatus.State finalState) throws Exception { tezSession.waitTillReady(); LOG.info("ABC Running DAG name: " + dag.getName()); DAGClient dagClient = tezSession.submitDAG(dag); DAGStatus dagStatus = dagClient.getDAGStatus(null); while (!dagStatus.isCompleted()) { LOG.info("Waiting for dag to complete. Sleeping for 500ms." + " DAG name: " + dag.getName() + " DAG appContext: " + dagClient.getExecutionContext() + " Current state: " + dagStatus.getState()); Thread.sleep(100); dagStatus = dagClient.getDAGStatus(null); } Assert.assertEquals(finalState, dagStatus.getState()); } private void verify(ApplicationId appId, int dagNum, List<StepCheck[]> steps) throws Exception { String dagId = TezDAGID.getInstance(appId, dagNum).toString(); DagInfo dagInfo = getDagInfo(dagId); verifyCriticalPath(dagInfo, steps); } private DagInfo getDagInfo(String dagId) throws Exception { // sleep for a bit to let ATS events be sent from AM DagInfo dagInfo = null; if (usingATS) { //Export the data from ATS String[] args = { "--dagId=" + dagId, "--downloadDir=" + DOWNLOAD_DIR, "--yarnTimelineAddress=" + yarnTimelineAddress }; int result = ATSImportTool.process(args); assertTrue(result == 0); //Parse ATS data and verify results //Parse downloaded contents File downloadedFile = new File(DOWNLOAD_DIR + Path.SEPARATOR + dagId + ".zip"); ATSFileParser parser = new ATSFileParser(downloadedFile); dagInfo = parser.getDAGData(dagId); assertTrue(dagInfo.getDagId().equals(dagId)); } else { if (!downloadedSimpleHistoryFile) { downloadedSimpleHistoryFile = true; TezDAGID tezDAGID = TezDAGID.fromString(dagId); ApplicationAttemptId applicationAttemptId = ApplicationAttemptId.newInstance(tezDAGID .getApplicationId(), 1); Path historyPath = new Path(miniTezCluster.getConfig().get("fs.defaultFS") + SIMPLE_HISTORY_DIR + HISTORY_TXT + "." + applicationAttemptId); FileSystem fs = historyPath.getFileSystem(miniTezCluster.getConfig()); Path localPath = new Path(DOWNLOAD_DIR, HISTORY_TXT); fs.copyToLocalFile(historyPath, localPath); } //Now parse via SimpleHistory File localFile = new File(DOWNLOAD_DIR, HISTORY_TXT); SimpleHistoryParser parser = new SimpleHistoryParser(localFile); dagInfo = parser.getDAGData(dagId); assertTrue(dagInfo.getDagId().equals(dagId)); } return dagInfo; } private void verifyCriticalPath(DagInfo dagInfo, List<StepCheck[]> stepsOptions) throws Exception { CriticalPathAnalyzer cp = setupCPAnalyzer(); cp.analyze(dagInfo); List<CriticalPathStep> criticalPath = cp.getCriticalPath(); for (CriticalPathStep step : criticalPath) { LOG.info("ABC Step: " + step.getType()); if (step.getType() == EntityType.ATTEMPT) { LOG.info("ABC Attempt: " + step.getAttempt().getShortName() + " " + step.getAttempt().getDetailedStatus()); } LOG.info("ABC Reason: " + step.getReason()); String notes = Joiner.on(";").join(step.getNotes()); LOG.info("ABC Notes: " + notes); } boolean foundMatchingLength = false; for (StepCheck[] steps : stepsOptions) { if (steps.length + 2 == criticalPath.size()) { foundMatchingLength = true; Assert.assertEquals(CriticalPathStep.EntityType.VERTEX_INIT, criticalPath.get(0).getType()); Assert.assertEquals(criticalPath.get(1).getAttempt().getShortName(), criticalPath.get(0).getAttempt().getShortName()); for (int i=1; i<criticalPath.size() - 1; ++i) { StepCheck check = steps[i-1]; CriticalPathStep step = criticalPath.get(i); Assert.assertEquals(CriticalPathStep.EntityType.ATTEMPT, step.getType()); Assert.assertTrue(check.getAttemptDetail(), step.getAttempt().getShortName().matches(check.getAttemptDetail())); Assert.assertEquals(steps[i-1].getReason(), step.getReason()); if (check.getErrCause() != null) { Assert.assertEquals(check.getErrCause(), TaskAttemptTerminationCause.valueOf(step.getAttempt().getTerminationCause())); } if (check.getNotesStr() != null) { String notes = Joiner.on("#").join(step.getNotes()); for (String note : check.getNotesStr()) { Assert.assertTrue(note, notes.contains(notes)); } } } Assert.assertEquals(CriticalPathStep.EntityType.DAG_COMMIT, criticalPath.get(criticalPath.size() - 1).getType()); break; } } Assert.assertTrue(foundMatchingLength); } @Test (timeout=300000) public void testWithATS() throws Exception { usingATS = true; createTezSessionATS(); runTests(); } @Test (timeout=300000) public void testWithSimpleHistory() throws Exception { usingATS = false; createTezSessionSimpleHistory(); runTests(); } private void runTests() throws Exception { ApplicationId appId = tezSession.getAppMasterApplicationId(); List<List<StepCheck[]>> stepsOptions = Lists.newArrayList(); // run all test dags stepsOptions.add(testAttemptOfDownstreamVertexConnectedWithTwoUpstreamVerticesFailure()); stepsOptions.add(testInputFailureCausesRerunOfTwoVerticesWithoutExit()); stepsOptions.add(testMultiVersionInputFailureWithoutExit()); stepsOptions.add(testCascadingInputFailureWithoutExitSuccess()); stepsOptions.add(testTaskMultipleFailures()); stepsOptions.add(testBasicInputFailureWithoutExit()); stepsOptions.add(testBasicTaskFailure()); stepsOptions.add(testBasicSuccessScatterGather()); stepsOptions.add(testMultiVersionInputFailureWithExit()); stepsOptions.add(testBasicInputFailureWithExit()); stepsOptions.add(testInputFailureRerunCanSendOutputToTwoDownstreamVertices()); stepsOptions.add(testCascadingInputFailureWithExitSuccess()); stepsOptions.add(testInternalPreemption()); // close session to flush if (tezSession != null) { tezSession.stop(); } Thread.sleep((TezConstants.TEZ_DAG_SLEEP_TIME_BEFORE_EXIT*3)/2); // verify all dags for (int i=0; i<stepsOptions.size(); ++i) { verify(appId, i+1, stepsOptions.get(i)); } } private List<StepCheck[]> testBasicSuccessScatterGather() throws Exception { Configuration testConf = new Configuration(false); testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1); StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY), createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY) }; DAG dag = SimpleTestDAG.createDAG("testBasicSuccessScatterGather", testConf); runDAG(dag, DAGStatus.State.SUCCEEDED); return Collections.singletonList(check); } private List<StepCheck[]> testBasicTaskFailure() throws Exception { Configuration testConf = new Configuration(false); testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1); testConf.setBoolean(TestProcessor.getVertexConfName( TestProcessor.TEZ_FAILING_PROCESSOR_DO_FAIL, "v1"), true); testConf.set(TestProcessor.getVertexConfName( TestProcessor.TEZ_FAILING_PROCESSOR_FAILING_TASK_INDEX, "v1"), "0"); testConf.setInt(TestProcessor.getVertexConfName( TestProcessor.TEZ_FAILING_PROCESSOR_FAILING_UPTO_TASK_ATTEMPT, "v1"), 0); StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY), createStep("v1 : 000000_1", CriticalPathDependency.RETRY_DEPENDENCY), createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), }; DAG dag = SimpleTestDAG.createDAG("testBasicTaskFailure", testConf); runDAG(dag, DAGStatus.State.SUCCEEDED); return Collections.singletonList(check); } private List<StepCheck[]> testTaskMultipleFailures() throws Exception { Configuration testConf = new Configuration(false); testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1); testConf.setBoolean(TestProcessor.getVertexConfName( TestProcessor.TEZ_FAILING_PROCESSOR_DO_FAIL, "v1"), true); testConf.set(TestProcessor.getVertexConfName( TestProcessor.TEZ_FAILING_PROCESSOR_FAILING_TASK_INDEX, "v1"), "0"); testConf.setInt(TestProcessor.getVertexConfName( TestProcessor.TEZ_FAILING_PROCESSOR_FAILING_UPTO_TASK_ATTEMPT, "v1"), 1); StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY), createStep("v1 : 000000_1", CriticalPathDependency.RETRY_DEPENDENCY), createStep("v1 : 000000_2", CriticalPathDependency.RETRY_DEPENDENCY), createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), }; DAG dag = SimpleTestDAG.createDAG("testTaskMultipleFailures", testConf); runDAG(dag, DAGStatus.State.SUCCEEDED); return Collections.singletonList(check); } private List<StepCheck[]> testBasicInputFailureWithExit() throws Exception { Configuration testConf = new Configuration(false); testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v2"), true); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v2"), true); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v2"), "0"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v2"), "0"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v2"), "0"); StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY), createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v2 : 000000_1", CriticalPathDependency.DATA_DEPENDENCY), }; DAG dag = SimpleTestDAG.createDAG("testBasicInputFailureWithExit", testConf); runDAG(dag, DAGStatus.State.SUCCEEDED); return Collections.singletonList(check); } private List<StepCheck[]> testBasicInputFailureWithoutExit() throws Exception { Configuration testConf = new Configuration(false); testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v2"), true); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v2"), "0"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v2"), "0"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v2"), "0"); StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY), createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), }; DAG dag = SimpleTestDAG.createDAG("testBasicInputFailureWithoutExit", testConf); runDAG(dag, DAGStatus.State.SUCCEEDED); return Collections.singletonList(check); } private List<StepCheck[]> testMultiVersionInputFailureWithExit() throws Exception { Configuration testConf = new Configuration(false); testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v2"), true); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v2"), true); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v2"), "0"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v2"), "0,1"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v2"), "0"); testConf.setInt(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v2"), 1); StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY), createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v2 : 000000_1", CriticalPathDependency.DATA_DEPENDENCY), createStep("v1 : 000000_2", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v2 : 000000_2", CriticalPathDependency.DATA_DEPENDENCY), }; DAG dag = SimpleTestDAG.createDAG("testMultiVersionInputFailureWithExit", testConf); runDAG(dag, DAGStatus.State.SUCCEEDED); return Collections.singletonList(check); } private List<StepCheck[]> testMultiVersionInputFailureWithoutExit() throws Exception { Configuration testConf = new Configuration(false); testConf.setInt(SimpleTestDAG.TEZ_SIMPLE_DAG_NUM_TASKS, 1); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v2"), true); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v2"), "0"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v2"), "0"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v2"), "0"); testConf.setInt(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v2"), 1); StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY), createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v1 : 000000_2", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), }; DAG dag = SimpleTestDAG.createDAG("testMultiVersionInputFailureWithoutExit", testConf); runDAG(dag, DAGStatus.State.SUCCEEDED); return Collections.singletonList(check); } /** * Sets configuration for cascading input failure tests that * use SimpleTestDAG3Vertices. * @param testConf configuration * @param failAndExit whether input failure should trigger attempt exit */ private void setCascadingInputFailureConfig(Configuration testConf, boolean failAndExit, int numTasks) { // v2 attempt0 succeeds. // v2 all tasks attempt1 input0 fail up to version 0. testConf.setInt(SimpleTestDAG3Vertices.TEZ_SIMPLE_DAG_NUM_TASKS, numTasks); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v2"), true); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v2"), failAndExit); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v2"), "-1"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v2"), "1"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v2"), "0"); testConf.setInt(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v2"), 0); //v3 task0 attempt0 all inputs fails up to version 0. testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v3"), true); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v3"), failAndExit); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v3"), "0"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v3"), "0"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v3"), "-1"); testConf.setInt(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v3"), 0); } /** * Test cascading input failure without exit. Expecting success. * v1 -- v2 -- v3 * v3 all-tasks attempt0 input0 fails. Wait. Triggering v2 rerun. * v2 task0 attempt1 input0 fails. Wait. Triggering v1 rerun. * v1 attempt1 rerun and succeeds. v2 accepts v1 attempt1 output. v2 attempt1 succeeds. * v3 attempt0 accepts v2 attempt1 output. * * AM vertex succeeded order is v1, v2, v1, v2, v3. * @throws Exception */ private List<StepCheck[]> testCascadingInputFailureWithoutExitSuccess() throws Exception { Configuration testConf = new Configuration(false); setCascadingInputFailureConfig(testConf, false, 1); StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY), createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v2 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v2 : 000000_1", CriticalPathDependency.DATA_DEPENDENCY), createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), }; DAG dag = SimpleTestDAG3Vertices.createDAG( "testCascadingInputFailureWithoutExitSuccess", testConf); runDAG(dag, DAGStatus.State.SUCCEEDED); return Collections.singletonList(check); } /** * Test cascading input failure with exit. Expecting success. * v1 -- v2 -- v3 * v3 all-tasks attempt0 input0 fails. v3 attempt0 exits. Triggering v2 rerun. * v2 task0 attempt1 input0 fails. v2 attempt1 exits. Triggering v1 rerun. * v1 attempt1 rerun and succeeds. v2 accepts v1 attempt1 output. v2 attempt2 succeeds. * v3 attempt1 accepts v2 attempt2 output. * * AM vertex succeeded order is v1, v2, v3, v1, v2, v3. * @throws Exception */ private List<StepCheck[]> testCascadingInputFailureWithExitSuccess() throws Exception { Configuration testConf = new Configuration(false); setCascadingInputFailureConfig(testConf, true, 1); StepCheck[] check = { createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY), createStep("v2 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v2 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v2 : 000000_2", CriticalPathDependency.DATA_DEPENDENCY), createStep("v3 : 000000_1", CriticalPathDependency.DATA_DEPENDENCY), }; DAG dag = SimpleTestDAG3Vertices.createDAG( "testCascadingInputFailureWithExitSuccess", testConf); runDAG(dag, DAGStatus.State.SUCCEEDED); return Collections.singletonList(check); } /** * 1 NM is running and can run 4 containers based on YARN mini cluster defaults and * Tez defaults for AM/task memory * v3 task0 reports read errors against both tasks of v2. This re-starts both of them. * Now all 4 slots are occupied 1 AM + 3 tasks * Now retries of v2 report read error against 1 task of v1. That re-starts. * Retry of v1 task has no space - so it preempts the least priority task (current tez logic) * v3 is preempted and re-run. Shows up on critical path as preempted failure. * Also v1 retry attempts note show that it caused preemption of v3 * @throws Exception */ private List<StepCheck[]> testInternalPreemption() throws Exception { Configuration testConf = new Configuration(false); setCascadingInputFailureConfig(testConf, false, 2); StepCheck[] check = { createStep("v1 : 00000[01]_0", CriticalPathDependency.INIT_DEPENDENCY), createStep("v2 : 00000[01]_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY, TaskAttemptTerminationCause.INTERNAL_PREEMPTION, null), createStep("v2 : 00000[01]_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY, null, Collections.singletonList("preemption of v3")), createStep("v2 : 00000[01]_1", CriticalPathDependency.DATA_DEPENDENCY), createStep("v3 : 000000_1", CriticalPathDependency.DATA_DEPENDENCY) }; DAG dag = SimpleTestDAG3Vertices.createDAG( "testInternalPreemption", testConf); runDAG(dag, DAGStatus.State.SUCCEEDED); return Collections.singletonList(check); } /** * Input failure of v3 causes rerun of both both v1 and v2 vertices. * v1 v2 * \ / * v3 * * @throws Exception */ private List<StepCheck[]> testInputFailureCausesRerunOfTwoVerticesWithoutExit() throws Exception { Configuration testConf = new Configuration(false); testConf.setInt(SimpleVTestDAG.TEZ_SIMPLE_V_DAG_NUM_TASKS, 1); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v3"), true); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v3"), false); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v3"), "0"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v3"), "0"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v3"), "-1"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v3"), "1"); StepCheck[] check = { // use regex for either vertices being possible on the path createStep("v[12] : 000000_0", CriticalPathDependency.INIT_DEPENDENCY), createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v[12] : 000000_[01]", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v[12] : 000000_[012]", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v[12] : 000000_[12]", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v[12] : 000000_2", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), }; DAG dag = SimpleVTestDAG.createDAG( "testInputFailureCausesRerunOfTwoVerticesWithoutExit", testConf); runDAG(dag, DAGStatus.State.SUCCEEDED); return Collections.singletonList(check); } /** * Downstream(v3) attempt failure of a vertex connected with * 2 upstream vertices.. * v1 v2 * \ / * v3 * * @throws Exception */ private List<StepCheck[]> testAttemptOfDownstreamVertexConnectedWithTwoUpstreamVerticesFailure() throws Exception { Configuration testConf = new Configuration(false); testConf.setInt(SimpleVTestDAG.TEZ_SIMPLE_V_DAG_NUM_TASKS, 1); testConf.setBoolean(TestProcessor.getVertexConfName( TestProcessor.TEZ_FAILING_PROCESSOR_DO_FAIL, "v3"), true); testConf.set(TestProcessor.getVertexConfName( TestProcessor.TEZ_FAILING_PROCESSOR_FAILING_TASK_INDEX, "v3"), "0"); testConf.setInt(TestProcessor.getVertexConfName( TestProcessor.TEZ_FAILING_PROCESSOR_FAILING_UPTO_TASK_ATTEMPT, "v3"), 1); StepCheck[] check = { // use regex for either vertices being possible on the path createStep("v[12] : 000000_0", CriticalPathDependency.INIT_DEPENDENCY), createStep("v3 : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v3 : 000000_1", CriticalPathDependency.RETRY_DEPENDENCY), createStep("v3 : 000000_2", CriticalPathDependency.RETRY_DEPENDENCY), }; DAG dag = SimpleVTestDAG.createDAG( "testAttemptOfDownstreamVertexConnectedWithTwoUpstreamVerticesFailure", testConf); runDAG(dag, DAGStatus.State.SUCCEEDED); return Collections.singletonList(check); } /** * Input failure of v2,v3 trigger v1 rerun. * Both v2 and v3 report error on v1 and dont exit. So one of them triggers next * version of v1 and also consume the output of the next version. While the other * consumes the output of the next version of v1. * Reruns can send output to 2 downstream vertices. * v1 * / \ * v2 v3 * * Also covers multiple consumer vertices report failure against same producer task. * @throws Exception */ private List<StepCheck[]> testInputFailureRerunCanSendOutputToTwoDownstreamVertices() throws Exception { Configuration testConf = new Configuration(false); testConf.setInt(SimpleReverseVTestDAG.TEZ_SIMPLE_REVERSE_V_DAG_NUM_TASKS, 1); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v2"), true); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v2"), false); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v2"), "-1"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v2"), "0"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v2"), "-1"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v2"), "0"); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL, "v3"), true); testConf.setBoolean(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_DO_FAIL_AND_EXIT, "v3"), false); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_INDEX, "v3"), "-1"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_TASK_ATTEMPT, "v3"), "0"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_INPUT_INDEX, "v3"), "-1"); testConf.set(TestInput.getVertexConfName( TestInput.TEZ_FAILING_INPUT_FAILING_UPTO_INPUT_ATTEMPT, "v3"), "0"); StepCheck[] check = { // use regex for either vertices being possible on the path createStep("v1 : 000000_0", CriticalPathDependency.INIT_DEPENDENCY), createStep("v[23] : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), createStep("v1 : 000000_1", CriticalPathDependency.OUTPUT_RECREATE_DEPENDENCY), createStep("v[23] : 000000_0", CriticalPathDependency.DATA_DEPENDENCY), }; DAG dag = SimpleReverseVTestDAG.createDAG( "testInputFailureRerunCanSendOutputToTwoDownstreamVertices", testConf); runDAG(dag, DAGStatus.State.SUCCEEDED); return Collections.singletonList(check); } }