TestOrderedWordCount.java example

Explorer
tez-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tez.mapreduce.examples;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.TreeMap;

import org.apache.commons.cli.ParseException;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.tez.client.CallerContext;
import org.apache.tez.client.TezClientUtils;
import org.apache.tez.client.TezClient;
import org.apache.tez.common.TezUtils;
import org.apache.tez.common.TezUtilsInternal;
import org.apache.tez.common.security.DAGAccessControls;
import org.apache.tez.dag.api.DAG;
import org.apache.tez.dag.api.DataSourceDescriptor;
import org.apache.tez.dag.api.Edge;
import org.apache.tez.dag.api.PreWarmVertex;
import org.apache.tez.dag.api.ProcessorDescriptor;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.dag.api.TezException;
import org.apache.tez.dag.api.UserPayload;
import org.apache.tez.dag.api.Vertex;
import org.apache.tez.dag.api.client.DAGClient;
import org.apache.tez.dag.api.client.DAGStatus;
import org.apache.tez.dag.api.client.StatusGetOpts;
import org.apache.tez.hadoop.shim.HadoopShim;
import org.apache.tez.hadoop.shim.HadoopShimsLoader;
import org.apache.tez.mapreduce.examples.helpers.SplitsInClientOptionParser;
import org.apache.tez.mapreduce.hadoop.MRHelpers;
import org.apache.tez.mapreduce.hadoop.MRInputHelpers;
import org.apache.tez.mapreduce.hadoop.MRJobConfig;
import org.apache.tez.mapreduce.input.MRInputLegacy;
import org.apache.tez.mapreduce.output.MROutputLegacy;
import org.apache.tez.mapreduce.processor.map.MapProcessor;
import org.apache.tez.mapreduce.processor.reduce.ReduceProcessor;
import org.apache.tez.runtime.library.api.TezRuntimeConfiguration;
import org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig;
import org.apache.tez.runtime.library.partitioner.HashPartitioner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Maps;

/**
 * An MRR job built on top of word count to return words sorted by
 * their frequency of occurrence.
 *
 * Use -DUSE_TEZ_SESSION=true to run jobs in a session mode.
 * If multiple input/outputs are provided, this job will process each pair
 * as a separate DAG in a sequential manner.
 * Use -DINTER_JOB_SLEEP_INTERVAL=<N> where N is the sleep interval in seconds
 * between the sequential DAGs.
 */
public class TestOrderedWordCount extends Configured implements Tool {

  private static Logger LOG = LoggerFactory.getLogger(TestOrderedWordCount.class);

  private static final String DAG_VIEW_ACLS = "tez.testorderedwordcount.view-acls";
  private static final String DAG_MODIFY_ACLS = "tez.testorderedwordcount.modify-acls";
  /**
   * IS_MAX_IPC_DATA_SET_BY_USER is a boolean value which is set to true when MAX_IPC_DATA_LENGTH is set by user
   * use -Dtez.testorderedwordcount.ipc.maximum.data.length to set the maximum IPC Data limit in MB
   * use -Dtez.testorderedwordcount.exceed.ipc.limit in MB to exceed the MAX_IPC_DATA_LENGTH value
   * IPC_PAYLOAD value is a random string generated for each vertex such that MAX_IPC_DATA_LENGTH is violated
   * NO_OF_VERTICES is the total number of vertices in testOrderedWordCount dag
   */
  private static final String IS_MAX_IPC_DATA_SET_BY_USER = "tez.testorderedwordcount.is-max-ipc-set-by-user";
  private static final String MAX_IPC_DATA_LENGTH = "tez.testorderedwordcount.ipc.maximum.data.length";
  private static final String EXCEED_IPC_DATA_LIMIT = "tez.testorderedwordcount.exceed.ipc.limit";
  private static final String IPC_PAYLOAD = "tez.testorderedwordcount.ipc.payload";
  private static final int NO_OF_VERTICES = 3;

  public static class TokenizerMapper
       extends Mapper<Object, Text, Text, IntWritable>{

    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    public void setup(Context context) throws IOException, InterruptedException {
      Configuration conf = context.getConfiguration();
      if (conf.getBoolean(IS_MAX_IPC_DATA_SET_BY_USER, false)) {
        LOG.info("Max IPC Data Length set : " + conf.getInt(MAX_IPC_DATA_LENGTH, -1) + " MB," +
            " Exceed the Max IPC Data Length : " + conf.getInt(EXCEED_IPC_DATA_LIMIT, 3) + " MB," +
            " Total Dag Payload sent through IPC : "
            + (conf.getInt(MAX_IPC_DATA_LENGTH, -1) + conf.getInt(EXCEED_IPC_DATA_LIMIT, 3)) + " MB," +
            " Each Vertex Processor payload : " +
            ((conf.getInt(MAX_IPC_DATA_LENGTH, -1) + conf.getInt(EXCEED_IPC_DATA_LIMIT, 3))/NO_OF_VERTICES)+" MB");
      }
    }

    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      StringTokenizer itr = new StringTokenizer(value.toString());
      while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        context.write(word, one);
      }
    }
  }

  public static class IntSumReducer
       extends Reducer<Text,IntWritable,IntWritable, Text> {
    private IntWritable result = new IntWritable();

    public void reduce(Text key, Iterable<IntWritable> values,
                       Context context
                       ) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable val : values) {
        sum += val.get();
      }
      result.set(sum);
      context.write(result, key);
    }
  }

  /**
   * Shuffle ensures ordering based on count of employees per department
   * hence the final reducer is a no-op and just emits the department name
   * with the employee count per department.
   */
  public static class MyOrderByNoOpReducer
      extends Reducer<IntWritable, Text, Text, IntWritable> {

    public void reduce(IntWritable key, Iterable<Text> values,
        Context context
        ) throws IOException, InterruptedException {
      for (Text word : values) {
        context.write(word, key);
      }
    }
  }

  private Credentials credentials = new Credentials();

  @VisibleForTesting
  public DAG createDAG(FileSystem fs, Configuration conf,
      Map<String, LocalResource> commonLocalResources, Path stagingDir,
      int dagIndex, String inputPath, String outputPath,
      boolean generateSplitsInClient,
      boolean useMRSettings,
      int intermediateNumReduceTasks,
      int maxDataLengthThroughIPC,
      int exceedDataLimit) throws Exception {

    Configuration mapStageConf = new JobConf(conf);
    mapStageConf.set(MRJobConfig.MAP_CLASS_ATTR,
        TokenizerMapper.class.getName());
    MRHelpers.translateMRConfToTez(mapStageConf, !useMRSettings);

    Configuration iReduceStageConf = new JobConf(conf);
    // TODO replace with auto-reduce parallelism
    iReduceStageConf.setInt(MRJobConfig.NUM_REDUCES, 2);
    iReduceStageConf.set(MRJobConfig.REDUCE_CLASS_ATTR,
        IntSumReducer.class.getName());
    iReduceStageConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, Text.class.getName());
    iReduceStageConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS,
        IntWritable.class.getName());
    iReduceStageConf.setBoolean("mapred.mapper.new-api", true);
    MRHelpers.translateMRConfToTez(iReduceStageConf, !useMRSettings);

    Configuration finalReduceConf = new JobConf(conf);
    finalReduceConf.setInt(MRJobConfig.NUM_REDUCES, 1);
    finalReduceConf.set(MRJobConfig.REDUCE_CLASS_ATTR,
        MyOrderByNoOpReducer.class.getName());
    finalReduceConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, IntWritable.class.getName());
    finalReduceConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS, Text.class.getName());
    MRHelpers.translateMRConfToTez(finalReduceConf, !useMRSettings);

    MRHelpers.configureMRApiUsage(mapStageConf);
    MRHelpers.configureMRApiUsage(iReduceStageConf);
    MRHelpers.configureMRApiUsage(finalReduceConf);

    List<Vertex> vertices = new ArrayList<Vertex>();

    String mapStageHistoryText = TezUtils.convertToHistoryText("Initial Tokenizer Vertex",
        mapStageConf);
    DataSourceDescriptor dsd;
    if (generateSplitsInClient) {
      mapStageConf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR,
          TextInputFormat.class.getName());
      mapStageConf.set(FileInputFormat.INPUT_DIR, inputPath);
      mapStageConf.setBoolean("mapred.mapper.new-api", true);
      dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(mapStageConf, stagingDir,
          true);
    } else {
      dsd = MRInputLegacy.createConfigBuilder(mapStageConf, TextInputFormat.class,
          inputPath).build();
    }
    dsd.getInputDescriptor().setHistoryText(TezUtils.convertToHistoryText(
        "HDFS Input " + inputPath, mapStageConf));

    Map<String, String> mapEnv = Maps.newHashMap();
    MRHelpers.updateEnvBasedOnMRTaskEnv(mapStageConf, mapEnv, true);
    Map<String, String> reduceEnv = Maps.newHashMap();
    MRHelpers.updateEnvBasedOnMRTaskEnv(mapStageConf, reduceEnv, false);

    Configuration copyMapStageConf = new Configuration(mapStageConf);
    setMaxDataLengthConf(copyMapStageConf, maxDataLengthThroughIPC, exceedDataLimit);

    Vertex mapVertex;
    ProcessorDescriptor mapProcessorDescriptor =
        ProcessorDescriptor.create(MapProcessor.class.getName())
            .setUserPayload(
                TezUtils.createUserPayloadFromConf(copyMapStageConf))
            .setHistoryText(mapStageHistoryText);
    if (!useMRSettings) {
      mapVertex = Vertex.create("initialmap", mapProcessorDescriptor);
    } else {
      mapVertex = Vertex.create("initialmap", mapProcessorDescriptor, -1,
          MRHelpers.getResourceForMRMapper(mapStageConf));
      mapVertex.setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRMapper(mapStageConf));
      mapVertex.setTaskEnvironment(mapEnv);
    }
    mapVertex.addTaskLocalFiles(commonLocalResources)
        .addDataSource("MRInput", dsd);
    vertices.add(mapVertex);

    Configuration copyiReduceStageConf = new Configuration(iReduceStageConf);
    setMaxDataLengthConf(copyiReduceStageConf, maxDataLengthThroughIPC, exceedDataLimit);

    String iReduceStageHistoryText = TezUtils.convertToHistoryText("Intermediate Summation Vertex",
        iReduceStageConf);
    ProcessorDescriptor iReduceProcessorDescriptor = ProcessorDescriptor.create(
        ReduceProcessor.class.getName())
        .setUserPayload(TezUtils.createUserPayloadFromConf(copyiReduceStageConf))
        .setHistoryText(iReduceStageHistoryText);

    Vertex intermediateVertex;
    if (!useMRSettings) {
      intermediateVertex = Vertex.create("intermediate_reducer", iReduceProcessorDescriptor,
          intermediateNumReduceTasks);
    } else {
      intermediateVertex = Vertex.create("intermediate_reducer", iReduceProcessorDescriptor,
          intermediateNumReduceTasks, MRHelpers.getResourceForMRReducer(iReduceStageConf));
      intermediateVertex.setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRReducer(iReduceStageConf));
      intermediateVertex.setTaskEnvironment(reduceEnv);
    }
    intermediateVertex.addTaskLocalFiles(commonLocalResources);
    vertices.add(intermediateVertex);

    Configuration copyFinalReduceConf = new Configuration(finalReduceConf);
    setMaxDataLengthConf(copyFinalReduceConf, maxDataLengthThroughIPC, exceedDataLimit);

    String finalReduceStageHistoryText = TezUtils.convertToHistoryText("Final Sorter Vertex",
        finalReduceConf);
    UserPayload finalReducePayload = TezUtils.createUserPayloadFromConf(copyFinalReduceConf);
    Vertex finalReduceVertex;

    ProcessorDescriptor finalReduceProcessorDescriptor =
        ProcessorDescriptor.create(
            ReduceProcessor.class.getName())
            .setUserPayload(finalReducePayload)
            .setHistoryText(finalReduceStageHistoryText);
    if (!useMRSettings) {
      finalReduceVertex = Vertex.create("finalreduce", finalReduceProcessorDescriptor, 1);
    } else {
      finalReduceVertex = Vertex.create("finalreduce", finalReduceProcessorDescriptor, 1,
          MRHelpers.getResourceForMRReducer(finalReduceConf));
      finalReduceVertex.setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRReducer(finalReduceConf));
      finalReduceVertex.setTaskEnvironment(reduceEnv);
    }
    finalReduceVertex.addTaskLocalFiles(commonLocalResources);
    finalReduceVertex.addDataSink("MROutput",
        MROutputLegacy.createConfigBuilder(finalReduceConf, TextOutputFormat.class, outputPath)
            .build());
    finalReduceVertex.getDataSinks().get(0).getOutputDescriptor().setHistoryText(
        TezUtils.convertToHistoryText("HDFS Output " + outputPath, finalReduceConf));
    vertices.add(finalReduceVertex);

    DAG dag = DAG.create("OrderedWordCount" + dagIndex);
    for (int i = 0; i < vertices.size(); ++i) {
      dag.addVertex(vertices.get(i));
    }

    OrderedPartitionedKVEdgeConfig edgeConf1 = OrderedPartitionedKVEdgeConfig
        .newBuilder(Text.class.getName(), IntWritable.class.getName(),
            HashPartitioner.class.getName()).setFromConfiguration(iReduceStageConf)
	    .configureInput().useLegacyInput().done().build();
    dag.addEdge(
        Edge.create(dag.getVertex("initialmap"), dag.getVertex("intermediate_reducer"),
            edgeConf1.createDefaultEdgeProperty()));

    OrderedPartitionedKVEdgeConfig edgeConf2 = OrderedPartitionedKVEdgeConfig
        .newBuilder(IntWritable.class.getName(), Text.class.getName(),
            HashPartitioner.class.getName()).setFromConfiguration(finalReduceConf)
            .configureInput().useLegacyInput().done().build();
    dag.addEdge(
        Edge.create(dag.getVertex("intermediate_reducer"), dag.getVertex("finalreduce"),
            edgeConf2.createDefaultEdgeProperty()));

    updateDAGACls(conf, dag, dagIndex);

    return dag;
  }
  private void setMaxDataLengthConf(Configuration config, int maxDataLengthThroughIPC, int exceedDataLimit) {
    /**
     * if -Dtez.testorderedwordcount.ipc.maximum.data.length is set by user,
     * this function sets necessary configurations as below:
     * IS_MAX_IPC_DATA_SET_BY_USER is set to true
     * EXCEED_IPC_DATA_LIMIT = <N> MB is used to test successful dag submission when MAX_IPC_DATA_LENGTH exceeds by N
     * Each vertex processor payload can be set to IPC_PAYLOAD so that the cumulative dag payload exceeds
     * the tez.testorderedwordcount.ipc.maximum.data.length set
     */
    if (maxDataLengthThroughIPC > 0) {
      config.setBoolean(IS_MAX_IPC_DATA_SET_BY_USER, true);
      config.setInt(EXCEED_IPC_DATA_LIMIT, exceedDataLimit);
      int payloadSize;
      payloadSize = (((maxDataLengthThroughIPC * 1024 * 1024) + (exceedDataLimit * 1024 * 1024)) / NO_OF_VERTICES);
      String payload = RandomStringUtils.randomAlphanumeric(payloadSize);
      config.set(IPC_PAYLOAD, payload);
    }
  }

  private void updateDAGACls(Configuration conf, DAG dag, int dagIndex) {
    LOG.info("Checking DAG specific ACLS");
    DAGAccessControls accessControls = null;
    String suffix = "." + dagIndex;
    if (conf.get(DAG_VIEW_ACLS + suffix) != null
        || conf.get(DAG_MODIFY_ACLS + suffix) != null) {
      accessControls = new DAGAccessControls(
          conf.get(DAG_VIEW_ACLS + suffix), conf.get(DAG_MODIFY_ACLS + suffix));

    } else if (conf.get(DAG_VIEW_ACLS) != null
      || conf.get(DAG_MODIFY_ACLS) != null) {
      accessControls = new DAGAccessControls(
          conf.get(DAG_VIEW_ACLS), conf.get(DAG_MODIFY_ACLS));
    }
    if (accessControls != null) {
      LOG.info("Setting DAG specific ACLS");
      dag.setAccessControls(accessControls);
    }
  }


  private static void printUsage() {
    String options = " [-generateSplitsInClient true/<false>]";
    System.err.println("Usage: testorderedwordcount <in> <out>" + options);
    System.err.println("Usage (In Session Mode):"
        + " testorderedwordcount <in1> <out1> ... <inN> <outN>" + options);
    ToolRunner.printGenericCommandUsage(System.err);
  }


  @Override
  public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    boolean generateSplitsInClient;

    SplitsInClientOptionParser splitCmdLineParser = new SplitsInClientOptionParser();
    try {
      generateSplitsInClient = splitCmdLineParser.parse(otherArgs, false);
      otherArgs = splitCmdLineParser.getRemainingArgs();
    } catch (ParseException e1) {
      System.err.println("Invalid options");
      printUsage();
      return 2;
    }

    boolean useTezSession = conf.getBoolean("USE_TEZ_SESSION", true);
    long interJobSleepTimeout = conf.getInt("INTER_JOB_SLEEP_INTERVAL", 0)
        * 1000;

    boolean retainStagingDir = conf.getBoolean("RETAIN_STAGING_DIR", false);
    boolean useMRSettings = conf.getBoolean("USE_MR_CONFIGS", true);
    // TODO needs to use auto reduce parallelism
    int intermediateNumReduceTasks = conf.getInt("IREDUCE_NUM_TASKS", 2);
    int maxDataLengthThroughIPC = conf.getInt(MAX_IPC_DATA_LENGTH, -1);
    int exceedDataLimit = conf.getInt(EXCEED_IPC_DATA_LIMIT, 3);
    if (maxDataLengthThroughIPC > 0) {
      conf.setInt(CommonConfigurationKeys.IPC_MAXIMUM_DATA_LENGTH, maxDataLengthThroughIPC * 1024 * 1024);
    }

    if (((otherArgs.length%2) != 0)
        || (!useTezSession && otherArgs.length != 2)) {
      printUsage();
      return 2;
    }

    List<String> inputPaths = new ArrayList<String>();
    List<String> outputPaths = new ArrayList<String>();
    TezConfiguration tezConf = new TezConfiguration(conf);

    for (int i = 0; i < otherArgs.length; i+=2) {
      FileSystem inputPathFs = new Path(otherArgs[i]).getFileSystem(tezConf);
      inputPaths.add(inputPathFs.makeQualified(new Path(otherArgs[i])).toString());
      FileSystem outputPathFs = new Path(otherArgs[i+1]).getFileSystem(tezConf);
      outputPaths.add(outputPathFs.makeQualified(new Path(otherArgs[i+1])).toString());
    }

    UserGroupInformation.setConfiguration(conf);
    HadoopShim hadoopShim = new HadoopShimsLoader(tezConf).getHadoopShim();
    TestOrderedWordCount instance = new TestOrderedWordCount();

    FileSystem fs = FileSystem.get(conf);

    String stagingDirStr =  conf.get(TezConfiguration.TEZ_AM_STAGING_DIR,
            TezConfiguration.TEZ_AM_STAGING_DIR_DEFAULT) + Path.SEPARATOR + 
            Long.toString(System.currentTimeMillis());
    Path stagingDir = new Path(stagingDirStr);
    FileSystem pathFs = stagingDir.getFileSystem(tezConf);
    pathFs.mkdirs(new Path(stagingDirStr));

    tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stagingDirStr);
    stagingDir = pathFs.makeQualified(new Path(stagingDirStr));
    
    TokenCache.obtainTokensForNamenodes(instance.credentials, new Path[] {stagingDir}, conf);
    TezClientUtils.ensureStagingDirExists(tezConf, stagingDir);

    // No need to add jar containing this class as assumed to be part of
    // the tez jars.

    // TEZ-674 Obtain tokens based on the Input / Output paths. For now assuming staging dir
    // is the same filesystem as the one used for Input/Output.
    
    if (useTezSession) {
      LOG.info("Creating Tez Session");
      tezConf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, true);
    } else {
      tezConf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, false);
    }
    TezClient tezSession = TezClient.create("OrderedWordCountSession", tezConf,
        null, instance.credentials);
    tezSession.start();
    if (tezSession.getAppMasterApplicationId() != null) {
      TezUtilsInternal.setHadoopCallerContext(hadoopShim, tezSession.getAppMasterApplicationId());
    }

    DAGStatus dagStatus = null;
    DAGClient dagClient = null;
    String[] vNames = { "initialmap", "intermediate_reducer",
      "finalreduce" };

    Set<StatusGetOpts> statusGetOpts = EnumSet.of(StatusGetOpts.GET_COUNTERS);
    try {
      for (int dagIndex = 1; dagIndex <= inputPaths.size(); ++dagIndex) {
        if (dagIndex != 1
            && interJobSleepTimeout > 0) {
          try {
            LOG.info("Sleeping between jobs, sleepInterval="
                + (interJobSleepTimeout/1000));
            Thread.sleep(interJobSleepTimeout);
          } catch (InterruptedException e) {
            LOG.info("Main thread interrupted. Breaking out of job loop");
            break;
          }
        }

        String inputPath = inputPaths.get(dagIndex-1);
        String outputPath = outputPaths.get(dagIndex-1);

        if (fs.exists(new Path(outputPath))) {
          throw new FileAlreadyExistsException("Output directory "
              + outputPath + " already exists");
        }
        LOG.info("Running OrderedWordCount DAG"
            + ", dagIndex=" + dagIndex
            + ", inputPath=" + inputPath
            + ", outputPath=" + outputPath);

        Map<String, LocalResource> localResources =
          new TreeMap<String, LocalResource>();
        
        DAG dag = instance.createDAG(fs, tezConf, localResources,
            stagingDir, dagIndex, inputPath, outputPath,
            generateSplitsInClient, useMRSettings, intermediateNumReduceTasks,
            maxDataLengthThroughIPC,exceedDataLimit);
        String callerType = "TestOrderedWordCount";
        String callerId = tezSession.getAppMasterApplicationId() == null ?
            ( "UnknownApp_" + System.currentTimeMillis() + dagIndex ) :
            ( tezSession.getAppMasterApplicationId().toString() + "_" + dagIndex);
        dag.setCallerContext(CallerContext.create("Tez", callerId, callerType,
            "TestOrderedWordCount Job"));

        boolean doPreWarm = dagIndex == 1 && useTezSession
            && conf.getBoolean("PRE_WARM_SESSION", true);
        int preWarmNumContainers = 0;
        if (doPreWarm) {
          preWarmNumContainers = conf.getInt("PRE_WARM_NUM_CONTAINERS", 0);
          if (preWarmNumContainers <= 0) {
            doPreWarm = false;
          }
        }
        if (doPreWarm) {
          LOG.info("Pre-warming Session");
          PreWarmVertex preWarmVertex = PreWarmVertex.create("PreWarm", preWarmNumContainers, dag
              .getVertex("initialmap").getTaskResource());
          preWarmVertex.addTaskLocalFiles(dag.getVertex("initialmap").getTaskLocalFiles());
          preWarmVertex.setTaskEnvironment(dag.getVertex("initialmap").getTaskEnvironment());
          preWarmVertex.setTaskLaunchCmdOpts(dag.getVertex("initialmap").getTaskLaunchCmdOpts());
          
          tezSession.preWarm(preWarmVertex);
        }

        if (useTezSession) {
          LOG.info("Waiting for TezSession to get into ready state");
          waitForTezSessionReady(tezSession);
          LOG.info("Submitting DAG to Tez Session, dagIndex=" + dagIndex);
          dagClient = tezSession.submitDAG(dag);
          LOG.info("Submitted DAG to Tez Session, dagIndex=" + dagIndex);
        } else {
          LOG.info("Submitting DAG as a new Tez Application");
          dagClient = tezSession.submitDAG(dag);
        }

        while (true) {
          dagStatus = dagClient.getDAGStatus(statusGetOpts);
          if (dagStatus.getState() == DAGStatus.State.RUNNING ||
              dagStatus.getState() == DAGStatus.State.SUCCEEDED ||
              dagStatus.getState() == DAGStatus.State.FAILED ||
              dagStatus.getState() == DAGStatus.State.KILLED ||
              dagStatus.getState() == DAGStatus.State.ERROR) {
            break;
          }
          try {
            Thread.sleep(500);
          } catch (InterruptedException e) {
            // continue;
          }
        }


        while (dagStatus.getState() != DAGStatus.State.SUCCEEDED &&
            dagStatus.getState() != DAGStatus.State.FAILED &&
            dagStatus.getState() != DAGStatus.State.KILLED &&
            dagStatus.getState() != DAGStatus.State.ERROR) {
          if (dagStatus.getState() == DAGStatus.State.RUNNING) {
            ExampleDriver.printDAGStatus(dagClient, vNames);
          }
          try {
            try {
              Thread.sleep(1000);
            } catch (InterruptedException e) {
              // continue;
            }
            dagStatus = dagClient.getDAGStatus(statusGetOpts);
          } catch (TezException e) {
            LOG.error("Failed to get application progress. Exiting");
            return -1;
          }
        }
        ExampleDriver.printDAGStatus(dagClient, vNames,
            true, true);
        LOG.info("DAG " + dagIndex + " completed. "
            + "FinalState=" + dagStatus.getState());
        if (dagStatus.getState() != DAGStatus.State.SUCCEEDED) {
          LOG.info("DAG " + dagIndex + " diagnostics: "
            + dagStatus.getDiagnostics());
        }
      }
    } catch (Exception e) {
      LOG.error("Error occurred when submitting/running DAGs", e);
      throw e;
    } finally {
      if (!retainStagingDir) {
        pathFs.delete(stagingDir, true);
      }
      LOG.info("Shutting down session");
      tezSession.stop();
    }

    if (!useTezSession) {
      ExampleDriver.printDAGStatus(dagClient, vNames);
      LOG.info("Application completed. " + "FinalState=" + dagStatus.getState());
    }
    return dagStatus.getState() == DAGStatus.State.SUCCEEDED ? 0 : 1;
  }

  private static void waitForTezSessionReady(TezClient tezSession)
    throws IOException, TezException, InterruptedException {
    tezSession.waitTillReady();
  }

  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(new TezConfiguration(), new TestOrderedWordCount(), args);
    System.exit(res);
  }
}