MROutputCommitter.java example

Explorer
tez-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tez.mapreduce.committer;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.mapred.FileOutputCommitter;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.JobStatus;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.tez.common.TezUtils;
import org.apache.tez.dag.api.TezUncheckedException;
import org.apache.tez.dag.api.UserPayload;
import org.apache.tez.dag.api.client.VertexStatus;
import org.apache.tez.mapreduce.hadoop.MRConfig;
import org.apache.tez.mapreduce.hadoop.MRJobConfig;
import org.apache.tez.runtime.api.OutputCommitter;
import org.apache.tez.runtime.api.OutputCommitterContext;

import java.io.IOException;

/**
 * Implements the {@link OutputCommitter} and provide Map Reduce compatible
 * output commit operations for Map Reduce compatible data sinks. 
 */
@Public
public class MROutputCommitter extends OutputCommitter {

  private static final Logger LOG = LoggerFactory.getLogger(MROutputCommitter.class);

  private org.apache.hadoop.mapreduce.OutputCommitter committer = null;
  private JobContext jobContext = null;
  private volatile boolean initialized = false;
  private JobConf jobConf = null;
  private boolean newApiCommitter;

  public MROutputCommitter(OutputCommitterContext committerContext) {
    super(committerContext);
  }

  @Override
  public void initialize() throws IOException {
    UserPayload userPayload = getContext().getOutputUserPayload();
    if (!userPayload.hasPayload()) {
      jobConf = new JobConf();
    } else {
      jobConf = new JobConf(
          TezUtils.createConfFromUserPayload(userPayload));
    }
    
    // Read all credentials into the credentials instance stored in JobConf.
    jobConf.getCredentials().mergeAll(UserGroupInformation.getCurrentUser().getCredentials());
    jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID,
        getContext().getDAGAttemptNumber());
    committer = getOutputCommitter(getContext());
    jobContext = getJobContextFromVertexContext(getContext());
    initialized = true;
  }

  @Override
  public void setupOutput() throws IOException {
    if (!initialized) {
      throw new RuntimeException("Committer not initialized");
    }
    committer.setupJob(jobContext);
  }

  @Override
  public void commitOutput() throws IOException {
    if (!initialized) {
      throw new RuntimeException("Committer not initialized");
    }
    committer.commitJob(jobContext);
  }

  @Override
  public void abortOutput(VertexStatus.State finalState) throws IOException {
    if (!initialized) {
      throw new RuntimeException("Committer not initialized");
    }
    JobStatus.State jobState = getJobStateFromVertexStatusState(finalState);
    committer.abortJob(jobContext, jobState);
  }

  @SuppressWarnings("rawtypes")
  private org.apache.hadoop.mapreduce.OutputCommitter
      getOutputCommitter(OutputCommitterContext context) {

    org.apache.hadoop.mapreduce.OutputCommitter committer = null;
    newApiCommitter = false;
    if (jobConf.getBoolean("mapred.reducer.new-api", false)
        || jobConf.getBoolean("mapred.mapper.new-api", false))  {
      newApiCommitter = true;
    }
    LOG.info("Committer for " + getContext().getVertexName() + ":" + getContext().getOutputName() +
        " using " + (newApiCommitter ? "new" : "old") + "mapred API");

    if (newApiCommitter) {
      TaskAttemptID taskAttemptID = new TaskAttemptID(
          Long.toString(context.getApplicationId().getClusterTimestamp()),
          context.getApplicationId().getId(),
          ((jobConf.getBoolean(MRConfig.IS_MAP_PROCESSOR, false) ?
              TaskType.MAP : TaskType.REDUCE)),
          0, context.getDAGAttemptNumber());

      TaskAttemptContext taskContext = new TaskAttemptContextImpl(jobConf,
          taskAttemptID);
      try {
        OutputFormat outputFormat = ReflectionUtils.newInstance(taskContext
            .getOutputFormatClass(), jobConf);
        committer = outputFormat.getOutputCommitter(taskContext);
      } catch (Exception e) {
        throw new TezUncheckedException(e);
      }
    } else {
      committer = ReflectionUtils.newInstance(jobConf.getClass(
          "mapred.output.committer.class", FileOutputCommitter.class,
          org.apache.hadoop.mapred.OutputCommitter.class), jobConf);
    }
    LOG.info("OutputCommitter for outputName="
        + context.getOutputName()
        + ", vertexName=" + context.getVertexName()
        + ", outputCommitterClass="
        + committer.getClass().getName());
    return committer;
  }

  // FIXME we are using ApplicationId as DAG id
  private JobContext getJobContextFromVertexContext(OutputCommitterContext context)
      throws IOException {
    JobID jobId = TypeConverter.fromYarn(
        context.getApplicationId());
    return new MRJobContextImpl(jobConf, jobId);
  }

  private JobStatus.State getJobStateFromVertexStatusState(VertexStatus.State state) {
    switch(state) {
      case INITED:
        return JobStatus.State.PREP;
      case RUNNING:
        return JobStatus.State.RUNNING;
      case SUCCEEDED:
        return JobStatus.State.SUCCEEDED;
      case KILLED:
        return JobStatus.State.KILLED;
      case FAILED:
      case ERROR:
        return JobStatus.State.FAILED;
      default:
        throw new TezUncheckedException("Unknown VertexStatus.State: " + state);
    }
  }

  private static class MRJobContextImpl
      extends org.apache.hadoop.mapred.JobContextImpl {

    public MRJobContextImpl(JobConf jobConf, JobID jobId) {
      super(jobConf, jobId);
    }

  }

  @SuppressWarnings("deprecation")
  @Override
  public boolean isTaskRecoverySupported() {
    if (!initialized) {
      throw new RuntimeException("Committer not initialized");
    }
    return committer.isRecoverySupported();
  }

  @Override
  public void recoverTask(int taskIndex, int attemptId) throws IOException {
    if (!initialized) {
      throw new RuntimeException("Committer not initialized");
    }
    TaskAttemptID taskAttemptID = new TaskAttemptID(
        Long.toString(getContext().getApplicationId().getClusterTimestamp())
        + String.valueOf(getContext().getVertexIndex()),
        getContext().getApplicationId().getId(),
        ((jobConf.getBoolean(MRConfig.IS_MAP_PROCESSOR, false) ?
            TaskType.MAP : TaskType.REDUCE)),
        taskIndex, attemptId);
    TaskAttemptContext taskContext = new TaskAttemptContextImpl(jobConf,
        taskAttemptID);
    committer.recoverTask(taskContext);
  }

}