MultiMROutput.java example

Explorer
tez-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tez.mapreduce.output;


import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.tez.mapreduce.hadoop.MRJobConfig;
import org.apache.tez.runtime.api.Event;
import org.apache.tez.runtime.api.Output;
import org.apache.tez.runtime.api.OutputContext;
import org.apache.tez.runtime.library.api.IOInterruptedException;
import org.apache.tez.runtime.library.api.KeyValueWriterWithBasePath;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.tez.mapreduce.hadoop.mapred.MRReporter;

/**
 * {@link MultiMROutput} is an {@link Output} which allows key/values pairs
 * to be written by a processor to different output files.
 *
 * It is compatible with all standard Apache Hadoop MapReduce
 * OutputFormat implementations.
 *
 */
@Public
public class MultiMROutput extends MROutput {

  Map<String, org.apache.hadoop.mapreduce.RecordWriter<?, ?>>
      newRecordWriters;

  Map<String, org.apache.hadoop.mapred.RecordWriter<?, ?>>
      oldRecordWriters;

  public MultiMROutput(OutputContext outputContext, int numPhysicalOutputs) {
    super(outputContext, numPhysicalOutputs);
  }

  @Override
  public List<Event> initialize() throws IOException, InterruptedException {
    List<Event> events = super.initializeBase();
    if (useNewApi) {
      newRecordWriters = new HashMap<>();
    } else {
      oldRecordWriters = new HashMap<>();
    }
    return events;
  }

  /**
   * Create an
   * {@link org.apache.tez.mapreduce.output.MROutput.MROutputConfigBuilder}
   *
   * @param conf         Configuration for the {@link MROutput}
   * @param outputFormat FileInputFormat derived class
   * @param outputPath   Output path
   * @return {@link org.apache.tez.mapreduce.output.MROutput.MROutputConfigBuilder}
   */
  public static MROutputConfigBuilder createConfigBuilder(Configuration conf,
      Class<?> outputFormat, String outputPath, boolean useLazyOutputFormat) {
    return MROutput.createConfigBuilder(conf, outputFormat, outputPath, useLazyOutputFormat)
        .setOutputClassName(MultiMROutput.class.getName());
  }

  @Override
  public KeyValueWriterWithBasePath getWriter() throws IOException {
    return new KeyValueWriterWithBasePath() {

      @SuppressWarnings("unchecked")
      @Override
      public void write(Object key, Object value) throws IOException {
        throw new UnsupportedOperationException(
            "Write without basePath isn't supported.");
      }

      @SuppressWarnings("unchecked")
      @Override
      public void write(Object key, Object value, String basePath)
          throws IOException {
        if (basePath == null) {
          throw new UnsupportedOperationException(
              "Write without basePath isn't supported.");
        }
        if (basePath.length() > 0 && basePath.charAt(0) == '/' ) {
          // The base path can't be absolute path starting with "/".
          // Otherwise, it will cause the task temporary files being
          // written outside the output committer's task work path.
          throw new UnsupportedOperationException(
              "Write with absolute basePath isn't supported.");
        }
        if (useNewApi) {
          try {
            getNewRecordWriter(newApiTaskAttemptContext, basePath).write(
                key, value);
          } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            throw new IOInterruptedException(
                "Interrupted while writing next key-value",e);
          }
        } else {
          getOldRecordWriter(basePath).write(key, value);
        }
        outputRecordCounter.increment(1);
        getContext().notifyProgress();
      }
    };
  }

  /**
   * Call this in the processor before finishing to ensure outputs that
   * outputs have been flushed. Must be called before commit.
   * @throws IOException
   */
  @Override
  public void flush() throws IOException {
    if (flushed.getAndSet(true)) {
      return;
    }
    try {
      if (useNewApi) {
          for (RecordWriter writer : newRecordWriters.values()) {
            writer.close(newApiTaskAttemptContext);
          }
      } else {
        for (org.apache.hadoop.mapred.RecordWriter writer :
            oldRecordWriters.values()) {
          writer.close(null);
        }
      }
    } catch (InterruptedException e) {
        throw new IOException("Interrupted while closing record writer", e);
    }
  }

  @SuppressWarnings("unchecked")
  private synchronized RecordWriter getNewRecordWriter(
      TaskAttemptContext taskContext, String baseFileName)
      throws IOException, InterruptedException {

    // look for record-writer in the cache
    RecordWriter writer = newRecordWriters.get(baseFileName);

    // If not in cache, create a new one
    if (writer == null) {
      // get the record writer from context output format
      taskContext.getConfiguration().set(
          MRJobConfig.FILEOUTPUTFORMAT_BASE_OUTPUT_NAME, baseFileName);
      try {
        writer = ((OutputFormat) ReflectionUtils.newInstance(
            taskContext.getOutputFormatClass(), taskContext.getConfiguration()))
            .getRecordWriter(taskContext);
      } catch (ClassNotFoundException e) {
        throw new IOException(e);
      }
      // add the record-writer to the cache
      newRecordWriters.put(baseFileName, writer);
    }
    return writer;
  }

  @SuppressWarnings("unchecked")
  private synchronized org.apache.hadoop.mapred.RecordWriter
      getOldRecordWriter(String baseFileName) throws IOException {

    // look for record-writer in the cache
    org.apache.hadoop.mapred.RecordWriter writer =
        oldRecordWriters.get(baseFileName);

    // If not in cache, create a new one
    if (writer == null) {
        FileSystem fs = FileSystem.get(jobConf);
        String finalName = getOutputName(baseFileName);
        writer = oldOutputFormat.getRecordWriter(fs, jobConf,
            finalName, new MRReporter(getContext().getCounters()));
      // add the record-writer to the cache
      oldRecordWriters.put(baseFileName, writer);
    }
    return writer;
  }
};