/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tez.mapreduce.output; import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.util.ReflectionUtils; import org.apache.tez.mapreduce.hadoop.MRJobConfig; import org.apache.tez.runtime.api.Event; import org.apache.tez.runtime.api.Output; import org.apache.tez.runtime.api.OutputContext; import org.apache.tez.runtime.library.api.IOInterruptedException; import org.apache.tez.runtime.library.api.KeyValueWriterWithBasePath; import org.apache.hadoop.classification.InterfaceAudience.Public; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.tez.mapreduce.hadoop.mapred.MRReporter; /** * {@link MultiMROutput} is an {@link Output} which allows key/values pairs * to be written by a processor to different output files. * * It is compatible with all standard Apache Hadoop MapReduce * OutputFormat implementations. * */ @Public public class MultiMROutput extends MROutput { Map<String, org.apache.hadoop.mapreduce.RecordWriter<?, ?>> newRecordWriters; Map<String, org.apache.hadoop.mapred.RecordWriter<?, ?>> oldRecordWriters; public MultiMROutput(OutputContext outputContext, int numPhysicalOutputs) { super(outputContext, numPhysicalOutputs); } @Override public List<Event> initialize() throws IOException, InterruptedException { List<Event> events = super.initializeBase(); if (useNewApi) { newRecordWriters = new HashMap<>(); } else { oldRecordWriters = new HashMap<>(); } return events; } /** * Create an * {@link org.apache.tez.mapreduce.output.MROutput.MROutputConfigBuilder} * * @param conf Configuration for the {@link MROutput} * @param outputFormat FileInputFormat derived class * @param outputPath Output path * @return {@link org.apache.tez.mapreduce.output.MROutput.MROutputConfigBuilder} */ public static MROutputConfigBuilder createConfigBuilder(Configuration conf, Class<?> outputFormat, String outputPath, boolean useLazyOutputFormat) { return MROutput.createConfigBuilder(conf, outputFormat, outputPath, useLazyOutputFormat) .setOutputClassName(MultiMROutput.class.getName()); } @Override public KeyValueWriterWithBasePath getWriter() throws IOException { return new KeyValueWriterWithBasePath() { @SuppressWarnings("unchecked") @Override public void write(Object key, Object value) throws IOException { throw new UnsupportedOperationException( "Write without basePath isn't supported."); } @SuppressWarnings("unchecked") @Override public void write(Object key, Object value, String basePath) throws IOException { if (basePath == null) { throw new UnsupportedOperationException( "Write without basePath isn't supported."); } if (basePath.length() > 0 && basePath.charAt(0) == '/' ) { // The base path can't be absolute path starting with "/". // Otherwise, it will cause the task temporary files being // written outside the output committer's task work path. throw new UnsupportedOperationException( "Write with absolute basePath isn't supported."); } if (useNewApi) { try { getNewRecordWriter(newApiTaskAttemptContext, basePath).write( key, value); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new IOInterruptedException( "Interrupted while writing next key-value",e); } } else { getOldRecordWriter(basePath).write(key, value); } outputRecordCounter.increment(1); getContext().notifyProgress(); } }; } /** * Call this in the processor before finishing to ensure outputs that * outputs have been flushed. Must be called before commit. * @throws IOException */ @Override public void flush() throws IOException { if (flushed.getAndSet(true)) { return; } try { if (useNewApi) { for (RecordWriter writer : newRecordWriters.values()) { writer.close(newApiTaskAttemptContext); } } else { for (org.apache.hadoop.mapred.RecordWriter writer : oldRecordWriters.values()) { writer.close(null); } } } catch (InterruptedException e) { throw new IOException("Interrupted while closing record writer", e); } } @SuppressWarnings("unchecked") private synchronized RecordWriter getNewRecordWriter( TaskAttemptContext taskContext, String baseFileName) throws IOException, InterruptedException { // look for record-writer in the cache RecordWriter writer = newRecordWriters.get(baseFileName); // If not in cache, create a new one if (writer == null) { // get the record writer from context output format taskContext.getConfiguration().set( MRJobConfig.FILEOUTPUTFORMAT_BASE_OUTPUT_NAME, baseFileName); try { writer = ((OutputFormat) ReflectionUtils.newInstance( taskContext.getOutputFormatClass(), taskContext.getConfiguration())) .getRecordWriter(taskContext); } catch (ClassNotFoundException e) { throw new IOException(e); } // add the record-writer to the cache newRecordWriters.put(baseFileName, writer); } return writer; } @SuppressWarnings("unchecked") private synchronized org.apache.hadoop.mapred.RecordWriter getOldRecordWriter(String baseFileName) throws IOException { // look for record-writer in the cache org.apache.hadoop.mapred.RecordWriter writer = oldRecordWriters.get(baseFileName); // If not in cache, create a new one if (writer == null) { FileSystem fs = FileSystem.get(jobConf); String finalName = getOutputName(baseFileName); writer = oldOutputFormat.getRecordWriter(fs, jobConf, finalName, new MRReporter(getContext().getCounters())); // add the record-writer to the cache oldRecordWriters.put(baseFileName, writer); } return writer; } };