/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tez.examples; import com.google.common.base.Preconditions; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.apache.tez.client.TezClient; import org.apache.tez.dag.api.DAG; import org.apache.tez.dag.api.Edge; import org.apache.tez.dag.api.EdgeManagerPluginDescriptor; import org.apache.tez.dag.api.EdgeProperty; import org.apache.tez.dag.api.ProcessorDescriptor; import org.apache.tez.dag.api.TezConfiguration; import org.apache.tez.dag.api.UserPayload; import org.apache.tez.dag.api.Vertex; import org.apache.tez.dag.api.VertexManagerPluginDescriptor; import org.apache.tez.mapreduce.input.MRInput; import org.apache.tez.mapreduce.output.MROutput; import org.apache.tez.mapreduce.processor.SimpleMRProcessor; import org.apache.tez.runtime.api.ProcessorContext; import org.apache.tez.runtime.library.api.KeyValueReader; import org.apache.tez.runtime.library.api.KeyValueWriter; import org.apache.tez.runtime.library.api.Partitioner; import org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig; import org.apache.tez.runtime.library.cartesianproduct.CartesianProductEdgeManager; import org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager; import org.apache.tez.runtime.library.conf.UnorderedKVEdgeConfig; import org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig; import org.apache.tez.runtime.library.processor.SimpleProcessor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; /** * This DAG does cartesian product of two text inputs and then filters results according to the * third text input. * * V1 V2 V3 * \ | / * CP\ CP| / Broadcast * \ | / * Vertex 4 * * Vertex 1~3 are tokenizers and each of them tokenizes input from one directory. In partitioned * case, CustomPartitioner separates tokens into 2 partitions according to the parity of token's * first char. Vertex 4 does cartesian product of input from vertex1 and vertex2, and generates * KV pairs where keys are vertex 1 tokens and values are vertex 2 tokens. Then vertex 4 outputs KV * pairs whose keys appears in vertex 3 tokens. */ public class CartesianProduct extends TezExampleBase { private static final String INPUT = "Input1"; private static final String OUTPUT = "Output"; private static final String VERTEX1 = "Vertex1"; private static final String VERTEX2 = "Vertex2"; private static final String VERTEX3 = "Vertex3"; private static final String VERTEX4 = "Vertex4"; private static final String PARTITIONED = "-partitioned"; private static final String UNPARTITIONED = "-unpartitioned"; private static final Logger LOG = LoggerFactory.getLogger(CartesianProduct.class); private static final int numPartition = 2; private static final String[] cpSources = new String[] {VERTEX1, VERTEX2}; public static class TokenProcessor extends SimpleProcessor { public TokenProcessor(ProcessorContext context) { super(context); } @Override public void run() throws Exception { Preconditions.checkArgument(getInputs().size() == 1); Preconditions.checkArgument(getOutputs().size() == 1); KeyValueReader kvReader = (KeyValueReader) getInputs().get(INPUT).getReader(); KeyValueWriter kvWriter = (KeyValueWriter) getOutputs().get(VERTEX4).getWriter(); while (kvReader.next()) { StringTokenizer itr = new StringTokenizer(kvReader.getCurrentValue().toString()); while (itr.hasMoreTokens()) { kvWriter.write(new Text(itr.nextToken()), new IntWritable(1)); } } } } public static class JoinProcessor extends SimpleMRProcessor { public JoinProcessor(ProcessorContext context) { super(context); } @Override public void run() throws Exception { KeyValueWriter kvWriter = (KeyValueWriter) getOutputs().get(OUTPUT).getWriter(); KeyValueReader kvReader1 = (KeyValueReader) getInputs().get(VERTEX1).getReader(); KeyValueReader kvReader2 = (KeyValueReader) getInputs().get(VERTEX2).getReader(); KeyValueReader kvReader3 = (KeyValueReader) getInputs().get(VERTEX3).getReader(); Set<String> v2TokenSet = new HashSet<>(); Set<String> v3TokenSet = new HashSet<>(); while (kvReader2.next()) { v2TokenSet.add(kvReader2.getCurrentKey().toString()); } while (kvReader3.next()) { v3TokenSet.add(kvReader3.getCurrentKey().toString()); } while (kvReader1.next()) { String left = kvReader1.getCurrentKey().toString(); if (v3TokenSet.contains(left)) { for (String right : v2TokenSet) { kvWriter.write(left, right); } } } } } public static class CustomPartitioner implements Partitioner { @Override public int getPartition(Object key, Object value, int numPartitions) { return key.toString().charAt(0) % numPartition; } } private DAG createDAG(TezConfiguration tezConf, String inputPath1, String inputPath2, String inputPath3, String outputPath, boolean isPartitioned) throws IOException { Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName())); // turn off groupSplit so that each input file incurs one task v1.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath1) .groupSplits(false).build()); Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName())); v2.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath2) .groupSplits(false).build()); Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(TokenProcessor.class.getName())); v3.addDataSource(INPUT, MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath3) .groupSplits(false).build()); CartesianProductConfig cartesianProductConfig; if (isPartitioned) { Map<String, Integer> vertexPartitionMap = new HashMap<>(); for (String vertex : cpSources) { vertexPartitionMap.put(vertex, numPartition); } cartesianProductConfig = new CartesianProductConfig(vertexPartitionMap); } else { cartesianProductConfig = new CartesianProductConfig(Arrays.asList(cpSources)); } UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf); Vertex v4 = Vertex.create(VERTEX4, ProcessorDescriptor.create(JoinProcessor.class.getName())); v4.addDataSink(OUTPUT, MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath) .build()); v4.setVertexManagerPlugin( VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()) .setUserPayload(userPayload)); EdgeManagerPluginDescriptor cpEdgeManager = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName()); cpEdgeManager.setUserPayload(userPayload); EdgeProperty cpEdgeProperty; if (isPartitioned) { UnorderedPartitionedKVEdgeConfig cpEdgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), CustomPartitioner.class.getName()).build(); cpEdgeProperty = cpEdgeConf.createDefaultCustomEdgeProperty(cpEdgeManager); } else { UnorderedKVEdgeConfig edgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build(); cpEdgeProperty = edgeConf.createDefaultCustomEdgeProperty(cpEdgeManager); } EdgeProperty broadcastEdgeProperty; UnorderedKVEdgeConfig broadcastEdgeConf = UnorderedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName()).build(); broadcastEdgeProperty = broadcastEdgeConf.createDefaultBroadcastEdgeProperty(); return DAG.create("CartesianProduct") .addVertex(v1).addVertex(v2).addVertex(v3).addVertex(v4) .addEdge(Edge.create(v1, v4, cpEdgeProperty)) .addEdge(Edge.create(v2, v4, cpEdgeProperty)) .addEdge(Edge.create(v3, v4, broadcastEdgeProperty)); } @Override protected void printUsage() { System.err.println("Usage: args: ["+PARTITIONED + "|" + UNPARTITIONED + " <input_dir1> <input_dir2> <input_dir3> <output_dir>"); } @Override protected int validateArgs(String[] otherArgs) { return (otherArgs.length != 5 || (!otherArgs[0].equals(PARTITIONED) && !otherArgs[0].equals(UNPARTITIONED))) ? -1 : 0; } @Override protected int runJob(String[] args, TezConfiguration tezConf, TezClient tezClient) throws Exception { DAG dag = createDAG(tezConf, args[1], args[2], args[3], args[4], args[0].equals(PARTITIONED)); return runDag(dag, isCountersLog(), LOG); } public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new CartesianProduct(), args); System.exit(res); } }