package eu.dnetlib.iis.common.spark.pipe;
import java.io.File;
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import eu.dnetlib.iis.common.utils.AvroUtils;
/**
* Spark job that works exacly like hadoop streaming job.<br/>
* <br/>
* Job parameters:<br/>
* -inputAvroPath - directory that contains avro input files<br/>
* -inputAvroSchemaClass - avro schema class of input<br/>
* -outputAvroPath - directory for output avro files<br/>
* -outputAvroSchemaClass - avro schema class of produced output<br/>
* -mapperScript - path to map script<br/>
* -mapperScriptArgs - arguments for map script<br/>
* -reducerScript - path to reduce script<br/>
* -reducerScriptArgs - arguments for reduce script<br/>
*
* @author madryk
*
*/
public final class SparkPipeMapReduce {
//------------------------ CONSTRUCTORS -------------------
private SparkPipeMapReduce() {}
//------------------------ LOGIC --------------------------
public static void main(String[] args) throws IOException, ClassNotFoundException {
SparkPipeMapReduceParameters params = parseParameters(args);
SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.set("spark.kryo.registrator", "pl.edu.icm.sparkutils.avro.AvroCompatibleKryoRegistrator");
Class<? extends GenericRecord> outputAvroClass = Class.forName(params.outputAvroSchemaClass).asSubclass(GenericRecord.class);
Schema inputSchema = AvroUtils.toSchema(params.inputAvroSchemaClass);
Schema outputSchema = AvroUtils.toSchema(params.outputAvroSchemaClass);
Job job = Job.getInstance();
AvroJob.setInputKeySchema(job, inputSchema);
AvroJob.setOutputKeySchema(job, outputSchema);
try (JavaSparkContext sc = new JavaSparkContext(conf)) {
sc.addFile(params.mapperScript);
sc.addFile(params.reducerScript);
String mapperScriptName = new File(params.mapperScript).getName();
String reducerScriptName = new File(params.reducerScript).getName();
SparkPipeExecutor pipeExecutor = new SparkPipeExecutor();
@SuppressWarnings("unchecked")
JavaPairRDD<AvroKey<GenericRecord>, NullWritable> inputRecords = (JavaPairRDD<AvroKey<GenericRecord>, NullWritable>)sc.newAPIHadoopFile(params.inputAvroPath, AvroKeyInputFormat.class, GenericRecord.class, NullWritable.class, job.getConfiguration());
JavaPairRDD<String, String> mappedRecords =
pipeExecutor.doMap(inputRecords, mapperScriptName, params.mapperScriptArgs);
JavaPairRDD<AvroKey<GenericRecord>, NullWritable> reducedRecords =
pipeExecutor.doReduce(mappedRecords, reducerScriptName, params.reducerScriptArgs, outputAvroClass);
reducedRecords.saveAsNewAPIHadoopFile(params.outputAvroPath, AvroKey.class, NullWritable.class, AvroKeyOutputFormat.class, job.getConfiguration());
}
}
//------------------------ PRIVATE --------------------------
private static SparkPipeMapReduceParameters parseParameters(String[] args) {
SparkPipeMapReduceParameters params = new SparkPipeMapReduceParameters();
JCommander jcommander = new JCommander(params);
jcommander.parse(args);
return params;
}
@Parameters(separators = "=")
private static class SparkPipeMapReduceParameters {
@Parameter(names = "-inputAvroPath", required = true)
private String inputAvroPath;
@Parameter(names = "-inputAvroSchemaClass", required = true)
private String inputAvroSchemaClass;
@Parameter(names = "-outputAvroPath", required = true)
private String outputAvroPath;
@Parameter(names = "-outputAvroSchemaClass", required = true)
private String outputAvroSchemaClass;
@Parameter(names = "-mapperScript", required = true)
private String mapperScript;
@Parameter(names = "-mapperScriptArgs", required = false)
private String mapperScriptArgs;
@Parameter(names = "-reducerScript", required = true)
private String reducerScript;
@Parameter(names = "-reducerScriptArgs", required = false)
private String reducerScriptArgs;
}
}