package gr.iti.mklab.visual.mapreduce; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.net.URI; /** * A hadoop visual feature extraction job * * @author Katerina Andreadou */ public class VisualJob extends Configured implements Tool { //LOCAL CONFIGURATION //public final static String LEARNING_FILES_PATH = "/home/kandreadou/webservice/learning_files/"; //AMAZON ELASTIC MAPREDUCE CONFIGURATION //https://s3.amazonaws.com/gr.iti.mklab/learningfiles/pq_1024_64x8_rp_ivf_8192k.csv public final static String LEARNING_FILES_PATH = ""; public final static boolean IS_LOCAL = false; /** * Main entry point that uses the {@link org.apache.hadoop.util.ToolRunner} class to run the Hadoop job. */ public static void main(String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new VisualJob(), args); System.exit(res); } @Override public int run(String[] args) throws Exception { String inputPath = args[0]; String outputPath = args[1]; if (!IS_LOCAL & args.length >= 3) { String configFile = args[2]; if (configFile != null) { getConf().addResource(configFile); } //The learning files have to be uploaded to the s3 bucket first //Then when starting the job, they have to be added to the hadoop distributed cache DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_0.csv#surf_l2_128c_0.csv"), getConf()); DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_1.csv#surf_l2_128c_1.csv"), getConf()); DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_2.csv#surf_l2_128c_2.csv"), getConf()); DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/surf_l2_128c_3.csv#surf_l2_128c_3.csv"), getConf()); DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/pca_surf_4x128_32768to1024.txt#pca_surf_4x128_32768to1024.txt"), getConf()); DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/qcoarse_1024d_8192k.csv#qcoarse_1024d_8192k.csv"), getConf()); DistributedCache.addCacheFile(new URI("s3n://gr-mklab/learningfiles/pq_1024_64x8_rp_ivf_8192k.csv#pq_1024_64x8_rp_ivf_8192k.csv"), getConf()); } Job job = createJob(inputPath, outputPath); return job.waitForCompletion(true) ? 0 : -1; } private Job createJob(String inputPath, String outputPath) throws Exception { Configuration conf = getConf(); Job job = new Job(conf); job.setJarByClass(VisualJob.class); job.setNumReduceTasks(90); FileSystem fs = FileSystem.get(new URI(outputPath), conf); if (fs.exists(new Path(outputPath))) { fs.delete(new Path(outputPath), true); } FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(FloatArrayWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(VisualThreadedMapper.class); job.setReducerClass(VisualReducer.class); return job; } public static float[] castToFloat(double[] doubleArray) { float[] floatArray = new float[doubleArray.length]; for (int i = 0; i < doubleArray.length; i++) { floatArray[i] = (float) doubleArray[i]; } return floatArray; } }