package mapreduce; import java.io.IOException; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.Mutation; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.log4j.Level; import org.apache.log4j.Logger; // cc ImportFromFile MapReduce job that reads from a file and writes into a table. // vv ImportFromFile public class ImportFromFile { // ^^ ImportFromFile private static final Log LOG = LogFactory.getLog(ImportFromFile.class); // vv ImportFromFile public static final String NAME = "ImportFromFile"; // co ImportFromFile-1-Name Define a job name for later use. public enum Counters { LINES } // ^^ ImportFromFile /** * Implements the <code>Mapper</code> that takes the lines from the input * and outputs <code>Put</code> instances. */ // vv ImportFromFile static class ImportMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Mutation> { // co ImportFromFile-2-Mapper Define the mapper class, extending the provided Hadoop class. private byte[] family = null; private byte[] qualifier = null; // ^^ ImportFromFile /** * Prepares the column family and qualifier. * * @param context The task context. * @throws IOException When an operation fails - not possible here. * @throws InterruptedException When the task is aborted. */ // vv ImportFromFile @Override protected void setup(Context context) throws IOException, InterruptedException { String column = context.getConfiguration().get("conf.column"); byte[][] colkey = KeyValue.parseColumn(Bytes.toBytes(column)); family = colkey[0]; if (colkey.length > 1) { qualifier = colkey[1]; } } // ^^ ImportFromFile /** * Maps the input. * * @param offset The current offset into the input file. * @param line The current line of the file. * @param context The task context. * @throws IOException When mapping the input fails. */ // vv ImportFromFile @Override public void map(LongWritable offset, Text line, Context context) // co ImportFromFile-3-Map The map() function transforms the key/value provided by the InputFormat to what is needed by the OutputFormat. throws IOException { try { String lineString = line.toString(); byte[] rowkey = DigestUtils.md5(lineString); // co ImportFromFile-4-RowKey The row key is the MD5 hash of the line to generate a random key. Put put = new Put(rowkey); put.addColumn(family, qualifier, Bytes.toBytes(lineString)); // co ImportFromFile-5-Put Store the original data in a column in the given table. context.write(new ImmutableBytesWritable(rowkey), put); context.getCounter(Counters.LINES).increment(1); } catch (Exception e) { e.printStackTrace(); } } } // ^^ ImportFromFile /** * Parse the command line parameters. * * @param args The parameters to parse. * @return The parsed command line. * @throws ParseException When the parsing of the parameters fails. */ // vv ImportFromFile private static CommandLine parseArgs(String[] args) throws ParseException { // co ImportFromFile-6-ParseArgs Parse the command line parameters using the Apache Commons CLI classes. These are already part of HBase and therefore are handy to process the job specific parameters. Options options = new Options(); Option o = new Option("t", "table", true, "table to import into (must exist)"); o.setArgName("table-name"); o.setRequired(true); options.addOption(o); o = new Option("c", "column", true, "column to store row data into (must exist)"); o.setArgName("family:qualifier"); o.setRequired(true); options.addOption(o); o = new Option("i", "input", true, "the directory or file to read from"); o.setArgName("path-in-HDFS"); o.setRequired(true); options.addOption(o); options.addOption("d", "debug", false, "switch on DEBUG log level"); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (Exception e) { System.err.println("ERROR: " + e.getMessage() + "\n"); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(NAME + " ", options, true); System.exit(-1); } // ^^ ImportFromFile if (cmd.hasOption("d")) { Logger log = Logger.getLogger("mapreduce"); log.setLevel(Level.DEBUG); } // vv ImportFromFile return cmd; } // ^^ ImportFromFile /** * Main entry point. * * @param args The command line parameters. * @throws Exception When running the job fails. */ // vv ImportFromFile public static void main(String[] args) throws Exception { Configuration conf = HBaseConfiguration.create(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); // co ImportFromFile-7-Args Give the command line arguments to the generic parser first to handle "-Dxyz" properties. CommandLine cmd = parseArgs(otherArgs); // ^^ ImportFromFile // check debug flag and other options if (cmd.hasOption("d")) conf.set("conf.debug", "true"); // get details // vv ImportFromFile String table = cmd.getOptionValue("t"); String input = cmd.getOptionValue("i"); String column = cmd.getOptionValue("c"); conf.set("conf.column", column); Job job = Job.getInstance(conf, "Import from file " + input + " into table " + table); // co ImportFromFile-8-JobDef Define the job with the required classes. job.setJarByClass(ImportFromFile.class); job.setMapperClass(ImportMapper.class); job.setOutputFormatClass(TableOutputFormat.class); job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, table); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Writable.class); job.setNumReduceTasks(0); // co ImportFromFile-9-MapOnly This is a map only job, therefore tell the framework to bypass the reduce step. FileInputFormat.addInputPath(job, new Path(input)); System.exit(job.waitForCompletion(true) ? 0 : 1); } } // ^^ ImportFromFile