package mapreduce; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import java.io.IOException; // cc AnalyzeSnapshotData MapReduce job that reads the data from a snapshot and analyzes it. public class AnalyzeSnapshotData { private static final Log LOG = LogFactory.getLog(AnalyzeSnapshotData.class); public static final String NAME = "AnalyzeSnapshotData"; public enum Counters { ROWS, COLS, ERROR, VALID } /** * Implements the <code>Mapper</code> that reads the data and extracts the * required information. */ static class AnalyzeMapper extends TableMapper<Text, IntWritable> { private JSONParser parser = new JSONParser(); private IntWritable ONE = new IntWritable(1); /** * Maps the input. * * @param row The row key. * @param columns The columns of the row. * @param context The task context. * @throws IOException When mapping the input fails. */ @Override public void map(ImmutableBytesWritable row, Result columns, Context context) throws IOException { context.getCounter(Counters.ROWS).increment(1); String value = null; try { for (Cell cell : columns.listCells()) { context.getCounter(Counters.COLS).increment(1); value = Bytes.toStringBinary(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength()); JSONObject json = (JSONObject) parser.parse(value); String author = (String) json.get("author"); if (context.getConfiguration().get("conf.debug") != null) System.out.println("Author: " + author); context.write(new Text(author), ONE); context.getCounter(Counters.VALID).increment(1); } } catch (Exception e) { e.printStackTrace(); System.err.println("Row: " + Bytes.toStringBinary(row.get()) + ", JSON: " + value); context.getCounter(Counters.ERROR).increment(1); } } /* { "updated": "Mon, 14 Sep 2009 17:09:02 +0000", "links": [{ "href": "http://www.webdesigndev.com/", "type": "text/html", "rel": "alternate" }], "title": "Web Design Tutorials | Creating a Website | Learn Adobe Flash, Photoshop and Dreamweaver", "author": "outernationalist", "comments": "http://delicious.com/url/e104984ea5f37cf8ae70451a619c9ac0", "guidislink": false, "title_detail": { "base": "http://feeds.delicious.com/v2/rss/recent?min=1&count=100", "type": "text/plain", "language": null, "value": "Web Design Tutorials | Creating a Website | Learn Adobe Flash, Photoshop and Dreamweaver" }, "link": "http://www.webdesigndev.com/", "source": {}, "wfw_commentrss": "http://feeds.delicious.com/v2/rss/url/ e104984ea5f37cf8ae70451a619c9ac0", "id": "http://delicious.com/url/ e104984ea5f37cf8ae70451a619c9ac0#outernationalist" } */ } /** * Implements the <code>Reducer</code> part of the process. */ static class AnalyzeReducer extends Reducer<Text, IntWritable, Text, IntWritable> { /** * Aggregates the counts. * * @param key The author. * @param values The counts for the author. * @param context The current task context. * @throws IOException When reading or writing the data fails. * @throws InterruptedException When the task is aborted. */ @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for (IntWritable one : values) count++; if (context.getConfiguration().get("conf.debug") != null) System.out.println("Author: " + key.toString() + ", Count: " + count); context.write(key, new IntWritable(count)); } } /** * Parse the command line parameters. * * @param args The parameters to parse. * @return The parsed command line. * @throws ParseException When the parsing of the parameters fails. */ private static CommandLine parseArgs(String[] args) throws ParseException { Options options = new Options(); Option o = new Option("t", "table", true, "table to snapshot (must exist)"); o.setArgName("table-name"); o.setRequired(true); options.addOption(o); o = new Option("s", "snapshot", true, "name of the snapshot"); o.setArgName("snapshot-name"); options.addOption(o); o = new Option("b", "restoredir", true, "name of restore directory"); o.setArgName("restoredir-name"); options.addOption(o); o = new Option("c", "column", true, "column to read data from (must exist)"); o.setArgName("family:qualifier"); options.addOption(o); o = new Option("o", "output", true, "the directory to write to"); o.setArgName("path-in-HDFS"); o.setRequired(true); options.addOption(o); options.addOption("x", "cleanup", false, "remove snapshot after job completion"); options.addOption("d", "debug", false, "switch on DEBUG log level"); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (Exception e) { System.err.println("ERROR: " + e.getMessage() + "\n"); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(NAME + " ", options, true); System.exit(-1); } if (cmd.hasOption("d")) { Logger log = Logger.getLogger("mapreduce"); log.setLevel(Level.DEBUG); System.out.println("DEBUG ON"); } return cmd; } /** * Main entry point. * * @param args The command line parameters. * @throws Exception When running the job fails. */ public static void main(String[] args) throws Exception { // vv AnalyzeSnapshotData Configuration conf = HBaseConfiguration.create(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); CommandLine cmd = parseArgs(otherArgs); if (cmd.hasOption("d")) conf.set("conf.debug", "true"); String table = cmd.getOptionValue("t"); long time = System.currentTimeMillis(); String tmpName = "snapshot-" + table + "-" + time; // co AnalyzeSnapshotData-1-TmpName Compute a name for the snapshot and restore directory, if not specified otherwise. String snapshot = cmd.getOptionValue("s", tmpName); Path restoreDir = new Path(cmd.getOptionValue("b", "/tmp/" + tmpName)); String column = cmd.getOptionValue("c"); String output = cmd.getOptionValue("o"); boolean cleanup = Boolean.valueOf(cmd.getOptionValue("x")); /*...*/ // ^^ AnalyzeSnapshotData Scan scan = new Scan(); if (column != null) { byte[][] colkey = KeyValue.parseColumn(Bytes.toBytes(column)); if (colkey.length > 1) { scan.addColumn(colkey[0], colkey[1]); } else { scan.addFamily(colkey[0]); } } // vv AnalyzeSnapshotData Connection connection = ConnectionFactory.createConnection(conf); Admin admin = connection.getAdmin(); LOG.info("Performing snapshot of table " + table + " as " + snapshot); admin.snapshot(snapshot, TableName.valueOf(table)); // co AnalyzeSnapshotData-2-Snap Create a snapshot of the table. LOG.info("Setting up job"); Job job = Job.getInstance(conf, "Analyze data in snapshot " + table); job.setJarByClass(AnalyzeSnapshotData.class); TableMapReduceUtil.initTableSnapshotMapperJob(snapshot, scan, AnalyzeMapper.class, Text.class, IntWritable.class, job, true, restoreDir); // co AnalyzeSnapshotData-2-Util Set up the snapshot mapper phase using the supplied utility. TableMapReduceUtil.addDependencyJars(job.getConfiguration(), JSONParser.class); job.setReducerClass(AnalyzeReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setNumReduceTasks(1); FileOutputFormat.setOutputPath(job, new Path(output)); System.exit(job.waitForCompletion(true) ? 0 : 1); if (cleanup) { LOG.info("Cleaning up snapshot and restore directory"); admin.deleteSnapshot(snapshot); // co AnalyzeSnapshotData-3-Cleanup Optionally clean up after the job is complete. restoreDir.getFileSystem(conf).delete(restoreDir, true); } admin.close(); connection.close(); // ^^ AnalyzeSnapshotData } }