package mapreduce;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import java.io.IOException;
// cc AnalyzeSnapshotData MapReduce job that reads the data from a snapshot and analyzes it.
public class AnalyzeSnapshotData {
private static final Log LOG = LogFactory.getLog(AnalyzeSnapshotData.class);
public static final String NAME = "AnalyzeSnapshotData";
public enum Counters { ROWS, COLS, ERROR, VALID }
/**
* Implements the <code>Mapper</code> that reads the data and extracts the
* required information.
*/
static class AnalyzeMapper extends TableMapper<Text, IntWritable> {
private JSONParser parser = new JSONParser();
private IntWritable ONE = new IntWritable(1);
/**
* Maps the input.
*
* @param row The row key.
* @param columns The columns of the row.
* @param context The task context.
* @throws IOException When mapping the input fails.
*/
@Override
public void map(ImmutableBytesWritable row, Result columns, Context context)
throws IOException {
context.getCounter(Counters.ROWS).increment(1);
String value = null;
try {
for (Cell cell : columns.listCells()) {
context.getCounter(Counters.COLS).increment(1);
value = Bytes.toStringBinary(cell.getValueArray(),
cell.getValueOffset(), cell.getValueLength());
JSONObject json = (JSONObject) parser.parse(value);
String author = (String) json.get("author");
if (context.getConfiguration().get("conf.debug") != null)
System.out.println("Author: " + author);
context.write(new Text(author), ONE);
context.getCounter(Counters.VALID).increment(1);
}
} catch (Exception e) {
e.printStackTrace();
System.err.println("Row: " + Bytes.toStringBinary(row.get()) +
", JSON: " + value);
context.getCounter(Counters.ERROR).increment(1);
}
}
/*
{
"updated": "Mon, 14 Sep 2009 17:09:02 +0000",
"links": [{
"href": "http://www.webdesigndev.com/",
"type": "text/html",
"rel": "alternate"
}],
"title": "Web Design Tutorials | Creating a Website | Learn Adobe
Flash, Photoshop and Dreamweaver",
"author": "outernationalist",
"comments": "http://delicious.com/url/e104984ea5f37cf8ae70451a619c9ac0",
"guidislink": false,
"title_detail": {
"base": "http://feeds.delicious.com/v2/rss/recent?min=1&count=100",
"type": "text/plain",
"language": null,
"value": "Web Design Tutorials | Creating a Website | Learn Adobe
Flash, Photoshop and Dreamweaver"
},
"link": "http://www.webdesigndev.com/",
"source": {},
"wfw_commentrss": "http://feeds.delicious.com/v2/rss/url/
e104984ea5f37cf8ae70451a619c9ac0",
"id": "http://delicious.com/url/
e104984ea5f37cf8ae70451a619c9ac0#outernationalist"
}
*/
}
/**
* Implements the <code>Reducer</code> part of the process.
*/
static class AnalyzeReducer
extends Reducer<Text, IntWritable, Text, IntWritable> {
/**
* Aggregates the counts.
*
* @param key The author.
* @param values The counts for the author.
* @param context The current task context.
* @throws IOException When reading or writing the data fails.
* @throws InterruptedException When the task is aborted.
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable one : values) count++;
if (context.getConfiguration().get("conf.debug") != null)
System.out.println("Author: " + key.toString() + ", Count: " + count);
context.write(key, new IntWritable(count));
}
}
/**
* Parse the command line parameters.
*
* @param args The parameters to parse.
* @return The parsed command line.
* @throws ParseException When the parsing of the parameters fails.
*/
private static CommandLine parseArgs(String[] args) throws ParseException {
Options options = new Options();
Option o = new Option("t", "table", true,
"table to snapshot (must exist)");
o.setArgName("table-name");
o.setRequired(true);
options.addOption(o);
o = new Option("s", "snapshot", true, "name of the snapshot");
o.setArgName("snapshot-name");
options.addOption(o);
o = new Option("b", "restoredir", true, "name of restore directory");
o.setArgName("restoredir-name");
options.addOption(o);
o = new Option("c", "column", true,
"column to read data from (must exist)");
o.setArgName("family:qualifier");
options.addOption(o);
o = new Option("o", "output", true, "the directory to write to");
o.setArgName("path-in-HDFS");
o.setRequired(true);
options.addOption(o);
options.addOption("x", "cleanup", false,
"remove snapshot after job completion");
options.addOption("d", "debug", false, "switch on DEBUG log level");
CommandLineParser parser = new PosixParser();
CommandLine cmd = null;
try {
cmd = parser.parse(options, args);
} catch (Exception e) {
System.err.println("ERROR: " + e.getMessage() + "\n");
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(NAME + " ", options, true);
System.exit(-1);
}
if (cmd.hasOption("d")) {
Logger log = Logger.getLogger("mapreduce");
log.setLevel(Level.DEBUG);
System.out.println("DEBUG ON");
}
return cmd;
}
/**
* Main entry point.
*
* @param args The command line parameters.
* @throws Exception When running the job fails.
*/
public static void main(String[] args) throws Exception {
// vv AnalyzeSnapshotData
Configuration conf = HBaseConfiguration.create();
String[] otherArgs =
new GenericOptionsParser(conf, args).getRemainingArgs();
CommandLine cmd = parseArgs(otherArgs);
if (cmd.hasOption("d")) conf.set("conf.debug", "true");
String table = cmd.getOptionValue("t");
long time = System.currentTimeMillis();
String tmpName = "snapshot-" + table + "-" + time; // co AnalyzeSnapshotData-1-TmpName Compute a name for the snapshot and restore directory, if not specified otherwise.
String snapshot = cmd.getOptionValue("s", tmpName);
Path restoreDir = new Path(cmd.getOptionValue("b", "/tmp/" + tmpName));
String column = cmd.getOptionValue("c");
String output = cmd.getOptionValue("o");
boolean cleanup = Boolean.valueOf(cmd.getOptionValue("x"));
/*...*/
// ^^ AnalyzeSnapshotData
Scan scan = new Scan();
if (column != null) {
byte[][] colkey = KeyValue.parseColumn(Bytes.toBytes(column));
if (colkey.length > 1) {
scan.addColumn(colkey[0], colkey[1]);
} else {
scan.addFamily(colkey[0]);
}
}
// vv AnalyzeSnapshotData
Connection connection = ConnectionFactory.createConnection(conf);
Admin admin = connection.getAdmin();
LOG.info("Performing snapshot of table " + table + " as " + snapshot);
admin.snapshot(snapshot, TableName.valueOf(table)); // co AnalyzeSnapshotData-2-Snap Create a snapshot of the table.
LOG.info("Setting up job");
Job job = Job.getInstance(conf, "Analyze data in snapshot " + table);
job.setJarByClass(AnalyzeSnapshotData.class);
TableMapReduceUtil.initTableSnapshotMapperJob(snapshot, scan,
AnalyzeMapper.class, Text.class, IntWritable.class, job, true,
restoreDir); // co AnalyzeSnapshotData-2-Util Set up the snapshot mapper phase using the supplied utility.
TableMapReduceUtil.addDependencyJars(job.getConfiguration(),
JSONParser.class);
job.setReducerClass(AnalyzeReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(1);
FileOutputFormat.setOutputPath(job, new Path(output));
System.exit(job.waitForCompletion(true) ? 0 : 1);
if (cleanup) {
LOG.info("Cleaning up snapshot and restore directory");
admin.deleteSnapshot(snapshot); // co AnalyzeSnapshotData-3-Cleanup Optionally clean up after the job is complete.
restoreDir.getFileSystem(conf).delete(restoreDir, true);
}
admin.close();
connection.close();
// ^^ AnalyzeSnapshotData
}
}