package mapreduce; import java.io.IOException; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.PosixParser; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.Mutation; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; // cc ImportJsonFromFile Example job that reads from a file and writes into a table. // vv ImportJsonFromFile public class ImportJsonFromFile { private static final Log LOG = LogFactory.getLog(ImportJsonFromFile.class); public static final String NAME = "ImportJsonFromFile"; /** * Implements the <code>Mapper</code> that takes the lines from the input * and outputs <code>Put</code> instances. */ static class ImportMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Mutation> { private JSONParser parser = new JSONParser(); /* { "updated": "Mon, 14 Sep 2009 17:09:02 +0000", "links": [{ "href": "http://www.webdesigndev.com/", "type": "text/html", "rel": "alternate" }], "title": "Web Design Tutorials | Creating a Website | Learn Adobe Flash, Photoshop and Dreamweaver", "author": "outernationalist", "comments": "http://delicious.com/url/e104984ea5f37cf8ae70451a619c9ac0", "guidislink": false, "title_detail": { "base": "http://feeds.delicious.com/v2/rss/recent?min=1&count=100", "type": "text/plain", "language": null, "value": "Web Design Tutorials | Creating a Website | Learn Adobe Flash, Photoshop and Dreamweaver" }, "link": "http://www.webdesigndev.com/", "source": {}, "wfw_commentrss": "http://feeds.delicious.com/v2/rss/url/ e104984ea5f37cf8ae70451a619c9ac0", "id": "http://delicious.com/url/ e104984ea5f37cf8ae70451a619c9ac0#outernationalist" } */ /** * Maps the input. * * @param offset The current offset into the input file. * @param line The current line of the file. * @param context The task context. * @throws java.io.IOException When mapping the input fails. */ @Override public void map(LongWritable offset, Text line, Context context) throws IOException { try { JSONObject json = (JSONObject) parser.parse(line.toString()); String link = (String) json.get("link"); byte[] md5Url = DigestUtils.md5(link); Put put = new Put(md5Url); put.addColumn(Bytes.toBytes("data"), Bytes.toBytes("link"), Bytes.toBytes(link)); context.write(new ImmutableBytesWritable(md5Url), put); } catch (Exception e) { e.printStackTrace(); } } } /** * Parse the command line parameters. * * @param args The parameters to parse. * @return The parsed command line. * @throws org.apache.commons.cli.ParseException When the parsing of the * parameters fails. */ private static CommandLine parseArgs(String[] args) throws ParseException { // create options Options options = new Options(); Option o = new Option("t", "table", true, "table to import into (must exist)"); o.setRequired(true); options.addOption(o); o = new Option("i", "input", true, "the directory in DFS to read files from"); o.setRequired(true); options.addOption(o); options.addOption("d", "debug", false, "switch on DEBUG log level"); // check if we are missing parameters if (args.length == 0) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(NAME + " ", options, true); System.exit(-1); } CommandLineParser parser = new PosixParser(); CommandLine cmd = parser.parse(options, args); // check debug flag first if (cmd.hasOption("d")) { Logger log = Logger.getLogger("mapreduce"); log.setLevel(Level.DEBUG); } return cmd; } /** * Main entry point. * * @param args The command line parameters. * @throws Exception When running the job fails. */ public static void main(String[] args) throws Exception { Configuration conf = HBaseConfiguration.create(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); CommandLine cmd = parseArgs(otherArgs); // check debug flag and other options if (cmd.hasOption("d")) conf.set("conf.debug", "true"); // get details String table = cmd.getOptionValue("t"); String input = cmd.getOptionValue("i"); // create job and set classes etc. Job job = Job.getInstance(conf, "Import from file " + input + " into table " + table); job.setJarByClass(ImportJsonFromFile.class); job.setMapperClass(ImportMapper.class); job.setOutputFormatClass(TableOutputFormat.class); job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, table); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Writable.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, new Path(input)); // run the job System.exit(job.waitForCompletion(true) ? 0 : 1); } } // ^^ ImportJsonFromFile