package bulkimport;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.mapreduce.PutSortReducer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.util.ReflectionUtils;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
public class BulkImportJobExample {
private static Log LOG = LogFactory.getLog(BulkImportJobExample.class);
final static String SKIP_LINES_CONF_KEY = "skip.bad.lines";
/**
* Verbose input sampler. Allows to get some insight into the sampling process.
*
* @param <K> The key type.
* @param <V> The value type.
*/
public static class VerboseInputSampler<K, V> extends InputSampler<K, V> {
private static Log LOG = LogFactory.getLog(VerboseInputSampler.class);
public VerboseInputSampler(Configuration conf) {
super(conf);
}
/**
* Fixed a potential overlap of generated regions / splits for a dataset with lots of identical keys. For instance,
* let your samples be: {1,1,1 ,1,3,3, 3,5,6} and your number of partitions be 3. Original implementation will get you
* following splits, 1-1, 3-3, 3-6, notice the overlap between 2nd and 3rd partition.
*
* @param job
* @param sampler
* @param <K>
* @param <V>
* @throws IOException
* @throws ClassNotFoundException
* @throws InterruptedException
*/
@SuppressWarnings("unchecked")
public static <K, V> void writePartitionFile(Job job, InputSampler.Sampler<K, V> sampler)
throws IOException, ClassNotFoundException, InterruptedException {
LinkedList<K> splits = new LinkedList<K>();
Configuration conf = job.getConfiguration();
final InputFormat inf =
ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
int numPartitions = job.getNumReduceTasks();
K[] samples = (K[])sampler.getSample(inf, job);
LOG.info("Using " + samples.length + " samples");
RawComparator<K> comparator = (RawComparator<K>) job.getGroupingComparator();
Arrays.sort(samples, comparator);
Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
FileSystem fs = dst.getFileSystem(conf);
if (fs.exists(dst)) fs.delete(dst, false);
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class);
NullWritable nullValue = NullWritable.get();
float stepSize = samples.length / (float) numPartitions;
K lastKey = null;
K currentKey = null;
int lastKeyIndex = -1;
for (int i = 1; i < numPartitions; ++i) {
int currentKeyOffset = Math.round(stepSize * i);
if (lastKeyIndex > currentKeyOffset) {
long keyOffset = lastKeyIndex - currentKeyOffset;
float errorRate = keyOffset / samples.length;
LOG.warn(
String.format("Partitions overlap. Consider using a different Sampler " +
"and/or increase the number of samples and/or use more splits to take samples from. " +
"Next sample would have been %s key overlaps by a distance of %d (factor %f) ", samples[currentKeyOffset], keyOffset, errorRate));
currentKeyOffset = lastKeyIndex + 1;
}
currentKey = samples[currentKeyOffset];
while (lastKey != null && comparator.compare(currentKey, lastKey) == 0) {
currentKeyOffset++;
if (currentKeyOffset >= samples.length) {
LOG.info("Last 10 elements:");
for (int d = samples.length - 1; d > samples.length - 11; d--) {
LOG.debug(samples[d]);
}
throw new IOException("Not enough samples, stopped at partition " + i);
}
currentKey = samples[currentKeyOffset];
}
writer.append(currentKey, nullValue);
lastKey = currentKey;
lastKeyIndex = currentKeyOffset;
splits.add(currentKey);
}
writer.close();
LOG.info("********************************************* ");
LOG.info(" START KEYs for new Regions: ");
for (K split : splits) {
LOG.info("* " + split.toString());
}
}
public static class VerboseRandomSampler<K, V> implements Sampler<K, V> {
int numSamples;
int maxSplitsSampled;
double freq;
public VerboseRandomSampler(double freq, int numSamples) {
this.freq = freq;
this.numSamples = numSamples;
}
public VerboseRandomSampler(double freq, int numSamples, int maxSplitsSampled) {
this.freq = freq;
this.numSamples = numSamples;
this.maxSplitsSampled = maxSplitsSampled;
}
public Object[] getSample(InputFormat inf, Job job) throws IOException, InterruptedException {
long counter = 0;
List<InputSplit> splits = inf.getSplits(job);
ArrayList<K> samples = new ArrayList<K>(numSamples);
int splitsToSample = Math.min(maxSplitsSampled, splits.size());
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
LOG.debug("Seed: " + seed);
// shuffle splits
for (int i = 0; i < splits.size(); ++i) {
InputSplit tmp = splits.get(i);
int j = r.nextInt(splits.size());
splits.set(i, splits.get(j));
splits.set(j, tmp);
}
LOG.info(String.format("tTaking %d samples with frequency: %f and maximum splits: %d", numSamples, freq, maxSplitsSampled));
// our target rate is in terms of the maximum number of sample splits,
// but we accept the possibility of sampling additional splits to hit
// the target sample keyset
for (int i = 0; i < splitsToSample || (i < splits.size() && samples.size() < numSamples); ++i) {
TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
job.getConfiguration(), new TaskAttemptID());
RecordReader<K, V> reader = inf.createRecordReader(splits.get(i), samplingContext);
reader.initialize(splits.get(i), samplingContext);
while (reader.nextKeyValue()) {
if (r.nextDouble() <= freq) {
if (samples.size() < numSamples) {
if (counter % 1000 == 0)
LOG.info(String.format("Fill: Collected %d samples from %d splits", counter, i));
counter++;
samples.add(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null));
} else {
// When exceeding the maximum number of samples, replace a
// random element with this one, then adjust the frequency
// to reflect the possibility of existing elements being
// pushed out
int ind = r.nextInt(numSamples);
if (ind != numSamples) {
samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
reader.getCurrentKey(), null));
if (counter % 1000 == 0)
LOG.info(String.format("Replace Random: Collected %d samples from %d splits", counter, i));
counter++;
}
freq *= (numSamples - 1) / (double) numSamples;
}
}
}
reader.close();
}
return (K[]) samples.toArray();
}
}
}
/**
* Wrap a LineRecordReader to parse JSON data, line by line.
*/
static class DeliciousRecordReader extends RecordReader<ImmutableBytesWritable, Put> {
private LineRecordReader lineRecordReader = null;
private JSONParser parser;
private ImmutableBytesWritable currentKey = null;
private Put currentValue = null;
private boolean skipBadLines = true;
private int badLineCount = 0;
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException {
lineRecordReader = new LineRecordReader();
lineRecordReader.initialize(inputSplit, taskAttemptContext);
currentKey = new ImmutableBytesWritable();
parser = new JSONParser();
skipBadLines = taskAttemptContext.getConfiguration().getBoolean(
SKIP_LINES_CONF_KEY, true);
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
boolean next = lineRecordReader.nextKeyValue();
if (next) {
String line = lineRecordReader.getCurrentValue().toString();
try {
JSONObject json = (JSONObject) parser.parse(line);
String author = (String) json.get("author");
String link = (String) json.get("link");
} catch (ParseException e) {
if (skipBadLines) {
System.err.println("Bad line at offset: " +
lineRecordReader.getCurrentKey().get() +
":\n" + e.getMessage());
badLineCount++;
} else {
throw new IOException(e);
}
}
}
return next;
}
@Override
public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
return currentKey;
}
@Override
public Put getCurrentValue() throws IOException, InterruptedException {
return currentValue;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return lineRecordReader.getProgress();
}
@Override
public void close() throws IOException {
lineRecordReader.close();
if (badLineCount > 0) {
System.err.println("Number of bad lines encountered: " + badLineCount);
}
}
}
/**
* Dedicated input format to parse Delicious RSS feed data. Can be used
* for the actual job, but also for the input sampler.
*/
static class DeliciousInputFormat
extends FileInputFormat<ImmutableBytesWritable, Put> {
@Override
public RecordReader<ImmutableBytesWritable, Put> createRecordReader(
InputSplit split, TaskAttemptContext context) {
return new DeliciousRecordReader();
}
@Override
protected boolean isSplitable(JobContext context, Path file) {
return super.isSplitable(context, file);
}
}
static class BulkImportMapper
extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
private JSONParser parser;
private boolean skipBadLines;
private Counter badLineCount;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
parser = new JSONParser();
skipBadLines = context.getConfiguration().getBoolean(
SKIP_LINES_CONF_KEY, true);
badLineCount = context.getCounter("BulkImportJobExample", "Bad Lines");
}
@Override
protected void map(LongWritable offset, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
try {
Object o = parser.parse(line);
} catch (ParseException e) {
if (skipBadLines) {
System.err.println("Bad line at offset: " + offset.get() +
":\n" + e.getMessage());
badLineCount.increment(1);
return;
} else {
throw new IOException(e);
}
}
}
}
/*
// "segs":{"cm.default":[["cm.drudge","0"]]}
JSONObject networks = ((JSONObject) json).getJSONObject(OUT_SEGS);
if (networks != null) {
for (Iterator iter = networks.keys(); iter.hasNext(); ) {
String network = (String) iter.next();
JSONArray segs = (JSONArray) networks.get(network);
if (segs != null) {
for (int sv = 0; sv < segs.length(); sv++) {
JSONArray seg = (JSONArray) segs.get(sv);
rowKey.set(key);
String line = key + "," + network + "," + seg.get(0) + "," + seg.get(1);
if (LOG.isDebugEnabled()) LOG.debug("writeSegments: line -> " + line);
rowValue.set(line);
if (!dryrun) mos.write(OUT_SEGS, textFormat ? NullWritable.get() : rowKey, rowValue);
context.getCounter(Counters.SEGMENTS).increment(1);
}
}
}
}
}
{
"updated": "Tue, 08 Sep 2009 23:28:55 +0000",
"links": [
{
"href": "http://www.theatermania.com/broadway/",
"type": "text/html",
"rel": "alternate"
}
],
"title": "TheaterMania",
"author": "mcasas1",
"comments": "http://delicious.com/url/b5b3cbf9a9176fe43c27d7b4af94a422",
"guidislink": false,
"title_detail": {
"base": "http://feeds.delicious.com/v2/rss/recent?min=1&count=100",
"type": "text/plain",
"language": null,
"value": "TheaterMania"
},
"link": "http://www.theatermania.com/broadway/",
"source": {
},
"wfw_commentrss": "http://feeds.delicious.com/v2/rss/url/b5b3cbf9a9176fe43c27d7b4af94a422",
"id": "http://delicious.com/url/b5b3cbf9a9176fe43c27d7b4af94a422#mcasas1",
"tags": [
{
"term": "NYC",
"scheme": "http://delicious.com/mcasas1/",
"label": null
}
]
}
*/
public static Job createSubmittableJob(Configuration conf, String[] args)
throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
Path inputDir = new Path(args[0]);
Path outputDir = new Path(args[1]);
boolean createPartitionFile = Boolean.parseBoolean(args[2]);
Job job = Job.getInstance(conf,
"Import delicious RSS feed into Hush tables.");
job.setJarByClass(BulkImportJobExample.class);
job.setInputFormatClass(TextInputFormat.class);
// conf.setLong("hbase.hregion.max.filesize", 64 * 1024);
FileInputFormat.setInputPaths(job, inputDir);
job.setMapperClass(BulkImportMapper.class);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(Put.class);
job.setPartitionerClass(TotalOrderPartitioner.class);
job.setReducerClass(PutSortReducer.class);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(KeyValue.class);
job.setOutputFormatClass(HFileOutputFormat.class);
HFileOutputFormat.setOutputPath(job, outputDir);
HFileOutputFormat.setCompressOutput(job, true);
HFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
job.getConfiguration().set("hfile.compression", "gz");
//job.getConfiguration().setFloat("mapred.job.shuffle.input.buffer.percent", 0.5f);
//job.setNumReduceTasks(30);
Path partitionsPath = new Path(job.getWorkingDirectory(),
"partitions_" + System.currentTimeMillis());
TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);
if (createPartitionFile) {
VerboseInputSampler.Sampler<KeyValue, ImmutableBytesWritable> sampler =
new VerboseInputSampler.VerboseRandomSampler<KeyValue, ImmutableBytesWritable>(0.05, 1000000, 30); // use 0.1 for real sampling
LOG.info("Sampling key space");
VerboseInputSampler.writePartitionFile(job, sampler);
LOG.info("Samping done");
}
URI cacheUri = new URI(partitionsPath.toString() + "#" +
TotalOrderPartitioner.DEFAULT_PATH);
DistributedCache.addCacheFile(cacheUri, job.getConfiguration());
DistributedCache.createSymlink(job.getConfiguration());
return job;
}
private static void usage(final String errorMsg) {
if (errorMsg != null && errorMsg.length() > 0) {
System.err.println("ERROR: " + errorMsg);
}
System.err.println("Usage: ");
System.err.println("foo.sh <input> <output> [flag]");
System.err.println(" input: hdfs input directory");
System.err.println(" output: hdfs output directory");
System.err.println(" flag: true - create partitions file, false - do nothing.");
}
public static void main(String[] args) throws Exception {
Configuration conf = HBaseConfiguration.create();
if (args.length < 3) {
usage("Wrong number of arguments: " + args.length);
System.exit(-1);
}
Job job = createSubmittableJob(conf, args);
if (job != null) System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}