BulkImportJobExample.java example

Explorer
hbase-book-master
package bulkimport;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.mapreduce.PutSortReducer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.util.ReflectionUtils;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;

public class BulkImportJobExample {

  private static Log LOG = LogFactory.getLog(BulkImportJobExample.class);

  final static String SKIP_LINES_CONF_KEY = "skip.bad.lines";

  /**
   * Verbose input sampler. Allows to get some insight into the sampling process.
   *
   * @param <K> The key type.
   * @param <V> The value type.
   */
  public static class VerboseInputSampler<K, V> extends InputSampler<K, V> {
    private static Log LOG = LogFactory.getLog(VerboseInputSampler.class);

    public VerboseInputSampler(Configuration conf) {
      super(conf);
    }

    /**
     * Fixed a potential overlap of generated regions / splits for a dataset with lots of identical keys. For instance,
     * let your samples be: {1,1,1 ,1,3,3, 3,5,6} and your number of partitions be 3. Original implementation will get you
     * following splits, 1-1, 3-3, 3-6, notice the overlap between 2nd and 3rd partition.
     *
     * @param job
     * @param sampler
     * @param <K>
     * @param <V>
     * @throws IOException
     * @throws ClassNotFoundException
     * @throws InterruptedException
     */
    @SuppressWarnings("unchecked")
    public static <K, V> void writePartitionFile(Job job, InputSampler.Sampler<K, V> sampler)
      throws IOException, ClassNotFoundException, InterruptedException {
      LinkedList<K> splits = new LinkedList<K>();
      Configuration conf = job.getConfiguration();
      final InputFormat inf =
        ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
      int numPartitions = job.getNumReduceTasks();
      K[] samples = (K[])sampler.getSample(inf, job);
      LOG.info("Using " + samples.length + " samples");
      RawComparator<K> comparator = (RawComparator<K>) job.getGroupingComparator();
      Arrays.sort(samples, comparator);
      Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
      FileSystem fs = dst.getFileSystem(conf);
      if (fs.exists(dst)) fs.delete(dst, false);
      SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class);
      NullWritable nullValue = NullWritable.get();
      float stepSize = samples.length / (float) numPartitions;

      K lastKey = null;
      K currentKey = null;
      int lastKeyIndex = -1;
      for (int i = 1; i < numPartitions; ++i) {
        int currentKeyOffset = Math.round(stepSize * i);
        if (lastKeyIndex > currentKeyOffset) {
          long keyOffset = lastKeyIndex - currentKeyOffset;
          float errorRate = keyOffset / samples.length;
          LOG.warn(
            String.format("Partitions overlap. Consider using a different Sampler " +
              "and/or increase the number of samples and/or use more splits to take samples from. " +
              "Next sample would have been %s key overlaps by a distance of %d (factor %f) ", samples[currentKeyOffset], keyOffset, errorRate));
          currentKeyOffset = lastKeyIndex + 1;
        }
        currentKey = samples[currentKeyOffset];

        while (lastKey != null && comparator.compare(currentKey, lastKey) == 0) {
          currentKeyOffset++;
          if (currentKeyOffset >= samples.length) {
            LOG.info("Last 10 elements:");

            for (int d = samples.length - 1; d > samples.length - 11; d--) {
              LOG.debug(samples[d]);
            }
            throw new IOException("Not enough samples, stopped at partition " + i);
          }
          currentKey = samples[currentKeyOffset];
        }

        writer.append(currentKey, nullValue);
        lastKey = currentKey;
        lastKeyIndex = currentKeyOffset;
        splits.add(currentKey);
      }
      writer.close();
      LOG.info("*********************************************  ");
      LOG.info(" START KEYs for new Regions:  ");
      for (K split : splits) {
        LOG.info("* " + split.toString());
      }

    }

    public static class VerboseRandomSampler<K, V> implements Sampler<K, V> {
      int numSamples;
      int maxSplitsSampled;
      double freq;

      public VerboseRandomSampler(double freq, int numSamples) {
        this.freq = freq;
        this.numSamples = numSamples;
      }

      public VerboseRandomSampler(double freq, int numSamples, int maxSplitsSampled) {
        this.freq = freq;
        this.numSamples = numSamples;
        this.maxSplitsSampled = maxSplitsSampled;
      }

      public Object[] getSample(InputFormat inf, Job job) throws IOException, InterruptedException {
        long counter = 0;
        List<InputSplit> splits = inf.getSplits(job);
        ArrayList<K> samples = new ArrayList<K>(numSamples);
        int splitsToSample = Math.min(maxSplitsSampled, splits.size());

        Random r = new Random();
        long seed = r.nextLong();
        r.setSeed(seed);
        LOG.debug("Seed: " + seed);
        // shuffle splits
        for (int i = 0; i < splits.size(); ++i) {
          InputSplit tmp = splits.get(i);
          int j = r.nextInt(splits.size());
          splits.set(i, splits.get(j));
          splits.set(j, tmp);
        }

        LOG.info(String.format("tTaking %d samples with frequency: %f and maximum splits: %d", numSamples, freq, maxSplitsSampled));

        // our target rate is in terms of the maximum number of sample splits,
        // but we accept the possibility of sampling additional splits to hit
        // the target sample keyset
        for (int i = 0; i < splitsToSample || (i < splits.size() && samples.size() < numSamples); ++i) {
          TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
            job.getConfiguration(), new TaskAttemptID());
          RecordReader<K, V> reader = inf.createRecordReader(splits.get(i), samplingContext);
          reader.initialize(splits.get(i), samplingContext);
          while (reader.nextKeyValue()) {
            if (r.nextDouble() <= freq) {
              if (samples.size() < numSamples) {
                if (counter % 1000 == 0)
                  LOG.info(String.format("Fill: Collected %d samples from %d splits", counter, i));
                counter++;
                samples.add(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null));
              } else {
                // When exceeding the maximum number of samples, replace a
                // random element with this one, then adjust the frequency
                // to reflect the possibility of existing elements being
                // pushed out
                int ind = r.nextInt(numSamples);
                if (ind != numSamples) {
                  samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
                    reader.getCurrentKey(), null));
                  if (counter % 1000 == 0)
                    LOG.info(String.format("Replace Random: Collected %d samples from %d splits", counter, i));
                  counter++;
                }
                freq *= (numSamples - 1) / (double) numSamples;
              }
            }
          }
          reader.close();
        }
        return (K[]) samples.toArray();
      }
    }
  }

  /**
   * Wrap a LineRecordReader to parse JSON data, line by line.
   */
  static class DeliciousRecordReader extends RecordReader<ImmutableBytesWritable, Put> {
    private LineRecordReader lineRecordReader = null;
    private JSONParser parser;
    private ImmutableBytesWritable currentKey = null;
    private Put currentValue = null;
    private boolean skipBadLines = true;
    private int badLineCount = 0;

    @Override
    public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
      throws IOException, InterruptedException {
      lineRecordReader = new LineRecordReader();
      lineRecordReader.initialize(inputSplit, taskAttemptContext);
      currentKey = new ImmutableBytesWritable();
      parser = new JSONParser();
      skipBadLines = taskAttemptContext.getConfiguration().getBoolean(
        SKIP_LINES_CONF_KEY, true);
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
      boolean next = lineRecordReader.nextKeyValue();
      if (next) {
        String line = lineRecordReader.getCurrentValue().toString();
        try {
          JSONObject json = (JSONObject) parser.parse(line);
          String author = (String) json.get("author");
          String link = (String) json.get("link");

        } catch (ParseException e) {
          if (skipBadLines) {
            System.err.println("Bad line at offset: " +
              lineRecordReader.getCurrentKey().get() +
              ":\n" + e.getMessage());
            badLineCount++;
          } else {
            throw new IOException(e);
          }
        }
      }
      return next;
    }

    @Override
    public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException {
      return currentKey;
    }

    @Override
    public Put getCurrentValue() throws IOException, InterruptedException {
      return currentValue;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
      return lineRecordReader.getProgress();
    }

    @Override
    public void close() throws IOException {
      lineRecordReader.close();
      if (badLineCount > 0) {
        System.err.println("Number of bad lines encountered: " + badLineCount);
      }
    }
  }

  /**
   * Dedicated input format to parse Delicious RSS feed data. Can be used
   * for the actual job, but also for the input sampler.
   */
  static class DeliciousInputFormat
    extends FileInputFormat<ImmutableBytesWritable, Put> {

    @Override
    public RecordReader<ImmutableBytesWritable, Put> createRecordReader(
      InputSplit split, TaskAttemptContext context) {
      return new DeliciousRecordReader();
    }

    @Override
    protected boolean isSplitable(JobContext context, Path file) {
      return super.isSplitable(context, file);
    }
  }

  static class BulkImportMapper
    extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
    private JSONParser parser;
    private boolean skipBadLines;
    private Counter badLineCount;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
      parser = new JSONParser();
      skipBadLines = context.getConfiguration().getBoolean(
        SKIP_LINES_CONF_KEY, true);
      badLineCount = context.getCounter("BulkImportJobExample", "Bad Lines");
    }

    @Override
    protected void map(LongWritable offset, Text value, Context context)
      throws IOException, InterruptedException {
      String line = value.toString();
      try {
        Object o = parser.parse(line);

      } catch (ParseException e) {
        if (skipBadLines) {
          System.err.println("Bad line at offset: " + offset.get() +
            ":\n" + e.getMessage());
          badLineCount.increment(1);
          return;
        } else {
          throw new IOException(e);
        }
      }
    }
  }

  /*
      // "segs":{"cm.default":[["cm.drudge","0"]]}
      JSONObject networks = ((JSONObject) json).getJSONObject(OUT_SEGS);
      if (networks != null) {
        for (Iterator iter = networks.keys(); iter.hasNext(); ) {
          String network = (String) iter.next();
          JSONArray segs = (JSONArray) networks.get(network);
          if (segs != null) {
            for (int sv = 0; sv < segs.length(); sv++) {
              JSONArray seg = (JSONArray) segs.get(sv);
              rowKey.set(key);
              String line = key + "," + network + "," + seg.get(0) + "," + seg.get(1);
              if (LOG.isDebugEnabled()) LOG.debug("writeSegments: line -> " + line);
              rowValue.set(line);
              if (!dryrun) mos.write(OUT_SEGS, textFormat ? NullWritable.get() : rowKey, rowValue);
              context.getCounter(Counters.SEGMENTS).increment(1);
            }
          }
        }
      }
    }
{
    "updated": "Tue, 08 Sep 2009 23:28:55 +0000",
    "links": [
        {
            "href": "http://www.theatermania.com/broadway/",
            "type": "text/html",
            "rel": "alternate"
        }
    ],
    "title": "TheaterMania",
    "author": "mcasas1",
    "comments": "http://delicious.com/url/b5b3cbf9a9176fe43c27d7b4af94a422",
    "guidislink": false,
    "title_detail": {
        "base": "http://feeds.delicious.com/v2/rss/recent?min=1&count=100",
        "type": "text/plain",
        "language": null,
        "value": "TheaterMania"
    },
    "link": "http://www.theatermania.com/broadway/",
    "source": {

    },
    "wfw_commentrss": "http://feeds.delicious.com/v2/rss/url/b5b3cbf9a9176fe43c27d7b4af94a422",
    "id": "http://delicious.com/url/b5b3cbf9a9176fe43c27d7b4af94a422#mcasas1",
    "tags": [
        {
            "term": "NYC",
            "scheme": "http://delicious.com/mcasas1/",
            "label": null
        }
    ]
}
   */

  public static Job createSubmittableJob(Configuration conf, String[] args)
    throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
    Path inputDir = new Path(args[0]);
    Path outputDir = new Path(args[1]);
    boolean createPartitionFile = Boolean.parseBoolean(args[2]);

    Job job = Job.getInstance(conf,
      "Import delicious RSS feed into Hush tables.");
    job.setJarByClass(BulkImportJobExample.class);

    job.setInputFormatClass(TextInputFormat.class);
    // conf.setLong("hbase.hregion.max.filesize", 64 * 1024);
    FileInputFormat.setInputPaths(job, inputDir);

    job.setMapperClass(BulkImportMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(Put.class);

    job.setPartitionerClass(TotalOrderPartitioner.class);

    job.setReducerClass(PutSortReducer.class);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(KeyValue.class);

    job.setOutputFormatClass(HFileOutputFormat.class);
    HFileOutputFormat.setOutputPath(job, outputDir);

    HFileOutputFormat.setCompressOutput(job, true);
    HFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    job.getConfiguration().set("hfile.compression", "gz");

    //job.getConfiguration().setFloat("mapred.job.shuffle.input.buffer.percent", 0.5f);
    //job.setNumReduceTasks(30);

    Path partitionsPath = new Path(job.getWorkingDirectory(),
      "partitions_" + System.currentTimeMillis());
    TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);

    if (createPartitionFile) {
      VerboseInputSampler.Sampler<KeyValue, ImmutableBytesWritable> sampler =
        new VerboseInputSampler.VerboseRandomSampler<KeyValue, ImmutableBytesWritable>(0.05, 1000000, 30);       // use 0.1 for real sampling

      LOG.info("Sampling key space");
      VerboseInputSampler.writePartitionFile(job, sampler);
      LOG.info("Samping done");
    }

    URI cacheUri = new URI(partitionsPath.toString() + "#" +
      TotalOrderPartitioner.DEFAULT_PATH);
    DistributedCache.addCacheFile(cacheUri, job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());

    return job;
  }

  private static void usage(final String errorMsg) {
    if (errorMsg != null && errorMsg.length() > 0) {
      System.err.println("ERROR: " + errorMsg);
    }

    System.err.println("Usage: ");
    System.err.println("foo.sh <input> <output> [flag]");
    System.err.println("  input: hdfs input directory");
    System.err.println("  output: hdfs output directory");
    System.err.println("  flag: true - create partitions file, false - do nothing.");

  }

  public static void main(String[] args) throws Exception {
    Configuration conf = HBaseConfiguration.create();
    if (args.length < 3) {
      usage("Wrong number of arguments: " + args.length);
      System.exit(-1);
    }
    Job job = createSubmittableJob(conf, args);
    if (job != null) System.exit(job.waitForCompletion(true) ? 0 : 1);
  }

}