TableIndexer.java example

Explorer
hindex-master
/**
 * Copyright 2011 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.index.mapreduce;

import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.index.util.IndexUtils;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class TableIndexer {

  private static final String TABLE_NAME_TO_INDEX = "tablename.to.index";
  final static String BULK_OUTPUT_CONF_KEY = "import.bulk.output";

  private final static int DEFAULT_CACHING = 500;
  private final static int DEFAULT_VERSIONS = 1;

  // This can be a comma seperated list
  // We can pass like
  // IDX1=>cf1:[q1->datatype&
  // length],[q2],[q3];cf2:[q1->datatype&length],[q2->datatype&length],[q3->datatype&
  // lenght]#IDX2=>cf1:q5,q5

  private static Map<String, List<String>> cfs = new HashMap<String, List<String>>();

  public static void main(String[] args) throws Exception {
    Configuration conf = HBaseConfiguration.create();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
      System.out.println("Caching and Versions  not specified");
      System.exit(-1);
    }
    int caching = -1;
    int versions = -1;
    try {
      caching = Integer.parseInt(otherArgs[0]);
    } catch (NumberFormatException nfe) {
      caching = DEFAULT_CACHING;
    }
    try {
      versions = Integer.parseInt(otherArgs[1]);
    } catch (NumberFormatException nfe) {
      versions = DEFAULT_VERSIONS;
    }

    String[] tableName = conf.getStrings(TABLE_NAME_TO_INDEX);
    if (tableName == null) {
      System.out
          .println("Wrong usage.  Usage is pass the table -Dtablename.to.index='table1' "
              + "-Dtable.columns.index='IDX1=>cf1:[q1->datatype& length],[q2],"
              + "[q3];cf2:[q1->datatype&length],[q2->datatype&length],[q3->datatype& lenght]#IDX2=>cf1:q5,q5'");
      System.out.println("The format used here is: ");
      System.out.println("IDX1 - Index name");
      System.out.println("cf1 - Columnfamilyname");
      System.out.println("q1 - qualifier");
      System.out.println("datatype - datatype (Int, String, Double, Float)");
      System.out.println("length - length of the value");
      System.out.println("The columnfamily should be seperated by ';'");
      System.out
          .println("The qualifier and the datatype and its length should be enclosed in '[]'."
              + "  The qualifier details are specified using '->' following qualifer name and the details are seperated by '&'");
      System.out.println("If the qualifier details are not specified default values are used.");
      System.out.println("# is used to seperate between two index details");
      System.out.println("Pass the scanner caching and maxversions as arguments.");
      System.exit(-1);
    }
    String tableNameToIndex = tableName[0];
    IndexUtils.createIndexTable(tableNameToIndex, conf, cfs);
    createMapReduceJob(tableNameToIndex, conf, caching, versions);
  }

  private static void createMapReduceJob(String tableNameToIndex, Configuration conf, int caching,
      int versions) throws IOException, InterruptedException, ClassNotFoundException {
    // Set the details to TableInputFormat
    Scan s = new Scan();
    s.setCaching(caching);
    s.setMaxVersions(versions);
    conf.set(TableInputFormat.INPUT_TABLE, tableNameToIndex);

    Set<Entry<String, List<String>>> entrySet = cfs.entrySet();
    for (Entry<String, List<String>> entry : entrySet) {
      List<String> quals = entry.getValue();
      addColumn(quals, Bytes.toBytes(entry.getKey()), s);
    }
    Job job = new Job(conf, "CreateIndex");
    String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY);

    TableMapReduceUtil.initTableMapperJob(tableNameToIndex, // input table
      s, // Scan instance to control CF and attribute selection
      IndexCreationMapper.class, // mapper class
      ImmutableBytesWritable.class, // mapper output key
      Put.class, // mapper output value
      job);

    TableMapReduceUtil.initTableReducerJob(IndexUtils.getIndexTableName(tableNameToIndex), // output
                                                                                           // table
      null, // reducer class
      job);

    if (hfileOutPath != null) {
      HTable table = new HTable(conf, tableNameToIndex);
      job.setReducerClass(KeyValueSortReducer.class);
      Path outputDir = new Path(hfileOutPath);
      FileOutputFormat.setOutputPath(job, outputDir);
      HFileOutputFormat.configureIncrementalLoad(job, table);
    } else {
      job.setNumReduceTasks(0);
    }

    TableMapReduceUtil.addDependencyJars(job.getConfiguration(),
      com.google.common.base.Preconditions.class);
    job.waitForCompletion(true);
    assert job.isComplete() == true;
  }

  private static void addColumn(List<String> quals, byte[] cf, Scan s) {
    for (String q : quals) {
      s.addColumn(cf, Bytes.toBytes(q));
    }
  }

}