/*
* Copyright 2011 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.client.coprocessor;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.NavigableSet;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.coprocessor.AggregateProtocol;
import org.apache.hadoop.hbase.coprocessor.ColumnInterpreter;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
/**
* This client class is for invoking the aggregate functions deployed on the
* Region Server side via the AggregateProtocol. This class will implement the
* supporting functionality for summing/processing the individual results
* obtained from the AggregateProtocol for each region.
* <p>
* This will serve as the client side handler for invoking the aggregate
* functions.
* <ul>
* For all aggregate functions,
* <li>start row < end row is an essential condition (if they are not
* {@link HConstants#EMPTY_BYTE_ARRAY})
* <li>Column family can't be null. In case where multiple families are
* provided, an IOException will be thrown. An optional column qualifier can
* also be defined.
* <li>For methods to find maximum, minimum, sum, rowcount, it returns the
* parameter type. For average and std, it returns a double value. For row
* count, it returns a long value.
*/
public class AggregationClient {
private static final Log log = LogFactory.getLog(AggregationClient.class);
Configuration conf;
/**
* Constructor with Conf object
* @param cfg
*/
public AggregationClient(Configuration cfg) {
this.conf = cfg;
}
/**
* It gives the maximum value of a column for a given column family for the
* given range. In case qualifier is null, a max of all values for the given
* family is returned.
* @param tableName
* @param ci
* @param scan
* @return max val <R>
* @throws Throwable
* The caller is supposed to handle the exception as they are thrown
* & propagated to it.
*/
public <R, S> R max(final byte[] tableName, final ColumnInterpreter<R, S> ci,
final Scan scan) throws Throwable {
validateParameters(scan);
class MaxCallBack implements Batch.Callback<R> {
R max = null;
R getMax() {
return max;
}
@Override
public synchronized void update(byte[] region, byte[] row, R result) {
max = (max == null || (result != null && ci.compare(max, result) < 0)) ? result : max;
}
}
MaxCallBack aMaxCallBack = new MaxCallBack();
HTable table = null;
try {
table = new HTable(conf, tableName);
table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(),
scan.getStopRow(), new Batch.Call<AggregateProtocol, R>() {
@Override
public R call(AggregateProtocol instance) throws IOException {
return instance.getMax(ci, scan);
}
}, aMaxCallBack);
} finally {
if (table != null) {
table.close();
}
}
return aMaxCallBack.getMax();
}
private void validateParameters(Scan scan) throws IOException {
if (scan == null
|| (Bytes.equals(scan.getStartRow(), scan.getStopRow()) && !Bytes
.equals(scan.getStartRow(), HConstants.EMPTY_START_ROW))
|| ((Bytes.compareTo(scan.getStartRow(), scan.getStopRow()) > 0) &&
!Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW))) {
throw new IOException(
"Agg client Exception: Startrow should be smaller than Stoprow");
} else if (scan.getFamilyMap().size() != 1) {
throw new IOException("There must be only one family.");
}
}
/**
* It gives the minimum value of a column for a given column family for the
* given range. In case qualifier is null, a min of all values for the given
* family is returned.
* @param tableName
* @param ci
* @param scan
* @return min val <R>
* @throws Throwable
*/
public <R, S> R min(final byte[] tableName, final ColumnInterpreter<R, S> ci,
final Scan scan) throws Throwable {
validateParameters(scan);
class MinCallBack implements Batch.Callback<R> {
private R min = null;
public R getMinimum() {
return min;
}
@Override
public synchronized void update(byte[] region, byte[] row, R result) {
min = (min == null || (result != null && ci.compare(result, min) < 0)) ? result : min;
}
}
MinCallBack minCallBack = new MinCallBack();
HTable table = null;
try {
table = new HTable(conf, tableName);
table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(),
scan.getStopRow(), new Batch.Call<AggregateProtocol, R>() {
@Override
public R call(AggregateProtocol instance) throws IOException {
return instance.getMin(ci, scan);
}
}, minCallBack);
} finally {
if (table != null) {
table.close();
}
}
log.debug("Min fom all regions is: " + minCallBack.getMinimum());
return minCallBack.getMinimum();
}
/**
* It gives the row count, by summing up the individual results obtained from
* regions. In case the qualifier is null, FirstKEyValueFilter is used to
* optimised the operation. In case qualifier is provided, I can't use the
* filter as it may set the flag to skip to next row, but the value read is
* not of the given filter: in this case, this particular row will not be
* counted ==> an error.
* @param tableName
* @param ci
* @param scan
* @return <R, S>
* @throws Throwable
*/
public <R, S> long rowCount(final byte[] tableName,
final ColumnInterpreter<R, S> ci, final Scan scan) throws Throwable {
validateParameters(scan);
class RowNumCallback implements Batch.Callback<Long> {
private final AtomicLong rowCountL = new AtomicLong(0);
public long getRowNumCount() {
return rowCountL.get();
}
@Override
public void update(byte[] region, byte[] row, Long result) {
rowCountL.addAndGet(result.longValue());
}
}
RowNumCallback rowNum = new RowNumCallback();
HTable table = null;
try {
table = new HTable(conf, tableName);
table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(),
scan.getStopRow(), new Batch.Call<AggregateProtocol, Long>() {
@Override
public Long call(AggregateProtocol instance) throws IOException {
return instance.getRowNum(ci, scan);
}
}, rowNum);
} finally {
if (table != null) {
table.close();
}
}
return rowNum.getRowNumCount();
}
/**
* It sums up the value returned from various regions. In case qualifier is
* null, summation of all the column qualifiers in the given family is done.
* @param tableName
* @param ci
* @param scan
* @return sum <S>
* @throws Throwable
*/
public <R, S> S sum(final byte[] tableName, final ColumnInterpreter<R, S> ci,
final Scan scan) throws Throwable {
validateParameters(scan);
class SumCallBack implements Batch.Callback<S> {
S sumVal = null;
public S getSumResult() {
return sumVal;
}
@Override
public synchronized void update(byte[] region, byte[] row, S result) {
sumVal = ci.add(sumVal, result);
}
}
SumCallBack sumCallBack = new SumCallBack();
HTable table = null;
try {
table = new HTable(conf, tableName);
table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(),
scan.getStopRow(), new Batch.Call<AggregateProtocol, S>() {
@Override
public S call(AggregateProtocol instance) throws IOException {
return instance.getSum(ci, scan);
}
}, sumCallBack);
} finally {
if (table != null) {
table.close();
}
}
return sumCallBack.getSumResult();
}
/**
* It computes average while fetching sum and row count from all the
* corresponding regions. Approach is to compute a global sum of region level
* sum and rowcount and then compute the average.
* @param tableName
* @param scan
* @throws Throwable
*/
private <R, S> Pair<S, Long> getAvgArgs(final byte[] tableName,
final ColumnInterpreter<R, S> ci, final Scan scan) throws Throwable {
validateParameters(scan);
class AvgCallBack implements Batch.Callback<Pair<S, Long>> {
S sum = null;
Long rowCount = 0l;
public Pair<S, Long> getAvgArgs() {
return new Pair<S, Long>(sum, rowCount);
}
@Override
public synchronized void update(byte[] region, byte[] row, Pair<S, Long> result) {
sum = ci.add(sum, result.getFirst());
rowCount += result.getSecond();
}
}
AvgCallBack avgCallBack = new AvgCallBack();
HTable table = null;
try {
table = new HTable(conf, tableName);
table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(),
scan.getStopRow(),
new Batch.Call<AggregateProtocol, Pair<S, Long>>() {
@Override
public Pair<S, Long> call(AggregateProtocol instance)
throws IOException {
return instance.getAvg(ci, scan);
}
}, avgCallBack);
} finally {
if (table != null) {
table.close();
}
}
return avgCallBack.getAvgArgs();
}
/**
* This is the client side interface/handle for calling the average method for
* a given cf-cq combination. It was necessary to add one more call stack as
* its return type should be a decimal value, irrespective of what
* columninterpreter says. So, this methods collects the necessary parameters
* to compute the average and returs the double value.
* @param tableName
* @param ci
* @param scan
* @return <R, S>
* @throws Throwable
*/
public <R, S> double avg(final byte[] tableName,
final ColumnInterpreter<R, S> ci, Scan scan) throws Throwable {
Pair<S, Long> p = getAvgArgs(tableName, ci, scan);
return ci.divideForAvg(p.getFirst(), p.getSecond());
}
/**
* It computes a global standard deviation for a given column and its value.
* Standard deviation is square root of (average of squares -
* average*average). From individual regions, it obtains sum, square sum and
* number of rows. With these, the above values are computed to get the global
* std.
* @param tableName
* @param scan
* @return
* @throws Throwable
*/
private <R, S> Pair<List<S>, Long> getStdArgs(final byte[] tableName,
final ColumnInterpreter<R, S> ci, final Scan scan) throws Throwable {
validateParameters(scan);
class StdCallback implements Batch.Callback<Pair<List<S>, Long>> {
long rowCountVal = 0l;
S sumVal = null, sumSqVal = null;
public Pair<List<S>, Long> getStdParams() {
List<S> l = new ArrayList<S>();
l.add(sumVal);
l.add(sumSqVal);
Pair<List<S>, Long> p = new Pair<List<S>, Long>(l, rowCountVal);
return p;
}
@Override
public synchronized void update(byte[] region, byte[] row, Pair<List<S>, Long> result) {
sumVal = ci.add(sumVal, result.getFirst().get(0));
sumSqVal = ci.add(sumSqVal, result.getFirst().get(1));
rowCountVal += result.getSecond();
}
}
StdCallback stdCallback = new StdCallback();
HTable table = null;
try {
table = new HTable(conf, tableName);
table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(),
scan.getStopRow(),
new Batch.Call<AggregateProtocol, Pair<List<S>, Long>>() {
@Override
public Pair<List<S>, Long> call(AggregateProtocol instance)
throws IOException {
return instance.getStd(ci, scan);
}
}, stdCallback);
} finally {
if (table != null) {
table.close();
}
}
return stdCallback.getStdParams();
}
/**
* This is the client side interface/handle for calling the std method for a
* given cf-cq combination. It was necessary to add one more call stack as its
* return type should be a decimal value, irrespective of what
* columninterpreter says. So, this methods collects the necessary parameters
* to compute the std and returns the double value.
* @param tableName
* @param ci
* @param scan
* @return <R, S>
* @throws Throwable
*/
public <R, S> double std(final byte[] tableName, ColumnInterpreter<R, S> ci,
Scan scan) throws Throwable {
Pair<List<S>, Long> p = getStdArgs(tableName, ci, scan);
double res = 0d;
double avg = ci.divideForAvg(p.getFirst().get(0), p.getSecond());
double avgOfSumSq = ci.divideForAvg(p.getFirst().get(1), p.getSecond());
res = avgOfSumSq - (avg) * (avg); // variance
res = Math.pow(res, 0.5);
return res;
}
/**
* It helps locate the region with median for a given column whose weight
* is specified in an optional column.
* From individual regions, it obtains sum of values and sum of weights.
* @param tableName
* @param ci
* @param scan
* @return pair whose first element is a map between start row of the region
* and (sum of values, sum of weights) for the region, the second element is
* (sum of values, sum of weights) for all the regions chosen
* @throws Throwable
*/
private <R, S> Pair<NavigableMap<byte[], List<S>>, List<S>>
getMedianArgs(final byte[] tableName,
final ColumnInterpreter<R, S> ci, final Scan scan) throws Throwable {
validateParameters(scan);
final NavigableMap<byte[], List<S>> map =
new TreeMap<byte[], List<S>>(Bytes.BYTES_COMPARATOR);
class StdCallback implements Batch.Callback<List<S>> {
S sumVal = null, sumWeights = null;
public Pair<NavigableMap<byte[], List<S>>, List<S>> getMedianParams() {
List<S> l = new ArrayList<S>();
l.add(sumVal);
l.add(sumWeights);
Pair<NavigableMap<byte[], List<S>>, List<S>> p =
new Pair<NavigableMap<byte[], List<S>>, List<S>>(map, l);
return p;
}
@Override
public synchronized void update(byte[] region, byte[] row, List<S> result) {
map.put(row, result);
sumVal = ci.add(sumVal, result.get(0));
sumWeights = ci.add(sumWeights, result.get(1));
}
}
StdCallback stdCallback = new StdCallback();
HTable table = null;
try {
table = new HTable(conf, tableName);
table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(),
scan.getStopRow(), new Batch.Call<AggregateProtocol, List<S>>() {
@Override
public List<S> call(AggregateProtocol instance) throws IOException {
return instance.getMedian(ci, scan);
}
}, stdCallback);
} finally {
if (table != null) {
table.close();
}
}
return stdCallback.getMedianParams();
}
/**
* This is the client side interface/handler for calling the median method for a
* given cf-cq combination. This method collects the necessary parameters
* to compute the median and returns the median.
* @param tableName
* @param ci
* @param scan
* @return R the median
* @throws Throwable
*/
public <R, S> R median(final byte[] tableName, ColumnInterpreter<R, S> ci,
Scan scan) throws Throwable {
Pair<NavigableMap<byte[], List<S>>, List<S>> p = getMedianArgs(tableName, ci, scan);
byte[] startRow = null;
byte[] colFamily = scan.getFamilies()[0];
NavigableSet<byte[]> quals = scan.getFamilyMap().get(colFamily);
NavigableMap<byte[], List<S>> map = p.getFirst();
S sumVal = p.getSecond().get(0);
S sumWeights = p.getSecond().get(1);
double halfSumVal = ci.divideForAvg(sumVal, 2L);
double movingSumVal = 0;
boolean weighted = false;
if (quals.size() > 1) {
weighted = true;
halfSumVal = ci.divideForAvg(sumWeights, 2L);
}
for (Map.Entry<byte[], List<S>> entry : map.entrySet()) {
S s = weighted ? entry.getValue().get(1) : entry.getValue().get(0);
double newSumVal = movingSumVal + ci.divideForAvg(s, 1L);
if (newSumVal > halfSumVal) break; // we found the region with the median
movingSumVal = newSumVal;
startRow = entry.getKey();
}
// scan the region with median and find it
Scan scan2 = new Scan(scan);
// inherit stop row from method parameter
if (startRow != null) scan2.setStartRow(startRow);
HTable table = null;
ResultScanner scanner = null;
try {
table = new HTable(conf, tableName);
int cacheSize = scan2.getCaching();
if (!scan2.getCacheBlocks() || scan2.getCaching() < 2) {
scan2.setCacheBlocks(true);
cacheSize = 5;
scan2.setCaching(cacheSize);
}
scanner = table.getScanner(scan2);
Result[] results = null;
byte[] qualifier = quals.pollFirst();
// qualifier for the weight column
byte[] weightQualifier = weighted ? quals.pollLast() : qualifier;
R value = null;
do {
results = scanner.next(cacheSize);
if (results != null && results.length > 0) {
for (int i = 0; i < results.length; i++) {
Result r = results[i];
// retrieve weight
KeyValue kv = r.getColumnLatest(colFamily, weightQualifier);
R newValue = ci.getValue(colFamily, weightQualifier, kv);
S s = ci.castToReturnType(newValue);
double newSumVal = movingSumVal + ci.divideForAvg(s, 1L);
// see if we have moved past the median
if (newSumVal > halfSumVal) {
return value;
}
movingSumVal = newSumVal;
kv = r.getColumnLatest(colFamily, qualifier);
value = ci.getValue(colFamily, qualifier, kv);
}
}
} while (results != null && results.length > 0);
} finally {
if (scanner != null) {
scanner.close();
}
if (table != null) {
table.close();
}
}
return null;
}
}