/*
* Copyright 2011 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.client.coprocessor;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.coprocessor.AggregateProtocol;
import org.apache.hadoop.hbase.coprocessor.ColumnInterpreter;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
/**
* This client class is for invoking the aggregate functions deployed on the
* Region Server side via the AggregateProtocol. This class will implement the
* supporting functionality for summing/processing the individual results
* obtained from the AggregateProtocol for each region.
* <p>
* This will serve as the client side handler for invoking the aggregate
* functions.
* <ul>
* For all aggregate functions,
* <li>start row < end row is an essential condition (if they are not
* {@link HConstants#EMPTY_BYTE_ARRAY})
* <li>Column family can't be null. In case where multiple families are
* provided, an IOException will be thrown. An optional column qualifier can
* also be defined.
* <li>For methods to find maximum, minimum, sum, rowcount, it returns the
* parameter type. For average and std, it returns a double value. For row
* count, it returns a long value.
*/
public class AggregationClient {
private static final Log log = LogFactory.getLog(AggregationClient.class);
Configuration conf;
/**
* Constructor with Conf object
* @param cfg
*/
public AggregationClient(Configuration cfg) {
this.conf = cfg;
}
/**
* It gives the maximum value of a column for a given column family for the
* given range. In case qualifier is null, a max of all values for the given
* family is returned.
* @param tableName
* @param ci
* @param scan
* @return max val <R>
* @throws Throwable
* The caller is supposed to handle the exception as they are thrown
* & propagated to it.
*/
public <R, S> R max(final byte[] tableName, final ColumnInterpreter<R, S> ci,
final Scan scan) throws Throwable {
validateParameters(scan);
HTable table = new HTable(conf, tableName);
class MaxCallBack implements Batch.Callback<R> {
R max = null;
R getMax() {
return max;
}
@Override
public synchronized void update(byte[] region, byte[] row, R result) {
max = ci.compare(max, result) < 0 ? result : max;
}
}
MaxCallBack aMaxCallBack = new MaxCallBack();
table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(), scan
.getStopRow(), new Batch.Call<AggregateProtocol, R>() {
@Override
public R call(AggregateProtocol instance) throws IOException {
return instance.getMax(ci, scan);
}
}, aMaxCallBack);
return aMaxCallBack.getMax();
}
private void validateParameters(Scan scan) throws IOException {
if (scan == null
|| (Bytes.equals(scan.getStartRow(), scan.getStopRow()) && !Bytes
.equals(scan.getStartRow(), HConstants.EMPTY_START_ROW))
|| Bytes.compareTo(scan.getStartRow(), scan.getStopRow()) > 0) {
throw new IOException(
"Agg client Exception: Startrow should be smaller than Stoprow");
} else if (scan.getFamilyMap().size() != 1) {
throw new IOException("There must be only one family.");
}
}
/**
* It gives the minimum value of a column for a given column family for the
* given range. In case qualifier is null, a min of all values for the given
* family is returned.
* @param tableName
* @param ci
* @param scan
* @return min val <R>
* @throws Throwable
*/
public <R, S> R min(final byte[] tableName, final ColumnInterpreter<R, S> ci,
final Scan scan) throws Throwable {
validateParameters(scan);
class MinCallBack implements Batch.Callback<R> {
private R min = null;
public R getMinimum() {
return min;
}
@Override
public synchronized void update(byte[] region, byte[] row, R result) {
min = (min == null || ci.compare(result, min) < 0) ? result : min;
}
}
HTable table = new HTable(conf, tableName);
MinCallBack minCallBack = new MinCallBack();
table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(), scan
.getStopRow(), new Batch.Call<AggregateProtocol, R>() {
@Override
public R call(AggregateProtocol instance) throws IOException {
return instance.getMin(ci, scan);
}
}, minCallBack);
log.debug("Min fom all regions is: " + minCallBack.getMinimum());
return minCallBack.getMinimum();
}
/**
* It gives the row count, by summing up the individual results obtained from
* regions. In case the qualifier is null, FirstKEyValueFilter is used to
* optimised the operation. In case qualifier is provided, I can't use the
* filter as it may set the flag to skip to next row, but the value read is
* not of the given filter: in this case, this particular row will not be
* counted ==> an error.
* @param tableName
* @param ci
* @param scan
* @return <R, S>
* @throws Throwable
*/
public <R, S> long rowCount(final byte[] tableName,
final ColumnInterpreter<R, S> ci, final Scan scan) throws Throwable {
validateParameters(scan);
class RowNumCallback implements Batch.Callback<Long> {
private final AtomicLong rowCountL = new AtomicLong(0);
public long getRowNumCount() {
return rowCountL.get();
}
@Override
public void update(byte[] region, byte[] row, Long result) {
rowCountL.addAndGet(result.longValue());
}
}
RowNumCallback rowNum = new RowNumCallback();
HTable table = new HTable(conf, tableName);
table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(), scan
.getStopRow(), new Batch.Call<AggregateProtocol, Long>() {
@Override
public Long call(AggregateProtocol instance) throws IOException {
return instance.getRowNum(ci, scan);
}
}, rowNum);
return rowNum.getRowNumCount();
}
/**
* It sums up the value returned from various regions. In case qualifier is
* null, summation of all the column qualifiers in the given family is done.
* @param tableName
* @param ci
* @param scan
* @return sum <S>
* @throws Throwable
*/
public <R, S> S sum(final byte[] tableName, final ColumnInterpreter<R, S> ci,
final Scan scan) throws Throwable {
validateParameters(scan);
class SumCallBack implements Batch.Callback<S> {
S sumVal = null;
public S getSumResult() {
return sumVal;
}
@Override
public synchronized void update(byte[] region, byte[] row, S result) {
sumVal = ci.add(sumVal, result);
}
}
SumCallBack sumCallBack = new SumCallBack();
HTable table = new HTable(conf, tableName);
table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(), scan
.getStopRow(), new Batch.Call<AggregateProtocol, S>() {
@Override
public S call(AggregateProtocol instance) throws IOException {
return instance.getSum(ci, scan);
}
}, sumCallBack);
return sumCallBack.getSumResult();
}
/**
* It computes average while fetching sum and row count from all the
* corresponding regions. Approach is to compute a global sum of region level
* sum and rowcount and then compute the average.
* @param tableName
* @param scan
* @throws Throwable
*/
private <R, S> Pair<S, Long> getAvgArgs(final byte[] tableName,
final ColumnInterpreter<R, S> ci, final Scan scan) throws Throwable {
validateParameters(scan);
class AvgCallBack implements Batch.Callback<Pair<S, Long>> {
S sum = null;
Long rowCount = 0l;
public Pair<S, Long> getAvgArgs() {
return new Pair<S, Long>(sum, rowCount);
}
@Override
public synchronized void update(byte[] region, byte[] row, Pair<S, Long> result) {
sum = ci.add(sum, result.getFirst());
rowCount += result.getSecond();
}
}
AvgCallBack avgCallBack = new AvgCallBack();
HTable table = new HTable(conf, tableName);
table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(), scan
.getStopRow(), new Batch.Call<AggregateProtocol, Pair<S, Long>>() {
@Override
public Pair<S, Long> call(AggregateProtocol instance) throws IOException {
return instance.getAvg(ci, scan);
}
}, avgCallBack);
return avgCallBack.getAvgArgs();
}
/**
* This is the client side interface/handle for calling the average method for
* a given cf-cq combination. It was necessary to add one more call stack as
* its return type should be a decimal value, irrespective of what
* columninterpreter says. So, this methods collects the necessary parameters
* to compute the average and returs the double value.
* @param tableName
* @param ci
* @param scan
* @return <R, S>
* @throws Throwable
*/
public <R, S> double avg(final byte[] tableName,
final ColumnInterpreter<R, S> ci, Scan scan) throws Throwable {
Pair<S, Long> p = getAvgArgs(tableName, ci, scan);
return ci.divideForAvg(p.getFirst(), p.getSecond());
}
/**
* It computes a global standard deviation for a given column and its value.
* Standard deviation is square root of (average of squares -
* average*average). From individual regions, it obtains sum, square sum and
* number of rows. With these, the above values are computed to get the global
* std.
* @param tableName
* @param scan
* @return
* @throws Throwable
*/
private <R, S> Pair<List<S>, Long> getStdArgs(final byte[] tableName,
final ColumnInterpreter<R, S> ci, final Scan scan) throws Throwable {
validateParameters(scan);
class StdCallback implements Batch.Callback<Pair<List<S>, Long>> {
long rowCountVal = 0l;
S sumVal = null, sumSqVal = null;
public Pair<List<S>, Long> getStdParams() {
List<S> l = new ArrayList<S>();
l.add(sumVal);
l.add(sumSqVal);
Pair<List<S>, Long> p = new Pair<List<S>, Long>(l, rowCountVal);
return p;
}
@Override
public synchronized void update(byte[] region, byte[] row, Pair<List<S>, Long> result) {
sumVal = ci.add(sumVal, result.getFirst().get(0));
sumSqVal = ci.add(sumSqVal, result.getFirst().get(1));
rowCountVal += result.getSecond();
}
}
StdCallback stdCallback = new StdCallback();
HTable table = new HTable(conf, tableName);
table.coprocessorExec(AggregateProtocol.class, scan.getStartRow(), scan
.getStopRow(),
new Batch.Call<AggregateProtocol, Pair<List<S>, Long>>() {
@Override
public Pair<List<S>, Long> call(AggregateProtocol instance)
throws IOException {
return instance.getStd(ci, scan);
}
}, stdCallback);
return stdCallback.getStdParams();
}
/**
* This is the client side interface/handle for calling the std method for a
* given cf-cq combination. It was necessary to add one more call stack as its
* return type should be a decimal value, irrespective of what
* columninterpreter says. So, this methods collects the necessary parameters
* to compute the std and returns the double value.
* @param tableName
* @param ci
* @param scan
* @return <R, S>
* @throws Throwable
*/
public <R, S> double std(final byte[] tableName, ColumnInterpreter<R, S> ci,
Scan scan) throws Throwable {
Pair<List<S>, Long> p = getStdArgs(tableName, ci, scan);
double res = 0d;
double avg = ci.divideForAvg(p.getFirst().get(0), p.getSecond());
double avgOfSumSq = ci.divideForAvg(p.getFirst().get(1), p.getSecond());
res = avgOfSumSq - (avg) * (avg); // variance
res = Math.pow(res, 0.5);
return res;
}
}