/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import junit.framework.Assert;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.protobuf.RequestConverter;
import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.JVMClusterUtil;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.zookeeper.KeeperException;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.experimental.categories.Category;
/**
* Test the draining servers feature.
*
* This is typically an integration test: a unit test would be to check that the
* master does no assign regions to a regionserver marked as drained.
*
* @see <a href="https://issues.apache.org/jira/browse/HBASE-4298">HBASE-4298</a>
*/
@Category(MediumTests.class)
public class TestDrainingServer {
private static final Log LOG = LogFactory.getLog(TestDrainingServer.class);
private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
private static final int NB_SLAVES = 5;
private static final int COUNT_OF_REGIONS = NB_SLAVES * 2;
/**
* Spin up a cluster with a bunch of regions on it.
*/
@BeforeClass
public static void setUpBeforeClass() throws Exception {
TEST_UTIL.startMiniCluster(NB_SLAVES);
TEST_UTIL.getHBaseCluster().waitForActiveAndReadyMaster();
TEST_UTIL.getConfiguration().setBoolean("hbase.master.enabletable.roundrobin", true);
final List<String> families = new ArrayList<String>(1);
families.add("family");
TEST_UTIL.createRandomTable("table", families, 1, 0, 0, COUNT_OF_REGIONS, 0);
// Ensure a stable env
TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, false);
boolean ready = false;
while (!ready){
waitForAllRegionsOnline();
// Assert that every regionserver has some regions on it.
int i = 0;
ready = true;
while (i < NB_SLAVES && ready){
HRegionServer hrs = TEST_UTIL.getMiniHBaseCluster().getRegionServer(i);
if (ProtobufUtil.getOnlineRegions(hrs).isEmpty()){
ready = false;
}
i++;
}
if (!ready){
TEST_UTIL.getHBaseAdmin().setBalancerRunning(true, true);
Assert.assertTrue("Can't start a balance!", TEST_UTIL.getHBaseAdmin().balancer());
TEST_UTIL.getHBaseAdmin().setBalancerRunning(false, false);
Thread.sleep(100);
}
}
}
private static HRegionServer setDrainingServer(final HRegionServer hrs)
throws KeeperException {
LOG.info("Making " + hrs.getServerName() + " the draining server; " +
"it has " + hrs.getNumberOfOnlineRegions() + " online regions");
ZooKeeperWatcher zkw = hrs.getZooKeeper();
String hrsDrainingZnode =
ZKUtil.joinZNode(zkw.drainingZNode, hrs.getServerName().toString());
ZKUtil.createWithParents(zkw, hrsDrainingZnode);
return hrs;
}
private static HRegionServer unsetDrainingServer(final HRegionServer hrs)
throws KeeperException {
ZooKeeperWatcher zkw = hrs.getZooKeeper();
String hrsDrainingZnode =
ZKUtil.joinZNode(zkw.drainingZNode, hrs.getServerName().toString());
ZKUtil.deleteNode(zkw, hrsDrainingZnode);
return hrs;
}
@AfterClass
public static void tearDownAfterClass() throws Exception {
TEST_UTIL.shutdownMiniCluster();
}
/**
* Test adding server to draining servers and then move regions off it.
* Make sure that no regions are moved back to the draining server.
* @throws IOException
* @throws KeeperException
*/
@Test // (timeout=30000)
public void testDrainingServerOffloading()
throws Exception {
// I need master in the below.
HMaster master = TEST_UTIL.getMiniHBaseCluster().getMaster();
HRegionInfo hriToMoveBack = null;
// Set first server as draining server.
HRegionServer drainingServer =
setDrainingServer(TEST_UTIL.getMiniHBaseCluster().getRegionServer(0));
try {
final int regionsOnDrainingServer =
drainingServer.getNumberOfOnlineRegions();
Assert.assertTrue(regionsOnDrainingServer > 0);
List<HRegionInfo> hris = ProtobufUtil.getOnlineRegions(drainingServer);
for (HRegionInfo hri : hris) {
// Pass null and AssignmentManager will chose a random server BUT it
// should exclude draining servers.
master.moveRegion(null,
RequestConverter.buildMoveRegionRequest(hri.getEncodedNameAsBytes(), null));
// Save off region to move back.
hriToMoveBack = hri;
}
// Wait for regions to come back on line again.
waitForAllRegionsOnline();
Assert.assertEquals(0, drainingServer.getNumberOfOnlineRegions());
} finally {
unsetDrainingServer(drainingServer);
}
// Now we've unset the draining server, we should be able to move a region
// to what was the draining server.
master.moveRegion(null,
RequestConverter.buildMoveRegionRequest(hriToMoveBack.getEncodedNameAsBytes(),
Bytes.toBytes(drainingServer.getServerName().toString())));
// Wait for regions to come back on line again.
waitForAllRegionsOnline();
Assert.assertEquals(1, drainingServer.getNumberOfOnlineRegions());
}
/**
* Test that draining servers are ignored even after killing regionserver(s).
* Verify that the draining server is not given any of the dead servers regions.
* @throws KeeperException
* @throws IOException
*/
@Test (timeout=30000)
public void testDrainingServerWithAbort() throws KeeperException, Exception {
HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
waitForAllRegionsOnline();
final long regionCount = TEST_UTIL.getMiniHBaseCluster().countServedRegions();
// Let's get a copy of the regions today.
Collection<HRegion> regions = new ArrayList<HRegion>();
for (int i = 0; i < NB_SLAVES; i++) {
HRegionServer hrs = TEST_UTIL.getMiniHBaseCluster().getRegionServer(i);
regions.addAll( hrs.getCopyOfOnlineRegionsSortedBySize().values() );
}
// Choose the draining server
HRegionServer drainingServer = TEST_UTIL.getMiniHBaseCluster().getRegionServer(0);
final int regionsOnDrainingServer = drainingServer.getNumberOfOnlineRegions();
Assert.assertTrue(regionsOnDrainingServer > 0);
ServerManager sm = master.getServerManager();
Collection<HRegion> regionsBefore = drainingServer.
getCopyOfOnlineRegionsSortedBySize().values();
LOG.info("Regions of drained server are: "+ regionsBefore );
try {
// Add first server to draining servers up in zk.
setDrainingServer(drainingServer);
//wait for the master to receive and manage the event
while (sm.createDestinationServersList().contains(drainingServer.getServerName())) {
Thread.sleep(1);
}
LOG.info("The available servers are: "+ sm.createDestinationServersList());
Assert.assertEquals("Nothing should have happened here.", regionsOnDrainingServer,
drainingServer.getNumberOfOnlineRegions());
Assert.assertFalse("We should not have regions in transition here. List is: " +
master.getAssignmentManager().getRegionStates().getRegionsInTransition(),
master.getAssignmentManager().getRegionStates().isRegionsInTransition());
// Kill a few regionservers.
for (int aborted = 0; aborted <= 2; aborted++) {
HRegionServer hrs = TEST_UTIL.getMiniHBaseCluster().getRegionServer(aborted + 1);
hrs.abort("Aborting");
}
// Wait for regions to come back online again.
waitForAllRegionsOnline();
Collection<HRegion> regionsAfter =
drainingServer.getCopyOfOnlineRegionsSortedBySize().values();
LOG.info("Regions of drained server are: " + regionsAfter);
Assert.assertEquals("Test conditions are not met: regions were" +
" created/deleted during the test. ",
regionCount, TEST_UTIL.getMiniHBaseCluster().countServedRegions());
// Assert the draining server still has the same regions.
StringBuilder result = new StringBuilder();
for (HRegion r: regionsAfter){
if (!regionsBefore.contains(r)){
result.append(r).append(" was added after the drain");
if (regions.contains(r)){
result.append("(existing region");
} else {
result.append("(new region)");
}
result.append("; ");
}
}
for (HRegion r: regionsBefore){
if (!regionsAfter.contains(r)){
result.append(r).append(" was removed after the drain; ");
}
}
Assert.assertTrue("Errors are: "+ result.toString(), result.length()==0);
} finally {
unsetDrainingServer(drainingServer);
}
}
private static void waitForAllRegionsOnline() throws Exception {
// Wait for regions to come back on line again.
boolean done = false;
while (!done) {
Thread.sleep(1);
// Nothing in ZK RIT for a start
ZKAssign.blockUntilNoRIT(TEST_UTIL.getZooKeeperWatcher());
// Then we want all the regions to be marked as available...
if (!isAllRegionsOnline()) continue;
// And without any work in progress on the master side
if (TEST_UTIL.getMiniHBaseCluster().getMaster().
getAssignmentManager().getRegionStates().isRegionsInTransition()) continue;
// nor on the region server side
done = true;
for (JVMClusterUtil.RegionServerThread rs :
TEST_UTIL.getMiniHBaseCluster().getLiveRegionServerThreads()) {
if (!rs.getRegionServer().getRegionsInTransitionInRS().isEmpty()) {
done = false;
}
}
}
}
private static boolean isAllRegionsOnline() {
return TEST_UTIL.getMiniHBaseCluster().countServedRegions() ==
(COUNT_OF_REGIONS + 2 /*catalog regions*/);
}
}