/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import junit.framework.Assert;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.ChaosMonkey;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.ChaosMonkey.Action;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.experimental.categories.Category;
/**
* A system test which does large data ingestion and verify using {@link LoadTestTool},
* while introducing chaos by hoarding many regions into few servers (unbalancing), then
* killing some of these servers, and triggering balancer.
* It's configured using a set of constants on top, which cover this scenario and are
* reasonable for minicluster. See constants if you want to tweak the test.
* You can configure how long the test should run by using
* "hbase.IntegrationTestRebalanceAndKillServersTargeted.runtime" configuration parameter,
* which is probably most useful on cluster.
*/
@Category(IntegrationTests.class)
public class IntegrationTestRebalanceAndKillServersTargeted extends IngestIntegrationTestBase {
private static final int NUM_SLAVES_BASE = 4; // number of slaves for the smallest cluster
private static final long DEFAULT_RUN_TIME = 5 * 60 * 1000; // run for 5 min by default
/** How often to introduce the chaos. If too frequent, sequence of kills on minicluster
* can cause test to fail when Put runs out of retries. */
private static final long CHAOS_EVERY_MS = 65 * 1000;
private ChaosMonkey monkey;
/** This action is too specific to put in ChaosMonkey; put it here */
static class UnbalanceKillAndRebalanceAction extends ChaosMonkey.Action {
/** Fractions of servers to get regions and live and die respectively; from all other
* servers, HOARD_FRC_OF_REGIONS will be removed to the above randomly */
private static final double FRC_SERVERS_THAT_HOARD_AND_LIVE = 0.1;
private static final double FRC_SERVERS_THAT_HOARD_AND_DIE = 0.1;
private static final double HOARD_FRC_OF_REGIONS = 0.8;
/** Waits between calling unbalance and killing servers, kills and rebalance, and rebalance
* and restarting the servers; to make sure these events have time to impact the cluster. */
private static final long WAIT_FOR_UNBALANCE_MS = 2 * 1000;
private static final long WAIT_FOR_KILLS_MS = 2 * 1000;
private static final long WAIT_AFTER_BALANCE_MS = 5 * 1000;
@Override
protected void perform() throws Exception {
ClusterStatus status = this.cluster.getClusterStatus();
List<ServerName> victimServers = new LinkedList<ServerName>(status.getServers());
int liveCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_LIVE * victimServers.size());
int deadCount = (int)Math.ceil(FRC_SERVERS_THAT_HOARD_AND_DIE * victimServers.size());
Assert.assertTrue((liveCount + deadCount) < victimServers.size());
List<ServerName> targetServers = new ArrayList<ServerName>(liveCount);
for (int i = 0; i < liveCount + deadCount; ++i) {
int victimIx = random.nextInt(victimServers.size());
targetServers.add(victimServers.remove(victimIx));
}
unbalanceRegions(status, victimServers, targetServers, HOARD_FRC_OF_REGIONS);
Thread.sleep(WAIT_FOR_UNBALANCE_MS);
for (int i = 0; i < liveCount; ++i) {
killRs(targetServers.get(i));
}
Thread.sleep(WAIT_FOR_KILLS_MS);
forceBalancer();
Thread.sleep(WAIT_AFTER_BALANCE_MS);
for (int i = 0; i < liveCount; ++i) {
startRs(targetServers.get(i));
}
}
}
@Before
@SuppressWarnings("unchecked")
public void setUp() throws Exception {
super.setUp(NUM_SLAVES_BASE);
ChaosMonkey.Policy chaosPolicy = new ChaosMonkey.PeriodicRandomActionPolicy(
CHAOS_EVERY_MS, new UnbalanceKillAndRebalanceAction());
monkey = new ChaosMonkey(util, chaosPolicy);
monkey.start();
}
@After
public void tearDown() throws Exception {
if (monkey != null) {
monkey.stop("tearDown");
monkey.waitForStop();
}
super.tearDown();
}
@Test
public void testDataIngest() throws Exception {
runIngestTest(DEFAULT_RUN_TIME, 2500, 10, 100, 20);
}
}