TestMasterFailover.java example

Explorer
hbase-cache-master
- trunk
/**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.master;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Abortable;
import org.apache.hadoop.hbase.ClusterStatus;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.LargeTests;
import org.apache.hadoop.hbase.MasterNotRunningException;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.RegionTransition;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.executor.EventHandler.EventType;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSTableDescriptors;
import org.apache.hadoop.hbase.util.JVMClusterUtil;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZKTable;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.junit.Test;
import org.junit.experimental.categories.Category;

@Category(LargeTests.class)
public class TestMasterFailover {
  private static final Log LOG = LogFactory.getLog(TestMasterFailover.class);

  /**
   * Complex test of master failover that tests as many permutations of the
   * different possible states that regions in transition could be in within ZK.
   * <p>
   * This tests the proper handling of these states by the failed-over master
   * and includes a thorough testing of the timeout code as well.
   * <p>
   * Starts with a single master and three regionservers.
   * <p>
   * Creates two tables, enabledTable and disabledTable, each containing 5
   * regions.  The disabledTable is then disabled.
   * <p>
   * After reaching steady-state, the master is killed.  We then mock several
   * states in ZK.
   * <p>
   * After mocking them, we will startup a new master which should become the
   * active master and also detect that it is a failover.  The primary test
   * passing condition will be that all regions of the enabled table are
   * assigned and all the regions of the disabled table are not assigned.
   * <p>
   * The different scenarios to be tested are below:
   * <p>
   * <b>ZK State:  OFFLINE</b>
   * <p>A node can get into OFFLINE state if</p>
   * <ul>
   * <li>An RS fails to open a region, so it reverts the state back to OFFLINE
   * <li>The Master is assigning the region to a RS before it sends RPC
   * </ul>
   * <p>We will mock the scenarios</p>
   * <ul>
   * <li>Master has assigned an enabled region but RS failed so a region is
   *     not assigned anywhere and is sitting in ZK as OFFLINE</li>
   * <li>This seems to cover both cases?</li>
   * </ul>
   * <p>
   * <b>ZK State:  CLOSING</b>
   * <p>A node can get into CLOSING state if</p>
   * <ul>
   * <li>An RS has begun to close a region
   * </ul>
   * <p>We will mock the scenarios</p>
   * <ul>
   * <li>Region of enabled table was being closed but did not complete
   * <li>Region of disabled table was being closed but did not complete
   * </ul>
   * <p>
   * <b>ZK State:  CLOSED</b>
   * <p>A node can get into CLOSED state if</p>
   * <ul>
   * <li>An RS has completed closing a region but not acknowledged by master yet
   * </ul>
   * <p>We will mock the scenarios</p>
   * <ul>
   * <li>Region of a table that should be enabled was closed on an RS
   * <li>Region of a table that should be disabled was closed on an RS
   * </ul>
   * <p>
   * <b>ZK State:  OPENING</b>
   * <p>A node can get into OPENING state if</p>
   * <ul>
   * <li>An RS has begun to open a region
   * </ul>
   * <p>We will mock the scenarios</p>
   * <ul>
   * <li>RS was opening a region of enabled table but never finishes
   * </ul>
   * <p>
   * <b>ZK State:  OPENED</b>
   * <p>A node can get into OPENED state if</p>
   * <ul>
   * <li>An RS has finished opening a region but not acknowledged by master yet
   * </ul>
   * <p>We will mock the scenarios</p>
   * <ul>
   * <li>Region of a table that should be enabled was opened on an RS
   * <li>Region of a table that should be disabled was opened on an RS
   * </ul>
   * @throws Exception
   */
  @Test (timeout=180000)
  public void testMasterFailoverWithMockedRIT() throws Exception {

    final int NUM_MASTERS = 1;
    final int NUM_RS = 3;

    // Create config to use for this cluster
    Configuration conf = HBaseConfiguration.create();
    // Need to drop the timeout much lower
    conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
    conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 4000);
    conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 3);
    conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 3);

    // Start the cluster
    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
    TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
    log("Cluster started");

    // Create a ZKW to use in the test
    ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(TEST_UTIL);

    // get all the master threads
    List<MasterThread> masterThreads = cluster.getMasterThreads();
    assertEquals(1, masterThreads.size());

    // only one master thread, let's wait for it to be initialized
    assertTrue(cluster.waitForActiveAndReadyMaster());
    HMaster master = masterThreads.get(0).getMaster();
    assertTrue(master.isActiveMaster());
    assertTrue(master.isInitialized());

    // disable load balancing on this master
    master.balanceSwitch(false);

    // create two tables in META, each with 10 regions
    byte [] FAMILY = Bytes.toBytes("family");
    byte [][] SPLIT_KEYS = new byte [][] {
        new byte[0], Bytes.toBytes("aaa"), Bytes.toBytes("bbb"),
        Bytes.toBytes("ccc"), Bytes.toBytes("ddd"), Bytes.toBytes("eee"),
        Bytes.toBytes("fff"), Bytes.toBytes("ggg"), Bytes.toBytes("hhh"),
        Bytes.toBytes("iii"), Bytes.toBytes("jjj")
    };

    byte [] enabledTable = Bytes.toBytes("enabledTable");
    HTableDescriptor htdEnabled = new HTableDescriptor(enabledTable);
    htdEnabled.addFamily(new HColumnDescriptor(FAMILY));

    FileSystem filesystem = FileSystem.get(conf);
    Path rootdir = filesystem.makeQualified(
        new Path(conf.get(HConstants.HBASE_DIR)));
    // Write the .tableinfo
    FSTableDescriptors.createTableDescriptor(filesystem, rootdir, htdEnabled);

    HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getName(), null, null);
    createRegion(hriEnabled, rootdir, conf, htdEnabled);

    List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(
        TEST_UTIL.getConfiguration(), htdEnabled, SPLIT_KEYS);

    byte [] disabledTable = Bytes.toBytes("disabledTable");
    HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
    htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
    // Write the .tableinfo
    FSTableDescriptors.createTableDescriptor(filesystem, rootdir, htdDisabled);
    HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getName(), null, null);
    createRegion(hriDisabled, rootdir, conf, htdDisabled);
    List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
        TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);

    log("Regions in META have been created");

    // at this point we only expect 2 regions to be assigned out (catalogs)
    assertEquals(2, cluster.countServedRegions());

    // Let's just assign everything to first RS
    HRegionServer hrs = cluster.getRegionServer(0);
    ServerName serverName = hrs.getServerName();
    HRegionInfo closingRegion = enabledRegions.remove(0);
    // we'll need some regions to already be assigned out properly on live RS
    List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
    enabledAndAssignedRegions.add(enabledRegions.remove(0));
    enabledAndAssignedRegions.add(enabledRegions.remove(0));
    enabledAndAssignedRegions.add(closingRegion);

    List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
    disabledAndAssignedRegions.add(disabledRegions.remove(0));
    disabledAndAssignedRegions.add(disabledRegions.remove(0));

    // now actually assign them
    for (HRegionInfo hri : enabledAndAssignedRegions) {
      master.assignmentManager.regionPlans.put(hri.getEncodedName(),
          new RegionPlan(hri, null, serverName));
      master.assignRegion(hri);
    }
    for (HRegionInfo hri : disabledAndAssignedRegions) {
      master.assignmentManager.regionPlans.put(hri.getEncodedName(),
          new RegionPlan(hri, null, serverName));
      master.assignRegion(hri);
    }

    // wait for no more RIT
    log("Waiting for assignment to finish");
    ZKAssign.blockUntilNoRIT(zkw);
    log("Assignment completed");

    // Stop the master
    log("Aborting master");
    cluster.abortMaster(0);
    cluster.waitOnMaster(0);
    log("Master has aborted");

    /*
     * Now, let's start mocking up some weird states as described in the method
     * javadoc.
     */

    List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
    List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();

    log("Beginning to mock scenarios");

    // Disable the disabledTable in ZK
    ZKTable zktable = new ZKTable(zkw);
    zktable.setDisabledTable(Bytes.toString(disabledTable));

    /*
     *  ZK = OFFLINE
     */

    // Region that should be assigned but is not and is in ZK as OFFLINE
    HRegionInfo region = enabledRegions.remove(0);
    regionsThatShouldBeOnline.add(region);
    ZKAssign.createNodeOffline(zkw, region, serverName);

    /*
     * ZK = CLOSING
     */
    regionsThatShouldBeOnline.add(closingRegion);
    ZKAssign.createNodeClosing(zkw, closingRegion, serverName);

    /*
     * ZK = CLOSED
     */

    // Region of enabled table closed but not ack
    region = enabledRegions.remove(0);
    regionsThatShouldBeOnline.add(region);
    int version = ZKAssign.createNodeClosing(zkw, region, serverName);
    ZKAssign.transitionNodeClosed(zkw, region, serverName, version);

    // Region of disabled table closed but not ack
    region = disabledRegions.remove(0);
    regionsThatShouldBeOffline.add(region);
    version = ZKAssign.createNodeClosing(zkw, region, serverName);
    ZKAssign.transitionNodeClosed(zkw, region, serverName, version);

    /*
     * ZK = OPENING
     */

    // RS was opening a region of enabled table but never finishes
    region = enabledRegions.remove(0);
    regionsThatShouldBeOnline.add(region);
    ZKAssign.createNodeOffline(zkw, region, serverName);
    ZKAssign.transitionNodeOpening(zkw, region, serverName);

    /*
     * ZK = OPENED
     */

    // Region of enabled table was opened on RS
    region = enabledRegions.remove(0);
    regionsThatShouldBeOnline.add(region);
    ZKAssign.createNodeOffline(zkw, region, serverName);
    ProtobufUtil.openRegion(hrs, region);
    while (true) {
      byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
      RegionTransition rt = RegionTransition.parseFrom(bytes);
      if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
        break;
      }
      Thread.sleep(100);
    }

    // Region of disable table was opened on RS
    region = disabledRegions.remove(0);
    regionsThatShouldBeOffline.add(region);
    ZKAssign.createNodeOffline(zkw, region, serverName);
    ProtobufUtil.openRegion(hrs, region);
    while (true) {
      byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
      RegionTransition rt = RegionTransition.parseFrom(bytes);
      if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
        break;
      }
      Thread.sleep(100);
    }

    /*
     * ZK = NONE
     */

    /*
     * DONE MOCKING
     */

    log("Done mocking data up in ZK");

    // Start up a new master
    log("Starting up a new master");
    master = cluster.startMaster().getMaster();
    log("Waiting for master to be ready");
    cluster.waitForActiveAndReadyMaster();
    log("Master is ready");

    // Failover should be completed, now wait for no RIT
    log("Waiting for no more RIT");
    ZKAssign.blockUntilNoRIT(zkw);
    log("No more RIT in ZK, now doing final test verification");

    // Grab all the regions that are online across RSs
    Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
    for (JVMClusterUtil.RegionServerThread rst :
      cluster.getRegionServerThreads()) {
      onlineRegions.addAll(ProtobufUtil.getOnlineRegions(rst.getRegionServer()));
    }

    // Now, everything that should be online should be online
    for (HRegionInfo hri : regionsThatShouldBeOnline) {
      assertTrue(onlineRegions.contains(hri));
    }

    // Everything that should be offline should not be online
    for (HRegionInfo hri : regionsThatShouldBeOffline) {
      assertFalse(onlineRegions.contains(hri));
    }

    log("Done with verification, all passed, shutting down cluster");

    // Done, shutdown the cluster
    TEST_UTIL.shutdownMiniCluster();
  }


  /**
   * Complex test of master failover that tests as many permutations of the
   * different possible states that regions in transition could be in within ZK
   * pointing to an RS that has died while no master is around to process it.
   * <p>
   * This tests the proper handling of these states by the failed-over master
   * and includes a thorough testing of the timeout code as well.
   * <p>
   * Starts with a single master and two regionservers.
   * <p>
   * Creates two tables, enabledTable and disabledTable, each containing 5
   * regions.  The disabledTable is then disabled.
   * <p>
   * After reaching steady-state, the master is killed.  We then mock several
   * states in ZK.  And one of the RS will be killed.
   * <p>
   * After mocking them and killing an RS, we will startup a new master which
   * should become the active master and also detect that it is a failover.  The
   * primary test passing condition will be that all regions of the enabled
   * table are assigned and all the regions of the disabled table are not
   * assigned.
   * <p>
   * The different scenarios to be tested are below:
   * <p>
   * <b>ZK State:  CLOSING</b>
   * <p>A node can get into CLOSING state if</p>
   * <ul>
   * <li>An RS has begun to close a region
   * </ul>
   * <p>We will mock the scenarios</p>
   * <ul>
   * <li>Region was being closed but the RS died before finishing the close
   * </ul>
   * <b>ZK State:  OPENED</b>
   * <p>A node can get into OPENED state if</p>
   * <ul>
   * <li>An RS has finished opening a region but not acknowledged by master yet
   * </ul>
   * <p>We will mock the scenarios</p>
   * <ul>
   * <li>Region of a table that should be enabled was opened by a now-dead RS
   * <li>Region of a table that should be disabled was opened by a now-dead RS
   * </ul>
   * <p>
   * <b>ZK State:  NONE</b>
   * <p>A region could not have a transition node if</p>
   * <ul>
   * <li>The server hosting the region died and no master processed it
   * </ul>
   * <p>We will mock the scenarios</p>
   * <ul>
   * <li>Region of enabled table was on a dead RS that was not yet processed
   * <li>Region of disabled table was on a dead RS that was not yet processed
   * </ul>
   * @throws Exception
   */
  @Test (timeout=180000)
  public void testMasterFailoverWithMockedRITOnDeadRS() throws Exception {

    final int NUM_MASTERS = 1;
    final int NUM_RS = 2;

    // Create and start the cluster
    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
    Configuration conf = TEST_UTIL.getConfiguration();
    // Need to drop the timeout much lower
    conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
    conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 4000);
    conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1);
    conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MAXTOSTART, 2);
    TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
    log("Cluster started");

    // Create a ZKW to use in the test
    ZooKeeperWatcher zkw = new ZooKeeperWatcher(TEST_UTIL.getConfiguration(),
        "unittest", new Abortable() {

          @Override
          public void abort(String why, Throwable e) {
            LOG.error("Fatal ZK Error: " + why, e);
            org.junit.Assert.assertFalse("Fatal ZK error", true);
          }

          @Override
          public boolean isAborted() {
            return false;
          }

    });

    // get all the master threads
    List<MasterThread> masterThreads = cluster.getMasterThreads();
    assertEquals(1, masterThreads.size());

    // only one master thread, let's wait for it to be initialized
    assertTrue(cluster.waitForActiveAndReadyMaster());
    HMaster master = masterThreads.get(0).getMaster();
    assertTrue(master.isActiveMaster());
    assertTrue(master.isInitialized());

    // disable load balancing on this master
    master.balanceSwitch(false);

    // create two tables in META, each with 10 regions
    byte [] FAMILY = Bytes.toBytes("family");
    byte [][] SPLIT_KEYS = new byte [][] {
        new byte[0], Bytes.toBytes("aaa"), Bytes.toBytes("bbb"),
        Bytes.toBytes("ccc"), Bytes.toBytes("ddd"), Bytes.toBytes("eee"),
        Bytes.toBytes("fff"), Bytes.toBytes("ggg"), Bytes.toBytes("hhh"),
        Bytes.toBytes("iii"), Bytes.toBytes("jjj")
    };

    byte [] enabledTable = Bytes.toBytes("enabledTable");
    HTableDescriptor htdEnabled = new HTableDescriptor(enabledTable);
    htdEnabled.addFamily(new HColumnDescriptor(FAMILY));
    FileSystem filesystem = FileSystem.get(conf);
    Path rootdir = filesystem.makeQualified(
           new Path(conf.get(HConstants.HBASE_DIR)));
    // Write the .tableinfo
    FSTableDescriptors.createTableDescriptor(filesystem, rootdir, htdEnabled);
    HRegionInfo hriEnabled = new HRegionInfo(htdEnabled.getName(),
        null, null);
    createRegion(hriEnabled, rootdir, conf, htdEnabled);

    List<HRegionInfo> enabledRegions = TEST_UTIL.createMultiRegionsInMeta(
        TEST_UTIL.getConfiguration(), htdEnabled, SPLIT_KEYS);

    byte [] disabledTable = Bytes.toBytes("disabledTable");
    HTableDescriptor htdDisabled = new HTableDescriptor(disabledTable);
    htdDisabled.addFamily(new HColumnDescriptor(FAMILY));
    // Write the .tableinfo
    FSTableDescriptors.createTableDescriptor(filesystem, rootdir, htdDisabled);
    HRegionInfo hriDisabled = new HRegionInfo(htdDisabled.getName(), null, null);
    createRegion(hriDisabled, rootdir, conf, htdDisabled);

    List<HRegionInfo> disabledRegions = TEST_UTIL.createMultiRegionsInMeta(
        TEST_UTIL.getConfiguration(), htdDisabled, SPLIT_KEYS);

    log("Regions in META have been created");

    // at this point we only expect 2 regions to be assigned out (catalogs)
    assertEquals(2, cluster.countServedRegions());

    // The first RS will stay online
    List<RegionServerThread> regionservers =
      cluster.getRegionServerThreads();
    HRegionServer hrs = regionservers.get(0).getRegionServer();

    // The second RS is going to be hard-killed
    RegionServerThread hrsDeadThread = regionservers.get(1);
    HRegionServer hrsDead = hrsDeadThread.getRegionServer();
    ServerName deadServerName = hrsDead.getServerName();

    // we'll need some regions to already be assigned out properly on live RS
    List<HRegionInfo> enabledAndAssignedRegions = new ArrayList<HRegionInfo>();
    enabledAndAssignedRegions.add(enabledRegions.remove(0));
    enabledAndAssignedRegions.add(enabledRegions.remove(0));
    List<HRegionInfo> disabledAndAssignedRegions = new ArrayList<HRegionInfo>();
    disabledAndAssignedRegions.add(disabledRegions.remove(0));
    disabledAndAssignedRegions.add(disabledRegions.remove(0));

    // now actually assign them
    for (HRegionInfo hri : enabledAndAssignedRegions) {
      master.assignmentManager.regionPlans.put(hri.getEncodedName(),
          new RegionPlan(hri, null, hrs.getServerName()));
      master.assignRegion(hri);
    }
    for (HRegionInfo hri : disabledAndAssignedRegions) {
      master.assignmentManager.regionPlans.put(hri.getEncodedName(),
          new RegionPlan(hri, null, hrs.getServerName()));
      master.assignRegion(hri);
    }

    assertTrue(" Table must be enabled.", master.getAssignmentManager()
        .getZKTable().isEnabledTable("enabledTable"));
    // we also need regions assigned out on the dead server
    List<HRegionInfo> enabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
    enabledAndOnDeadRegions.add(enabledRegions.remove(0));
    enabledAndOnDeadRegions.add(enabledRegions.remove(0));
    List<HRegionInfo> disabledAndOnDeadRegions = new ArrayList<HRegionInfo>();
    disabledAndOnDeadRegions.add(disabledRegions.remove(0));
    disabledAndOnDeadRegions.add(disabledRegions.remove(0));

    // set region plan to server to be killed and trigger assign
    for (HRegionInfo hri : enabledAndOnDeadRegions) {
      master.assignmentManager.regionPlans.put(hri.getEncodedName(),
          new RegionPlan(hri, null, deadServerName));
      master.assignRegion(hri);
    }
    for (HRegionInfo hri : disabledAndOnDeadRegions) {
      master.assignmentManager.regionPlans.put(hri.getEncodedName(),
          new RegionPlan(hri, null, deadServerName));
      master.assignRegion(hri);
    }

    // wait for no more RIT
    log("Waiting for assignment to finish");
    ZKAssign.blockUntilNoRIT(zkw);
    log("Assignment completed");

    // Stop the master
    log("Aborting master");
    cluster.abortMaster(0);
    cluster.waitOnMaster(0);
    log("Master has aborted");

    /*
     * Now, let's start mocking up some weird states as described in the method
     * javadoc.
     */

    List<HRegionInfo> regionsThatShouldBeOnline = new ArrayList<HRegionInfo>();
    List<HRegionInfo> regionsThatShouldBeOffline = new ArrayList<HRegionInfo>();

    log("Beginning to mock scenarios");

    // Disable the disabledTable in ZK
    ZKTable zktable = new ZKTable(zkw);
    zktable.setDisabledTable(Bytes.toString(disabledTable));

    assertTrue(" The enabled table should be identified on master fail over.",
        zktable.isEnabledTable("enabledTable"));

    /*
     * ZK = CLOSING
     */

    // Region of enabled table being closed on dead RS but not finished
    HRegionInfo region = enabledAndOnDeadRegions.remove(0);
    regionsThatShouldBeOnline.add(region);
    ZKAssign.createNodeClosing(zkw, region, deadServerName);
    LOG.debug("\n\nRegion of enabled table was CLOSING on dead RS\n" +
        region + "\n\n");

    // Region of disabled table being closed on dead RS but not finished
    region = disabledAndOnDeadRegions.remove(0);
    regionsThatShouldBeOffline.add(region);
    ZKAssign.createNodeClosing(zkw, region, deadServerName);
    LOG.debug("\n\nRegion of disabled table was CLOSING on dead RS\n" +
        region + "\n\n");

    /*
     * ZK = CLOSED
     */

    // Region of enabled on dead server gets closed but not ack'd by master
    region = enabledAndOnDeadRegions.remove(0);
    regionsThatShouldBeOnline.add(region);
    int version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
    ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
    LOG.debug("\n\nRegion of enabled table was CLOSED on dead RS\n" +
        region + "\n\n");

    // Region of disabled on dead server gets closed but not ack'd by master
    region = disabledAndOnDeadRegions.remove(0);
    regionsThatShouldBeOffline.add(region);
    version = ZKAssign.createNodeClosing(zkw, region, deadServerName);
    ZKAssign.transitionNodeClosed(zkw, region, deadServerName, version);
    LOG.debug("\n\nRegion of disabled table was CLOSED on dead RS\n" +
        region + "\n\n");

    /*
     * ZK = OPENING
     */

    // RS was opening a region of enabled table then died
    region = enabledRegions.remove(0);
    regionsThatShouldBeOnline.add(region);
    ZKAssign.createNodeOffline(zkw, region, deadServerName);
    ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
    LOG.debug("\n\nRegion of enabled table was OPENING on dead RS\n" +
        region + "\n\n");

    // RS was opening a region of disabled table then died
    region = disabledRegions.remove(0);
    regionsThatShouldBeOffline.add(region);
    ZKAssign.createNodeOffline(zkw, region, deadServerName);
    ZKAssign.transitionNodeOpening(zkw, region, deadServerName);
    LOG.debug("\n\nRegion of disabled table was OPENING on dead RS\n" +
        region + "\n\n");

    /*
     * ZK = OPENED
     */

    // Region of enabled table was opened on dead RS
    region = enabledRegions.remove(0);
    regionsThatShouldBeOnline.add(region);
    ZKAssign.createNodeOffline(zkw, region, deadServerName);
    ProtobufUtil.openRegion(hrsDead, region);
    while (true) {
      byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
      RegionTransition rt = RegionTransition.parseFrom(bytes);
      if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
        break;
      }
      Thread.sleep(100);
    }
    LOG.debug("\n\nRegion of enabled table was OPENED on dead RS\n" +
        region + "\n\n");

    // Region of disabled table was opened on dead RS
    region = disabledRegions.remove(0);
    regionsThatShouldBeOffline.add(region);
    ZKAssign.createNodeOffline(zkw, region, deadServerName);
    ProtobufUtil.openRegion(hrsDead, region);
    while (true) {
      byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
      RegionTransition rt = RegionTransition.parseFrom(bytes);
      if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
        break;
      }
      Thread.sleep(100);
    }
    LOG.debug("\n\nRegion of disabled table was OPENED on dead RS\n" +
        region + "\n\n");

    /*
     * ZK = NONE
     */

    // Region of enabled table was open at steady-state on dead RS
    region = enabledRegions.remove(0);
    regionsThatShouldBeOnline.add(region);
    ZKAssign.createNodeOffline(zkw, region, deadServerName);
    ProtobufUtil.openRegion(hrsDead, region);
    while (true) {
      byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
      RegionTransition rt = RegionTransition.parseFrom(bytes);
      if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
        ZKAssign.deleteOpenedNode(zkw, region.getEncodedName());
        LOG.debug("DELETED " + rt);
        break;
      }
      Thread.sleep(100);
    }
    LOG.debug("\n\nRegion of enabled table was open at steady-state on dead RS"
        + "\n" + region + "\n\n");

    // Region of disabled table was open at steady-state on dead RS
    region = disabledRegions.remove(0);
    regionsThatShouldBeOffline.add(region);
    ZKAssign.createNodeOffline(zkw, region, deadServerName);
    ProtobufUtil.openRegion(hrsDead, region);
    while (true) {
      byte [] bytes = ZKAssign.getData(zkw, region.getEncodedName());
      RegionTransition rt = RegionTransition.parseFrom(bytes);
      if (rt != null && rt.getEventType().equals(EventType.RS_ZK_REGION_OPENED)) {
        ZKAssign.deleteOpenedNode(zkw, region.getEncodedName());
        break;
      }
      Thread.sleep(100);
    }
    LOG.debug("\n\nRegion of disabled table was open at steady-state on dead RS"
      + "\n" + region + "\n\n");

    /*
     * DONE MOCKING
     */

    log("Done mocking data up in ZK");

    // Kill the RS that had a hard death
    log("Killing RS " + deadServerName);
    hrsDead.abort("Killing for unit test");
    log("RS " + deadServerName + " killed");

    // Start up a new master.  Wait until regionserver is completely down
    // before starting new master because of hbase-4511.
    while (hrsDeadThread.isAlive()) {
      Threads.sleep(10);
    }
    log("Starting up a new master");
    master = cluster.startMaster().getMaster();
    log("Waiting for master to be ready");
    assertTrue(cluster.waitForActiveAndReadyMaster());
    log("Master is ready");

    // Let's add some weird states to master in-memory state

    // After HBASE-3181, we need to have some ZK state if we're PENDING_OPEN
    // b/c it is impossible for us to get into this state w/o a zk node
    // this is not true of PENDING_CLOSE

    // PENDING_OPEN and enabled
    region = enabledRegions.remove(0);
    regionsThatShouldBeOnline.add(region);
    master.getAssignmentManager().getRegionStates().updateRegionState(
      region, RegionState.State.PENDING_OPEN);
    ZKAssign.createNodeOffline(zkw, region, master.getServerName());
    // PENDING_OPEN and disabled
    region = disabledRegions.remove(0);
    regionsThatShouldBeOffline.add(region);
    master.getAssignmentManager().getRegionStates().updateRegionState(
      region, RegionState.State.PENDING_OPEN);
    ZKAssign.createNodeOffline(zkw, region, master.getServerName());

    // Failover should be completed, now wait for no RIT
    log("Waiting for no more RIT");
    ZKAssign.blockUntilNoRIT(zkw);
    log("No more RIT in ZK");
    long now = System.currentTimeMillis();
    final long maxTime = 120000;
    boolean done = master.assignmentManager.waitUntilNoRegionsInTransition(maxTime);
    if (!done) {
      LOG.info("rit=" + master.getAssignmentManager().getRegionStates().getRegionsInTransition());
    }
    long elapsed = System.currentTimeMillis() - now;
    assertTrue("Elapsed=" + elapsed + ", maxTime=" + maxTime + ", done=" + done,
      elapsed < maxTime);
    log("No more RIT in RIT map, doing final test verification");

    // Grab all the regions that are online across RSs
    Set<HRegionInfo> onlineRegions = new TreeSet<HRegionInfo>();
    for (JVMClusterUtil.RegionServerThread rst :
        cluster.getRegionServerThreads()) {
      try {
        onlineRegions.addAll(ProtobufUtil.getOnlineRegions(rst.getRegionServer()));
      } catch (org.apache.hadoop.hbase.regionserver.RegionServerStoppedException e) {
        LOG.info("Got RegionServerStoppedException", e);
      }
    }

    // Now, everything that should be online should be online
    for (HRegionInfo hri : regionsThatShouldBeOnline) {
      assertTrue("region=" + hri.getRegionNameAsString() + ", " + onlineRegions.toString(),
        onlineRegions.contains(hri));
    }

    // Everything that should be offline should not be online
    for (HRegionInfo hri : regionsThatShouldBeOffline) {
      assertFalse(onlineRegions.contains(hri));
    }

    log("Done with verification, all passed, shutting down cluster");

    // Done, shutdown the cluster
    TEST_UTIL.shutdownMiniCluster();
  }

  HRegion createRegion(final HRegionInfo  hri, final Path rootdir, final Configuration c,
      final HTableDescriptor htd)
  throws IOException {
    HRegion r = HRegion.createHRegion(hri, rootdir, c, htd);
    // The above call to create a region will create an hlog file.  Each
    // log file create will also create a running thread to do syncing.  We need
    // to close out this log else we will have a running thread trying to sync
    // the file system continuously which is ugly when dfs is taken away at the
    // end of the test.
    HRegion.closeHRegion(r);
    return r;
  }

  // TODO: Next test to add is with testing permutations of the RIT or the RS
  //       killed are hosting ROOT and META regions.

  private void log(String string) {
    LOG.info("\n\n" + string + " \n\n");
  }

  @Test (timeout=180000)
  public void testShouldCheckMasterFailOverWhenMETAIsInOpenedState()
      throws Exception {
    LOG.info("Starting testShouldCheckMasterFailOverWhenMETAIsInOpenedState");
    final int NUM_MASTERS = 1;
    final int NUM_RS = 2;

    // Start the cluster
    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
    Configuration conf = TEST_UTIL.getConfiguration();
    conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
    conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 8000);
    conf.setInt("hbase.master.info.port", -1);

    TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();

    // Find regionserver carrying meta.
    List<RegionServerThread> regionServerThreads =
      cluster.getRegionServerThreads();
    int count = -1;
    HRegion metaRegion = null;
    for (RegionServerThread regionServerThread : regionServerThreads) {
      HRegionServer regionServer = regionServerThread.getRegionServer();
      metaRegion = regionServer.getOnlineRegion(HRegionInfo.FIRST_META_REGIONINFO.getRegionName());
      count++;
      regionServer.abort("");
      if (null != metaRegion) break;
    }
    HRegionServer regionServer = cluster.getRegionServer(count);

    TEST_UTIL.shutdownMiniHBaseCluster();

    // Create a ZKW to use in the test
    ZooKeeperWatcher zkw =
      HBaseTestingUtility.createAndForceNodeToOpenedState(TEST_UTIL,
          metaRegion, regionServer.getServerName());

    LOG.info("Staring cluster for second time");
    TEST_UTIL.startMiniHBaseCluster(NUM_MASTERS, NUM_RS);

    HMaster master = TEST_UTIL.getHBaseCluster().getMaster();
    while (!master.isInitialized()) {
      Thread.sleep(100);
    }
    // Failover should be completed, now wait for no RIT
    log("Waiting for no more RIT");
    ZKAssign.blockUntilNoRIT(zkw);

    zkw.close();
    // Stop the cluster
    TEST_UTIL.shutdownMiniCluster();
  }

  /**
   * Simple test of master failover.
   * <p>
   * Starts with three masters.  Kills a backup master.  Then kills the active
   * master.  Ensures the final master becomes active and we can still contact
   * the cluster.
   * @throws Exception
   */
  @Test (timeout=240000)
  public void testSimpleMasterFailover() throws Exception {

    final int NUM_MASTERS = 3;
    final int NUM_RS = 3;

    // Start the cluster
    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();

    TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();

    // get all the master threads
    List<MasterThread> masterThreads = cluster.getMasterThreads();

    // wait for each to come online
    for (MasterThread mt : masterThreads) {
      assertTrue(mt.isAlive());
    }

    // verify only one is the active master and we have right number
    int numActive = 0;
    int activeIndex = -1;
    ServerName activeName = null;
    HMaster active = null;
    for (int i = 0; i < masterThreads.size(); i++) {
      if (masterThreads.get(i).getMaster().isActiveMaster()) {
        numActive++;
        activeIndex = i;
        active = masterThreads.get(activeIndex).getMaster();
        activeName = active.getServerName();
      }
    }
    assertEquals(1, numActive);
    assertEquals(NUM_MASTERS, masterThreads.size());
    LOG.info("Active master " + activeName);

    // Check that ClusterStatus reports the correct active and backup masters
    assertNotNull(active);
    ClusterStatus status = active.getClusterStatus();
    assertTrue(status.getMaster().equals(activeName));
    assertEquals(2, status.getBackupMastersSize());
    assertEquals(2, status.getBackupMasters().size());

    // attempt to stop one of the inactive masters
    int backupIndex = (activeIndex == 0 ? 1 : activeIndex - 1);
    HMaster master = cluster.getMaster(backupIndex);
    LOG.debug("\n\nStopping a backup master: " + master.getServerName() + "\n");
    cluster.stopMaster(backupIndex, false);
    cluster.waitOnMaster(backupIndex);

    // Verify still one active master and it's the same
    for (int i = 0; i < masterThreads.size(); i++) {
      if (masterThreads.get(i).getMaster().isActiveMaster()) {
        assertTrue(activeName.equals(masterThreads.get(i).getMaster().getServerName()));
        activeIndex = i;
        active = masterThreads.get(activeIndex).getMaster();
      }
    }
    assertEquals(1, numActive);
    assertEquals(2, masterThreads.size());
    int rsCount = masterThreads.get(activeIndex).getMaster().getClusterStatus().getServersSize();
    LOG.info("Active master " + active.getServerName() + " managing " + rsCount +  " regions servers");
    assertEquals(3, rsCount);

    // Check that ClusterStatus reports the correct active and backup masters
    assertNotNull(active);
    status = active.getClusterStatus();
    assertTrue(status.getMaster().equals(activeName));
    assertEquals(1, status.getBackupMastersSize());
    assertEquals(1, status.getBackupMasters().size());

    // kill the active master
    LOG.debug("\n\nStopping the active master " + active.getServerName() + "\n");
    cluster.stopMaster(activeIndex, false);
    cluster.waitOnMaster(activeIndex);

    // wait for an active master to show up and be ready
    assertTrue(cluster.waitForActiveAndReadyMaster());

    LOG.debug("\n\nVerifying backup master is now active\n");
    // should only have one master now
    assertEquals(1, masterThreads.size());

    // and he should be active
    active = masterThreads.get(0).getMaster();
    assertNotNull(active);
    status = active.getClusterStatus();
    ServerName mastername = status.getMaster();
    assertTrue(mastername.equals(active.getServerName()));
    assertTrue(active.isActiveMaster());
    assertEquals(0, status.getBackupMastersSize());
    assertEquals(0, status.getBackupMasters().size());
    int rss = status.getServersSize();
    LOG.info("Active master " + mastername.getServerName() + " managing " +
      rss +  " region servers");
    assertEquals(3, rss);

    // Stop the cluster
    TEST_UTIL.shutdownMiniCluster();
  }

  /**
   * return the index of the active master in the cluster
   * @throws MasterNotRunningException if no active master found
   */
  private int getActiveMasterIndex(MiniHBaseCluster cluster) throws MasterNotRunningException {
    // get all the master threads
    List<MasterThread> masterThreads = cluster.getMasterThreads();

    for (int i = 0; i < masterThreads.size(); i++) {
      if (masterThreads.get(i).getMaster().isActiveMaster()) {
        return i;
      }
    }
    throw new MasterNotRunningException();
  }

  /**
   * Kill the master and wait for a new active master to show up
   * @param cluster
   * @return the new active master
   * @throws InterruptedException
   * @throws IOException
   */
  private HMaster killActiveAndWaitForNewActive(MiniHBaseCluster cluster)
  throws InterruptedException, IOException {
    int activeIndex = getActiveMasterIndex(cluster);
    HMaster active = cluster.getMaster();
    cluster.stopMaster(activeIndex);
    cluster.waitOnMaster(activeIndex);
    assertTrue(cluster.waitForActiveAndReadyMaster());
    // double check this is actually a new master
    HMaster newActive = cluster.getMaster();
    assertFalse(active == newActive);
    return newActive;
  }

  /**
   * Test that if the master fails, the load balancer maintains its
   * state (running or not) when the next master takes over
   * @throws Exception
   */
  @Test (timeout=240000)
  public void testMasterFailoverBalancerPersistence() throws Exception {
    final int NUM_MASTERS = 3;
    final int NUM_RS = 1;

    // Start the cluster
    HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();

    TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
    MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();

    assertTrue(cluster.waitForActiveAndReadyMaster());
    HMaster active = cluster.getMaster();
    // check that the balancer is on by default for the active master
    ClusterStatus clusterStatus = active.getClusterStatus();
    assertTrue(clusterStatus.isBalancerOn());

    active = killActiveAndWaitForNewActive(cluster);

    // ensure the load balancer is still running on new master
    clusterStatus = active.getClusterStatus();
    assertTrue(clusterStatus.isBalancerOn());

    // turn off the load balancer
    active.balanceSwitch(false);

    // once more, kill active master and wait for new active master to show up
    active = killActiveAndWaitForNewActive(cluster);

    // ensure the load balancer is not running on the new master
    clusterStatus = active.getClusterStatus();
    assertFalse(clusterStatus.isBalancerOn());

    // Stop the cluster
    TEST_UTIL.shutdownMiniCluster();
  }

}