/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.replication;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.LargeTests;
import org.apache.hadoop.hbase.UnknownScannerException;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import static org.junit.Assert.fail;
@Category(LargeTests.class)
public class TestReplicationQueueFailover extends TestReplicationBase {
private static final Log LOG = LogFactory.getLog(TestReplicationQueueFailover.class);
/**
* Load up multiple tables over 2 region servers and kill a source during
* the upload. The failover happens internally.
*
* WARNING this test sometimes fails because of HBASE-3515
*
* @throws Exception
*/
@Test(timeout=300000)
public void queueFailover() throws Exception {
// killing the RS with .META. can result into failed puts until we solve
// IO fencing
int rsToKill1 =
utility1.getHBaseCluster().getServerWithMeta() == 0 ? 1 : 0;
int rsToKill2 =
utility2.getHBaseCluster().getServerWithMeta() == 0 ? 1 : 0;
// Takes about 20 secs to run the full loading, kill around the middle
Thread killer1 = killARegionServer(utility1, 7500, rsToKill1);
Thread killer2 = killARegionServer(utility2, 10000, rsToKill2);
LOG.info("Start loading table");
int initialCount = utility1.loadTable(htable1, famName);
LOG.info("Done loading table");
killer1.join(5000);
killer2.join(5000);
LOG.info("Done waiting for threads");
Result[] res;
while (true) {
try {
Scan scan = new Scan();
ResultScanner scanner = htable1.getScanner(scan);
res = scanner.next(initialCount);
scanner.close();
break;
} catch (UnknownScannerException ex) {
LOG.info("Cluster wasn't ready yet, restarting scanner");
}
}
// Test we actually have all the rows, we may miss some because we
// don't have IO fencing.
if (res.length != initialCount) {
LOG.warn("We lost some rows on the master cluster!");
// We don't really expect the other cluster to have more rows
initialCount = res.length;
}
int lastCount = 0;
final long start = System.currentTimeMillis();
int i = 0;
while (true) {
if (i==NB_RETRIES-1) {
fail("Waited too much time for queueFailover replication. " +
"Waited "+(System.currentTimeMillis() - start)+"ms.");
}
Scan scan2 = new Scan();
ResultScanner scanner2 = htable2.getScanner(scan2);
Result[] res2 = scanner2.next(initialCount * 2);
scanner2.close();
if (res2.length < initialCount) {
if (lastCount < res2.length) {
i--; // Don't increment timeout if we make progress
} else {
i++;
}
lastCount = res2.length;
LOG.info("Only got " + lastCount + " rows instead of " +
initialCount + " current i=" + i);
Thread.sleep(SLEEP_TIME*2);
} else {
break;
}
}
}
private static Thread killARegionServer(final HBaseTestingUtility utility,
final long timeout, final int rs) {
Thread killer = new Thread() {
public void run() {
try {
Thread.sleep(timeout);
utility.expireRegionServerSession(rs);
} catch (Exception e) {
LOG.error("Couldn't kill a region server", e);
}
}
};
killer.setDaemon(true);
killer.start();
return killer;
}
}