/**
* Copyright 2014 The CmRaft Project
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package com.chicm.cmraft.log;
import java.io.EOFException;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.chicm.cmraft.common.Configuration;
import com.chicm.cmraft.common.ServerInfo;
import com.chicm.cmraft.core.RaftNode;
import com.chicm.cmraft.core.State;
import com.chicm.cmraft.protobuf.generated.RaftProtos.KeyValuePair;
import com.chicm.cmraft.protobuf.generated.RaftProtos.RaftLogEntry;
import com.chicm.cmraft.rpc.RpcTimeoutException;
import com.chicm.cmraft.util.BlockingHashMap;
import com.google.common.base.Preconditions;
import com.google.protobuf.ByteString;
public class DefaultRaftLog implements RaftLog {
static final Log LOG = LogFactory.getLog(DefaultRaftLog.class);
private static final String RAFT_ROOT_DIR_KEY = "raft.root.dir";
private static final int DEFAULT_COMMIT_TIMEOUT = 5000;
private Configuration conf;
private SortedMap<Long, RaftLogEntry> entries = new TreeMap<>();
private ConcurrentHashMap<ByteString, ByteString> keyValues = new ConcurrentHashMap<>();
private final static long INITIAL_TERM = 0;
private RaftNode node;
private final AtomicLong commitIndex = new AtomicLong(0);
private final AtomicLong flushedIndex = new AtomicLong(0);
/**lastApplied initialized as 0, and the first time increase it to 1,
so the first index is 1. */
private final AtomicLong lastApplied = new AtomicLong(0);
/** followerIndexes contains:
* nextIndex[] for each server, index of the next log entry
* to send to that server (initialized to leaderlast log index + 1)
* matchIndex[] for each server, index of highest log entry
* known to be replicated on server (initialized to 0, increases monotonically)
*/
private Map<ServerInfo, FollowerIndexes> followerIndexes = new ConcurrentHashMap<>();
/** AppendEntries RPC response counter, to convert entry from applied to commit
* once get more than half success response */
private ResponseBag<Long> responseBag = new ResponseBag<Long>();
private BlockingHashMap<Long, Boolean> rpcResults = new BlockingHashMap<>();
private int nTotalServers = 0;
private ServerInfo thisServer;
public DefaultRaftLog(RaftNode node, Configuration conf) {
this.node = node;
this.conf = conf;
thisServer = node.getServerInfo();
loadPersistentData();
startFlushWorker();
}
public String getServerName() {
return thisServer.toString();
}
public ServerInfo getServerInfo() {
return thisServer;
}
private void leaderInit() {
LOG.info(getServerName() + ": LEADER INIT");
for(ServerInfo remoteServer: node.getRemoteServers()) {
FollowerIndexes fIndexes = new FollowerIndexes(getLastApplied() +1, 0);
followerIndexes.put(remoteServer, fIndexes);
}
// including local server + remote servers
nTotalServers = node.getRemoteServers().size() + 1;
}
private void cleanupLeaderWorker() {
}
@Override
public void stateChange(State oldState, State newState) {
LOG.info(getServerName() + ": STATE CHANGE");
if(oldState == State.LEADER && newState != State.LEADER) {
cleanupLeaderWorker();
} else if(newState == State.LEADER && oldState != State.LEADER) {
leaderInit();
}
}
@Override
public long getFollowerMatchIndex(ServerInfo follower) {
return followerIndexes.get(follower).getMatchIndex();
}
// todo - implemnet nextIndex logic on appendEntries failure
public long getFollowerNextIndex(ServerInfo follower) {
return followerIndexes.get(follower).getNextIndex();
}
@Override
public long getLogTerm(long index) {
if(entries.get(index) == null)
return 0;
return entries.get(index).getTerm();
}
@Override
public List<RaftLogEntry> getLogEntries(long startIndex, long endIndex) {
List<RaftLogEntry> result = new ArrayList<RaftLogEntry>();
if(startIndex < 1 || endIndex > getLastApplied())
return result;
for(long key = startIndex; key <= endIndex; key++) {
result.add(entries.get(key));
}
return result;
}
/**
* @return the commitIndex
*/
@Override
public long getCommitIndex() {
return commitIndex.get();
}
/**
* @param commitIndex the commitIndex to set
*/
public void setCommitIndex(long commitIndex) {
this.commitIndex.set(commitIndex);;
}
/**
* @return the lastApplied
*/
@Override
public long getLastApplied() {
return lastApplied.get();
}
@Override
public long getLastLogTerm() {
if(entries == null || entries.size() < 1)
return INITIAL_TERM;
return entries.get(getLastApplied()).getTerm();
}
/**
* @param lastApplied the lastApplied to set
*/
public void setLastApplied(long lastApplied) {
this.lastApplied.set(lastApplied);;
}
@Override
public long getFlushedIndex() {
return this.flushedIndex.get();
}
public void setFlushedIndex(long index) {
flushedIndex.set(index);
}
// for followers
@Override
public boolean appendEntries(long term, ServerInfo leaderId, long leaderCommit,
long prevLogIndex, long prevLogTerm, List<RaftLogEntry> leaderEntries) {
LOG.debug(getServerName() + "follower appending entry...");
Preconditions.checkNotNull(leaderEntries);
if(term < node.getCurrentTerm()) {
return false;
}
if(prevLogIndex > 0) {
if(!this.entries.containsKey(prevLogIndex)) {
return false;
}
if(this.entries.get(prevLogIndex).getTerm() != prevLogTerm) {
return false;
}
}
//append entries
//need to assure that the entries are sorted before hand.
for(RaftLogEntry entry: leaderEntries) {
entries.put(entry.getIndex(), entry);
applyKeyValueLog(entry.getIndex());
if(entry.getIndex() > getLastApplied()) {
setLastApplied(entry.getIndex());
}
}
if(leaderCommit > getCommitIndex()) {
setCommitIndex(Math.min(getLastApplied(), leaderCommit));
//to-do: need to be done asynchronously
flushCommitted();
}
if(leaderEntries.size() == 0) { // heart beat
LOG.debug("heart beat");
}
LOG.debug(getServerName() + "follower appending entry... done");
return true;
}
// for leaders
@Override
public void onAppendEntriesResponse(ServerInfo follower, long followerTerm, boolean success,
long followerLastApplied) {
LOG.debug(getServerName() + ": onAppendEntriesResponse");
updateFollowerMatchIndexes(follower, followerLastApplied);
if(!success) {
return;
}
responseBag.add(followerLastApplied, 1);
//checkAndCommit(followerLastApplied);
if(followerLastApplied <= getCommitIndex()) {
return;
}
// consider local server is already committed
if(responseBag.get(followerLastApplied)+1 > nTotalServers/2) {
LOG.info(getServerName() + ": committed, index:" + followerLastApplied);
rpcResults.put(followerLastApplied, true);
}
}
// commit the specified log
private void commitLog(long index) {
setCommitIndex(index);
replayKeyValueLogEntries(index, index);
flushCommitted();
//to-do: need to notify followers after commit
}
private void updateFollowerMatchIndexes(ServerInfo follower, long lastApplied) {
if(followerIndexes.get(follower) == null) {
LOG.error("FOLLOWER INDEXES MAP SHOULD BE INITIALIZED BEFORE HERE.");
}
if(followerIndexes.get(follower).getMatchIndex() < lastApplied) {
followerIndexes.get(follower).setMatchIndex(lastApplied);
}
}
@Override
public boolean set(KeyValuePair kv) {
LOG.debug(getServerName() + ": set request received");
Preconditions.checkArgument(!kv.getKey().isEmpty());
Preconditions.checkArgument(!kv.getValue().isEmpty());
//lastApplied initialized as 0, and the first time increase it to 1,
//so the first index is 1
//LogEntry entry = new LogEntry(lastApplied.incrementAndGet(), node.getCurrentTerm(), key, value, LogMutationType.SET);
RaftLogEntry.Builder builder = RaftLogEntry.newBuilder();
builder.setKv(kv);
builder.setIndex(lastApplied.incrementAndGet());
builder.setTerm(node.getCurrentTerm());
builder.setMode(RaftLogEntry.MutationMode.SET);
RaftLogEntry entry = builder.build();
entries.put(entry.getIndex(), entry);
boolean committed = true;
//making rpc call to followers
if(!node.getNodeConnectionManager().getRemoteServers().isEmpty()) {
committed = false;
node.getNodeConnectionManager().appendEntries(this, entry.getIndex());
//waiting for results
try {
committed = rpcResults.take(entry.getIndex(), DEFAULT_COMMIT_TIMEOUT);
} catch(RpcTimeoutException e) {
LOG.error(e.getMessage());
return false;
}
}
if(committed) {
commitLog(entry.getIndex());
}
LOG.debug(getServerName() + ": set committed, sending response");
return committed;
}
@Override
public byte[] get(byte[] key) {
Preconditions.checkNotNull(key);
Preconditions.checkArgument(key.length > 0);
ByteString result = keyValues.get(ByteString.copyFrom(key));
if(result != null)
return result.toByteArray();
else
return null;
}
@Override
public boolean delete(byte[] key) {
//lastApplied initialized as 0, and the first time increase it to 1,
//so the first index is 1
//LogEntry entry = new LogEntry(lastApplied.incrementAndGet(), node.getCurrentTerm(), key, null, LogMutationType.DELETE);
Preconditions.checkNotNull(key);
Preconditions.checkArgument(key.length > 0);
ByteString bsKey = ByteString.copyFrom(key);
if(!keyValues.containsKey(bsKey)) {
return false;
}
KeyValuePair.Builder kvBuilder = KeyValuePair.newBuilder();
kvBuilder.setKey(ByteString.copyFrom(key));
RaftLogEntry.Builder builder = RaftLogEntry.newBuilder();
builder.setKv(kvBuilder.build());
builder.setIndex(lastApplied.incrementAndGet());
builder.setTerm(node.getCurrentTerm());
builder.setMode(RaftLogEntry.MutationMode.DELETE);
RaftLogEntry entry = builder.build();
entries.put(entry.getIndex(), entry);
boolean committed = true;
//making rpc call to followers
if(!node.getNodeConnectionManager().getRemoteServers().isEmpty()) {
committed = false;
node.getNodeConnectionManager().appendEntries(this, entry.getIndex());
//waiting for results
try {
committed = rpcResults.take(entry.getIndex(), DEFAULT_COMMIT_TIMEOUT);
} catch(RpcTimeoutException e) {
LOG.error(e.getMessage());
return false;
}
}
if(committed) {
commitLog(entry.getIndex());
}
LOG.debug(getServerName() + ": set committed, sending response");
return committed;
}
@Override
public Collection<KeyValuePair> list(byte[] pattern) {
//return entries.values();
List<KeyValuePair> result = new ArrayList<>();
for(ByteString k: keyValues.keySet()) {
KeyValuePair.Builder builder = KeyValuePair.newBuilder();
builder.setKey(k);
builder.setValue(keyValues.get(k));
result.add(builder.build());
}
return result;
}
private void replayKeyValueLogEntries(long startIndex, long endIndex) {
Preconditions.checkArgument(startIndex <= getLastApplied() && startIndex >= 0);
Preconditions.checkArgument(endIndex <= getLastApplied() && endIndex >= 0);
for(long index = startIndex; index <= endIndex; index++) {
applyKeyValueLog(index);
}
}
private void applyKeyValueLog(long index) {
RaftLogEntry log = entries.get(index);
if(log.hasMode() && log.getMode() == RaftLogEntry.MutationMode.SET) {
keyValues.put(log.getKv().getKey(), log.getKv().getValue());
} else if(log.hasMode() && log.getMode() == RaftLogEntry.MutationMode.DELETE) {
keyValues.remove(log.getKv().getKey());
} else;
}
private Path dataFile;
private Path getStorageFilePath() {
if(dataFile != null)
return dataFile;
String rootDir = conf.getString(RAFT_ROOT_DIR_KEY).trim();
String node = getServerInfo().getHost() + "-" + getServerInfo().getPort();
dataFile = Paths.get(rootDir).resolve(node).resolve("data");
return dataFile;
}
private volatile boolean persistentDataLoaded = false;
private void loadPersistentData() {
if(persistentDataLoaded) {
return;
}
try ( FileInputStream fis = new FileInputStream(getStorageFilePath().toFile())) {
long maxIndex = 0;
while(true) {
try {
//LogEntry entry = (LogEntry)ois.readObject();
RaftLogEntry.Builder builder = RaftLogEntry.newBuilder();
if(!builder.mergeDelimitedFrom(fis)) {
break;
}
RaftLogEntry entry = builder.build();
entries.put(entry.getIndex(), entry);
if(entry.getIndex() > maxIndex) {
maxIndex = entry.getIndex();
}
setCommitIndex(maxIndex);
setLastApplied(maxIndex);
setFlushedIndex(maxIndex);
replayKeyValueLogEntries(maxIndex, maxIndex);
} catch(EOFException e) {
break;
}
}
} catch(FileNotFoundException e) {
LOG.warn("no persistant data to load");
} catch(IOException e) {
LOG.error("ERROR loadPersistentData" + e.getMessage(), e);
//Will not try to load again if exception occurs.
//so still set loaded mark persistentDataLoaded
}
persistentDataLoaded = true;
}
/** Lock for flushing committed log */
private final ReentrantLock flushLock = new ReentrantLock();
/** Wait condition for flushing committed log */
private final Condition needFlush = flushLock.newCondition();
private void flushCommitted() {
try {
flushLock.lock();
needFlush.signal();
} finally {
flushLock.unlock();
}
}
private void startFlushWorker() {
Thread t = new Thread(new LogFlushWorker());
t.setName("LogFlushWorker");
t.start();
}
class LogFlushWorker implements Runnable {
@Override
public void run() {
while(true) {
try {
flushLock.lock();
while (getFlushedIndex() >= getCommitIndex()) {
needFlush.await();
}
} catch(InterruptedException e) {
LOG.info("Interrupted, LogFlushWorker exiting");
return;
} finally {
flushLock.unlock();
}
doFlush();
}
}
private void doFlush() {
Path path = getStorageFilePath();
boolean append = path.toFile().exists();
if(!path.getParent().toFile().exists()) {
try {
Files.createDirectories(path.getParent());
} catch(IOException e) {
LOG.error("ERROR flushCommitted", e);
return;
}
}
try (/* ObjectOutputStream oos = AppendableObjectOutputStream.create(*/
FileOutputStream fos = new FileOutputStream(path.toFile(), append)) {
for(long index = flushedIndex.get() + 1; index <= commitIndex.get(); index++) {
RaftLogEntry entry = entries.get(index);
entry.writeDelimitedTo(fos);
setFlushedIndex(index);
}
} catch(IOException e) {
LOG.error("ERROR flushCommitted", e);
return;
}
}
}
class FollowerIndexes {
/** for each server, index of the next log entry
to send to that server (initialized to leader
last log index + 1) */
private long nextIndex;
/** for each server, index of highest log entry
known to be replicated on server
(initialized to 0, increases monotonically) */
private long matchIndex;
public FollowerIndexes(long next, long match) {
this.nextIndex = next;
this.matchIndex = match;
}
/**
* @return the nextIndex
*/
public long getNextIndex() {
return nextIndex;
}
/**
* @param nextIndex the nextIndex to set
*/
public void setNextIndex(long nextIndex) {
this.nextIndex = nextIndex;
}
/**
* @return the matchIndex
*/
public long getMatchIndex() {
return matchIndex;
}
/**
* @param matchIndex the matchIndex to set
*/
public void setMatchIndex(long matchIndex) {
this.matchIndex = matchIndex;
}
}
}