package hdgl.db.server.bsp; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.Watcher; import org.apache.zookeeper.ZooDefs.Ids; import org.apache.zookeeper.ZooKeeper; import hdgl.db.exception.HdglException; import hdgl.db.graph.Edge; import hdgl.db.graph.Entity; import hdgl.db.graph.Vertex; import hdgl.db.protocol.MessagePackWritable; import hdgl.db.protocol.MessageWritable; import hdgl.db.query.QueryContext; import hdgl.db.query.condition.AbstractCondition; import hdgl.db.query.condition.BinaryCondition; import hdgl.db.query.condition.Conjunction; import hdgl.db.query.condition.EqualTo; import hdgl.db.query.condition.LargerThan; import hdgl.db.query.condition.LargerThanOrEqualTo; import hdgl.db.query.condition.LessThan; import hdgl.db.query.condition.LessThanOrEqualTo; import hdgl.db.query.condition.NoRestriction; import hdgl.db.query.condition.NotEqualTo; import hdgl.db.query.stm.StateMachine; import hdgl.db.server.HConf; import hdgl.db.store.GraphStore; import hdgl.util.StringHelper; public class BSPRunner extends Thread implements Watcher { static final String readyFile = "ready0"; static final String dieFile = "die0"; GraphStore graphStore; ZooKeeper zk; String barrierZkRoot; String dieZkRoot; String alivePath; int runnerCount; int superStep = 0; String myname; Object mutex = new Object(); Configuration conf; String lockPath; String diePath; int nodeId; QueryContext ctx; boolean IamPivot = false; boolean IamDiePivot = false; BSPContainer container; int sessionId; private static final org.apache.commons.logging.Log Log = LogFactory .getLog(BSPRunner.class); SortedMap<Long, MessageWritable> received = new TreeMap<Long, MessageWritable>(); SortedMap<Long, MessageWritable> sent = new TreeMap<Long, MessageWritable>(); public BSPRunner(GraphStore graphStore, QueryContext ctx, String zkRoot, int runnerCount, int clientId, int sessionId, BSPContainer container, Configuration conf) throws IOException { super(); this.graphStore = graphStore; this.zk = HConf.getZooKeeper(conf, this); this.barrierZkRoot = StringHelper.makePath(zkRoot, "b"); this.dieZkRoot = StringHelper.makePath(zkRoot, "d"); this.alivePath = StringHelper.makePath(zkRoot, "alive"); this.runnerCount = runnerCount; this.conf = conf; this.myname = "bsp"; this.nodeId = clientId; this.ctx = ctx; this.sessionId = sessionId; this.container = container; this.setDaemon(false); // Log.info("init bsp node " + myname); } public int getSuperStep() { return superStep; } boolean dieNoWait() throws KeeperException, InterruptedException { if(diePath == null){ diePath = zk.create(StringHelper.makePath(dieZkRoot, myname), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL); } // Log.info("bsp node " + nodeId +" is dying"); return zk.getChildren(dieZkRoot, false).size() >= runnerCount; } void dieWait() throws KeeperException, InterruptedException { int lockNumber = StringHelper.getLastInt(diePath); List<String> list = zk.getChildren(dieZkRoot, false); int maxId = -1; for (String cn : list) { int theirNumber = StringHelper.getLastInt(cn); if (theirNumber > maxId) maxId = theirNumber; } if (list.size() < runnerCount || maxId != lockNumber) { IamDiePivot = false; while (true) { synchronized (mutex) { if (zk.exists( StringHelper.makePath(dieZkRoot, dieFile), true) == null) { mutex.wait(); } else { return; } } } } else { IamPivot = true; zk.create(StringHelper.makePath(dieZkRoot, dieFile), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL); return; } } void died() throws KeeperException, InterruptedException{ zk.delete(diePath, -1); if(IamDiePivot){ while (true) { synchronized (mutex) { if (zk.getChildren(dieZkRoot, true).size() > 1) { mutex.wait(); } else { zk.delete( StringHelper.makePath(dieZkRoot, dieFile), -1); return; } } } } } void alive() throws KeeperException, InterruptedException { if (diePath != null) { zk.delete(diePath, -1); diePath = null; } // Log.info("bsp node " + nodeId +" is alive"); } void enterNoWait() throws KeeperException, InterruptedException { lockPath = zk.create(StringHelper.makePath(barrierZkRoot, myname), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL); // Log.info("bsp node " + nodeId +" entering barrier " + superStep); } void enterWait() throws KeeperException, InterruptedException { int lockNumber = StringHelper.getLastInt(lockPath); List<String> list = zk.getChildren(barrierZkRoot, false); int maxId = -1; for (String cn : list) { int theirNumber = StringHelper.getLastInt(cn); if (theirNumber > maxId) maxId = theirNumber; } if (list.size() < runnerCount || maxId != lockNumber) { IamPivot = false; while (true) { synchronized (mutex) { if (zk.exists( StringHelper.makePath(barrierZkRoot, readyFile), this) == null) { mutex.wait(); } else { // Log.info("bsp node " + nodeId // +" has entered barrier " + superStep); return; } } } } else { IamPivot = true; zk.create(StringHelper.makePath(barrierZkRoot, readyFile), new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL); // Log.info("bsp node " + nodeId +" has entered barrier " + // superStep); // Log.info("== all bsp nodes has entered barrier " + // superStep+", pivot: "+nodeId+" =="); return; } } void leaveNoWait() throws InterruptedException, KeeperException { zk.delete(lockPath, -1); // Log.info("bsp node " + nodeId +" leaving barrier " + superStep); } boolean leaveWait() throws KeeperException, InterruptedException { if (IamPivot) { while (true) { synchronized (mutex) { if (zk.getChildren(barrierZkRoot, true).size() > 1) { mutex.wait(); } else { zk.delete( StringHelper.makePath(barrierZkRoot, readyFile), -1); // Log.info("bsp node " + nodeId +" has left barrier " + // superStep); // Log.info("== all bsp nodes has left barrier " + // superStep+ ", pivot: "+nodeId+" =="); return true; } } } } else { while (true) { synchronized (mutex) { if (zk.exists( StringHelper.makePath(barrierZkRoot, readyFile), this) != null) { mutex.wait(); } else { // Log.info("bsp node " + nodeId +" has left barrier " + // superStep); return true; } } } } } void doQuery() throws IOException { if (superStep == 0) { long ovid, vid = 0; int orid, rid = 0; for (Map.Entry<Long, Integer> r : ctx.getIdMap().entrySet()) { ovid = vid; orid = rid; vid = r.getKey(); rid = r.getValue(); if (orid == nodeId) { int state = ctx.getStateMachine().getStartState(); long[] nullpath = new long[0]; for (long id = ovid; id < vid; id++) { sendMessageToVertex(id, state, nullpath); } } } if (rid == nodeId) { int state = ctx.getStateMachine().getStartState(); long[] nullpath = new long[0]; long maxid = graphStore.getVertexCount() + 1; for (long id = vid; id < maxid; id++) { sendMessageToVertex(id, state, nullpath); } } } else { for (Map.Entry<Long, MessageWritable> msg : received.entrySet()) { // Log.info(msg.getKey()+" received message"); doQueryForVertex(msg.getKey(), msg.getValue()); } } } void doQueryForVertex(long vid, MessageWritable msg) throws IOException { Vertex v = graphStore.getVertex(vid); for (int i = 0; i < msg.size(); i++) { int stateId = msg.getState(i); long[] path = msg.getPath(i); StateMachine.State state = ctx.getStateMachine().getState(stateId); for (StateMachine.Condition cond : state.getConditions()) { if (cond.getTest().test(v)) { for (StateMachine.Transition t : cond.getTransitions()) { switch (t.getType()) { case In: e: for (Edge e : v.getInEdges()) { if (t.getTest().test(e)) { long ovid = e.getInVertex().getId(); for (long p : path) { if (ovid == p) { continue e; } } long[] newpath = new long[path.length + 2]; System.arraycopy(newpath, 0, path, 0, path.length); newpath[path.length] = vid; newpath[path.length + 1] = e.getId(); int newState = t.getToState(); sendMessageToVertex(ovid, newState, newpath); } } break; case Out: e: for (Edge e : v.getOutEdges()) { if (t.getTest().test(e)) { long ovid = e.getOutVertex().getId(); for (long p : path) { if (ovid == p) { continue e; } } long[] newpath = new long[path.length + 2]; System.arraycopy(path, 0, newpath, 0, path.length); newpath[path.length] = vid; newpath[path.length + 1] = e.getId(); int newState = t.getToState(); sendMessageToVertex(ovid, newState, newpath); } } break; case Backtrack: throw new HdglException("not implemented"); case Success: long[] result = new long[path.length + 1]; System.arraycopy(path, 0, result, 0, path.length); result[path.length] = vid; container.sendResult(sessionId, result); break; default: } } break; } } } } synchronized public void receiveMessages(MessagePackWritable msgs) { for (int i = 0; i < msgs.size(); i++) { long vid = msgs.getReceiver(i); MessageWritable msg = msgs.getMessage(i); if (received.containsKey(vid)) { received.get(vid).addAll(msg); } else { received.put(vid, msg); } } } void sendMessageToVertex(long vertexId, int newstate, long[] path) { MessageWritable msg; if (!sent.containsKey(vertexId)) { msg = new MessageWritable(); sent.put(vertexId, msg); } else { msg = sent.get(vertexId); } msg.add(newstate, path); } void packAndSendMessage() throws IOException { Map<Integer, MessagePackWritable> packs = new HashMap<Integer, MessagePackWritable>(); Iterator<Map.Entry<Long, Integer>> nodePos = ctx.getIdMap().entrySet() .iterator(); Map.Entry<Long, Integer> range = nodePos.next(); long minId = range.getKey(); long maxId; int currentRegion = range.getValue(); int nextRegion; if (nodePos.hasNext()) { range = nodePos.next(); maxId = range.getKey(); nextRegion = range.getValue(); } else { maxId = graphStore.getVertexCount() + 1; nextRegion = nodeId; } for (Map.Entry<Long, MessageWritable> m : sent.entrySet()) { long id = m.getKey(); int destRegionId; if (id < minId) { destRegionId = nodeId; } else if (id >= minId && id < maxId) { destRegionId = currentRegion; } else { while (id > maxId) { minId = maxId; currentRegion = nextRegion; if (nodePos.hasNext()) { range = nodePos.next(); maxId = range.getKey(); nextRegion = range.getValue(); } else { maxId = graphStore.getVertexCount() + 1; nextRegion = nodeId; } } if (id < minId) { destRegionId = nodeId; } else { destRegionId = currentRegion; } } MessagePackWritable pack; if (packs.containsKey(destRegionId)) { pack = packs.get(destRegionId); } else { pack = new MessagePackWritable(); packs.put(destRegionId, pack); } pack.add(m.getKey(), m.getValue()); } sent.clear(); for (Map.Entry<Integer, MessagePackWritable> pack : packs.entrySet()) { if (pack.getKey() != nodeId) { container.sendMessagePack(sessionId, pack.getKey(), pack.getValue()); } else { receiveMessages(pack.getValue()); } } } @Override public void run() { Throwable throwable = null; try { if (zk.exists(barrierZkRoot, false) == null) { zk.create(barrierZkRoot, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } if (zk.exists(dieZkRoot, false) == null) { zk.create(dieZkRoot, new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } while (true) { // Log.info("node " + nodeId +" working in step " + superStep); if(zk.exists(alivePath, true) == null){ break; } doQuery(); received.clear(); enterNoWait(); packAndSendMessage(); enterWait(); if (!received.isEmpty()) { alive(); } leaveNoWait(); leaveWait(); if (received.isEmpty()) { if (dieNoWait()) { // Log.info("node " + nodeId +" has died"); break; } } superStep++; if(container.superStepFinish(sessionId, superStep - 1, mutex)){ break; } } dieWait(); died(); } catch (Throwable th) { throwable = th; Log.error("error during bsp", th); } finally { if (throwable == null) { container.finish(sessionId); } else { container.error(sessionId, throwable); } try { zk.close(); } catch (InterruptedException e) { } } } @Override public void process(WatchedEvent e) { synchronized (mutex) { mutex.notify(); } } }