package ch.usi.da.dmap.server; /* * Copyright (c) 2017 Università della Svizzera italiana (USI) * * This file is part of URingPaxos. * * URingPaxos is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * URingPaxos is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with URingPaxos. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.net.DatagramPacket; import java.net.DatagramSocket; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.SocketException; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Random; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; import org.apache.log4j.Logger; import org.apache.thrift.TException; import org.apache.thrift.server.TServer; import org.apache.thrift.server.TThreadPoolServer; import org.apache.thrift.transport.TServerSocket; import org.apache.thrift.transport.TServerTransport; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.WatchedEvent; import org.apache.zookeeper.Watcher; import org.apache.zookeeper.ZooDefs.Ids; import org.apache.zookeeper.ZooKeeper; import ch.usi.da.dmap.thrift.gen.Command; import ch.usi.da.dmap.thrift.gen.CommandType; import ch.usi.da.dmap.thrift.gen.Dmap; import ch.usi.da.dmap.thrift.gen.Dmap.Iface; import ch.usi.da.dmap.thrift.gen.MapError; import ch.usi.da.dmap.thrift.gen.RangeCommand; import ch.usi.da.dmap.thrift.gen.RangeResponse; import ch.usi.da.dmap.thrift.gen.Replica; import ch.usi.da.dmap.thrift.gen.ReplicaCommand; import ch.usi.da.dmap.thrift.gen.Response; import ch.usi.da.dmap.thrift.gen.WrongPartition; import ch.usi.da.dmap.utils.Pair; import ch.usi.da.dmap.utils.Utils; import ch.usi.da.paxos.Util; import ch.usi.da.paxos.lab.DummyWatcher; import ch.usi.da.paxos.message.Control; import ch.usi.da.paxos.message.ControlType; import ch.usi.da.paxos.ring.ElasticLearnerRole; import ch.usi.da.paxos.ring.Node; import ch.usi.da.paxos.ring.RingDescription; import ch.usi.da.paxos.storage.Decision; /** * Name: DMapReplica<br> * Description: <br> * * Creation date: Jan 28, 2017<br> * $Id$ * * @author Samuel Benz benz@geoid.ch */ public class DMapReplica<K,V> implements Watcher { static { // get hostname and pid for log file name String host = "localhost"; try { Process proc = Runtime.getRuntime().exec("hostname"); BufferedInputStream in = new BufferedInputStream(proc.getInputStream()); proc.waitFor(); byte [] b = new byte[in.available()]; in.read(b); in.close(); host = new String(b).replace("\n",""); } catch (IOException | InterruptedException e) { } int pid = 0; try { pid = Integer.parseInt((new File("/proc/self")).getCanonicalFile().getName()); } catch (NumberFormatException | IOException e) { } System.setProperty("logfilename", "L" + host + "-" + pid + ".log"); } private final static Logger logger = Logger.getLogger(DMapReplica.class); private final static Logger stats = Logger.getLogger("ch.usi.da.paxos.Stats"); private final AtomicLong stat_latency = new AtomicLong(); private final AtomicLong stat_command = new AtomicLong(); private volatile SortedMap<K,V> db; private final Node node; private final ZooKeeper zoo; public int default_ring; public int partition_ring; public int token; private DatagramSocket signalSender; private DatagramSocket signalReceiver; private final Map<Long,FutureResponse> signals = new HashMap<Long,FutureResponse>(); private final boolean linearizable = true; public long partition_version = 0; public final Map<Integer, Set<Replica>> partitions = new TreeMap<Integer,Set<Replica>>(); private final Map<Long, List<Entry<K, V>>> snapshots = new LinkedHashMap<Long,List<Entry<K,V>>>(){ private static final long serialVersionUID = -2704400124020327063L; protected boolean removeEldestEntry(Map.Entry<Long, List<Entry<K, V>>> eldest) { return size() > 1000; // hold only 1000 snapshots in memory! }}; private final Map<Long,SortedMap<K,V>> snapshotsDB = new LinkedHashMap<Long,SortedMap<K,V>>(){ private static final long serialVersionUID = -2704400124020327063L; protected boolean removeEldestEntry(Map.Entry<Long,SortedMap<K,V>> eldest) { return size() > 1000; // hold only 1000 snapshots in memory! }}; private Map<Long,FutureResponse> responses = new ConcurrentHashMap<Long,FutureResponse>(); private boolean ignore_cmd = false; private long ignore_cmd_instance = 0; public DMapReplica(int default_ring,Node node,ZooKeeper zoo,Comparator<? super K> comparator) { this.default_ring = default_ring; this.node = node; this.zoo = zoo; db = new TreeMap<K,V>(comparator); } public DMapReplica(int default_ring,Node node,ZooKeeper zoo) { this.default_ring = default_ring; this.node = node; this.zoo = zoo; db = new TreeMap<K,V>(); if(stats.isInfoEnabled()){ final Thread writer = new Thread("ABReceiverStatsWriter"){ private long last_time = System.nanoTime(); private long last_sent_count = 0; private long last_sent_time = 0; @Override public void run() { while(true){ try { long time = System.nanoTime(); long sent_count = stat_command.get() - last_sent_count; long sent_time = stat_latency.get() - last_sent_time; float t = (float)(time-last_time)/(1000*1000*1000); float count = sent_count/t; stats.info(String.format("DMapReplica executed %.1f command/s avg. latency %.0f ns",count,sent_time/count)); last_sent_count += sent_count; last_sent_time += sent_time; last_time = time; Thread.sleep(1000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); break; } } } }; writer.start(); } } public Node getNode(){ return node; } public Map<Long,FutureResponse> getResponses() { return responses; } public void registerPartition(String nodeName,int ring_id,InetSocketAddress addr,int token,RecoveryClient<K,V> recovery){ this.token = token; partition_ring = ring_id; // start signal sender /receiver String thrift_address = addr.getHostString() + ";" + addr.getPort(); try { signalSender = new DatagramSocket(); signalReceiver = new DatagramSocket(addr.getPort()); Thread t = new Thread(new SignalReceiver(signalReceiver)); t.setName("SignalReceiver"); t.start(); } catch (SocketException e) { logger.error(e); } // subscribing with recovery from trim point requires prepare msg! if(recovery != null){ if(node.getLearner() instanceof ElasticLearnerRole && partition_ring != default_ring){ Control c = new Control(1,ControlType.Prepare,node.getGroupID(),ring_id); node.getProposer(default_ring).control(c); node.getProposer(ring_id).control(c); } try { Thread.sleep(1000); } catch (InterruptedException e) { } } // subscribe learner to partition (ring) if(node.getLearner() instanceof ElasticLearnerRole && partition_ring != default_ring){ Control c = new Control(1,ControlType.Subscribe,node.getGroupID(),ring_id); node.getProposer(default_ring).control(c); node.getProposer(ring_id).control(c); } // recover state if(recovery != null){ try { Thread.sleep(5000); } catch (InterruptedException e) { } // recover partitions ignore_cmd = true; partition_version = recovery.getPartitionVersion(); partitions.putAll(recovery.getPartitions()); logger.info("Recovered partition map: " + partitions); // create snapshot long snapshotID = recovery.snapshot(); // install data Iterator<Entry<K,V>> entries = recovery.iterator(token, snapshotID); while(entries.hasNext()){ Entry<K,V> e = entries.next(); db.put(e.getKey(),e.getValue()); } logger.info("Recovered " + db.size() + " DB entries"); // remove snapshot recovery.removeSnapshot(snapshotID); ignore_cmd_instance = snapshotID; } // register partition Replica replica = new Replica(); replica.setName(nodeName); replica.setRing(ring_id); replica.setToken(token); replica.setAddress(thrift_address); ReplicaCommand cmd = new ReplicaCommand(); cmd.setId(1L); cmd.setType(CommandType.PUT); cmd.setReplica(replica); try { replica(cmd); } catch (TException e) { logger.error(this + " register replica " + replica,e); } } public void splitPartition(){ //TODO: // recover old partition (token left of new one) // register new partition (token) // unsubscribe old partition ring // (release (delete) data in old partition) -> otherwise they are included in the iterators } public void joinPartition(){ //TODO: } @Override public void process(WatchedEvent event) { try { List<String> n = zoo.getChildren(event.getPath(),true); for(Entry<Integer,Set<Replica>> e : partitions.entrySet()){ for(Replica r : e.getValue()){ if(!n.contains(r.name)){ logger.warn("Replica " + r + " offline!"); ReplicaCommand cmd = new ReplicaCommand(); cmd.setId(2L); cmd.setType(CommandType.REMOVE); cmd.setReplica(r); replica(cmd); } } } } catch (KeeperException | InterruptedException | TException e) { logger.error(this,e); } } public synchronized void receive(Decision d) { if(ignore_cmd){ if(d.getRing() == default_ring && d.getInstance() == ignore_cmd_instance){ ignore_cmd = false; // recovered } return; } long time = System.nanoTime(); if(d.getValue() != null){ long instance = d.getInstance(); Object o = null; try { o = Utils.getObject(d.getValue().getValue()); } catch (ClassNotFoundException | IOException e1) { logger.error(e1); } if(o instanceof Command){ Command cmd = (Command)o; logger.debug("DMapReplica execute " + cmd); Object r = null; try { r = execute(cmd); } catch (TException e) { r = e; } // send/wait for signal if(d.getRing() == default_ring && default_ring != partition_ring){ singal(cmd.id,r); if(responses.containsKey(cmd.id) || linearizable) { try { List<Object> rl = signals.get(cmd.id).getResponse(); // wait logger.debug("... release wait lock!"); if(responses.containsKey(cmd.id)){ for(Object orl : rl){ responses.get(cmd.id).addResponse(orl); } } } catch (InterruptedException e) { } } synchronized(signals){ signals.remove(cmd.id); } } if(responses.containsKey(cmd.id)){ responses.get(cmd.id).addResponse(r); responses.remove(cmd.id); } } else if(o instanceof RangeCommand){ RangeCommand cmd = (RangeCommand)o; logger.debug("DMapReplica execute " + cmd); Object r = null; try { r = range(instance,cmd); } catch (TException e) { r = e; } if(responses.containsKey(cmd.id)){ responses.get(cmd.id).addResponse(r); responses.remove(cmd.id); } } else if(o instanceof ReplicaCommand){ // set partition ReplicaCommand cmd = (ReplicaCommand)o; Replica r = cmd.getReplica(); if(cmd.getType().equals(CommandType.PUT)){ if(partitions.containsKey(r.token)){ partitions.get(r.token).add(r); }else{ Set<Replica> s = new HashSet<Replica>(); s.add(r); partitions.put(r.token,s); } }else if(cmd.getType().equals(CommandType.REMOVE)){ if(partitions.containsKey(r.token)){ partitions.get(r.token).remove(r); } }else if(cmd.getType().equals(CommandType.CLEAR)){ if(partitions.containsKey(r.token)){ partitions.remove(r.token); } } partition_version = instance; logger.info("Install new partition map " + partition_version + ":" + partitions); } } if(stats.isInfoEnabled()){ long lat = System.nanoTime() - time; stat_latency.addAndGet(lat); stat_command.incrementAndGet(); } } private void singal(Long id, Object o) { synchronized (signals){ if(!signals.containsKey(id)){ signals.put(id,new FutureResponse(partitions.keySet())); logger.debug("Global command wait for partitions: " + partitions.keySet() + " ..."); } } for(Entry<Integer,Set<Replica>> e : partitions.entrySet()){ for(Replica r : e.getValue()){ try { String[] addr = r.address.split(";"); InetAddress ip = InetAddress.getByName(addr[0]); int port = Integer.parseInt(addr[1]); byte[] buffer = Utils.getBuffer(o).array(); DatagramPacket packet = new DatagramPacket(buffer,0,buffer.length,ip,port); signalSender.send(packet); } catch (Exception e1){ logger.error(e1); } } } } @SuppressWarnings("unchecked") public synchronized Response execute(Command cmd) throws MapError, TException { Response response = new Response(); response.setId(cmd.id); response.setCount(0); response.setPartition(token); if(cmd.getPartition_version() != partition_version){ WrongPartition p = new WrongPartition(); p.setErrorMsg(cmd.getPartition_version() + "!=" + partition_version); throw p; } try { K key = null; if(cmd.isSetKey()){ key = (K) Utils.getObject(cmd.getKey()); } V value = null; if(cmd.isSetValue()){ value = (V) Utils.getObject(cmd.getValue()); } K retK = null; V retV = null; SortedMap<K,V> snapshotDB = db; if(cmd.isSetSnapshot()){ long snapshot = cmd.getSnapshot(); if(snapshotsDB.containsKey(snapshot)){ snapshotDB = snapshotsDB.get(snapshot); }else{ MapError e = new MapError(); e.setErrorMsg("Snaphost " + cmd.getSnapshot() + " does not exist!"); throw e; } } switch(cmd.type){ case CLEAR: snapshotDB.clear(); break; case CONTAINSVALUE: if(snapshotDB.containsValue(value)){ response.setCount(1); } break; case GET: retV = snapshotDB.get(key); break; case PUT: retV = snapshotDB.put(key,value); break; case REMOVE: retV = snapshotDB.remove(key); break; case SIZE: response.setCount(snapshotDB.size()); break; case FIRSTKEY: retK = snapshotDB.firstKey(); break; case LASTKEY: retK = snapshotDB.lastKey(); break; default: break; } if(retK != null){ response.setKey(Utils.getBuffer(retK)); response.setCount(1); } if(retV != null){ response.setValue(Utils.getBuffer(retV)); response.setCount(1); } } catch (ClassNotFoundException | IOException e) { logger.error("DMapReplica error: ",e); MapError error = new MapError(); error.setErrorMsg(e.getMessage()); throw error; } return response; } public void replica(ReplicaCommand cmd) throws TException { try { getNode().getProposer(default_ring).propose(Utils.getBuffer(cmd).array()); } catch (IOException e) { throw new TException(e); } } @SuppressWarnings("unchecked") public RangeResponse range(long instance, RangeCommand cmd) throws MapError, TException { RangeResponse response = new RangeResponse(); response.setId(cmd.getId()); response.setPartition(token); List<Entry<K,V>> snapshot; SortedMap<K,V> snapshotDB; /*if(cmd.getPartition_version() != partitions_version){ // that's ok on snapshots WrongPartition p = new WrongPartition(); p.setErrorMsg(cmd.getPartition_version() + "!=" + partitions_version); throw p; }*/ try { switch(cmd.type){ case PERSISTRANGE: if(cmd.isSetSnapshot() && snapshots.containsKey(cmd.getSnapshot())){ //TODO: persist } break; case CREATERANGE: if(cmd.isSetFromkey() && cmd.isSetTokey()){ K from = (K) Utils.getObject(cmd.getFromkey()); K to = (K) Utils.getObject(cmd.getTokey()); snapshotDB = new TreeMap<K,V>(db.subMap(from,to)); }else if(cmd.isSetFromkey() && !cmd.isSetTokey()){ K from = (K) Utils.getObject(cmd.getFromkey()); snapshotDB = new TreeMap<K,V>(db.tailMap(from)); }else if(!cmd.isSetFromkey() && cmd.isSetTokey()){ K to = (K) Utils.getObject(cmd.getTokey()); snapshotDB = new TreeMap<K,V>(db.headMap(to)); }else{ snapshotDB = new TreeMap<K,V>(db); } long id = instance; snapshots.put(id,new ArrayList<Entry<K,V>>(snapshotDB.entrySet())); snapshotsDB.put(id,snapshotDB); response.setCount(snapshotDB.size()); response.setSnapshot(id); break; case DELETERANGE: if(cmd.isSetSnapshot()){ if(snapshots.containsKey(cmd.getSnapshot())){ snapshots.remove(cmd.getSnapshot()); snapshotsDB.remove(cmd.getSnapshot()); response.setCount(1); }else{ MapError e = new MapError(); e.setErrorMsg("Snaphost " + cmd.getSnapshot() + " does not exist!"); throw e; } } break; case GETRANGE: id = cmd.getSnapshot(); if(snapshots.containsKey(id)){ snapshot = snapshots.get(id); int from = 0; int size = snapshot.size(); int to = size; if(cmd.isSetFromid() && cmd.getFromid() >= 0 && cmd.getFromid() <= size){ from = cmd.getFromid(); if(cmd.isSetToid() && cmd.getToid() > cmd.getFromid() && cmd.getToid() <= size){ to = cmd.getToid(); } } List<Pair<K,V>> list = new ArrayList<Pair<K,V>>(); //sublist and TreeMap.Entry are not serializable! for(Entry<K,V> e : snapshot.subList(from,to)){ list.add(new Pair<K,V>(e.getKey(),e.getValue())); } response.setCount(list.size()); response.setValues(Utils.getBuffer(list)); }else{ MapError e = new MapError(); e.setErrorMsg("Snaphost " + cmd.getSnapshot() + " does not exist!"); throw e; } break; case PARTITIONSIZE: id = cmd.getSnapshot(); if(snapshots.containsKey(id)){ snapshot = snapshots.get(id); response.setCount(snapshot.size()); }else{ MapError e = new MapError(); e.setErrorMsg("Snaphost " + cmd.getSnapshot() + " does not exist!"); throw e; } break; default: break; } } catch (ClassNotFoundException | IOException e) { logger.error("DMapReplica error: ",e); MapError error = new MapError(); error.setErrorMsg(e.getMessage()); throw error; } return response; } /** * @param args */ public static void main(String[] args) { try { String mapID = ""; int nodeID = 1; int groupID = 1; int default_ring = 1; int partition_ring = 2; String roles = "1:PAL;2:PA"; int token = 0; if(args.length > 6){ mapID = args[0]; nodeID = Integer.parseInt(args[1]); groupID = Integer.parseInt(args[2]); default_ring = Integer.parseInt(args[3]); partition_ring = Integer.parseInt(args[4]); roles = args[5]; token = Integer.parseInt(args[6]); }else{ System.err.println("Plese use \"DMapReplica\" \"map ID\" \"node ID\" \"group ID\" \"default ring\" \"partition ring\" \"roles\" \"token\" \"[zookeeper]\" \"[recovery]\""); System.exit(1); } String zoo_host = "127.0.0.1:2181"; if (args.length > 7) { zoo_host = args[7]; } boolean recovery = false; if (args.length > 8) { if(args[8].contains("1") || args[8].contains("true")){ recovery = true; } } //register this node at zookeeper final Random rand = new Random(); final int port = 5000 + rand.nextInt(1000); // assign port between 5000-6000 final InetAddress ip = Util.getHostAddress(); final InetSocketAddress addr = new InetSocketAddress(ip,port); final String addrs = addr.getHostString() + ";" + addr.getPort(); final byte[] b = addrs.getBytes(); // store the SocketAddress final ZooKeeper zoo = new ZooKeeper(zoo_host,3000,new DummyWatcher()); Util.checkThenCreateZooNode("/dmap/" + mapID,null,Ids.OPEN_ACL_UNSAFE,CreateMode.PERSISTENT,zoo); String nodeName = Util.checkThenCreateZooNode("/dmap/" + mapID + "/node",b,Ids.OPEN_ACL_UNSAFE,CreateMode.EPHEMERAL_SEQUENTIAL,zoo); nodeName = nodeName.replace("/dmap/" + mapID + "/",""); //start URingPaxos node List<RingDescription> rings = Util.parseRingsArgument(roles); final Node node = new Node(nodeID,groupID,zoo_host,rings); node.start(); //create replica DMapReplica<Object,Object> replica = new DMapReplica<Object,Object>(default_ring,node,zoo); zoo.register(replica); zoo.getChildren("/dmap/" + mapID, true); Thread.sleep(5000); RecoveryClient<Object,Object> rclient = null; if(recovery){ rclient = new RecoveryClient<Object,Object>(mapID,zoo_host); } replica.registerPartition(nodeName,partition_ring,addr,token,rclient); //start thrift server (proposer) @SuppressWarnings({ "rawtypes", "unchecked" }) final Dmap.Processor<Iface> processor = new Dmap.Processor<Iface>(new ABSender(replica)); final TServerTransport serverTransport = new TServerSocket(port); final TThreadPoolServer.Args serverArgs = new TThreadPoolServer.Args(serverTransport).processor(processor); serverArgs.maxWorkerThreads(5000); serverArgs.minWorkerThreads(5); final TServer server = new TThreadPoolServer(serverArgs); Thread s = new Thread() { @Override public void run() { server.serve(); }; }; s.start(); //start receiver (learner) Thread receiver = new Thread(new ABReceiver(replica)); receiver.setName("ABReceiver"); receiver.start(); BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); in.readLine(); node.stop(); zoo.close(); server.stop(); System.exit(0); } catch (Exception e) { e.printStackTrace(); System.exit(1); } } class SignalReceiver implements Runnable { private final DatagramSocket socket; public SignalReceiver(DatagramSocket socket) throws SocketException{ this.socket = socket; } @Override public void run() { while(!socket.isClosed()){ try { byte[] buffer = new byte[65535]; DatagramPacket packet = new DatagramPacket(buffer,buffer.length); socket.receive(packet); Object o = Utils.getObject(Arrays.copyOfRange(packet.getData(),0,packet.getLength())); logger.debug("Signal received " + o); if(o instanceof Response){ Response r = (Response)o; synchronized (signals) { if(signals.get(r.getId()) != null){ signals.get(r.getId()).addResponse(o); }else{ if(responses.containsKey(r.getId()) || linearizable){ // signal received for non wait command if(!signals.containsKey(r.getId())){ signals.put(r.getId(),new FutureResponse(partitions.keySet())); } signals.get(r.getId()).addResponse(o); } } } }//TODO: how to handle Exceptions from one Replica (no cmd.id)? } catch (ClassNotFoundException | IOException e) { logger.error(e); } } } } }