package com.zillabyte.motherbrain.flow.collectors.coordinated; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import org.apache.log4j.Logger; import org.javatuples.Triplet; import com.google.common.base.Throwables; import com.google.common.collect.Lists; import com.google.common.collect.SetMultimap; import com.zillabyte.motherbrain.benchmarking.Benchmark; import com.zillabyte.motherbrain.flow.Flow; import com.zillabyte.motherbrain.flow.MapTuple; import com.zillabyte.motherbrain.flow.collectors.OutputCollector; import com.zillabyte.motherbrain.flow.collectors.coordinated.support.CoordinatedOutputCollectorSupportFactory; import com.zillabyte.motherbrain.flow.collectors.coordinated.support.FailedTupleHandler; import com.zillabyte.motherbrain.flow.collectors.coordinated.support.TupleIdGenerator; import com.zillabyte.motherbrain.flow.collectors.coordinated.support.TupleIdSet; import com.zillabyte.motherbrain.flow.collectors.coordinated.support.naive.NaiveCoordinatedOutputCollectorSupportFactory; import com.zillabyte.motherbrain.flow.operations.AggregationOperation; import com.zillabyte.motherbrain.flow.operations.Operation; import com.zillabyte.motherbrain.flow.operations.OperationException; import com.zillabyte.motherbrain.flow.operations.Source; import com.zillabyte.motherbrain.flow.rpc.RPCHelper; import com.zillabyte.motherbrain.relational.DefaultStreamException; import com.zillabyte.motherbrain.universe.Config; import com.zillabyte.motherbrain.utils.Utils; /***** * */ public final class CoordinatedOutputCollector implements OutputCollector { private static Logger _log = Utils.getLogger(CoordinatedOutputCollector.class); private OutputCollector _delegate; private ConcurrentHashMap<Object, BatchTracker> _batchTrackers = new ConcurrentHashMap<Object, BatchTracker>(); private Object _currentBatch; private Integer _taskId = -1; private Operation _operation; private boolean _pressureState = false; private long _acks = 0; private long _fails = 0; private long _emits = 0; private long _consumes = 0; private long _coodinationTuplesEmitted = 0; private long _coordinationTuplesReceived = 0; private boolean _inProcessing = false; /*** * Note on the UNACK_THRESHOLD -- The savvy engineer will note that a high UNACK_THRESHOLD can allow for * downstream operations to run out of memory because they are getting queued up. To mitigate this, we're * using `OffloadableQueue` as the main reactor queue. This queue will, if necessary, resort to disk or * other offline storage if it grows large. */ private final Long UNACK_THRESHOLD = Config.getOrDefault("coordinated.collector.unack.pressure.threshold", 20000L); // how many unacked tuples do we allow before we start pausing sources? // TODO: fix this. The fail-tuple mechanics are currently broken because when an operation gets overloaded with tuples, // the ACKs don't get back in time. So, instead of failing and ignoring pressure states, we'll just make a hack where we // don't fail tuples. The 'right' solution here is to not fail a tuple unless we get a ping back from the remote operation. private final Long TUPLE_FAIL_MS = Config.getOrDefault("coordinated.collector.tuple.fail.ms", 1000L * 60 * 60 * 5); // how long to wait before considering a tuple failed? private final Long BATCH_COMPLETE_FAIL_MS = Config.getOrDefault("coordinated.collector.batch_complete.fail.ms", 1000L * 60); // host long to wait before considering batch-complete signal dead? private final long ACK_POOL_MAX_INTERVAL_MS = Config.getOrDefault("coordinated.collector.ack.pool.max.interval.ms", 1000L * 3); // how long to pool tuple-ids before sending the ack private int _ackPoolMaxSize = Config.getOrDefault("coordinated.collector.ack.pool.max.size", 1000); // how many tuples to pool before sending the ack private final int FAILED_BATCH_ACK_DETECTED_COUNT_LIMIT = Config.getOrDefault("coordinated.collector.failed.batch_ack.detected.count.limit", -1); private final int FAILED_TUPLE_ABSOLUTE_LIMIT = Config.getOrDefault("coordinated.collector.failed.tuple.count", -1); private final long FAILED_TUPLE_PERCENTAGE_MINIMUM = Config.getOrDefault("coordinated.collector.failed.tuple.percentage.min", -1); private final double FAILED_TUPLE_PERCENTAGE = Config.getOrDefault("coordinated.collector.failed.tuple.percentage", 0.5); private static final long QUEUE_NOTIFICATION_PERIOD = Config.getOrDefault("coordinated.collector.queued.notification.period", 1000L * 3); private static final int SEND_ACKS_AFTER_N_TUPLES_EMITTED = 10; public static final String COORD_DOWNSTREAM_ID = "ZB_DN"; public static final String COORD_UPSTREAM_ID = "ZB_UP"; private CoordinatedOutputCollectorSupportFactory _supportFactory; private FailedTupleHandler _failedTupleHandler; private TupleIdGenerator _tupleIdGenerator; private BatchedTuple _currentTuple; private Integer _currentSourceTask; private Integer _emittedInThisSession = 0; private long _lastQueueNotificationSent = 0; /**** * */ public CoordinatedOutputCollector(OutputCollector collector, CoordinatedOutputCollectorSupportFactory supportFactory) { _delegate = collector; _operation = _delegate.getOperation(); _supportFactory = supportFactory; _tupleIdGenerator = supportFactory.createTupleIdGenerator(this); _failedTupleHandler = supportFactory.createFailedTupleHandler(this); // if we're an rpc, then have a smaller pool if (RPCHelper.isRpcApp(_operation.getTopFlow())) { _ackPoolMaxSize = 0; } } public CoordinatedOutputCollector(OutputCollector collector) { // TODO: swap out with a better supportFactory later on this(collector, new NaiveCoordinatedOutputCollectorSupportFactory()); } public synchronized void setAckPoolMaxSize(int i) { _ackPoolMaxSize = i; } public synchronized int getAckPoolMaxSize() { return _ackPoolMaxSize; } /*** * */ @Override public synchronized Operation getOperation() { return _delegate.getOperation(); } @Override public synchronized void constructTaskOperationInfo(Object context) { _delegate.constructTaskOperationInfo(context); } @Override public synchronized Set<Integer> getAllTasks() { return _delegate.getAllTasks(); } @Override public synchronized SetMultimap<String, Integer> getTaskOperationMap() { return _delegate.getTaskOperationMap(); } @Override public synchronized Set<Integer> getAdjacentDownStreamTasks() { return _delegate.getAdjacentDownStreamTasks(); } @Override public synchronized Set<Integer> getAdjacentUpStreamNonLoopTasks() { return _delegate.getAdjacentUpStreamNonLoopTasks(); } /*** * (re)conigures for new tasks.. * @param context * @return */ @Override public synchronized void configure(Object context) { Integer newTaskId = _delegate.getThisTask(context); if (_taskId != newTaskId) { _log.warn("_taskId has changed from " + _taskId + " to " + _delegate.getThisTask(context)); _taskId = newTaskId; } constructTaskOperationInfo(context); for(BatchTracker t : this._batchTrackers.values()) { Integer tId = t.getTask(); if(t != null && !getAllTasks().contains(tId)) { t.removeDeadTask(tId); } } } private synchronized void maybeSendQueuedNotifications() { // Queue notiications are sent periodically, regardless of size if (_lastQueueNotificationSent + QUEUE_NOTIFICATION_PERIOD < System.currentTimeMillis()) { // Init _lastQueueNotificationSent = System.currentTimeMillis(); for(BatchTracker tracker : this._batchTrackers.values()) { for(Entry<Integer, TupleIdSet> e : tracker.getLocallyQueuedTupleIds().entrySet()) { // INIT Integer task = e.getKey(); TupleIdSet set = e.getValue(); Long oldest = set.getOldest(); int size = set.size(); if (size > 0) { emitCoordTupleUp(task, new QueuedTuple(tracker.getBatch(), _taskId, set)); } } } } } /*** * */ private void maybeSendAcks() { maybeSendQueuedNotifications(); for(BatchTracker tracker : this._batchTrackers.values()) { // Init int localQueue = tracker.getLocallyQueuedTupleSize(); // Processed acks... for(Entry<Integer, TupleIdSet> e : tracker.getProcessedTupleIds().entrySet()) { // INIT Integer task = e.getKey(); TupleIdSet set = e.getValue(); Long oldest = set.getOldest(); int size = set.size(); // If tuples are older than the threshold, then go ahead and flush them out. if (size > 0) { if ( (ACK_POOL_MAX_INTERVAL_MS == -1) || (oldest != null && oldest < System.currentTimeMillis() - ACK_POOL_MAX_INTERVAL_MS) || (size >= _ackPoolMaxSize) || (_emittedInThisSession > SEND_ACKS_AFTER_N_TUPLES_EMITTED) ) { // Send the ack debug("send ack (threshold exceeded)"); emitCoordTupleUp(task, new AckTuple(tracker.getBatch(), _taskId, set)); tracker.removeProcessedTupleIds(task, set); } } } } } /*** * * @param batchId */ private void sendAcks(BatchTracker tracker) { // INIT Map<Integer, TupleIdSet> acks = tracker.getProcessedTupleIds(); debug("sendAcks (explicit): " + acks); // Send each source an ack for(Entry<Integer, TupleIdSet> e : acks.entrySet()) { // Tell them that this tuple as acked... if (e.getValue().size() > 0) { emitCoordTupleUp(e.getKey(), new AckTuple(tracker.getBatch(), _taskId, e.getValue())); } } // Remove the received tuples... tracker.removeProcessedTupleIds(acks); } /**** * This should only be called when the component is a source or an aggregator */ public synchronized void setCurrentBatch(Object batchId) { this._currentBatch = batchId; } /**** * * @param batch */ private final BatchTracker getTrackerForBatch(final Object batch) { if (_batchTrackers.containsKey(batch) == false) { if(_operation instanceof AggregationOperation) { // TODO: this is a performance hole: we do not want to start a new thread for every batch, as that could easily get out of hand. // We need to initialize the store key of the aggregation too if the tracker for this batch was just instantiated. ((AggregationOperation) _operation).startAggregationThread(batch); } _batchTrackers.put(batch, new BatchTracker(batch, this)); } return _batchTrackers.get(batch); } /** * Observes tuple before they go into the reactor queue. This helps with the situation where there is a large * back-log on the queue, but we need to process certain messages to even see if downstream opeations are * alive. */ @Override public synchronized ObserveIncomingTupleAction observePreQueuedCoordTuple(Object tuple, Integer originTask) throws OperationException { debugIPC("observeIncomingPRE: from=" + originTask + " tuple=" + tuple + " totalCoordTupleCount=" + _coordinationTuplesReceived); if (tuple instanceof QueuedTuple) { // INIT _coordinationTuplesReceived++; QueuedTuple queuedTuple = ((QueuedTuple)tuple); TupleIdSet set = queuedTuple.tupleIdSet(); BatchTracker tracker = getTrackerForBatch(queuedTuple.batchId()); tracker.replaceTuplesRemotelyQueuedAt(set, originTask); tracker.resetUnAckedTupleTimestamp(originTask, set); // no further processing return ObserveIncomingTupleAction.STOP; } if (tuple instanceof BatchedTuple) { // We've received a new tuple, but it is going on the queue after this. So, let's mark that it // is currently queued, in case the queue happens to be HUGE. Object thisBatch = ((BatchedTuple)tuple).batchId(); BatchTracker tracker = this.getTrackerForBatch(thisBatch); tracker.markTupleLocallyQueuedFrom((BatchedTuple)tuple, originTask); // Continue.. return ObserveIncomingTupleAction.CONTINUE; } // Done return ObserveIncomingTupleAction.CONTINUE; } /** * @throws OperationException ** * */ @Override public synchronized ObserveIncomingTupleAction observePostQueuedCoordTuple(Object tuple, Integer originTask) throws OperationException { debugIPC("["+_operation.instanceName()+", taskId="+_taskId+"] observeIncomingPOST: from=" + originTask + " tuple=" + tuple + " totalCoordTupleCount=" + _coordinationTuplesReceived); _emittedInThisSession = 0; // Periodic checks... maybeSendQueuedNotifications(); /********************************* * FROM DOWNSTREAM *********************************/ if (tuple instanceof AckTuple) { _coordinationTuplesReceived++; // From Downstream. This is telling us the contained tuple msg'ids have been processed. AckTuple ackTuple = ((AckTuple)tuple); BatchTracker tracker = getTrackerForBatch(ackTuple.batchId()); debug("USING batch="+tracker.getBatch() + " collector:"); debug("["+_operation.instanceName()+"] RECEIVED ACK TUPLE "+tuple); debug("The current batch state is "+ tracker.getBatchState()); // Remove acked tuples... tracker.removeRemotelyQueuedTupleIds(ackTuple.fromTaskId(), ackTuple.tupleIdSet()); tracker.removeUnAckedTupleIds(ackTuple.fromTaskId(), ackTuple.tupleIdSet()); debug("removed unacked tuples: " + ackTuple.tupleIdSet()); _acks += ackTuple.tupleIdSet().size(); // Remove pressure state? maybeRemovePressureState(); // Have all tuples been acked? debug("un-acked: "+tracker.getUnAckedTuples()); // If we're COMPLETING, then maybe check for batch completed.. if (tracker.getBatchState() == BatchState.COMPLETING) { this.maybeCompleteBatch(); } // no further processing return ObserveIncomingTupleAction.STOP; } if (tuple instanceof BatchCompleteAckTuple) { _coordinationTuplesReceived++; // This is telling us that the batchComplete message has been acked. This means we can clean up resources.. BatchCompleteAckTuple bca = ((BatchCompleteAckTuple)tuple); BatchTracker tracker = getTrackerForBatch(bca.batchId()); // Remove acked tuples... tracker.markThatDownstreamBatchAckHasBeenReceived(bca.fromTaskId()); tracker.resetBatchCompleteAttempts(bca.fromTaskId()); // Have all downstreams reported the batch-ack? for (Integer task : getAdjacentDownStreamTasks()) { if (tracker.hasDownstreamBatchAckBeenReceived(task) == false) { // Ack has not been received... do nothign return ObserveIncomingTupleAction.STOP; } } // Else, if we get here, then all downstream tasks have been received. We can now clean up. tracker.setBatchState(BatchState.ACKED); // handleCleanup(tracker); // no further processing return ObserveIncomingTupleAction.STOP; } if (tuple instanceof PingTuple) { _coordinationTuplesReceived++; // We're being pinged, better respond with a pong. PingTuple pingTuple = (PingTuple) tuple; BatchTracker tracker = getTrackerForBatch(pingTuple.batchId()); String message = pingTuple.messsage(); CoordTupleOptions opts = pingTuple.options(); debug(_operation.instanceName()+"(" + _taskId+") receiving PING "+pingTuple); PongTuple pongTuple = null; switch(message) { case "agg_safe": // query from downstream aggregators asking whether or not _operation is still emitting or is still processing tuples // TODO: I'm not sure we actually need to pass in the aggStoreKey here since it's just copied back in the Pong...but the receiver // of the Pong (i.e. the originator of this Ping) should know what aggStoreKey it's currently on. if(!(_operation instanceof AggregationOperation)) { // AggregationOperations give specific permission for downstream operations to start aggregating (see permission_to_agg - a message // sent by aggregators to give downstream aggregators permission to start aggregating). Here we only want to check that // non-aggregators are no longer emitting/processing tuples. Boolean sourceEmitting = Boolean.FALSE; if(_operation instanceof Source) { if(tracker.getBatchState().equals(BatchState.EMITTING) || tracker.getBatchState().equals(BatchState.COMPLETING)) sourceEmitting = Boolean.TRUE; } debug("[operation: "+_operation.instanceName()+" task: "+_taskId+" batch: "+tracker.getBatch()+"-"+tracker.getBatchState()+"] in processing: "+_inProcessing); debug("[operation: "+_operation.instanceName()+" task: "+_taskId+" batch: "+tracker.getBatch()+"-"+tracker.getBatchState()+"] source emitting?: "+sourceEmitting); if(!_inProcessing && !sourceEmitting) { pongTuple = new PongTuple(tracker.getBatch(), _taskId, message, CoordTupleOptions.build().addOpt(message, Boolean.TRUE).addOpt("agg_store_key", opts.getInt("agg_store_key"))); } else { pongTuple = new PongTuple(tracker.getBatch(), _taskId, message, CoordTupleOptions.build().addOpt(message, Boolean.FALSE).addOpt("agg_store_key", opts.getInt("agg_store_key"))); } } break; case "permission_to_agg": // message from preceding operations (both in loop-back paths or non-loopback paths) that _operation may start aggregating if(_operation instanceof AggregationOperation) { // Tell the current operation that the originator of this operation has given it permission to aggregate AggregationOperation aggOp = (AggregationOperation) _operation; tracker.setTaskAggSafe(aggOp.getIterationStoreKey(tracker.getBatch()), pingTuple.fromTaskId(), Boolean.TRUE); } break; case "done_agg": // only called between instances of the same aggregation operation // Get the earliest start aggregation time out of all sibling instances. Long siblingStartTime = opts.getLong("agg_start_time"); Long myStartTime = ((AggregationOperation) _operation).getStartAggTime(tracker.getBatch(), opts.getInt("agg_store_key")); if(myStartTime == null || myStartTime > siblingStartTime) ((AggregationOperation) _operation).setStartAggTime(tracker.getBatch(), opts.getInt("agg_store_key"), siblingStartTime); tracker.setSiblingTaskDoneAgg(opts.getInt("agg_store_key"), pingTuple.fromTaskId()); break; case "seen_new_tuples": // message from downstream aggregators asking if _operation has processed any tuples since the given time Long baseTime = opts.getLong("agg_start_time"); Long lastBatchedTime = tracker.getLastBatchedTupleSeenAt(); if(lastBatchedTime == null || lastBatchedTime - baseTime <= 0) { pongTuple = new PongTuple(tracker.getBatch(), _taskId, message, CoordTupleOptions.build().addOpt(message, Boolean.FALSE).addOpt("agg_store_key", opts.getInt("agg_store_key"))); } else { pongTuple = new PongTuple(tracker.getBatch(), _taskId, message, CoordTupleOptions.build().addOpt(message, Boolean.TRUE).addOpt("agg_store_key", opts.getInt("agg_store_key"))); } break; default: throw new RuntimeException ("undefined message in ping tuple: "+message); } if(pongTuple != null) sendPongTupleUpStream(pongTuple, pingTuple.fromTaskId()); return ObserveIncomingTupleAction.STOP; } /********************************* * FROM UPSTREAM *********************************/ if (tuple instanceof PongTuple) { _coordinationTuplesReceived++; PongTuple pongTuple = (PongTuple) tuple; BatchTracker tracker = getTrackerForBatch(pongTuple.batchId()); String reply = pongTuple.reply(); CoordTupleOptions opts = pongTuple.options(); debug(_operation.instanceName()+"("+_taskId+") receiving PONG "+pongTuple); switch(reply) { case "agg_safe": // reply to agg_safe ping telling _operation if the originating task is still emitting or processing tuples tracker.setTaskAggSafe(opts.getInt("agg_store_key"), pongTuple.fromTaskId(), opts.getBoolean(reply)); break; case "seen_new_tuples": // reply to seen_new_tuples ping telling _operation if the originating task has seen new tuples since the query time tracker.setCheckTuple(opts.getInt("agg_store_key"), pongTuple.fromTaskId(), opts.getBoolean(reply)); break; default: throw new RuntimeException ("undefined reply in pong tuple: "+reply); } return ObserveIncomingTupleAction.STOP; } if (tuple instanceof BatchCompleteTuple) { _coordinationTuplesReceived++; // From Upstream. This tuple is telling us the batch is complete BatchCompleteTuple bc = ((BatchCompleteTuple)tuple); BatchTracker tracker = getTrackerForBatch(bc.batchId()); Integer fromTask = bc.fromTaskId(); debug("USING batch="+tracker.getBatch() + " collector:"); debug("["+_operation.instanceName()+"] RECEIVED BATCH COMPLETE TUPLE "+tuple); // Mark that we've received a batch-complete from the task.. tracker.markThatBatchCompleteHasBeenReceivedFromUpstreamTask(fromTask); // Send an ACK to the originating task emitCoordTupleUp(fromTask, new BatchCompleteAckTuple(bc.batchId(), _taskId)); // Sanity Set<Integer> upStreamNonLoopTasks = getAdjacentUpStreamNonLoopTasks(); if (upStreamNonLoopTasks.size() == 0) { throw new IllegalStateException("no upstream tasks, yet we just received a BatchCompleteTuple"); } // TODO: fix this // if (upStreamTasks.contains(originTask) == false) { // _log.fatal("we received a BatchCompleteTuple from a task we are not aware of."); // } for(Integer upTask : getAdjacentUpStreamNonLoopTasks()) { if (tracker.hasReceivedBatchCompleteFromUpstreamTask(upTask) == false) { // Nope, keep waiting.. debug("will not complete this batch becase we have not received batch-complete from: " + upTask); return ObserveIncomingTupleAction.STOP; } if (tracker.hasProcessedTuplesFrom(upTask)) { debug("will not complete this batch becase we still have processed tuples (i.e. unsent acks): " + upTask); return ObserveIncomingTupleAction.STOP; } if (tracker.hasLocallyQueuedTuplesFrom(upTask)) { debug("will not complete this batch becase we still have queued tuples from: " + upTask); return ObserveIncomingTupleAction.STOP; } } startCompletingBatch(tracker); return ObserveIncomingTupleAction.STOP; } if (tuple instanceof ExplicitAckRequestTuple) { _coordinationTuplesReceived++; // From Upstream. This tuple is telling us it is done emitting. This is a good signal to now send some acks. ExplicitAckRequestTuple ed = ((ExplicitAckRequestTuple)tuple); BatchTracker tracker = getTrackerForBatch(ed.batchId()); sendAcks(tracker); // no further processing return ObserveIncomingTupleAction.STOP; } if (tuple instanceof BatchedTuple) { _consumes++; _inProcessing = true; _currentTuple = (BatchedTuple) tuple; _currentBatch = ((BatchedTuple)tuple).batchId(); _currentSourceTask = originTask; BatchTracker tracker = getTrackerForBatch(_currentBatch); tracker.setLastBatchedTupleSeenAt(); tracker.removeLocallyQueuedTupleFrom(_currentTuple, _currentSourceTask); _delegate.observeIncomingTuple(_currentTuple); return ObserveIncomingTupleAction.CONTINUE; } // Else. unknown throw new RuntimeException("unknown tuple type: " + tuple); } public synchronized Boolean aggregationSafe(Object batch, Set<String> pingOperations, Integer aggStoreKey) { Map<Integer, Boolean> taskAggSafe = getTrackerForBatch(batch).getAggregationSafe(aggStoreKey); SetMultimap<String, Integer> taskOpMap = getTaskOperationMap(); for(String op : pingOperations) { for(Integer task : taskOpMap.get(op)) { Boolean aggSafe = taskAggSafe.get(task); if(aggSafe == null || !taskAggSafe.get(task)) return Boolean.FALSE; } } return Boolean.TRUE; } public synchronized Boolean siblingsDoneAggregating(Object batch, Integer aggStoreKey) { Set<Integer> siblingsDoneAgg = getTrackerForBatch(batch).getSiblingsDoneAggregation(aggStoreKey); if(siblingsDoneAgg == null) return Boolean.FALSE; SetMultimap<String, Integer> taskOpMap = getTaskOperationMap(); for(Integer task : taskOpMap.get(_operation.namespaceName())) { if(task == _taskId) continue; // obviously we are done ourselves if we are asking if(!siblingsDoneAgg.contains(task)) return Boolean.FALSE; } return Boolean.TRUE; } public synchronized Boolean upStreamReceivedNewTuples(Object batch, Integer aggStoreKey) { Map<Integer, Boolean> taskCheckTuple = getTrackerForBatch(batch).getLastTupleCheck(aggStoreKey); SetMultimap<String, Integer> taskOpMap = getTaskOperationMap(); for(String op : _operation.opsFromSelfToSelf()) { for(Integer task : taskOpMap.get(op)) { if(taskCheckTuple.get(task)) return Boolean.TRUE; } } return Boolean.FALSE; } public synchronized Boolean pongRepliesReceived(Object batch, Set<String> pingOperations, String message, Integer aggStoreKey) { Map<Integer, Boolean> checkMap; switch(message) { case "agg_safe": checkMap = getTrackerForBatch(batch).getAggregationSafe(aggStoreKey); break; case "seen_new_tuples": checkMap = getTrackerForBatch(batch).getLastTupleCheck(aggStoreKey); break; default: throw new RuntimeException("undefined message \""+message+"\" in replies received"); } if(checkMap == null) return Boolean.FALSE; for(String op : pingOperations) { // _log.info(getTaskOperationMap()); // _log.info(op+" "+_operation.instanceName()+" "+getTaskOperationMap().get(op)); for(Integer task : getTaskOperationMap().get(op)) { if(!checkMap.containsKey(task)) return Boolean.FALSE; } } return Boolean.TRUE; } /*** * * @param tracker */ private void handleCleanup(BatchTracker tracker) { tracker.cleanup(); this._batchTrackers.remove(tracker.getBatch()); } /*** * * @param tracker */ private void startCompletingBatch(BatchTracker tracker) { // Sanity if (tracker.getBatchState() == BatchState.COMPLETING || tracker.getBatchState() == BatchState.COMPLETED || tracker.getBatchState() == BatchState.ACKED) { return; } if (tracker.getBatchState() != BatchState.EMITTING) { throw new IllegalStateException("batch= "+tracker.getBatch() + " " + _operation.instanceName() + " unable to transition to COMPLETING from: " + tracker.getBatchState()); } // Invoke the callback... _log.info("["+_operation.instanceName()+"] batch="+tracker.getBatch()+" transitioning to COMPLETING"); tracker.setBatchState(BatchState.COMPLETING); try { this._operation.onBatchCompleting(tracker.getBatch()); } catch (OperationException e) { Throwables.propagate(e); } // Tell downstreams to send their acks NOW maybeSendAckRequestsTuples(tracker); maybeCompleteBatch(tracker); } private void maybeCompleteBatch(BatchTracker tracker) { // Make sure we only do this once... if (tracker.getBatchState() == BatchState.COMPLETED) { //_log.warn("Ignored duplicate attempt to complete batch: " + tracker.getBatch() ); return; } if (tracker.getBatchState() != BatchState.COMPLETING) { // Ignore debug("Ignoring maybeCompleteBatch because we are in state: " + tracker.getBatchState()); return; } // Will the operation allow us to complete the batch? if (_operation.permissionToCompleteBatch(tracker.getBatch()) == false) { debug("Ignoring maybeCompleteBatch because the operation refused permission"); return; } // Its possible the above callabck emitted some more stuff.. in that case, we'll exit out // and watch for the acks to come back later... debug("unacked tuples: "+tracker.getUnAckedTuples()); if (!tracker.hasNoUnAckedTuples()) { debug("not completing batch because we have unacked tuples."); return; } // If we get here, then all tuples have been acked, we can transition to COMPLETE _log.info("["+_operation.instanceName()+"] batch="+tracker.getBatch()+" transitioning to COMPLETED"); tracker.setBatchState(BatchState.COMPLETED); _operation.onThisBatchCompleted(tracker.getBatch()); // Send the batch complete tuples sendBatchCompleteTuples(tracker); } public synchronized void maybeCompleteBatch(Object batch) { maybeCompleteBatch(getTrackerForBatch(batch)); } private void maybeCompleteBatch() { for(BatchTracker t : this._batchTrackers.values()) { maybeCompleteBatch(t); } } // // private void completeBatch(BatchTracker tracker) { // // // Make sure we only do this once... // if (tracker.getBatchState() == BatchState.COMPLETED) { // _log.warn("Ignored multiple attempt to complete batch: " + tracker.getBatch() ); // return; // } // // // If we get here, then all upstreams have said this batch is completed. Note that the operation may emit new tuples. // try { // _operation.onBatchCompleted(tracker.getBatch()); // } catch (OperationException e) { // Throwables.propagate(e); // } // // // Mark state as completed. This node will itself send a BatchCompleteTuple when downstream tasks ack all outstanding tuples // debug("BATCH COMPLETE: " + tracker.getBatch()); // tracker.setBatchState(BatchState.COMPLETED); // // // If no outstanding tuples, then tell downstreams about completed batch // if (tracker.hasNoUnAckedTuples()) { // // Have all tuples been acked? // maybeResendBatchCompleteTuples(tracker); // } else { // // Otherwise, ask for acks sooner than later.. // sendEmitDoneTuples(tracker); // } // } public synchronized void handleChecks(Object context) { // Has storm changed anything on us? configure(context); handleChecks(); } public synchronized void handleChecks() { // Sanity if (_inProcessing) return; Benchmark.markBegin("coordinated.collector.handle_checks"); try { // Sanity checks.. debug("handleChecks " + this._operation.instanceName()); maybeFailUnackedTuples(); maybeThrowSanityErrors(); // Send queued notifications... maybeSendQueuedNotifications(); // Maybe we're done with pressure state? maybeRemovePressureState(); // Maybe send acks that we've been pooling up.. maybeSendAcks(); // Maybe complete teh batch.. maybeCompleteBatch(); } finally { Benchmark.markEnd("coordinated.collector.handle_checks"); } } private void maybeRemovePressureState() { if (_pressureState) { if (this.getUnackedCount() == 0) { _pressureState = false; } } } /**** * */ private void maybeFailUnackedTuples() { // INIT Long cutoffTime = System.currentTimeMillis() - TUPLE_FAIL_MS; for(BatchTracker tracker : this._batchTrackers.values()) { for(Entry<Integer, TupleIdSet> e : tracker.getUnAckedTuples().entrySet()) { // INIT Integer destTask = e.getKey(); TupleIdSet set = e.getValue(); TupleIdSet oldTuples = set.getTupleIdsOlderThan(cutoffTime); // Are the oldTuples currently on the remote queue? TupleIdSet queuedTuples = tracker.getRemoteQueuedTuples(destTask); oldTuples.clone().removeAll(queuedTuples); TupleIdSet failures = oldTuples; if (failures.size() > 0) { _fails += failures.size(); _failedTupleHandler.handleFailedTupleIds(this, tracker.getBatch(), failures); tracker.removeUnAckedTupleIds(destTask, failures); tracker.removeRemotelyQueuedTupleIds(destTask, failures); } } } } /*** * * @param tracker */ private void maybeThrowSanityErrors() { for(BatchTracker tracker : _batchTrackers.values()) { maybeThrowSanityErrors(tracker); } } /*** * */ private void maybeThrowSanityErrors(BatchTracker tracker) { for(Integer outTask : getAdjacentDownStreamTasks()) { // Are all tuples dying? if (tracker.getFailedTupleCount(outTask) > FAILED_TUPLE_ABSOLUTE_LIMIT && FAILED_TUPLE_ABSOLUTE_LIMIT != -1) { throw new DeadNodeDetectedException("The task="+outTask+" appears to be dead. Tuples sent to it have failed more than " + tracker.getFailedTupleCount(outTask) + " times"); } // Are a percentage of tuples dying? long num = tracker.getFailedTupleCount(outTask); long den = tracker.getSentTupleCount(outTask); if (den > FAILED_TUPLE_PERCENTAGE_MINIMUM && FAILED_TUPLE_PERCENTAGE_MINIMUM > 0) { double p = num / den; if (p > FAILED_TUPLE_PERCENTAGE && FAILED_TUPLE_PERCENTAGE != -1) { throw new DeadNodeDetectedException("The task="+outTask+" appears to be dead. Tuples sent to it have failed more than " + p + "% of the time."); } } // Sanity... are we getting any BATCH_COMPLETE acks? int attempts = tracker.getDownstreamBatchCompleteAttempts(outTask); if (attempts > FAILED_BATCH_ACK_DETECTED_COUNT_LIMIT && FAILED_BATCH_ACK_DETECTED_COUNT_LIMIT != -1) { throw new DeadNodeDetectedException("The task="+outTask+" appears to be dead. It has not responded to any BATCH_COMPLETE messages after " + attempts + " attempts"); } } } /*** * * @param tracker * @param force */ private void sendBatchCompleteTuples(BatchTracker tracker) { debug("["+_operation.instanceName()+"] sendBatchComplete(): " + getAdjacentDownStreamTasks() + " batch="+tracker.getBatch()); for(Integer outTask : getAdjacentDownStreamTasks()) { // Has the batch complete already been acked? if (tracker.hasDownstreamBatchAckBeenReceived(outTask)) { continue; } // Send the batch complete tuples, mark when we sent them, just in case it fails.. Long sent = tracker.getDownstreamBatchCompleteSentAt(outTask); if (sent + BATCH_COMPLETE_FAIL_MS < System.currentTimeMillis()) { emitCoordTupleDown(outTask, new BatchCompleteTuple(tracker.getBatch(), _taskId)); tracker.markThatBatchCompleteHasBeenSentToDownstreamTask(outTask, System.currentTimeMillis()); } } } public synchronized void sendPingTuplesUpStream(Object batchId, Set<String> pingOperations, String message, CoordTupleOptions options) { for(String op : pingOperations) { for(Integer task : getTaskOperationMap().get(op)) { sendPingTupleUpStream(batchId, task, message, options); } } } public synchronized void sendPingTupleUpStream(Object batchId, Integer outTask, String message, CoordTupleOptions options) { emitCoordTupleUp(outTask, new PingTuple(batchId, _taskId, message, options)); } public synchronized void sendPingTuplesDownStream(Object batchId, Set<String> pingOperations, String message, CoordTupleOptions options) { for(String op : pingOperations) { for(Integer task : getTaskOperationMap().get(op)) { sendPingTupleDownStream(batchId, task, message, options); } } } public synchronized void sendPingTupleDownStream(Object batchId, Integer outTask, String message, CoordTupleOptions options) { emitCoordTupleDown(outTask, new PingTuple(batchId, _taskId, message, options)); } private void sendPongTupleUpStream(PongTuple pongTuple, Integer outTask) { emitCoordTupleDown(outTask, pongTuple); } /*** * */ @Override public synchronized void emit(String streamName, MapTuple tuple) throws OperationException { emitImpl(_operation.prefixifyStreamName(streamName), tuple); } /*** * */ @Override public synchronized void emit(MapTuple t) throws OperationException { try { this.emitImpl(_delegate.getDefaultStream(), t); } catch (DefaultStreamException e) { throw (OperationException) new OperationException(_operation, e).setUserMessage("No emit stream given for operation \""+_operation.namespaceName()+"\"."); } } /*** * */ private synchronized void emitImpl(String streamName, MapTuple tuple) throws OperationException { Benchmark.markBegin("coordinated.collector.emit"); try { // INIT _emits++; _emittedInThisSession++; debugIPC("emit (" + streamName + "): " + tuple); Object batch = this._currentBatch; if (batch == null) throw new IllegalStateException("batch has not been set!"); BatchTracker tracker = getTrackerForBatch(batch); // Sanity if (_pressureState) { //_log.warn("operation " + _operation.instanceName() + " continues to emit, even though we are in a pressure state!"); } maybeThrowSanityErrors(tracker); // Emit the tuple, as normal. The delegate will return a list of tasks that the tuple has been sent to. BatchedTuple bt = new BatchedTuple(tuple, _tupleIdGenerator.getTupleIdFor(tuple), batch); List<Integer> tasks = _delegate.emitAndGetTasks(streamName, bt); for(Integer task : tasks) { tracker.addUnAckedTupleSentTo(bt, task); tracker.incSentTupleCount(task); } // Does this put us into a pressure state? if (this.isAtUnackedThreshold()) { // Enter pressure state.. _pressureState = true; // Tell all downstreams to send their acks.. maybeSendAckRequestsTuples(); } } finally { Benchmark.markEnd("coordinated.collector.emit"); } } /*** * * @param tracker */ private void maybeSendAckRequestsTuples(BatchTracker tracker) { // We send the ExplicitAckRequestTuple to tell the operaitons to send their acks sooner than later. for(Integer outTask : getAdjacentDownStreamTasks()) { // Is the remote operaiton queue=0? if (tracker.getRemoteQueuedTuples(outTask).size() == 0 && tracker.getLocallyQueuedTupleSize() == 0) { emitCoordTupleDown(outTask, new ExplicitAckRequestTuple(tracker.getBatch(), _taskId)); } } } private void maybeSendAckRequestsTuples() { for(BatchTracker tracker : this._batchTrackers.values()) { maybeSendAckRequestsTuples(tracker); } } /*** * * @param string */ public synchronized void explicitlyCompleteBatch(Object batch) { // Init debug("batch completed (explicitly) batch=" + batch); startCompletingBatch(getTrackerForBatch(batch)); } /*** * */ @Override public synchronized void observeIncomingTuple(MapTuple tuple) { if (tuple instanceof BatchedTuple == false) throw new IllegalStateException("expected batched tuple"); } /*** * */ @Override public synchronized void emitDirect(Integer taskId, String streamId, List<?> rawTuple) { // TODO: we can handle emitDirects, but we need to use other streamIds throw new IllegalStateException("you should not call emitDirect() with a CoordinateOutputCollector, otherwise coordination will fail"); } /*** * */ @Override public synchronized List<Integer> emitAndGetTasks(String streamName, MapTuple t) { throw new IllegalStateException("you should not call emitAndGetTasks() with a CoordinateOutputCollector, otherwise coordination will fail"); } /*** * */ @Override public synchronized Integer getThisTask(Object context) { return _delegate.getThisTask(context); } /*** * */ @Override public synchronized String getDefaultStream() throws DefaultStreamException { return _delegate.getDefaultStream(); } /*** * */ @Override public synchronized void resetCounter() { _delegate.resetCounter(); } /*** * */ @Override public synchronized long getCounter() { return _delegate.getCounter(); } /** * @throws OperationException * * */ @Override public synchronized void onAfterTuplesEmitted() throws OperationException { debug("onAfterTuplesEmitted"); BatchTracker tracker = getTrackerForBatch(_currentBatch); _inProcessing = false; tracker.markTupleProcessedFrom(_currentTuple, _currentSourceTask); maybeSendAcks(); _delegate.onAfterTuplesEmitted(); } /*** * * @param fromTask * @param coordTuple */ private void emitCoordTupleDown(Integer task, BaseCoordTuple coordTuple) { _coodinationTuplesEmitted++; debug("coord: task=" + task + " tuple=" + coordTuple); handleEmitDirect(task, COORD_DOWNSTREAM_ID, coordTuple); } private void emitCoordTupleUp(Integer task, BaseCoordTuple coordTuple) { _coodinationTuplesEmitted++; debug("coord: task=" + task + " tuple=" + coordTuple); handleEmitDirect(task, COORD_UPSTREAM_ID, coordTuple); } private void handleEmitDirect(Integer task, String stream, BaseCoordTuple coordTuple) { Benchmark.markBegin("coordinated.collector.emit_direct"); try { _delegate.emitDirect(task, stream, Lists.newArrayList(coordTuple)); } finally { Benchmark.markEnd("coordinated.collector.emit_direct"); } } private void debugTrackers() { synchronized(CoordinatedOutputCollector.class) { for(BatchTracker t : this._batchTrackers.values()) { debug(t.toString()); } } } /*** * * @param flow * @return true if the flow should use the coordinated framework */ public static final boolean shouldUse(Flow flow) { if (Config.getOrDefault("storm.coordinated", Boolean.TRUE).booleanValue()) { return true; } if (flow.isCoordinated()) { return true; } return false; } public synchronized CoordinatedOutputCollectorSupportFactory getSupportFactory() { return this._supportFactory; } public synchronized boolean isAtUnackedThreshold() { return getUnackedCount() > UNACK_THRESHOLD; } private synchronized long getUnackedCount() { long size = 0; for(BatchTracker tracker : this._batchTrackers.values()) { for(TupleIdSet set : tracker.getUnAckedTuples().values()) { size += set.size(); } } return size; } public synchronized Triplet<Integer,Integer,Integer> getUnackedAndLocalQueueAndRemoteQueeuCount_ThreadUnsafe() { int unackedSize = 0; int localQueueSize= 0; int remoteQueueSize= 0; for(BatchTracker tracker : this._batchTrackers.values()) { for(TupleIdSet set : tracker.getUnAckedTuples().values()) { unackedSize += set.size(); } for(TupleIdSet set : tracker.getLocallyQueuedTupleIds().values()) { localQueueSize += set.size(); } for(TupleIdSet set : tracker.getRemotelyQueuedTupleIds().values()) { remoteQueueSize += set.size(); } } return new Triplet<Integer,Integer,Integer>(unackedSize, localQueueSize, remoteQueueSize); } public synchronized Integer getTaskId() { return this._taskId; } /*** * * @return */ public synchronized boolean inPressureState() { return _pressureState; } public synchronized long getFailCount() { return this._fails; } public synchronized long getAckCount() { return this._acks; } public synchronized long getEmitCount() { return this._emits; } public synchronized long getConsumeCount() { return this._consumes; } public synchronized long getCoordConsumeCount() { return _coordinationTuplesReceived; } public synchronized long getCoordEmitCount() { return _coodinationTuplesEmitted; } /*** * * @param string */ public void debug(Object m) { // System.err.println(this._operation.instanceName()+" (task " + this._taskId + "-"+this._operation.type()+"): " + m); // _log.info(this._operation.instanceName()+" (task " + this._taskId + "): " + m); // _log.debug(string); } private void debugIPC(Object m) { debug(m); } public synchronized ConcurrentHashMap<Object, BatchTracker> getTrackers() { return this._batchTrackers; } @Override public Object getCurrentBatch() { return _currentBatch; } }