BatchingStrategy.java example

Explorer
parseq-master
package com.linkedin.parseq.batching;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.linkedin.parseq.Context;
import com.linkedin.parseq.Task;
import com.linkedin.parseq.batching.BatchImpl.BatchBuilder;
import com.linkedin.parseq.batching.BatchImpl.BatchEntry;
import com.linkedin.parseq.batching.BatchImpl.BatchPromise;
import com.linkedin.parseq.internal.ContextImpl;
import com.linkedin.parseq.internal.PlanContext;
import com.linkedin.parseq.promise.CountDownPromiseListener;
import com.linkedin.parseq.promise.PromiseListener;
import com.linkedin.parseq.promise.Promises;
import com.linkedin.parseq.promise.SettablePromise;
import com.linkedin.parseq.trace.Relationship;
import com.linkedin.parseq.trace.ShallowTraceBuilder;
import com.linkedin.parseq.trace.TraceBuilder;

/**
 * {@code BatchingStrategy} helps build "batching clients" in ParSeq. "Client" means an object that given {@code K key}
 * provides a task that returns {@code T value}. "Batching" means that it can group together keys to resolve values
 * in batches. The benefit of this approach is that batching happens transparently in the background and user's code
 * does not have to deal with logic needed to implement batching.
 * <p>
 * Example of a batching client might be ParSeq client for a key-value store that provides batch get operation. For
 * the sake of simplicity of the example we are using dummy, synchronous key-value store interface:
 * <blockquote><pre>
 *  interface KVStore {
 *    String get(Long key);
 *    Map{@code <Long, String>} batchGet(Collection{@code <Long>} keys);
 *  }
 * </pre></blockquote>
 *
 * We can then implement a {@code BatchingStrategy} in the following way:
 * <blockquote><pre>
 *  public static class BatchingKVStoreClient extends BatchingStrategy{@code <Integer, Long, String>} {
 *    private final KVStore _store;
 *    public BatchingKVStoreClient(KVStore store) {
 *      _store = store;
 *    }
 *
 *    {@code @Override}
 *    public void executeBatch(Integer group, Batch{@code <Long, String>} batch) {
 *      Map{@code <Long, String>} batchResult = _store.batchGet(batch.keys());
 *      batch.foreach((key, promise) {@code ->} promise.done(batchResult.get(key)));
 *    }
 *
 *    {@code @Override}
 *    public Integer classify(Long entry) {
 *      return 0;
 *    }
 *  }
 * </pre></blockquote>
 *
 * In above example there is an assumption that all keys can be grouped together. This is why method {@code classify()}
 * trivially returns a constant {@code 0}. In practice {@code classify()} returns a group for a key. Keys that have
 * the same group will be batched together.
 * <p>
 * The interaction between ParSeq and {@code BatchingStrategy} is the following:
 * <ol>
 *   <li>{@code batchable(String desc, K key)} is invoked to create Task instance</li>
 *   <li>Plan is started by {@code Engine.run()}</li>
 *   <li>When Task returned by {@code batchable(String desc, K key)} is started, the key {@code K} is remembered by a {@code BatchingStrategy}</li>
 *   <li>When Plan can't make immediate progress {@code BatchingStrategy} will be invoked to run batchable operations:
 *   <ol>
 *     <li>Every {@code K key} is classified using {@code classify(K key)} method</li>
 *     <li>Keys, together with adequate Promises, are batched together based on {@code G group} returned by previous step</li>
 *     <li>Method {@code executeBatch(G group, Batch<K, T> batch)} is invoked for every batch</li>
 *   </ol>
 *   {@code executeBatch(G group, Batch<K, T> batch)} invocations are executed
 *   in the context of their own Task instances with description given by {@code getBatchName(G group, Batch<K, T> batch)}.
 *   Implementation of {@code BatchingStrategy} has to be fast because it is executed sequentially with respect to tasks belonging
 *   to the plan. It means that no other task will be executed until {@code BatchingStrategy} completes. Typically classify(K key)
 *   is a synchronous and fast operation whilst {@code executeBatch(G group, Batch<K, T> batch)} returns quickly and completes
 *   promises asynchronously.
 * </ol>
 *
 * @author Jaroslaw Odzga (jodzga@linkedin.com)
 *
 * @param <G> Type of a Group
 * @param <K> Type of a Key
 * @param <T> Type of a Value
 *
 * @see SimpleBatchingStrategy
 * @see TaskBatchingStrategy
 */
public abstract class BatchingStrategy<G, K, T> {

  public static final int DEFAULT_MAX_BATCH_SIZE = 1024;

  private static final Logger LOGGER = LoggerFactory.getLogger(BatchingStrategy.class);
  private static final int DEFAULT_KEY_SIZE = 1;

  private final ConcurrentMap<Long, GroupBatchBuilder> _batches =
      new ConcurrentHashMap<>();

  private final BatchSizeMetric _batchSizeMetric = new BatchSizeMetric();
  private final BatchAggregationTimeMetric _batchAggregationTimeMetric = new BatchAggregationTimeMetric();

  /**
   * This method returns Task that returns value for a single key allowing this strategy to batch operations.
   * @param desc description of the task
   * @param key key
   * @return Task that returns value for a single key allowing this strategy to batch operations
   */
  public Task<T> batchable(final String desc, final K key) {
    Task<T> batchableTask = Task.async(desc, ctx -> {
      final BatchPromise<T> result = new BatchPromise<>();
      final Long planId = ctx.getPlanId();
      final GroupBatchBuilder builder = _batches.computeIfAbsent(planId, k -> new GroupBatchBuilder());
      final G group = classify(key);
      Batch<K, T> fullBatch = builder.add(group, key, ctx.getShallowTraceBuilder(), result);
      if (fullBatch != null) {
        try {
          ctx.run(taskForBatch(group, fullBatch, true));
        } catch (Throwable t) {
          //we don't care if some of promises have already been completed
          //all we care is that all remaining promises have been failed
          fullBatch.failAll(t);
        }
      }
      return result;
    });
    batchableTask.getShallowTraceBuilder().setTaskType("batched");
    return batchableTask;
  }

  /**
   * This method returns Task that returns value for a single key allowing this strategy to batch operations.
   * @param key key
   * @return Task that returns value for a single key allowing this strategy to batch operations
   */
  public Task<T> batchable(final K key) {
    return batchable("batchableTaskForKey: " + key.toString(), key);
  }

  private Task<?> taskForBatch(final G group, final Batch<K, T> batch, final boolean hasParent) {
    _batchSizeMetric.record(batch.batchSize());
    if (LOGGER.isDebugEnabled()) {
      LOGGER.debug(debugInfo(group, batch));
    }
    return Task.async(getBatchName(group, batch), ctx -> {
      final SettablePromise<T> result = Promises.settable();
      final PromiseListener<T> countDownListener =
          new CountDownPromiseListener<T>(batch.keySize(), result, null);

      boolean assignedParent = false;
      final TraceBuilder traceBuilder = ctx.getTraceBuilder();
      for (BatchEntry<T> entry : batch.values()) {
        for (ShallowTraceBuilder shallowTraceBuilder: entry.getShallowTraceBuilders()) {
          if (!assignedParent && !hasParent) {
            traceBuilder.addRelationship(Relationship.CHILD_OF, ctx.getShallowTraceBuilder(), shallowTraceBuilder);
            assignedParent = true;
          } else {
            traceBuilder.addRelationship(Relationship.POTENTIAL_CHILD_OF, ctx.getShallowTraceBuilder(), shallowTraceBuilder);
          }
        }
        BatchPromise<T> promise = entry.getPromise();
        promise.getInternal().addListener(countDownListener);
        result.addListener(v -> promise.trigger());
      }

      try {
        executeBatchWithContext(group, batch, ctx);
      } catch (Throwable t) {
        batch.failAll(t);
      }

      ctx.getShallowTraceBuilder().setSystemHidden(true);

      return result;
    });
  }

  private void runBatch(final PlanContext planContext, G group, final Batch<K, T> batch) {
    try {
      new ContextImpl(planContext, taskForBatch(group, batch, false)).runTask();
    } catch (Throwable t) {
      //we don't care if some of promises have already been completed
      //all we care is that all remaining promises have been failed
      batch.failAll(t);
    }
  }

  void handleBatch(final PlanContext planContext) {
    final GroupBatchBuilder batchBuilder = _batches.remove(planContext.getId());
    if (batchBuilder != null) {
      batchBuilder.batches().forEach((group, builder) -> runBatch(planContext, group, builder.build()));
    }
  }

  private String debugInfo(G group, Batch<K, T> batch) {
    StringBuilder debugInfo = new StringBuilder("\n");
      debugInfo.append("group: ")
        .append(group)
        .append("\n")
        .append("batch keys: \n");
      batch.keys().forEach(key -> {
        debugInfo.append("    ").append(key).append("\n");
      });
    return debugInfo.toString();
  }


  public BatchSizeMetric getBatchSizeMetric() {
    return _batchSizeMetric;
  }

  public BatchAggregationTimeMetric getBatchAggregationTimeMetric() {
    return _batchAggregationTimeMetric;
  }

  /**
   * This method will be called for every {@code Batch}.
   * Implementation of this method must make sure that all {@code SettablePromise} contained in the {@code Batch}
   * will eventually be resolved - typically asynchronously. Failing to eventually resolve any
   * of the promises may lead to plan that never completes i.e. appears to hung and may lead to
   * a memory leak.
   * @param group group that represents the batch
   * @param batch batch contains collection of {@code SettablePromise} that eventually need to be resolved - typically asynchronously
   */
  public abstract void executeBatch(G group, Batch<K, T> batch);

  protected void executeBatchWithContext(G group, Batch<K, T> batch, Context ctx) {
    executeBatch(group, batch);
  }

  /**
   * Classify the {@code K Key} and by doing so assign it to a {@code G group}.
   * If two keys are classified by the same group then they will belong to the same {@code Batch}.
   * This method needs to be thread safe.
   * @param key key to be classified
   * @return Group that represents a batch the key will belong to
   */
  public abstract G classify(K key);

  /**
   * Overriding this method allows specifying maximum batch size for a given group.
   * Default value is {@value #DEFAULT_MAX_BATCH_SIZE}.
   * @param group group for which maximum batch size needs to be decided
   * @return maximum batch size for a given group
   */
  public int maxBatchSizeForGroup(G group) {
    return DEFAULT_MAX_BATCH_SIZE;
  }

  /**
   * Overriding this method allows specifying size of the key for a given group.
   * Default value is 1. This method is used when calculating batch size and making sure
   * that it does not exceed max batch size for a group.
   * @param group group
   * @return max batch size for this group
   * @see #maxBatchSizeForGroup(Object)
   */
  public int keySize(G group, K key) {
    return DEFAULT_KEY_SIZE;
  }

  /**
   * Overriding this method allows providing custom name for a batch. Name will appear in the
   * ParSeq trace as a description of the task that executes the batch.
   * @param batch batch to be described
   * @param group group to be described
   * @return name for the batch and group
   */
  public String getBatchName(G group, Batch<K, T> batch) {
    return "batch(keys: " + batch.keySize() + ", size: " + batch.batchSize() + ")";
  }

  private class GroupBatchBuilder {
    private final Map<G, BatchBuilder<K, T>> _batchesByGroup =
        new HashMap<>();

    /**
     * Adds new entry to a batch specified by a given group and returns
     * list of batches that can be executed or null if batch is still not full.
     * @return list of batches that can be executed or null otherwise
     */
    Batch<K, T> add(G group, K key, ShallowTraceBuilder traceBuilder, BatchPromise<T> promise) {
      final int size = keySize(group, key);
      BatchBuilder<K, T> builder =
        _batchesByGroup.computeIfAbsent(group, x -> new BatchBuilder<>(maxBatchSizeForGroup(group), _batchAggregationTimeMetric));
      //invariant: builder is not full - it is maintained by the fact that max batch size >= 1
      //and that we remove builder from the map after adding to it entry that makes it full
      if (builder.add(key, traceBuilder, promise, size)) {
        if (builder.isFull()) {
          _batchesByGroup.remove(group);
          return builder.build();
        } else {
          return null;
        }
      } else {
        BatchBuilder<K, T> newBuilder = new BatchBuilder<>(maxBatchSizeForGroup(group), _batchAggregationTimeMetric);
        //this will be successful because builder is empty and first add is always successful as per builder contract
        newBuilder.add(key, traceBuilder, promise, size);
        if (newBuilder.isFull()) {
          return newBuilder.build();
        } else {
          //return larger batch
          if (builder.batchSize() > newBuilder.batchSize()) {
            _batchesByGroup.put(group, newBuilder);
            return builder.build();
          } else {
            return newBuilder.build();
          }
        }
      }
    }

    Map<G, BatchBuilder<K, T>> batches() {
      return _batchesByGroup;
    }

  }

}