package com.zillabyte.motherbrain.flow.operations.builtin; import java.nio.charset.Charset; import java.util.Map; import net.sf.json.JSONObject; import com.google.common.collect.Maps; import com.google.common.hash.BloomFilter; import com.google.common.hash.Funnel; import com.google.common.hash.PrimitiveSink; import com.zillabyte.motherbrain.flow.Fields; import com.zillabyte.motherbrain.flow.MapTuple; import com.zillabyte.motherbrain.flow.collectors.OutputCollector; import com.zillabyte.motherbrain.flow.config.OperationConfig; import com.zillabyte.motherbrain.flow.operations.Function; import com.zillabyte.motherbrain.flow.operations.OperationException; import com.zillabyte.motherbrain.flow.operations.multilang.operations.MultilangHandler; import com.zillabyte.motherbrain.universe.Config; public class Unique extends Function { private static final long serialVersionUID = -1116914507247257811L; private Integer _expectedSize = Config.getOrDefault("unique.bloom.filter.size", 5_000_000); private transient Map<Object, BloomFilter<MapTuple>> _filters = null; private Fields _uniqueFields = null; public Unique(String name, OperationConfig config, Fields fields) { super(name, config); if (fields != null && fields.size() > 0) { _uniqueFields = fields; setIncomingRouteByFields(fields); } } public Unique(JSONObject node) { this(node.getString("name"), MultilangHandler.getConfig(node), new Fields(node.getJSONArray("group_fields"))); if (node.has("config") && node.getJSONObject("config").has("expected_size")) { _expectedSize = node.getJSONObject("config").getInt("expected_size"); } } private BloomFilter<MapTuple> getFilter(Object batch) { if (_filters.containsKey(batch) == false) { Funnel<MapTuple> funnel = new Funnel<MapTuple>() { private static final long serialVersionUID = 3504134639163725164L; @Override public void funnel(MapTuple from, PrimitiveSink into) { if (_uniqueFields == null) { into.putString(from.values().toString(), Charset.defaultCharset()); } else { for(String f : _uniqueFields) { into.putString(from.get(f).toString(), Charset.defaultCharset()); } } } }; logger().info("Creating unique filter with max expected capacity of: " + _expectedSize); _filters.put(batch, BloomFilter.create(funnel, _expectedSize)); } return _filters.get(batch); } /*** * */ @SuppressWarnings("serial") @Override public void prepare() { _filters = Maps.newHashMap(); } @Override public int getMaxParallelism() { if (this._uniqueFields == null) { return 1; } else { return super.getMaxParallelism(); } } @Override public void onThisBatchCompleted(Object batchId) { _filters.remove(batchId); } @Override protected void process(MapTuple t, OutputCollector c) throws OperationException, InterruptedException { BloomFilter<MapTuple> filter = getFilter(c.getCurrentBatch()); if (filter.mightContain(t)) { // Do nothing... } else { filter.put(t); c.emit(t); } } }