package gobblin.compaction.verify;
import com.google.common.base.Splitter;
import gobblin.compaction.audit.AuditCountClient;
import gobblin.compaction.audit.AuditCountClientFactory;
import gobblin.compaction.mapreduce.MRCompactor;
import gobblin.compaction.parser.CompactionPathParser;
import gobblin.configuration.State;
import gobblin.dataset.FileSystemDataset;
import gobblin.util.ClassAliasResolver;
import lombok.extern.slf4j.Slf4j;
import org.joda.time.DateTime;
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Map;
/**
* Use {@link AuditCountClient} to retrieve all record count across different tiers
* Compare one specific tier (gobblin-tier) with all other refernce tiers and determine
* if verification should be passed based on a pre-defined threshold.
*/
@Slf4j
public class CompactionAuditCountVerifier implements CompactionVerifier<FileSystemDataset> {
public static final String COMPACTION_COMPLETENESS_THRESHOLD = MRCompactor.COMPACTION_PREFIX + "completeness.threshold";
public static final double DEFAULT_COMPACTION_COMPLETENESS_THRESHOLD = 0.99;
public static final String PRODUCER_TIER = "producer.tier";
public static final String ORIGIN_TIER = "origin.tier";
public static final String GOBBLIN_TIER = "gobblin.tier";
private Collection<String> referenceTiers;
private Collection<String> originTiers;
private String producerTier;
private String gobblinTier;
private double threshold;
private final State state;
private final AuditCountClient auditCountClient;
/**
* Constructor with default audit count client
*/
public CompactionAuditCountVerifier (State state) {
this (state, getClientFactory (state).createAuditCountClient(state));
}
/**
* Constructor with user specified audit count client
*/
public CompactionAuditCountVerifier (State state, AuditCountClient client) {
this.auditCountClient = client;
this.state = state;
// retrieve all tiers information
if (client != null) {
this.threshold =
state.getPropAsDouble(COMPACTION_COMPLETENESS_THRESHOLD, DEFAULT_COMPACTION_COMPLETENESS_THRESHOLD);
this.producerTier = state.getProp(PRODUCER_TIER);
this.gobblinTier = state.getProp(GOBBLIN_TIER);
this.originTiers = Splitter.on(",").omitEmptyStrings().trimResults().splitToList(state.getProp(ORIGIN_TIER));
this.referenceTiers = new HashSet<>(originTiers);
this.referenceTiers.add(producerTier);
}
}
/**
* Obtain a client factory
* @param state job state
* @return a factory which creates {@link AuditCountClient}.
* If no factory is set or an error occurred, a {@link EmptyAuditCountClientFactory} is
* returned which creates a <code>null</code> {@link AuditCountClient}
*/
private static AuditCountClientFactory getClientFactory (State state) {
if (!state.contains(AuditCountClientFactory.AUDIT_COUNT_CLIENT_FACTORY)) {
return new EmptyAuditCountClientFactory ();
}
try {
String factoryName = state.getProp(AuditCountClientFactory.AUDIT_COUNT_CLIENT_FACTORY);
ClassAliasResolver<AuditCountClientFactory> conditionClassAliasResolver = new ClassAliasResolver<>(AuditCountClientFactory.class);
AuditCountClientFactory factory = conditionClassAliasResolver.resolveClass(factoryName).newInstance();
return factory;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/**
* Verify a specific dataset by following below steps
* 1) Retrieve a tier-to-count mapping
* 2) Read count from {@link CompactionAuditCountVerifier#gobblinTier}
* 3) Read count from all other {@link CompactionAuditCountVerifier#referenceTiers}
* 4) Compare count retrieved from steps 2) and 3), if any of (gobblin/refenence) >= threshold, return true, else return false
* @param dataset Dataset needs to be verified
* @return If verification is succeeded
*/
public boolean verify (FileSystemDataset dataset) {
if (auditCountClient == null) {
log.debug("No audit count client specified, skipped");
return true;
}
CompactionPathParser.CompactionParserResult result = new CompactionPathParser(this.state).parse(dataset);
DateTime startTime = result.getTime();
DateTime endTime = startTime.plusHours(1);
String datasetName = result.getDatasetName();
try {
Map<String, Long> countsByTier = auditCountClient.fetch (datasetName, startTime.getMillis(), endTime.getMillis());
for (String tier: referenceTiers) {
if (passed (datasetName, countsByTier, tier)) {
return true;
}
}
} catch (IOException e) {
log.error(e.toString());
}
return false;
}
/**
* Compare record count between {@link CompactionAuditCountVerifier#gobblinTier} and {@link CompactionAuditCountVerifier#referenceTiers}.
* @param datasetName the name of dataset
* @param countsByTier the tier-to-count mapping retrieved by {@link AuditCountClient#fetch(String, long, long)}
* @param referenceTier the tiers we wants to compare against
* @return If any of (gobblin/refenence) >= threshold, return true, else return false
*/
private boolean passed (String datasetName, Map<String, Long> countsByTier, String referenceTier) {
if (!countsByTier.containsKey(this.gobblinTier)) {
log.warn(String
.format("Failed to get audit count for topic %s, tier %s", datasetName, this.gobblinTier));
return false;
}
if (!countsByTier.containsKey(referenceTier)) {
log.warn(String.format("Failed to get audit count for topic %s, tier %s", datasetName, referenceTier));
return false;
}
long originCount = countsByTier.get(referenceTier);
long gobblinCount = countsByTier.get(this.gobblinTier);
if ((double) gobblinCount / (double) originCount < this.threshold) {
log.warn(String.format("Verification failed for %s : gobblin count = %d, originCount count = %d (%f)",
datasetName, gobblinCount, originCount, (double) gobblinCount / (double) originCount));
return false;
}
return true;
}
public String getName() {
return this.getClass().getName() + "(" + this.auditCountClient.getClass().getName() + ")";
}
private static class EmptyAuditCountClientFactory implements AuditCountClientFactory {
public AuditCountClient createAuditCountClient (State state) {
return null;
}
}
}