package org.gbif.occurrence.cli.crawl; import org.gbif.api.model.common.paging.PagingResponse; import org.gbif.api.model.common.search.SearchResponse; import org.gbif.api.model.crawler.DatasetProcessStatus; import org.gbif.api.model.crawler.FinishReason; import org.gbif.api.model.occurrence.Occurrence; import org.gbif.api.model.occurrence.search.OccurrenceSearchParameter; import org.gbif.api.model.occurrence.search.OccurrenceSearchRequest; import org.gbif.api.service.occurrence.OccurrenceSearchService; import org.gbif.api.service.registry.DatasetProcessStatusService; import org.gbif.occurrence.ws.client.OccurrenceWsClientModule; import org.gbif.registry.ws.client.guice.RegistryWsClientModule; import org.gbif.ws.client.guice.AnonymousAuthModule; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Properties; import java.util.UUID; import java.util.function.Consumer; import java.util.function.Function; import com.google.inject.Guice; import com.google.inject.Injector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Service that checks for previous crawls and send delete messages if predefined conditions are met. */ public class PreviousCrawlsManagerService { private static final Logger LOG = LoggerFactory.getLogger(PreviousCrawlsManagerService.class); private static final String SQL_WITH_CLAUSE = "WITH t1 AS (" + " SELECT datasetKey, count(DISTINCT crawlID) crawlCount" + " FROM %s" + " WHERE protocol = 'DWC_ARCHIVE'" + " GROUP BY datasetKey" + " HAVING crawlCount > 1 )"; private static final String SQL_QUERY = " SELECT " + " o.datasetKey, o.crawlId, count(*) AS crawlCount" + " FROM " + " %s o JOIN t1 ON o.datasetKey = t1.datasetKey" + " GROUP BY " + " o.datasetKey, o.crawlId" + " ORDER BY " + " o.datasetKey, o.crawlId"; private static final String SQL_QUERY_SINGLE_DATASET = "SELECT datasetkey, crawlid, count(*) AS crawlCount FROM " + " %s WHERE datasetkey = ? GROUP BY datasetkey, crawlid"; private static final Function<String, String> getSqlCommand = (tableName) -> String.format(SQL_WITH_CLAUSE, tableName) + String.format(SQL_QUERY, tableName); private static final Function<String, String> getSqlCommandSingleDataset = (tableName) -> String.format(SQL_QUERY_SINGLE_DATASET, tableName); private final PreviousCrawlsManagerConfiguration config; private DatasetProcessStatusService datasetProcessStatusService; private OccurrenceSearchService occurrenceSearchService; private final DeletePreviousCrawlsService deletePreviousCrawlsService; public PreviousCrawlsManagerService(PreviousCrawlsManagerConfiguration config, DeletePreviousCrawlsService deletePreviousCrawlsService) { this.config = config; this.deletePreviousCrawlsService = deletePreviousCrawlsService; } private void prepare() { // Create Registry WS Client Properties properties = new Properties(); properties.setProperty("registry.ws.url", config.registryWsUrl); properties.setProperty("occurrence.ws.url", config.registryWsUrl); properties.setProperty("httpTimeout", "30000"); Injector injector = Guice.createInjector(new RegistryWsClientModule(properties), new AnonymousAuthModule(), new OccurrenceWsClientModule(properties)); datasetProcessStatusService = injector.getInstance(DatasetProcessStatusService.class); occurrenceSearchService = injector.getInstance(OccurrenceSearchService.class); } /** * Starts the service. * @param resultHandler handler used to serialize the results */ public void start(Consumer<Object> resultHandler) { prepare(); Object report; // if (config.datasetKey != null) { report = runOnSingleDataset(UUID.fromString(config.datasetKey)); } else { Map<UUID, DatasetRecordCountInfo> allDatasetWithMoreThanOneCrawl = getAllDatasetWithMoreThanOneCrawl(); report = allDatasetWithMoreThanOneCrawl; } // analyseReport(allDatasetWithMoreThanOneCrawl); resultHandler.accept(report); } private DatasetRecordCountInfo runOnSingleDataset(UUID datasetKey) { DatasetRecordCountInfo datasetRecordCountInfo = getDatasetCrawlInfo(datasetKey); if (shouldRunDeletion(datasetRecordCountInfo)) { int numberOfMessageEmitted = deletePreviousCrawlsService.deleteOccurrenceInPreviousCrawls(datasetKey, datasetRecordCountInfo.getLastCompleteCrawlId()); LOG.info("Number Of Delete message emitted: " + numberOfMessageEmitted); } return datasetRecordCountInfo; } /** * Based on {@link PreviousCrawlsManagerConfiguration} and {@link DatasetRecordCountInfo}, decides if we should * trigger deletion of occurrence records that belong to previous crawl(s). * * @param datasetRecordCountInfo * * @return */ private boolean shouldRunDeletion(DatasetRecordCountInfo datasetRecordCountInfo) { if (!config.delete) { return false; } if (config.automaticRecordDeletionThreshold > datasetRecordCountInfo.getDiffSolrLastCrawlPercentage()) { LOG.info("No automatic deletion. Percentage of records to remove (" + datasetRecordCountInfo.getDiffSolrLastCrawlPercentage() + ") higher than the configured threshold (" + config.automaticRecordDeletionThreshold + ")."); return false; } return true; } /** * Get {@link DatasetCrawlInfo} for a single Dataset. * @param datasetKey * @return */ private DatasetRecordCountInfo getDatasetCrawlInfo(UUID datasetKey) { DatasetRecordCountInfo datasetRecordCountInfo = getDatasetRecordCountInfo(datasetKey); List<DatasetCrawlInfo> datasetCrawlInfoList = new ArrayList<>(); datasetRecordCountInfo.setCrawlInfo(datasetCrawlInfoList); try (Connection conn = config.hive.buildHiveConnection(); PreparedStatement stmt = conn.prepareStatement(getSqlCommandSingleDataset.apply(config.hiveOccurrenceTable))) { stmt.setString(1, datasetKey.toString()); try (ResultSet rs = stmt.executeQuery()) { while (rs.next()) { datasetCrawlInfoList.add(new DatasetCrawlInfo(rs.getInt(2), rs.getInt(3))); } } } catch (SQLException e) { LOG.error("Error while getting crawl information for dataset " + datasetKey , e); } return datasetRecordCountInfo; } /** * Get {@link DatasetRecordCountInfo} for each datasets that has records coming to more than one crawl. * @return */ private Map<UUID, DatasetRecordCountInfo> getAllDatasetWithMoreThanOneCrawl() { String sql = getSqlCommand.apply(config.hiveOccurrenceTable); Map<UUID, DatasetRecordCountInfo> crawlInfo = new HashMap<>(); try (Connection conn = config.hive.buildHiveConnection(); Statement stmt = conn.createStatement(); ResultSet rs = stmt.executeQuery(sql)) { UUID currentDatasetKey = null; DatasetRecordCountInfo currentDatasetRecordCountInfo = null; List<DatasetCrawlInfo> currentDatasetCrawlInfoList = new ArrayList<>(); while (rs.next()) { if(!UUID.fromString(rs.getString(1)).equals(currentDatasetKey)) { currentDatasetKey = UUID.fromString(rs.getString(1)); currentDatasetCrawlInfoList = new ArrayList<>(); currentDatasetRecordCountInfo = getDatasetRecordCountInfo(currentDatasetKey); currentDatasetRecordCountInfo.setCrawlInfo(currentDatasetCrawlInfoList); crawlInfo.put(currentDatasetKey, currentDatasetRecordCountInfo); } currentDatasetCrawlInfoList.add(new DatasetCrawlInfo(rs.getInt(2), rs.getInt(3))); } } catch (SQLException e) { LOG.error("Error while generating the crawls report", e); } return crawlInfo; } /** * Load information related to Solr count and last crawls . * * @param datasetKey * * @return */ private DatasetRecordCountInfo getDatasetRecordCountInfo(UUID datasetKey) { DatasetRecordCountInfo datasetRecordCountInfo = new DatasetRecordCountInfo(datasetKey); //Get the count from Solr OccurrenceSearchRequest osReq = new OccurrenceSearchRequest(); osReq.addDatasetKeyFilter(datasetKey); osReq.setLimit(1); SearchResponse<Occurrence, OccurrenceSearchParameter> occResponse = occurrenceSearchService.search(osReq); datasetRecordCountInfo.setSolrCount(occResponse.getCount() != null ? occResponse.getCount() : 0); //Check crawl status, try to get the latest successful crawl PagingResponse<DatasetProcessStatus> processStatus = datasetProcessStatusService.listDatasetProcessStatus(datasetKey, null); Optional<DatasetProcessStatus> lastCompletedCrawl = processStatus.getResults() .stream() .filter(dps -> FinishReason.NORMAL == dps.getFinishReason() && dps.getPagesFragmentedSuccessful() > 0) .findFirst(); if (lastCompletedCrawl.isPresent()) { DatasetProcessStatus datasetProcessStatus = lastCompletedCrawl.get(); datasetRecordCountInfo.setLastCompleteCrawlId(datasetProcessStatus.getCrawlJob().getAttempt()); datasetRecordCountInfo.setFragmentEmittedCount(datasetProcessStatus.getFragmentsEmitted()); datasetRecordCountInfo.setFragmentProcessCount(datasetProcessStatus.getFragmentsProcessed()); } return datasetRecordCountInfo; } private static class DatasetRecordCountInfo { private UUID datasetKey; private int lastCompleteCrawlId; private long fragmentEmittedCount; private long fragmentProcessCount; private long solrCount; private double diffSolrLastCrawlPercentage; private List<DatasetCrawlInfo> crawlInfo; public DatasetRecordCountInfo(){} public List<DatasetCrawlInfo> getCrawlInfo() { return crawlInfo; } public void setCrawlInfo(List<DatasetCrawlInfo> crawlInfo) { this.crawlInfo = crawlInfo; } public DatasetRecordCountInfo(UUID datasetKey) { this.datasetKey = datasetKey; } public UUID getDatasetKey() { return datasetKey; } public void setDatasetKey(UUID datasetKey) { this.datasetKey = datasetKey; } public int getLastCompleteCrawlId() { return lastCompleteCrawlId; } public void setLastCompleteCrawlId(int lastCompleteCrawlId) { this.lastCompleteCrawlId = lastCompleteCrawlId; } public long getFragmentEmittedCount() { return fragmentEmittedCount; } public void setFragmentEmittedCount(long fragmentEmittedCount) { this.fragmentEmittedCount = fragmentEmittedCount; } public long getFragmentProcessCount() { return fragmentProcessCount; } public void setFragmentProcessCount(long fragmentProcessCount) { this.fragmentProcessCount = fragmentProcessCount; } public long getSolrCount() { return solrCount; } public void setSolrCount(long solrCount) { this.solrCount = solrCount; } public long getDiffSolrLastCrawl() { return solrCount - fragmentProcessCount; } public long getSumAllPreviousCrawl() { if(crawlInfo == null || crawlInfo.isEmpty()) { return 0; } return crawlInfo.stream() .filter( dci -> dci.getCrawlId() != lastCompleteCrawlId) .mapToLong( dci -> dci.getCount()) .sum(); } public double getDiffSolrLastCrawlPercentage() { if(solrCount == 0) { return 0; } return (double)getDiffSolrLastCrawl()/(double)solrCount*100d; } } }