package org.gbif.occurrence.persistence; import org.gbif.dwc.terms.GbifTerm; import org.gbif.occurrence.persistence.api.DatasetDeletionService; import org.gbif.occurrence.persistence.api.OccurrenceKeyPersistenceService; import org.gbif.occurrence.persistence.api.OccurrencePersistenceService; import org.gbif.occurrence.persistence.hbase.Columns; import java.util.Iterator; import java.util.List; import java.util.UUID; import com.google.common.collect.Lists; import com.google.inject.Inject; import com.google.inject.Singleton; import org.apache.hadoop.hbase.util.Bytes; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static com.google.common.base.Preconditions.checkNotNull; /** * An implementation of the DatasetDeletionService for deletion of datasets in HBase. */ @Singleton public class DatasetDeletionServiceImpl implements DatasetDeletionService { private static final Logger LOG = LoggerFactory.getLogger(DatasetDeletionServiceImpl.class); public static final int KEYS_BATCH_SIZE = 100000; private final OccurrencePersistenceService occurrenceService; private final OccurrenceKeyPersistenceService occurrenceKeyService; @Inject public DatasetDeletionServiceImpl(OccurrencePersistenceService occurrenceService, OccurrenceKeyPersistenceService occurrenceKeyService) { this.occurrenceService = checkNotNull(occurrenceService, "occurrenceService can't be null"); this.occurrenceKeyService = checkNotNull(occurrenceKeyService, "occurrenceKeyService can't be null"); } @Override public void deleteDataset(UUID datasetKey) { checkNotNull(datasetKey, "datasetKey can't be null"); LOG.debug("Deleting dataset for datasetKey [{}]", datasetKey); // lookup all occurrence ids from lookup table using dataset prefix // deleteByColumn(Bytes.toBytes(datasetKey.toString()), GbifTerm.datasetKey); LOG.debug("Completed deletion of dataset for datasetKey [{}]", datasetKey); } /** * Deletes both the secondary indexes ("lookups") as well as the occurrence proper for the scan that matches the * given column and value. Note any exceptions thrown during the deletions will cause this method to fail, leaving * deletions in an incomplete state. * * @param columnValue value to match * @param column interpreted column on which to match values */ private void deleteByColumn(byte[] columnValue, GbifTerm column) { LOG.debug("Starting delete by column for [{}]", column); int deleteCount = 0; Iterator<Integer> keyIterator = occurrenceService.getKeysByColumn(columnValue, Columns.column(column)); List<Integer> keys = Lists.newArrayListWithCapacity(KEYS_BATCH_SIZE); while (keyIterator.hasNext()) { int key = keyIterator.next(); // TODO: this is critical, but causes extreme performance problems (full scan of lookups per deleted key) occurrenceKeyService.deleteKey(key, null); keys.add(key); if ((keys.size() % KEYS_BATCH_SIZE) == 0) { LOG.debug("Writing batch of [{}] deletes", keys.size()); occurrenceService.delete(keys); deleteCount += keys.size(); keys = Lists.newArrayListWithCapacity(KEYS_BATCH_SIZE); } } LOG.debug("Writing batch of [{}] deletes", keys.size()); occurrenceService.delete(keys); deleteCount += keys.size(); LOG.debug("Finished delete by column for [{}] giving [{}] total rows deleted", column, deleteCount); } }