package org.gbif.occurrence.cli.registry.sync; import org.gbif.api.model.occurrence.Occurrence; import org.gbif.api.model.registry.Dataset; import org.gbif.api.model.registry.Organization; import org.gbif.common.messaging.api.messages.DeleteDatasetOccurrencesMessage; import org.gbif.common.messaging.api.messages.OccurrenceDeletionReason; import org.gbif.common.messaging.api.messages.OccurrenceMutatedMessage; import java.io.IOException; import java.util.Set; import java.util.UUID; import java.util.concurrent.ExecutionException; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; import com.google.common.collect.Sets; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.util.Bytes; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A MapReduce Mapper that synchronizes all occurrences (called with this mapper) with the registry. */ public class OccurrenceRegistryMapper extends AbstractOccurrenceRegistryMapper { private static final Logger LOG = LoggerFactory.getLogger(OccurrenceRegistryMapper.class); private static final int MAX_DATASET_CACHE = 1000; private static final int MAX_ORGANIZATION_CACHE = 1000; private LoadingCache<UUID, Dataset> datasetCache; private LoadingCache<UUID, Organization> organizationCache; private static final Set<UUID> DELETED_DATASETS = Sets.newHashSet(); private int numRecords = 0; @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); datasetCache = CacheBuilder.newBuilder() .maximumSize(MAX_DATASET_CACHE) .build( new CacheLoader<UUID, Dataset>() { public Dataset load(UUID datasetKey) { return datasetService.get(datasetKey); } }); organizationCache = CacheBuilder.newBuilder() .maximumSize(MAX_ORGANIZATION_CACHE) .build( new CacheLoader<UUID, Organization>() { public Organization load(UUID key) { return orgService.get(key); } }); } @Override public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException { UUID datasetKey = UUID.fromString(Bytes.toString(values.getValue(SyncCommon.OCC_CF, SyncCommon.DK_COL))); if (DELETED_DATASETS.contains(datasetKey)) { return; } try { Dataset dataset = datasetCache.get(datasetKey); if (dataset.getDeleted() != null) { DELETED_DATASETS.add(datasetKey); try { LOG.info("Sending delete dataset message for dataset [{}]", datasetKey); messagePublisher.send(new DeleteDatasetOccurrencesMessage(datasetKey, OccurrenceDeletionReason.DATASET_MANUAL)); } catch (IOException e) { LOG.warn("Failed to send update message", e); } return; } Organization publishingOrg = organizationCache.get(dataset.getPublishingOrganizationKey()); if (occurrenceMutator.requiresUpdate(dataset, publishingOrg, values)) { Occurrence origOcc = occurrencePersistenceService.get(Bytes.toInt(row.get())); // we have no clone or other easy copy method Occurrence updatedOcc = occurrencePersistenceService.get(Bytes.toInt(row.get())); occurrenceMutator.mutateOccurrence(updatedOcc, dataset, publishingOrg); occurrencePersistenceService.update(updatedOcc); int crawlId = Bytes.toInt(values.getValue(SyncCommon.OCC_CF, SyncCommon.CI_COL)); OccurrenceMutatedMessage msg = OccurrenceMutatedMessage.buildUpdateMessage(datasetKey, origOcc, updatedOcc, crawlId); try { //TODO use generateUpdateMessage LOG.debug( "Sending update for key [{}], publishing org changed from [{}] to [{}] and host country from [{}] to [{}]", datasetKey, origOcc.getPublishingOrgKey(), updatedOcc.getPublishingOrgKey(), origOcc.getPublishingCountry(), updatedOcc.getPublishingCountry()); messagePublisher.send(msg); } catch (IOException e) { LOG.warn("Failed to send update message", e); } } numRecords++; if (numRecords % 10000 == 0) { context.setStatus("mapper processed " + numRecords + " records so far"); } } catch (ExecutionException e) { LOG.warn("Failed to get Dataset/Organization data for datasetKey {}", datasetKey, e); } } }