package org.gbif.occurrence.ws.resources; import org.gbif.api.model.occurrence.Occurrence; import org.gbif.api.model.registry.Organization; import org.gbif.api.service.occurrence.OccurrenceService; import org.gbif.api.service.registry.OrganizationService; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Random; import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.regex.Pattern; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.inject.Inject; import com.google.inject.name.Named; import com.yammer.metrics.Metrics; import com.yammer.metrics.core.Meter; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.util.Bytes; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A reader that scans the featured occurrence table, and produces the list of the occurrences to feature. * Only occurrences with scientific names are featured. * Class is public to allow Guice wiring, but can only be used in this package. */ public class FeaturedOccurrenceReader { private static final byte[] CF = Bytes.toBytes("o"); private static final byte[] COL = Bytes.toBytes("v"); private static final Pattern COMMA_PATTERN = Pattern.compile(","); private static final Logger LOG = LoggerFactory.getLogger(FeaturedOccurrenceReader.class); private static final int PAGE_SIZE = 50; // The randomization modulus value used when creating the sampled data. See the README. private static final int RANDOMIZATION_MODULUS = 5; private final OccurrenceService occurrenceService; private final OrganizationService organizationService; private final Connection connection; private final TableName tableName; private final Random random = new Random(); private final Meter requests = Metrics.newMeter(FeaturedOccurrenceReader.class, "requests", "requests", TimeUnit.SECONDS); private final Meter missingOccurrences = Metrics.newMeter(FeaturedOccurrenceReader.class, "missingOccurrences", "missingOccurrences", TimeUnit.SECONDS); private final Meter registryFailures = Metrics.newMeter(FeaturedOccurrenceReader.class, "registryFailures", "registryFailures", TimeUnit.SECONDS); @Inject public FeaturedOccurrenceReader(OccurrenceService occurrenceService, @Named("featured_table_pool") Connection connection, @Named("featured_table_name") String tableName, OrganizationService organizationService) { this.occurrenceService = occurrenceService; this.connection = connection; this.tableName = TableName.valueOf(tableName); this.organizationService = organizationService; } private Result getRow(String key) throws IOException { try (Table htable = connection.getTable(tableName)){ return htable.get(new Get(Bytes.toBytes(key))); } } private void appendOccurrence(List<Occurrence> results, String key) throws IOException { Result row = getRow(key); String[] occurrenceIds = COMMA_PATTERN.split(Bytes.toString(row.getValue(CF, COL))); int randomIndex = random.nextInt(occurrenceIds.length); Occurrence occ = occurrenceService.get(Integer.parseInt(occurrenceIds[randomIndex])); if (occ == null) { missingOccurrences.mark(); // keep logging low - issues will become apparent quickly LOG.debug("No featured occurrence record found for key: {}", occurrenceIds[randomIndex]); } else { // filter for only good ones if (occ.getScientificName() != null && occ.getDecimalLatitude() != null && occ.getDecimalLongitude() != null && !occ.hasSpatialIssue() && occ.getPublishingOrgKey() != null) { results.add(occ); } } } /** * Converts the results into featured occurrences. */ private List<FeaturedOccurrence> asFeatured(List<Occurrence> results) { List<FeaturedOccurrence> featured = Lists.newArrayList(); Map<UUID, Organization> orgCache = Maps.newHashMap(); // reduce registry calls for (Occurrence o : results) { try { Organization org = orgCache.get(o.getPublishingOrgKey()); if (org == null) { org = organizationService.get(o.getPublishingOrgKey()); if (org != null) { orgCache.put(org.getKey(), org); } else { LOG.warn("Suspicious that registry reports no org[" + o.getPublishingOrgKey() + "] for occurrence[" + o.getKey() + "]"); registryFailures.mark(); // not communication, but still erroneous continue; } } if (org != null) { featured.add(new FeaturedOccurrence(o.getKey(), o.getDecimalLatitude(), o.getDecimalLongitude(), o.getScientificName(), org.getTitle(), org.getKey(), o.getLastInterpreted())); } } catch (Exception e) { registryFailures.mark(); LOG.error("Unable to read organizaton[{}] from registry", o.getPublishingOrgKey(), e); } } return featured; } /** * Generates a collection of points to use. * Deliberately package only visible. */ List<FeaturedOccurrence> featuredOccurrences() throws IOException { requests.mark(); List<Occurrence> results = Lists.newArrayList(); for (int i = 0; i < PAGE_SIZE; i++) { // randomly pick a cell, from which we then randomly select an occurrence // this gives a reasonable effect, String randomKey = random.nextInt(RANDOMIZATION_MODULUS) + ":" + random.nextInt(RANDOMIZATION_MODULUS); appendOccurrence(results, randomKey); } return asFeatured(results); } }