package org.gbif.occurrence.download.file; import org.gbif.api.vocabulary.Extension; import org.gbif.api.vocabulary.OccurrenceIssue; import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwc.terms.GbifTerm; import org.gbif.dwc.terms.Term; import org.gbif.occurrence.common.TermUtils; import org.gbif.occurrence.common.download.DownloadUtils; import org.gbif.occurrence.common.json.MediaSerDeserUtils; import org.gbif.occurrence.download.inject.DownloadWorkflowModule; import org.gbif.occurrence.persistence.hbase.Columns; import org.gbif.occurrence.persistence.hbase.ExtResultReader; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.Map; import java.util.Set; import javax.annotation.Nonnull; import javax.annotation.Nullable; import com.beust.jcommander.internal.Sets; import com.google.common.base.Joiner; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.google.inject.Inject; import com.google.inject.name.Named; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.util.Bytes; import static org.gbif.occurrence.common.download.DownloadUtils.DELIMETERS_MATCH_PATTERN; /** * Reads a occurrence record from HBase and return it in a Map<String,Object>. */ public class OccurrenceMapReader { private static final Joiner SEMICOLON_JOINER = Joiner.on(';').skipNulls(); private final String occurrenceTableName; private final Connection connection; /** * Utility to build an API Occurrence record as a Map<String,Object> from an HBase row. * * @return A complete occurrence, or null */ public static Map<String, String> buildInterpretedOccurrenceMap(@Nullable Result row) { if (row == null || row.isEmpty()) { return null; } else { Map<String, String> occurrence = new HashMap<String, String>(); for (Term term : TermUtils.interpretedTerms()) { if (TermUtils.isInterpretedDate(term)) { occurrence.put(term.simpleName(), toISO8601Date(ExtResultReader.getDate(row, term))); } else if (TermUtils.isInterpretedDouble(term)) { Double value = ExtResultReader.getDouble(row, term); occurrence.put(term.simpleName(), value != null ? value.toString() : null); } else if (TermUtils.isInterpretedNumerical(term)) { Integer value = ExtResultReader.getInteger(row, term); occurrence.put(term.simpleName(), value != null ? value.toString() : null); } else if (term == GbifTerm.issue) { occurrence.put(GbifTerm.issue.simpleName(), extractOccurrenceIssues(row)); } else if (term == GbifTerm.mediaType) { occurrence.put(GbifTerm.mediaType.simpleName(), extractMediaTypes(row)); } else if (!TermUtils.isComplexType(term)) { occurrence.put(term.simpleName(), getCleanString(row, term)); } } occurrence.put(GbifTerm.hasGeospatialIssues.simpleName(), Boolean.toString(hasGeospatialIssues(row))); occurrence.put(GbifTerm.hasCoordinate.simpleName(), Boolean.toString(occurrence.get(DwcTerm.decimalLatitude.simpleName()) != null && occurrence.get(DwcTerm.decimalLongitude.simpleName()) != null)); occurrence.put(GbifTerm.repatriated.simpleName(), getRepatriated(row).orNull()); return occurrence; } } /** * Utility to build an API Occurrence record as a Map<String,Object> from an HBase row. * * @return A complete occurrence, or null */ public static Map<String, String> buildOccurrenceMap(@Nullable Result row, Collection<Term> terms) { if (row == null || row.isEmpty()) { return null; } else { Map<String, String> occurrence = new HashMap<String, String>(); for (Term term : terms) { if (TermUtils.isInterpretedDate(term)) { occurrence.put(term.simpleName(), toISO8601Date(ExtResultReader.getDate(row, term))); } else if (TermUtils.isInterpretedDouble(term)) { Double value = ExtResultReader.getDouble(row, term); occurrence.put(term.simpleName(), value != null ? value.toString() : null); } else if (TermUtils.isInterpretedNumerical(term)) { Integer value = ExtResultReader.getInteger(row, term); occurrence.put(term.simpleName(), value != null ? value.toString() : null); } else if (term == GbifTerm.issue) { occurrence.put(GbifTerm.issue.simpleName(), extractOccurrenceIssues(row)); } else if (term == GbifTerm.mediaType) { occurrence.put(GbifTerm.mediaType.simpleName(), extractMediaTypes(row)); } else if (term == GbifTerm.hasGeospatialIssues) { occurrence.put(GbifTerm.hasGeospatialIssues.simpleName(), Boolean.toString(hasGeospatialIssues(row))); } else if (term == GbifTerm.hasCoordinate) { occurrence.put(GbifTerm.hasCoordinate.simpleName(), Boolean.toString(occurrence.get(DwcTerm.decimalLatitude.simpleName()) != null && occurrence.get(DwcTerm.decimalLongitude.simpleName()) != null)); } else if (term == GbifTerm.repatriated) { occurrence.put(GbifTerm.repatriated.simpleName(), getRepatriated(row).orNull()); } else if (!TermUtils.isComplexType(term)) { occurrence.put(term.simpleName(), getCleanString(row, term)); } } return occurrence; } } /** * Validates if the occurrence record it's a repatriated record. */ private static Optional<String> getRepatriated(Result result) { String publishingCountry = ExtResultReader.getString(result,Columns.column(GbifTerm.publishingCountry)); String countryCode = ExtResultReader.getString(result,Columns.column(DwcTerm.countryCode)); if (publishingCountry != null && countryCode != null) { return Optional.of(Boolean.toString(publishingCountry.equalsIgnoreCase(countryCode))); } return Optional.absent(); } /** * Extracts the media types from the hbase result. */ private static String extractMediaTypes(Result result) { Optional<byte[]> val = Optional.fromNullable(result.getValue(Columns.CF, Bytes.toBytes(Columns.column(Extension.MULTIMEDIA)))); return val.isPresent() ? SEMICOLON_JOINER.join(MediaSerDeserUtils.extractMediaTypes(val.get())) : ""; } /** * Extracts the spatial issues from the hbase result. */ private static String extractOccurrenceIssues(Result result) { Set<String> issues = Sets.newHashSet(); for (OccurrenceIssue issue : OccurrenceIssue.values()) { byte[] val = result.getValue(Columns.CF, Bytes.toBytes(Columns.column(issue))); if (val != null) { issues.add(issue.name()); } } return SEMICOLON_JOINER.join(issues); } /** * Extracts the spatial issues from the HBase result. */ private static Boolean hasGeospatialIssues(Result result) { for (OccurrenceIssue issue : OccurrenceIssue.GEOSPATIAL_RULES) { String column = Columns.column(issue); byte[] val = result.getValue(Columns.CF, Bytes.toBytes(column)); if (val != null) { return true; } } return false; } /** * Utility to build an API Occurrence from an HBase row. * * @return A complete occurrence, or null */ public static Map<String, String> buildVerbatimOccurrenceMap(@Nullable Result row) { if (row == null || row.isEmpty()) { return null; } Map<String, String> occurrence = new HashMap<String, String>(); for (Term term : TermUtils.verbatimTerms()) { occurrence.put(term.simpleName(), getCleanVerbatimString(row, term)); } return occurrence; } /** * Cleans specials characters from a string value. * Removes tabs, line breaks and new lines. */ public static String getCleanString(Result row, Term term) { return cleanString(Optional.fromNullable(ExtResultReader.getString(row, term))); } /** * Cleans specials characters from a string value. * Removes tabs, line breaks and new lines. */ public static String getCleanVerbatimString(Result row, Term term) { return cleanString(Optional.fromNullable(ExtResultReader.getString(row, Columns.verbatimColumn(term)))); } private static String cleanString(Optional<String> value) { return value.isPresent() ? DELIMETERS_MATCH_PATTERN.matcher(value.get()).replaceAll(" ") : value.orNull(); } /** * Converts a date object into a String in IS0 8601 format. */ public static String toISO8601Date(Date date) { return date != null ? new SimpleDateFormat(DownloadUtils.ISO_8601_FORMAT).format(date) : null; } @Inject public OccurrenceMapReader(@Named(DownloadWorkflowModule.DefaultSettings.OCC_HBASE_TABLE_KEY) String tableName, Connection connection ) { occurrenceTableName = tableName; this.connection = connection; } /** * Reads an occurrence record from HBase into Map. * The occurrence record */ public Result get(@Nonnull Integer key) throws IOException { Preconditions.checkNotNull(key, "Occurrence key can't be null"); try (Table table = connection.getTable(TableName.valueOf(occurrenceTableName))) { return table.get(new Get(Bytes.toBytes(key))); } } }