package org.gbif.occurrence.download.hive; import org.gbif.dwc.terms.DcTerm; import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwc.terms.GbifInternalTerm; import org.gbif.dwc.terms.GbifTerm; import org.gbif.dwc.terms.Term; import java.util.List; import java.util.Set; import javax.annotation.Nullable; import com.google.common.base.Predicate; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Sets; /** * This class serves to document the terms used in various stages of processing. Please note that changes to this * class do not influence processing, although they define the formats for various Hive tables. * <p/> * Processing is complex procedure where, e.g. several verbatim fields are inspected, and depending on their content * will influence different fields in the interpreted view. One example might be a verbatim view with dwc:eventDate * populated, but in the interpreted view dwc:eventDate, dwc:day, dwc:month and dwc:year are present. * <p/> * The code in this class is intended as a more intuitive replacement for {@link org.gbif.occurrence.common.TermUtils} * and should be merged into that class when ready. */ public final class Terms { /** * The list of only the Dublin Core properties, excluding classes, such as Location. */ private static final List<DcTerm> DC_PROPERTIES = dcPropertyTerms(); /** * The list of Darwin Core properties applicable to occurrence records, excluding classes such as Taxon and terms * that are not relevant to occurrence records. */ private static final List<DwcTerm> DwC_PROPERTIES = dwcPropertyTerms(); /** * The list of GBIF properties applicable to occurrence records, excluding any classes and terms that are not * relevant to occurrence records. */ private static final List<GbifTerm> GBIF_PROPERTIES = gbifPropertyTerms(); /** * The list of terms that are subject to interpretation and <strong>may</strong> not be present in the * interpreted record. For example, dwc:maximumDepthInMeters may be present on a verbatim record, and subject to * interpretation, but (at the time of writing) is not be surfaced on the interpreted object but instead contributes * to the gbif:depth term. */ private static final Set<Term> TERMS_SUBJECT_TO_INTERPRETATION = termsSubjectToInterpretation(); /** * The terms that are present only due to explicit interpretation. These are often typed explicitly, such as Dates * or are the result of a routine that has analyzed various verbatim fields and interpreted them into new values, * such as the dwc:kingdom ... dwc:scientificName fields which are subject to a nub lookup. */ private static final Set<Term> TERMS_POPULATED_BY_INTERPRETATION = termsPopulatedByInterpretation(); /** * The terms that are subject to interpretation but not present on the interpreted occurrence. */ private static final Set<Term> TERMS_REMOVED_DURING_INTERPRETATION = Sets.difference(TERMS_SUBJECT_TO_INTERPRETATION, TERMS_POPULATED_BY_INTERPRETATION); /** * Utility to strip out classes from the complete Dublin Core enumeration. * * @return the complete list of property terms of Dublin Core, excluding any "class" terms such as Location. */ private static List<DcTerm> dcPropertyTerms() { return ImmutableList.copyOf(Iterables.<DcTerm>filter(Lists.newArrayList(DcTerm.values()), new Predicate<DcTerm>() { @Override public boolean apply(DcTerm t) { return !t.isClass(); } })); } /** * Utility to strip out classes from the GBIF enumeration. * * @return the complete list of property terms of the GBIF namespace, excluding any "class" terms and terms not * relevant to occurrences. */ private static List<GbifTerm> gbifPropertyTerms() { // the following have no place on occurrence final Set<GbifTerm> exclusions = ImmutableSet.of(GbifTerm.infraspecificMarker, GbifTerm.isExtinct, GbifTerm.isFreshwater, GbifTerm.isHybrid, GbifTerm.isMarine, GbifTerm.isPlural, GbifTerm.isPreferredName, GbifTerm.isTerrestrial, GbifTerm.livingPeriod, GbifTerm.lifeForm, GbifTerm.ageInDays, GbifTerm.sizeInMillimeter, GbifTerm.massInGram, GbifTerm.organismPart, GbifTerm.appendixCITES, GbifTerm.typeDesignatedBy, GbifTerm.typeDesignationType, GbifTerm.canonicalName, GbifTerm.nameType, GbifTerm.verbatimLabel, GbifTerm.infraspecificMarker); //We should handle deprecated terms here. Waiting for GBIF-132 return ImmutableList.copyOf(Iterables.<GbifTerm>filter(Lists.newArrayList(GbifTerm.values()), new Predicate<GbifTerm>() { @Override public boolean apply(GbifTerm t) { return !t.isClass() && !exclusions.contains(t); } })); } /** * Utility to strip out all classes from the DwC terms and all properties that are not applicable to an occurrence * record. * * @return the complete list of property terms of Darwin Core, excluding any "class" terms such as Taxon and terms * not relevant to occurrence records. */ private static List<DwcTerm> dwcPropertyTerms() { // the following are only used in extensions final Set<DwcTerm> exclusions = ImmutableSet.<DwcTerm>builder() .addAll(DwcTerm.listByGroup(DwcTerm.GROUP_MEASUREMENTORFACT)) .addAll(DwcTerm.listByGroup(DwcTerm.GROUP_RESOURCERELATIONSHIP)) .build(); return ImmutableList.copyOf(Iterables.<DwcTerm>filter( // Remove any that are "classes", or explicitly omitted Lists.newArrayList(DwcTerm.values()), new Predicate<DwcTerm>() { @Override public boolean apply(DwcTerm t) { return !t.isClass() && !exclusions.contains(t); } })); } /** * Lists all the terms which are populated on the occurrence object by interpretation, explicit processing or are * internally generated. These are all explicit java properties on the * {@link org.gbif.api.model.occurrence.Occurrence} class. * * @return the terms with values that will only be populated following some interpretation */ private static Set<Term> termsPopulatedByInterpretation() { return ImmutableSet.<Term>of(DwcTerm.decimalLatitude, DwcTerm.decimalLongitude, DwcTerm.continent, DwcTerm.waterBody, DwcTerm.stateProvince, DwcTerm.countryCode, DwcTerm.dateIdentified, DwcTerm.eventDate, DwcTerm.year, DwcTerm.month, DwcTerm.day, DwcTerm.kingdom, DwcTerm.phylum, DwcTerm.class_, DwcTerm.order, DwcTerm.family, DwcTerm.genus, DwcTerm.subgenus, GbifTerm.species, DwcTerm.scientificName, DwcTerm.taxonRank, // DwcTerm.verbatimCoordinates, GbifTerm.genericName, DwcTerm.specificEpithet, DwcTerm.infraspecificEpithet, DwcTerm.basisOfRecord, DwcTerm.individualCount, DwcTerm.sex, DwcTerm.lifeStage, DwcTerm.establishmentMeans, GbifTerm.taxonKey, DwcTerm.typeStatus, GbifTerm.typifiedName, GbifTerm.kingdomKey, GbifTerm.phylumKey, GbifTerm.classKey, GbifTerm.orderKey, GbifTerm.familyKey, GbifTerm.genusKey, GbifTerm.subgenusKey, GbifTerm.speciesKey, GbifTerm.datasetKey, GbifTerm.publishingCountry, GbifTerm.lastInterpreted, DcTerm.modified, DwcTerm.coordinateUncertaintyInMeters, DwcTerm.coordinatePrecision, GbifTerm.elevation, GbifTerm.elevationAccuracy, GbifTerm.depth, GbifTerm.depthAccuracy, GbifInternalTerm.unitQualifier, GbifTerm.issue, DcTerm.references, GbifTerm.datasetKey, GbifTerm.publishingCountry, GbifTerm.protocol, GbifTerm.lastCrawled, GbifTerm.lastParsed); } /** * Lists all terms that are subject to interpretation. Some of the terms may be present on the interpreted * occurrence record, but others will not. This simply lists those that will be interpreted. * * @return the set of terms that will be processed by interpretation routines, and may disappear from the record */ private static Set<Term> termsSubjectToInterpretation() { // any term that is populated by interpretation has to be subject to interpretation if present on the // verbatim record return ImmutableSet.<Term>builder() .addAll(termsPopulatedByInterpretation()) .add(DwcTerm.decimalLatitude, DwcTerm.decimalLongitude, DwcTerm.verbatimLatitude, DwcTerm.verbatimLongitude, DwcTerm.verbatimCoordinates, DwcTerm.geodeticDatum, DwcTerm.coordinateUncertaintyInMeters, DwcTerm.coordinatePrecision, DwcTerm.continent, DwcTerm.waterBody, DwcTerm.stateProvince, DwcTerm.country, DwcTerm.countryCode, DwcTerm.scientificName, DwcTerm.scientificNameAuthorship, DwcTerm.taxonRank, DwcTerm.kingdom, DwcTerm.phylum, DwcTerm.class_, DwcTerm.order, DwcTerm.family, DwcTerm.genus, DwcTerm.subgenus, GbifTerm.genericName, DwcTerm.specificEpithet, DwcTerm.infraspecificEpithet, DcTerm.modified, DwcTerm.dateIdentified, DwcTerm.eventDate, DwcTerm.year, DwcTerm.month, DwcTerm.day, DwcTerm.minimumDepthInMeters, DwcTerm.maximumDepthInMeters, DwcTerm.minimumElevationInMeters, DwcTerm.maximumElevationInMeters, DwcTerm.associatedMedia) .build(); } /** * Returns the list of all terms which can be present in the verbatim view of an Occurrence. This is defined as: * <ul> * <li>The GBIF ID</li> * <li>The complete Dublin Core terms excluding "class" terms</li> * <li>The Darwin Core terms excluding "class" terms and any not suitable for occurrence records</li> * </ul> */ public static List<Term> verbatimTerms() { return ImmutableList.<Term>builder().add(GbifTerm.gbifID).addAll(DC_PROPERTIES).addAll(DwC_PROPERTIES).build(); } /** * Returns the list of all terms which can be present in the interpreted view of an Occurrence. This is defined as * <ul> * <li>The GBIF ID</li> * <li>DC terms that are not removed during interpretation</li> * <li>DwC terms that are not removed during interpretation</li> * <li>The remaining GBIF terms not removed during interpretation and not deprecated</li> * </ul> */ public static List<Term> interpretedTerms() { return ImmutableList.<Term>builder().add(GbifTerm.gbifID).addAll( // add all Dublin Core terms that are not stripped during interpretation Iterables.filter(DC_PROPERTIES, new Predicate<Term>() { @Override public boolean apply(@Nullable Term t) { return !TERMS_REMOVED_DURING_INTERPRETATION.contains(t); } })).addAll( // add all Darwin Core terms that are not stripped during interpretation Iterables.filter(DwC_PROPERTIES, new Predicate<Term>() { @Override public boolean apply(@Nullable Term t) { return !TERMS_REMOVED_DURING_INTERPRETATION.contains(t); } })).addAll( // add all GBIF terms that are not stripped during interpretation Iterables.filter(GBIF_PROPERTIES, new Predicate<Term>() { @Override public boolean apply(@Nullable Term t) { // strip the GBIF id as we've already added that // Do not include GbifTerm.coordinateAccuracy since it is deprecated (POR-3061) return !TERMS_REMOVED_DURING_INTERPRETATION.contains(t) && GbifTerm.gbifID != t && GbifTerm.coordinateAccuracy !=t; } })).build(); } private Terms() { // empty constructor } }