package org.gbif.occurrence.download.hive;
import org.gbif.api.vocabulary.Extension;
import org.gbif.api.vocabulary.OccurrenceIssue;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.GbifInternalTerm;
import org.gbif.dwc.terms.GbifTerm;
import org.gbif.dwc.terms.Term;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
/**
* This provides the definition required to construct the occurrence hdfs table, for use as a Hive table.
* The table is populated by a query which scans the HBase backed table, but along the way converts some fields to
* e.g. Hive arrays which requires some UDF voodoo captured here.
* <p/>
* Note to developers: It is not easy to find a perfectly clean solution to this work. Here we try and favour long
* term code management over all else. For that reason, Functional programming idioms are not used even though they
* would reduce lines of code. However, they come at a cost in that there are several levels of method chaining
* here, and it becomes difficult to follow as they are not so intuitive on first glance. Similarly, we don't attempt
* to push all complexities into the freemarker templates (e.g. complex UDF chaining) as it becomes unmanageable.
* <p/>
* Developers please adhere to the above design goals when modifying this class, and consider developing for simple
* maintenance.
*/
public class OccurrenceHDFSTableDefinition {
/**
* Private constructor.
*/
private OccurrenceHDFSTableDefinition() {
//hidden constructor
}
/**
* Assemble the mapping for verbatim fields.
*
* @return the list of fields that are used in the verbatim context
*/
private static List<InitializableField> verbatimFields() {
ImmutableList.Builder<InitializableField> builder = ImmutableList.builder();
for (Term t : DownloadTerms.DOWNLOAD_VERBATIM_TERMS) {
builder.add(verbatimField(t));
}
return builder.build();
}
/**
* @return a string for constructing the media type field using a UDF
*/
private static String mediaTypeInitializer() {
// collects from the extension multimedia term important(!)
return "collectMediaTypes(" + HiveColumns.columnFor(Extension.MULTIMEDIA) + ")";
}
/**
* @return a string for constructing the hasCoordinate field
*/
private static String hasCoordinateInitializer() {
return "("
+ HiveColumns.columnFor(DwcTerm.decimalLatitude)
+ " IS NOT NULL AND "
+ HiveColumns.columnFor(DwcTerm.decimalLongitude)
+ " IS NOT NULL)";
}
/**
* @return a string for constructing the repatriated field
*/
private static String repatriatedInitializer() {
return "IF("
+ HiveColumns.columnFor(GbifTerm.publishingCountry)
+ " IS NOT NULL AND "
+ HiveColumns.columnFor(DwcTerm.countryCode)
+ " IS NOT NULL, countrycode != publishingcountry, NULL )";
}
private static String cleanDelimitersInitializer(String column) {
return "cleanDelimiters(" + column + ") AS " + column;
}
/**
* @return a string for constructing the hasGeospatialIssues field
*/
private static String hasGeospatialIssuesInitializer() {
// example output:
// (COALESCE(zero_coordinate,0) + COALESCE(country_coordinate_mismatch,0)) > 0
StringBuilder statement = new StringBuilder("(");
for (int i = 0; i < OccurrenceIssue.GEOSPATIAL_RULES.size(); i++) {
OccurrenceIssue issue = OccurrenceIssue.GEOSPATIAL_RULES.get(i);
statement.append("COALESCE(").append(HiveColumns.columnFor(issue)).append(",0)");
if (i + 1 < OccurrenceIssue.GEOSPATIAL_RULES.size()) {
statement.append(" + ");
}
}
statement.append(") > 0");
return statement.toString();
}
/**
* @return a complex string which turns individual issues into a Hive array, formatted nicely for the freemarker to
* aid debugging.
*/
private static String issueInitializer() {
StringBuilder statement = new StringBuilder("removeNulls(\n").append(" array(\n");
for (int i = 0; i < OccurrenceIssue.values().length; i++) {
OccurrenceIssue issue = OccurrenceIssue.values()[i];
// example: "IF(zero_coordinate IS NOT NULL, 'ZERO_COORDINATE', NULL),"
statement.append(" IF(")
.append(HiveColumns.columnFor(issue))
.append(" IS NOT NULL, '")
.append(issue.toString().toUpperCase())
.append("', NULL)");
if (i + 1 < OccurrenceIssue.values().length) {
statement.append(",\n");
} else {
statement.append("\n");
}
}
statement.append(" )\n");
statement.append(" )");
return statement.toString();
}
/**
* Assemble the mapping for interpreted fields, taking note that in reality, many are mounted onto the verbatim
* HBase columns.
*
* @return the list of fields that are used in the interpreted context
*/
private static List<InitializableField> interpretedFields() {
// the following terms are manipulated when transposing from HBase to hive by using UDFs and custom HQL
Map<Term, String> initializers = ImmutableMap.<Term, String>of(GbifTerm.hasGeospatialIssues,
hasGeospatialIssuesInitializer(),
GbifTerm.hasCoordinate,
hasCoordinateInitializer(),
GbifTerm.issue,
issueInitializer(),
GbifTerm.repatriated,
repatriatedInitializer());
ImmutableList.Builder<InitializableField> builder = ImmutableList.builder();
for (Term t : DownloadTerms.DOWNLOAD_INTERPRETED_TERMS_HDFS) {
// if there is custom handling registered for the term, use it
if (initializers.containsKey(t)) {
builder.add(interpretedField(t, initializers.get(t)));
} else {
builder.add(interpretedField(t)); // will just use the term name as usual
}
}
return builder.build();
}
/**
* The internal fields stored in HBase which we wish to expose through Hive. The fragment and fragment hash
* are removed and not present.
*
* @return the list of fields that are exposed through Hive
*/
private static List<InitializableField> internalFields() {
ImmutableList.Builder<InitializableField> builder = ImmutableList.builder();
for (GbifInternalTerm t : GbifInternalTerm.values()) {
if (!DownloadTerms.EXCLUSIONS.contains(t)) {
builder.add(interpretedField(t));
}
}
return builder.build();
}
/**
* The fields stored in HBase which represent an extension.
*
* @return the list of fields that are exposed through Hive
*/
private static List<InitializableField> extensions() {
// only MULTIMEDIA is supported, but coded for future use
Set<Extension> extensions = ImmutableSet.of(Extension.MULTIMEDIA);
ImmutableList.Builder<InitializableField> builder = ImmutableList.builder();
builder.add(interpretedField(GbifTerm.mediaType, mediaTypeInitializer()));
for (Extension e : extensions) {
builder.add(new InitializableField(GbifTerm.Multimedia,
HiveColumns.columnFor(e),
HiveDataTypes.TYPE_STRING
// always, as it has a custom serialization
));
}
return builder.build();
}
/**
* Generates the conceptual definition for the occurrence tables when used in hive.
*
* @return a list of fields, with the types.
*/
public static List<InitializableField> definition() {
return ImmutableList.<InitializableField>builder()
.add(keyField())
.addAll(verbatimFields())
.addAll(internalFields())
.addAll(interpretedFields())
.addAll(extensions())
.build();
}
/**
* Constructs the field for the primary key, which is a special case in that it needs a special mapping.
*/
private static InitializableField keyField() {
return new InitializableField(GbifTerm.gbifID,
HiveColumns.columnFor(GbifTerm.gbifID),
HiveDataTypes.typeForTerm(GbifTerm.gbifID, true)
// verbatim context
);
}
/**
* Constructs a Field for the given term, when used in the verbatim context.
*/
private static InitializableField verbatimField(Term term) {
String column = HiveColumns.VERBATIM_COL_PREFIX + term.simpleName().toLowerCase();
return new InitializableField(term, column,
// no escape needed, due to prefix
HiveDataTypes.typeForTerm(term, true), // verbatim context
cleanDelimitersInitializer(column) //remove delimiters '\n', '\t', etc.
);
}
/**
* Constructs a Field for the given term, when used in the interpreted context context constructed with no custom
* initializer.
*/
private static InitializableField interpretedField(Term term) {
if (HiveDataTypes.TYPE_STRING.equals(HiveDataTypes.typeForTerm(term, false))) {
return interpretedField(term, cleanDelimitersInitializer(HiveColumns.columnFor(term))); // no initializer
}
return interpretedField(term, null); // no initializer
}
/**
* Constructs a Field for the given term, when used in the interpreted context context, and setting it up with the
* given initializer.
*/
private static InitializableField interpretedField(Term term, String initializer) {
return new InitializableField(term,
HiveColumns.columnFor(term),
// note that Columns takes care of whether this is mounted on a verbatim or an interpreted
// column uin HBase for us
HiveDataTypes.typeForTerm(term, false),
// not verbatim context
initializer);
}
}