package org.gbif.occurrence.hive.udf;
import org.gbif.api.vocabulary.Country;
import org.gbif.api.vocabulary.OccurrenceIssue;
import org.gbif.common.parsers.core.OccurrenceParseResult;
import org.gbif.common.parsers.core.ParseResult;
import org.gbif.occurrence.processor.guice.ApiClientConfiguration;
import org.gbif.occurrence.processor.interpreting.CoordinateInterpreter;
import org.gbif.occurrence.processor.interpreting.LocationInterpreter;
import org.gbif.occurrence.processor.interpreting.result.CoordinateResult;
import java.net.URI;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import com.beust.jcommander.internal.Lists;
import com.google.common.base.Strings;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A UDF that uses the GBIF API to verify coordinates look sensible.
* If coordinates aren't available then country is interpreted only.
* If coordinates are present, then country and coordinates are returned only if they don't contradict, otherwise they
* are BOTH dropped.
* Note: This is used for the GBIF EU BON analysis.
*/
@Description(name = "parseCoordinates", value = "_FUNC_(apiUrl, latitude, longitude, verbatim_country)")
public class CoordinateCountryParseUDF extends GenericUDF {
private static final int argLength = 4;
private ObjectInspectorConverters.Converter[] converters;
private static final Logger LOG = LoggerFactory.getLogger(CoordinateCountryParseUDF.class);
private LocationInterpreter locInterpreter;
private CoordinateInterpreter coordInterpreter;
private Object lock = new Object();
public LocationInterpreter getLocInterpreter(URI apiWs) {
init(apiWs);
return locInterpreter;
}
public CoordinateInterpreter getCoordInterpreter(URI apiWs) {
init(apiWs);
return coordInterpreter;
}
private void init(URI apiWs) {
if (locInterpreter == null) {
synchronized (lock) { // while we were waiting for the lock, another thread may have instantiated the object
if (locInterpreter == null) {
LOG.info("Create new coordinate & location interpreter using API at {}", apiWs);
ApiClientConfiguration cfg = new ApiClientConfiguration();
cfg.url = apiWs;
coordInterpreter = new CoordinateInterpreter(cfg.newApiClient());
locInterpreter = new LocationInterpreter(coordInterpreter);
}
}
}
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
assert arguments.length == argLength;
URI api = URI.create(arguments[0].get().toString());
// Interpret the country to pass in to the geo lookup
String country = arguments[3].get() == null ? null : converters[3].convert(arguments[3].get()).toString();
Country interpretedCountry = Country.UNKNOWN;
if (country != null) {
ParseResult<Country> r = getLocInterpreter(api).interpretCountry(country);
if (r.isSuccessful() && r.getPayload() != null) {
interpretedCountry = r.getPayload();
}
}
String iso = Country.UNKNOWN == interpretedCountry ? null : interpretedCountry.getIso2LetterCode();
List<Object> result = Lists.newArrayList(3);
if (arguments[1].get() == null || arguments[2].get() == null
|| Strings.isNullOrEmpty(arguments[1].get().toString()) || Strings.isNullOrEmpty(arguments[2].get().toString())) {
result.add(null);
result.add(null);
result.add(iso); // no coords to dispute the iso
return result;
}
String latitude = converters[1].convert(arguments[1].get()).toString();
String longitude = converters[2].convert(arguments[2].get()).toString();
// while we have interpreted the country to try and pass something sensible to the CoordinateInterpreter,
// it will not infer countries if we pass in UNKNOWN, as it likes NULL.
interpretedCountry = Country.UNKNOWN == interpretedCountry ? null : interpretedCountry;
// LOG.info("Parsing lat[{}], lng[{}], country[{}]", latitude, longitude, interpretedCountry);
OccurrenceParseResult<CoordinateResult> response = getCoordInterpreter(api)
.interpretCoordinate(latitude, longitude, null, interpretedCountry);
if (response != null && response.isSuccessful() && !hasSpatialIssue(response.getIssues())) {
CoordinateResult cc = response.getPayload();
// we use the result, which often includes interpreted countries
if (cc.getCountry() == null || Country.UNKNOWN == cc.getCountry()) {
iso = null;
} else {
iso = cc.getCountry().getIso2LetterCode();
}
result.add(cc.getLatitude());
result.add(cc.getLongitude());
result.add(iso);
} else {
result.add(null);
result.add(null);
result.add(null);
}
return result;
}
private static boolean hasSpatialIssue(Collection<OccurrenceIssue> issues) {
for (OccurrenceIssue rule : OccurrenceIssue.GEOSPATIAL_RULES) {
if (issues.contains(rule)) {
return true;
}
}
return false;
}
@Override
public String getDisplayString(String[] strings) {
assert strings.length == 4;
return "parseCoordinates(" + strings[0] + ", " + strings[1] + ", " + strings[2] + ", " + strings[3] + ')';
}
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
if (arguments.length != 4) {
throw new UDFArgumentException("parseCoordinates takes four arguments");
}
converters = new ObjectInspectorConverters.Converter[arguments.length];
for (int i = 0; i < arguments.length; i++) {
converters[i] = ObjectInspectorConverters
.getConverter(arguments[i], PrimitiveObjectInspectorFactory.writableStringObjectInspector);
}
return ObjectInspectorFactory
.getStandardStructObjectInspector(Arrays.asList("latitude", "longitude", "country"), Arrays
.<ObjectInspector>asList(PrimitiveObjectInspectorFactory.javaDoubleObjectInspector,
PrimitiveObjectInspectorFactory.javaDoubleObjectInspector,
PrimitiveObjectInspectorFactory.javaStringObjectInspector));
}
}