package org.gbif.occurrence.solr; import org.gbif.dwc.terms.DwcTerm; import org.gbif.dwc.terms.Term; import org.gbif.occurrence.download.hive.DownloadTerms; import org.gbif.occurrence.download.hive.HiveColumns; import org.gbif.occurrence.download.hive.HiveDataTypes; import org.gbif.occurrence.search.writer.FullTextFieldBuilder; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.Collection; import java.util.Map; import java.util.Set; import javax.annotation.Nullable; import com.google.common.base.Function; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; import com.google.common.collect.Collections2; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Sets; import freemarker.cache.ClassTemplateLoader; import freemarker.template.Configuration; import freemarker.template.Template; import freemarker.template.TemplateException; /** * This provides the definition required to construct the occurrence hdfs table, for use as a Hive table. * The table is populated by a query which scans the HBase backed table, but along the way converts some fields to * e.g. Hive arrays which requires some UDF voodoo captured here. * <p/> * Note to developers: It is not easy to find a perfectly clean solution to this work. Here we try and favour long * term code management over all else. For that reason, Functional programming idioms are not used even though they * would reduce lines of code. However, they come at a cost in that there are several levels of method chaining * here, and it becomes difficult to follow as they are not so intuitive on first glance. Similarly, we don't attempt * to push all complexities into the freemarker templates (e.g. complex UDF chaining) as it becomes unmanageable. * <p/> * Developers please adhere to the above design goals when modifying this class, and consider developing for simple * maintenance. */ public class OccurrenceSearchFieldsDefinition { private static final Set<String> NON_SEARCHABLE_HIVE_TYPES = ImmutableSet.<String>of(HiveDataTypes.TYPE_BIGINT, HiveDataTypes.TYPE_ARRAY_STRING, HiveDataTypes.TYPE_INT, HiveDataTypes.TYPE_BOOLEAN); private static final Set<Term> TEMPORAL_FIELDS = ImmutableSet.<Term>of(DwcTerm.year, DwcTerm.month, DwcTerm.day); private static final Predicate<Term> NON_SEARCHABLE_TYPES = new Predicate<Term>() { @Override public boolean apply(@Nullable Term input) { return !NON_SEARCHABLE_HIVE_TYPES.contains(HiveDataTypes.typeForTerm(input, false)) //don't discard temporal INT fields: year, month and day || (HiveDataTypes.typeForTerm(input, false) == HiveDataTypes.TYPE_INT && TEMPORAL_FIELDS.contains(input)); } }; private static final Set<Term> SEARCH_FIELDS = Sets.difference(Sets.filter(DownloadTerms.DOWNLOAD_INTERPRETED_TERMS_HDFS, NON_SEARCHABLE_TYPES), FullTextFieldBuilder.NON_FULL_TEXT_TERMS); private static final String HIVE_OUT_DIR = "hive-scripts/"; /** * Private constructor. */ private OccurrenceSearchFieldsDefinition(){ //hidden constructor } /** * Generates the conceptual definition for the occurrence tables when used in hive. * * @return a list of fields, with the types. */ public static Collection<String> definition() { return Collections2.transform(SEARCH_FIELDS, new Function<Term, String>() { @Override public String apply(@Nullable Term input) { return HiveColumns.columnFor(input); } }); } /** * Generates HQL which create a Hive table on the HBase table. */ private static void generateHQL(Configuration cfg, File outDir) throws IOException, TemplateException { try (FileWriter out = new FileWriter(new File(outDir, "import_hive_to_avro.q"))) { Template template = cfg.getTemplate("import_hive_to_avro.ftl"); Map<String, Object> data = ImmutableMap.<String, Object>of("fields", definition()); template.process(data, out); } } public static void main(String[] args) { try { Preconditions.checkState(1 == args.length, "Output path for HQL files is required"); File outDir = new File(args[0]); Preconditions.checkState(outDir.exists() && outDir.isDirectory(), "Output directory must exist"); // create the sub directories into which we will write File createTablesDir = new File(outDir, HIVE_OUT_DIR); createTablesDir.mkdirs(); Configuration cfg = new Configuration(); cfg.setTemplateLoader(new ClassTemplateLoader(OccurrenceSearchFieldsDefinition.class, "/templates")); // generates HQL for the coordinator jobs to create the tables to be queried generateHQL(cfg, createTablesDir); } catch (Exception e) { // Hard exit for safety, and since this is used in build pipelines, any generation error could have // catastophic effects - e.g. partially complete scripts being run, and resulting in inconsistent // data. System.err.println("*** Aborting JVM ***"); System.err.println("Unexpected error building the templated HQL files. " + "Exiting JVM as a precaution, after dumping technical details."); e.printStackTrace(); System.exit(-1); } } }