package org.gbif.occurrence.download.oozie; import org.gbif.api.model.occurrence.Download; import org.gbif.api.model.occurrence.predicate.Predicate; import org.gbif.api.service.registry.OccurrenceDownloadService; import org.gbif.occurrence.common.download.DownloadUtils; import org.gbif.occurrence.download.conf.WorkflowConfiguration; import org.gbif.occurrence.download.inject.DownloadWorkflowModule; import org.gbif.occurrence.download.query.HiveQueryVisitor; import org.gbif.occurrence.download.query.QueryBuildingException; import org.gbif.occurrence.download.query.SolrQueryVisitor; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.Properties; import com.google.common.base.Strings; import com.google.common.base.Throwables; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import com.google.inject.name.Named; import org.apache.commons.lang3.StringEscapeUtils; import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.response.QueryResponse; import org.codehaus.jackson.map.DeserializationConfig; import org.codehaus.jackson.map.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static com.google.common.base.Preconditions.checkArgument; /** * This class sets the following parameters required by the download workflow: * - is_small_download: define if the occurrence download must be processed as a small(Solr) or a big (Hive) download.\ * This parameter is calculated by executing a Solr query that counts the number of records. * - solr_query: query to process small download, it's a translation of the predicate filter. * - hive_query: query to process big download, it's a translation of the predicate filter. * - hive_db: this parameter is read from a properties file. * - download_key: download primary key, it's generated from the Oozie workflow id. * - download_table_name: base name to use when creating hive tables and files, it's the download_key, but the '-' * it's replaced by '_'. */ public class DownloadPrepareAction { private static final Logger LOG = LoggerFactory.getLogger(DownloadPrepareAction.class); // arbitrary record count that represents and error counting the records of the input query private static final int ERROR_COUNT = -1; private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper().configure(DeserializationConfig.Feature.FAIL_ON_UNKNOWN_PROPERTIES, false); private static final String OOZIE_ACTION_OUTPUT_PROPERTIES = "oozie.action.output.properties"; private static final String IS_SMALL_DOWNLOAD = "is_small_download"; private static final String SOLR_QUERY = "solr_query"; private static final String HIVE_DB = "hive_db"; private static final String HIVE_QUERY = "hive_query"; private static final String DOWNLOAD_KEY = "download_key"; //'-' is not allowed in a Hive table name. // This value will hold the same value as the DOWNLOAD_KEY but the - is replaced by an '_'. private static final String DOWNLOAD_TABLE_NAME = "download_table_name"; private final SolrClient solrClient; // Holds the value of the maximum number of records that a small download can have. private final int smallDownloadLimit; private final OccurrenceDownloadService occurrenceDownloadService; private final WorkflowConfiguration workflowConfiguration; /** * Entry point: receives as argument the predicate filter and the Oozie workflow id. */ public static void main(String[] args) throws Exception { checkArgument(args.length > 0 || Strings.isNullOrEmpty(args[0]), "The solr query argument hasn't been specified"); DownloadPrepareAction occurrenceCount = getInjector().getInstance(DownloadPrepareAction.class); occurrenceCount.updateDownloadData(args[0], DownloadUtils.workflowToDownloadId(args[1])); } /** * Utility method that creates a instance of a Guice Injector containing the OccurrenceSearchCountModule. */ private static Injector getInjector() { try { return Guice.createInjector(new DownloadWorkflowModule(new WorkflowConfiguration())); } catch (IllegalArgumentException e) { LOG.error("Error creating Guice module", e); throw Throwables.propagate(e); } } /** * Default/injectable constructor. */ @Inject public DownloadPrepareAction( SolrClient solrClient, @Named(DownloadWorkflowModule.DefaultSettings.MAX_RECORDS_KEY) int smallDownloadLimit, OccurrenceDownloadService occurrenceDownloadService, WorkflowConfiguration workflowConfiguration ) { this.solrClient = solrClient; this.smallDownloadLimit = smallDownloadLimit; this.occurrenceDownloadService = occurrenceDownloadService; this.workflowConfiguration = workflowConfiguration; } /** * Method that determines if the Solr Query produces a "small" download file. */ public Boolean isSmallDownloadCount(long recordCount) { return recordCount != ERROR_COUNT && recordCount <= smallDownloadLimit; } /** * Update the oozie workflow data/parameters and persists the record of the occurrence download. * * @param rawPredicate to be executed * @param downloadKey workflow id * * @throws java.io.IOException in case of error reading or writing the 'oozie.action.output.properties' file */ public void updateDownloadData(String rawPredicate, String downloadKey) throws IOException, QueryBuildingException { Predicate predicate = OBJECT_MAPPER.readValue(rawPredicate, Predicate.class); String solrQuery = new SolrQueryVisitor().getQuery(predicate); long recordCount = getRecordCount(solrQuery); String oozieProp = System.getProperty(OOZIE_ACTION_OUTPUT_PROPERTIES); if (oozieProp != null) { File propFile = new File(oozieProp); Properties props = new Properties(); try (OutputStream os = new FileOutputStream(propFile)) { props.setProperty(IS_SMALL_DOWNLOAD, isSmallDownloadCount(recordCount).toString()); props.setProperty(SOLR_QUERY, solrQuery); props.setProperty(HIVE_QUERY, StringEscapeUtils.escapeXml10(new HiveQueryVisitor().getHiveQuery(predicate))); props.setProperty(DOWNLOAD_KEY, downloadKey); // '-' is replaced by '_' because it's not allowed in hive table names props.setProperty(DOWNLOAD_TABLE_NAME, downloadKey.replaceAll("-", "_")); props.setProperty(HIVE_DB, workflowConfiguration.getHiveDb()); props.store(os, ""); } catch (FileNotFoundException e) { LOG.error("Error reading properties file", e); throw Throwables.propagate(e); } } else { throw new IllegalStateException(OOZIE_ACTION_OUTPUT_PROPERTIES + " System property not defined"); } if (recordCount >= 0) { updateTotalRecordsCount(downloadKey, recordCount); } } /** * Executes the Solr query and returns the number of records found. * If an error occurs 'ERROR_COUNT' is returned. */ private long getRecordCount(String solrQuery) { try { QueryResponse response = solrClient.query(new SolrQuery(solrQuery)); return response.getResults().getNumFound(); } catch (Exception e) { LOG.error("Error getting the records count", e); return ERROR_COUNT; } } /** * Updates the record count of the download entity. */ private void updateTotalRecordsCount(String downloadKey, long recordCount) { try { LOG.info("Updating record count({}) of download {}", recordCount, downloadKey); Download download = occurrenceDownloadService.get(downloadKey); if (download == null) { LOG.error("Download {} was not found!", downloadKey); } else { download.setTotalRecords(recordCount); occurrenceDownloadService.update(download); } } catch (Exception ex) { LOG.error(String.format("Error updating record count for download workflow %s, reported count is %,d", downloadKey, recordCount), ex); } } }