DwcaArchiveBuilder.java example

Explorer

occurrence-master
- occurrence-cli
  - src
    - main
      - java
        org
        gbif
        occurrence
        cli
        FragmentProcessorCommand.java
        FragmentProcessorService.java
        InterpretedProcessorCommand.java
        InterpretedProcessorService.java
        ProcessorCliConfiguration.java
        VerbatimProcessorCommand.java
        VerbatimProcessorService.java
        common
        GangliaConfiguration.java
        HiveJdbcConfiguration.java
        HueCsvReader.java
        ZkUtils.java
        crawl
        CrawlReportGeneratorService.java
        CrawlsReportGeneratorCommand.java
        CrawlsReportGeneratorConfiguration.java
        DatasetCrawlInfo.java
        DeletePreviousCrawlsService.java
        PreviousCrawlsManagerCommand.java
        PreviousCrawlsManagerConfiguration.java
        PreviousCrawlsManagerService.java
        dataset
        DeleteDatasetListener.java
        InterpretDatasetListener.java
        ParseDatasetListener.java
        commands
        DatasetMutationCommand.java
        DatasetMutationConfiguration.java
        DeleteDatasetCommand.java
        InterpretDatasetCommand.java
        ParseDatasetCommand.java
        service
        DatasetMutationCommand.java
        DatasetMutationConfiguration.java
        DatasetMutationService.java
        delete
        DeleteOccurrenceCommand.java
        DeleteOccurrenceConfiguration.java
        service
        DeleterCommand.java
        DeleterConfiguration.java
        DeleterService.java
        index
        IndexUpdaterCallback.java
        IndexUpdaterService.java
        IndexingConfiguration.java
        UpdateOccurrenceIndexCommand.java
        process
        InterpretOccurrenceCommand.java
        InterpretOccurrenceConfiguration.java
        registry
        service
        RegistryChangeCommand.java
        RegistryChangeConfiguration.java
        RegistryChangeService.java
        sync
        SyncOccurrenceRegistryCommand.java
        SyncOccurrenceRegistryConfiguration.java
        SyncOccurrenceRegistryService.java
    - test
      - java
        org
        gbif
        occurrence
        DigesterLogTest.java
- occurrence-common
  - src
    - main
      - java
        org
        gbif
        occurrence
        common
        HiveColumnsUtils.java
        TermUtils.java
        config
        OccHBaseConfiguration.java
        ZooKeeperConfiguration.java
        download
        DownloadException.java
        DownloadUtils.java
        identifier
        HolyTriplet.java
        OccurrenceKeyHelper.java
        PublisherProvidedUniqueIdentifier.java
        UniqueIdentifier.java
        interpretation
        InterpretationRemark.java
        InterpretationRemarkSeverity.java
        InterpretationRemarksDefinition.java
        json
        ExtensionSerDeserUtils.java
        MediaSerDeserUtils.java
    - test
      - java
        org
        gbif
        occurrence
        common
        TermUtilsTest.java
        download
        DownloadUtilsTest.java
        identifier
        OccurrenceKeyHelperTest.java
        interpretation
        InterpretationRemarksDefinitionTest.java
- occurrence-deleter
  - src
    - main
      - java
        org
        gbif
        occurrence
        deleter
        OccurrenceDeletionService.java
        messaging
        DeleteOccurrenceListener.java
    - test
      - java
        org
        gbif
        occurrence
        deleter
        OccurrenceDeletionServiceTest.java
- occurrence-download
  - src
    - main
      - java
        org
        gbif
        occurrence
        download
        citations
        CitationsFileReader.java
        conf
        DownloadConfBuilder.java
        WorkflowConfiguration.java
        file
        DownloadAggregator.java
        DownloadFileWork.java
        DownloadJobConfiguration.java
        DownloadMaster.java
        OccurrenceMapReader.java
        Result.java
        common
        DatasetUsagesCollector.java
        DownloadFileUtils.java
        SolrQueryProcessor.java
        dwca
        CitationsFileWriter.java
        DownloadDwcaActor.java
        DwcArchiveUtils.java
        DwcDownloadsConstants.java
        DwcaArchiveBuilder.java
        DwcaContactsUtil.java
        DwcaDownloadAggregator.java
        TableSuffixes.java
        oozie
        ArchiveDownloadAction.java
        simplecsv
        SimpleCsvArchiveBuilder.java
        SimpleCsvDownloadActor.java
        SimpleCsvDownloadAggregator.java
        hive
        GenerateHQL.java
        Queries.java
        inject
        DownloadWorkflowModule.java
        license
        LicenseSelector.java
        LicenseSelectors.java
        oozie
        DownloadPrepareAction.java
        FromSolrDownloadAction.java
        query
        HiveQueryVisitor.java
        QueryBuildingException.java
        SolrQueryVisitor.java
        util
        HeadersFileUtil.java
        JacksonJsonContextResolver.java
        RegistryClientUtil.java
    - test
      - java
        org
        gbif
        occurrence
        download
        licenses
        LicenseSelectorTest.java
        query
        HiveQueryVisitorTest.java
        SolrQueryVisitorTest.java
        TestDownloadHeaders.java
- occurrence-hbase-solr-index
  - src
    - main
      - java
        org
        gbif
        occurrence
        hbaseindexer
        IntKeyFormatter.java
        MediaTypeByteArrayMapper.java
- occurrence-hdfs-table
  - src
    - main
      - java
        org
        gbif
        occurrence
        download
        hive
        DownloadTerms.java
        Field.java
        HBaseField.java
        HiveColumns.java
        HiveDataTypes.java
        InitializableField.java
        OccurrenceHBaseTableDefinition.java
        OccurrenceHDFSTableDefinition.java
        Terms.java
- occurrence-heatmaps
  - src
    - main
      - java
        org
        gbif
        occurrence
        search
        heatmap
        OccurrenceHeatmapRequest.java
        OccurrenceHeatmapRequestProvider.java
        OccurrenceHeatmapResponse.java
        OccurrenceHeatmapResponseBuilder.java
        OccurrenceHeatmapsModule.java
        OccurrenceHeatmapsService.java
    - test
      - java
        org
        gbif
        occurrence
        search
        heatmap
        OccurrenceHeatmapsTest.java
- occurrence-hive
  - src
    - main
      - java
        org
        gbif
        occurrence
        hive
        udf
        ArrayNullsRemoverGenericUDF.java
        BasisOfRecordParseUDF.java
        CleanDelimiterCharsUDF.java
        CollectMediaTypesUDF.java
        ContainsUDF.java
        CoordinateCountryParseUDF.java
        DateParseUDF.java
        ReinterpretLocationUDF.java
        SpeciesMatchUDF.java
        ToISO8601UDF.java
- occurrence-index-builder-workflow
  - src
    - main
      - java
        org
        gbif
        occurrence
        solr
        OccurrenceSearchFieldsDefinition.java
- occurrence-parser
  - src
    - main
      - java
        org
        gbif
        occurrence
        OccurrenceParser.java
        ParsingException.java
        constants
        ExtractionSimpleXPaths.java
        PrioritizedPropertyNameEnum.java
        ResponseElementEnum.java
        TaxonRankEnum.java
        model
        Identification.java
        IdentifierRecord.java
        ImageRecord.java
        LinkRecord.java
        PropertyPrioritizer.java
        RawOccurrenceRecord.java
        Taxon.java
        TypificationRecord.java
        parsing
        RawXmlOccurrence.java
        response_file
        ParsedSearchResponse.java
        ResponseSchemaDetector.java
        xml
        HigherTaxonParser.java
        IdentifierExtractionResult.java
        PrioritizedProperty.java
        RawOccurrenceRecordBuilder.java
        XmlFragmentParser.java
        rules
        Abcd12RuleSet.java
        Abcd206RuleSet.java
        AbstractDwcRuleSet.java
        AbstractRuleSet.java
        Dwc10RuleSet.java
        Dwc14RuleSet.java
        Dwc2009RuleSet.java
        DwcManisRuleSet.java
        SetLiteralRule.java
        util
        XmlSanitizingReader.java
    - test
      - java
        org
        gbif
        occurrence
        parsing
        response_file
        ResponseSchemaDetectorTest.java
        xml
        Abcd12RecordParserTest.java
        Abcd206RecordParserTest.java
        BadXmlCharsParserTest.java
        Dwc10RecordParserTest.java
        Dwc14RecordParserTest.java
        Dwc2009RecordParserTest.java
        DwcManisRecordParserTest.java
        NonUtf8ParserTest.java
        ParserTestCase.java
        XmlFragmentParserTest.java
        util
        XmlSanitizingReaderTest.java
- occurrence-persistence
  - src
    - main
      - java
        org
        gbif
        occurrence
        persistence
        DatasetDeletionServiceImpl.java
        FragmentPersistenceServiceImpl.java
        IllegalDataStateException.java
        OccurrenceKeyIterator.java
        OccurrenceKeyPersistenceServiceImpl.java
        OccurrencePersistenceServiceImpl.java
        api
        DatasetDeletionService.java
        Fragment.java
        FragmentCreationResult.java
        FragmentPersistenceService.java
        KeyLookupResult.java
        OccurrenceKeyPersistenceService.java
        OccurrencePersistenceService.java
        OccurrenceWriter.java
        guice
        OccurrencePersistenceModule.java
        ThreadLocalLockProvider.java
        hbase
        Columns.java
        ExtResultReader.java
        HBaseStore.java
        RowUpdate.java
        keygen
        AbstractHBaseKeyPersistenceService.java
        HBaseLockingKeyService.java
        KeyBuilder.java
        KeyPersistenceService.java
        OccurrenceKeyBuilder.java
        ZkLockingKeyService.java
        util
        OccurrenceBuilder.java
        zookeeper
        ZookeeperLockManager.java
    - test
      - java
        org
        gbif
        occurrence
        persistence
        DatasetDeletionServiceImplTest.java
        FragmentPersistenceImplThroughputTest.java
        FragmentPersistenceServiceImplTest.java
        KeyPersistenceServiceTest.java
        OccurrenceKeyIteratorTest.java
        OccurrenceKeyPersistenceServiceImplTest.java
        OccurrencePersistenceServiceImplTest.java
        guice
        OccurrencePersistenceModuleTest.java
        hbase
        BigDecimalTest.java
        ColumnsTest.java
        keygen
        HBaseLockingKeyServiceTest.java
        HBaseLockingKeyServiceThroughputTest.java
        util
        ExtensionSerDeserUtilsTest.java
        zookeeper
        ZookeeperLockManagerTest.java
- occurrence-processor
  - src
    - main
      - java
        org
        gbif
        occurrence
        processor
        FragmentProcessor.java
        InterpretedProcessor.java
        VerbatimProcessor.java
        guice
        ApiClientConfiguration.java
        OccurrenceProcessorModule.java
        ProcessorConfiguration.java
        identifiers
        IdentifierStrategy.java
        interpreting
        CoordinateInterpreter.java
        DatasetInfoInterpreter.java
        LocationInterpreter.java
        MultiMediaInterpreter.java
        OccurrenceInterpreter.java
        TaxonomyInterpreter.java
        TemporalInterpreter.java
        result
        CoordinateResult.java
        DateYearMonthDay.java
        OccurrenceInterpretationResult.java
        util
        CellIdCalculator.java
        CountryMaps.java
        ObjectMapperContextResolver.java
        RetryingWebserviceClient.java
        Wgs84Projection.java
        messaging
        FragmentPersistedListener.java
        InterpretVerbatimListener.java
        OccurrenceFragmentedListener.java
        ParseFragmentListener.java
        VerbatimPersistedListener.java
        parsing
        FragmentParser.java
        JsonFragmentParser.java
        zookeeper
        BatchingDalWrapper.java
        ZookeeperConnector.java
    - test
      - java
        org
        gbif
        occurrence
        processor
        FragmentPersistenceServiceMock.java
        FragmentProcessorTest.java
        FragmentProcessorThroughputTest.java
        OccurrenceInterpreterTest.java
        OccurrenceKeyPersistenceServiceMock.java
        OccurrencePersistenceServiceMock.java
        OccurrenceProcessorIT.java
        VerbatimProcessorTest.java
        guice
        OccurrenceProcessorModuleTest.java
        identifiers
        IdentifierStrategyTest.java
        interpreting
        CoordinateInterpreterTest.java
        DatasetInfoInterpreterTest.java
        LocationInterpreterTest.java
        MultiMediaInterpreterTest.java
        OccurrenceInterpreterTest.java
        TaxonomyInterpreterTest.java
        TemporalInterpreterTest.java
        UniquenessTest.java
        util
        CellIdCalculatorTest.java
        Wgs84ProjectionTest.java
        parsing
        FragmentParserTest.java
        JsonFragmentParserTest.java
        zookeeper
        ZookeeperConnectorTest.java
- occurrence-registry-sync
  - src
    - main
      - java
        org
        gbif
        occurrence
        cli
        registry
        RegistryChangeListener.java
        RegistryObjectMapperContextResolver.java
        sync
        AbstractOccurrenceRegistryMapper.java
        OccurrenceRegistryMapper.java
        OccurrenceScanMapper.java
        RegistryBasedOccurrenceMutator.java
        SyncCommon.java
    - test
      - java
        org
        gbif
        occurrence
        cli
        regsitry
        sync
        RegistryBasedOccurrenceMutatorTest.java
- occurrence-search
  - src
    - main
      - java
        org
        gbif
        occurrence
        search
        OccurrenceSearchImpl.java
        OccurrenceSearchRequestBuilder.java
        guice
        OccurrenceSearchModule.java
        solr
        FacetField.java
        FacetFieldConfiguration.java
        OccurrenceSolrField.java
        SolrQueryUtils.java
        SpellCheckResponseBuilder.java
        writer
        FullTextFieldBuilder.java
        SolrOccurrenceWriter.java
    - test
      - java
        org
        gbif
        occurrence
        search
        OccurrenceDataLoader.java
        OccurrenceSearchRequestBuilderTest.java
        OccurrenceSearchTestIT.java
        writers
        FullTextFieldBuilderTest.java
        HBasePredicateWriter.java
        SolrPredicateWriter.java
- occurrence-ws
  - src
    - main
      - java
        org
        gbif
        occurrence
        download
        service
        CallbackService.java
        Constants.java
        DownloadEmailUtils.java
        DownloadLimitsService.java
        DownloadRequestServiceImpl.java
        DownloadSecurityUtil.java
        EmailModel.java
        OccurrenceDownloadServiceModule.java
        PredicateFactory.java
        QueryBuildingException.java
        conf
        DownloadLimits.java
        freemarker
        NiceDateTemplateMethodModel.java
        workflow
        DownloadWorkflowParameters.java
        DownloadWorkflowParametersBuilder.java
        ws
        OccurrenceWsListener.java
        provider
        DwcXMLDocument.java
        OccurrenceDwcXMLBodyWriter.java
        OccurrenceVerbatimDwcXMLBodyWriter.java
        resources
        DownloadResource.java
        FeaturedOccurrence.java
        FeaturedOccurrenceReader.java
        InterpretationResource.java
        OccurrenceResource.java
        OccurrenceSearchResource.java
        TermResource.java
    - test
      - java
        org
        gbif
        occurrence
        download
        service
        CallbackServiceTest.java
        DownloadEmailUtilsTest.java
        DownloadRequestServiceImplTest.java
        DownloadServiceImplTest.java
        freemarker
        NiceDateTemplateMethodModelTest.java
        ws
        resources
        DownloadResourceTest.java
        provider
        DwcXMLDocumentTest.java
        OccurrenceDwcXMLBodyWriterTest.java
        VerbatimOccurrenceDwcXMLBodyWriterTest.java
- occurrence-ws-client
  - src
    - main
      - java
        org
        gbif
        occurrence
        ws
        client
        Constants.java
        OccurrenceDownloadWsClient.java
        OccurrenceWsClient.java
        OccurrenceWsClientModule.java
        OccurrenceWsSearchClient.java
    - test
      - java
        org
        gbif
        occurrence
        ws
        client
        OccurrenceWsClientIT.java
        OccurrenceWsClientModuleTest.java
        mock
        OccurrenceDownloadMockServices.java
        OccurrencePersistenceMockService.java
        OccurrencePersistenceMockServiceTest.java
        OccurrenceSearchMockService.java
        OccurrenceWsMockModule.java
        OccurrenceWsTestModule.java

package org.gbif.occurrence.download.file.dwca;

import org.gbif.api.model.occurrence.Download;
import org.gbif.api.model.occurrence.predicate.Predicate;
import org.gbif.api.model.registry.Citation;
import org.gbif.api.model.registry.Contact;
import org.gbif.api.model.registry.Dataset;
import org.gbif.api.model.registry.DatasetOccurrenceDownloadUsage;
import org.gbif.api.model.registry.Identifier;
import org.gbif.api.model.registry.eml.DataDescription;
import org.gbif.api.service.registry.DatasetOccurrenceDownloadUsageService;
import org.gbif.api.service.registry.DatasetService;
import org.gbif.api.service.registry.OccurrenceDownloadService;
import org.gbif.api.vocabulary.ContactType;
import org.gbif.api.vocabulary.DatasetType;
import org.gbif.api.vocabulary.IdentifierType;
import org.gbif.api.vocabulary.Language;
import org.gbif.api.vocabulary.License;
import org.gbif.hadoop.compress.d2.D2CombineInputStream;
import org.gbif.hadoop.compress.d2.D2Utils;
import org.gbif.hadoop.compress.d2.zip.ModalZipOutputStream;
import org.gbif.occurrence.common.download.DownloadException;
import org.gbif.occurrence.download.conf.WorkflowConfiguration;
import org.gbif.occurrence.download.file.DownloadJobConfiguration;
import org.gbif.occurrence.download.license.LicenseSelector;
import org.gbif.occurrence.download.license.LicenseSelectors;
import org.gbif.occurrence.download.util.HeadersFileUtil;
import org.gbif.occurrence.download.util.RegistryClientUtil;
import org.gbif.occurrence.query.HumanFilterBuilder;
import org.gbif.occurrence.query.TitleLookup;
import org.gbif.occurrence.query.TitleLookupModule;
import org.gbif.registry.metadata.EMLWriter;
import org.gbif.utils.file.CompressionUtil;
import org.gbif.utils.file.FileUtils;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Writer;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.UUID;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Charsets;
import com.google.common.base.Function;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Ordering;
import com.google.common.io.ByteStreams;
import com.google.common.io.Closer;
import com.google.inject.Guice;
import com.google.inject.Injector;
import com.sun.jersey.api.client.UniformInterfaceException;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static org.gbif.occurrence.download.file.dwca.DwcDownloadsConstants.CITATIONS_FILENAME;
import static org.gbif.occurrence.download.file.dwca.DwcDownloadsConstants.INTERPRETED_FILENAME;
import static org.gbif.occurrence.download.file.dwca.DwcDownloadsConstants.METADATA_FILENAME;
import static org.gbif.occurrence.download.file.dwca.DwcDownloadsConstants.MULTIMEDIA_FILENAME;
import static org.gbif.occurrence.download.file.dwca.DwcDownloadsConstants.RIGHTS_FILENAME;
import static org.gbif.occurrence.download.file.dwca.DwcDownloadsConstants.VERBATIM_FILENAME;

/**
 * Creates a dwc archive for occurrence downloads based on the hive query result files generated
 * during the Oozie workflow. It create a local archive folder with an occurrence data file and a dataset subfolder
 * that contains an EML metadata file per dataset involved.
 */
public class DwcaArchiveBuilder {

  private static final Logger LOG = LoggerFactory.getLogger(DwcaArchiveBuilder.class);
  // The CRC is created by the function FileSystem.copyMerge function
  private static final String CRC_FILE_FMT = ".%s.crc";
  private static final String DOWNLOAD_CONTACT_SERVICE = "GBIF Download Service";
  private static final String DOWNLOAD_CONTACT_EMAIL = "support@gbif.org";
  private static final String METADATA_DESC_HEADER_FMT =
    "A dataset containing all occurrences available in GBIF matching the query:\n%s"
    +
    "\nThe dataset includes records from the following constituent datasets. "
    + "The full metadata for each constituent is also included in this archive:\n";
  private static final String CITATION_HEADER =
    "Please cite this data as follows, and pay attention to the rights documented in the rights.txt:\n"
    + "Please respect the rights declared for each dataset in the download: ";
  private static final String DATASET_TITLE_FMT = "GBIF Occurrence Download %s";
  private static final String RIGHTS =
    "The data included in this download are provided to the user under a %s license (%s), please read the license terms and conditions to understand the implications of its usage and sharing.\n\nData from some individual datasets included in this download may be licensed under less restrictive terms; review the details below.";
  private static final String DATA_DESC_FORMAT = "Darwin Core Archive";
  private static final Splitter TAB_SPLITTER = Splitter.on('\t').trimResults();
  private static final EMLWriter EML_WRITER = EMLWriter.newInstance(true);

  private final DatasetService datasetService;
  private final DatasetOccurrenceDownloadUsageService datasetUsageService;
  private final OccurrenceDownloadService occurrenceDownloadService;
  private final TitleLookup titleLookup;
  private final Dataset dataset;
  private final File archiveDir;
  private final WorkflowConfiguration workflowConfiguration;
  private final FileSystem sourceFs;
  private final FileSystem targetFs;
  private final DownloadJobConfiguration configuration;
  private final LicenseSelector licenseSelector = LicenseSelectors.getMostRestrictiveLicenseSelector(License.CC_BY_4_0);
  private final List<Constituent> constituents = Lists.newArrayList();
  private final Ordering<Constituent> constituentsOrder =
    Ordering.natural().onResultOf(new Function<Constituent, Integer>() {

      public Integer apply(Constituent c) {
        return c.records;
      }
    });

  public static void buildArchive(DownloadJobConfiguration configuration, RegistryClientUtil registryClientUtil) throws IOException {
    buildArchive(configuration, new WorkflowConfiguration(), registryClientUtil);
  }

  public static void buildArchive(DownloadJobConfiguration configuration, WorkflowConfiguration workflowConfiguration, RegistryClientUtil registryClientUtil)
    throws IOException {
    String tmpDir = workflowConfiguration.getTempDir();

    // create temporary, local, download specific directory
    File archiveDir = new File(tmpDir, configuration.getDownloadKey());

    String registryWs = workflowConfiguration.getRegistryWsUrl();
    // create registry client and services
    DatasetService datasetService = registryClientUtil.setupDatasetService(registryWs);
    DatasetOccurrenceDownloadUsageService datasetUsageService = registryClientUtil.setupDatasetUsageService(registryWs);
    OccurrenceDownloadService occurrenceDownloadService = registryClientUtil.setupOccurrenceDownloadService(registryWs);

    Injector inj = Guice.createInjector(new TitleLookupModule(true, workflowConfiguration.getApiUrl()));
    TitleLookup titleLookup = inj.getInstance(TitleLookup.class);

    FileSystem sourceFs = configuration.isSmallDownload()
      ? FileSystem.getLocal(workflowConfiguration.getHadoopConf())
      : FileSystem.get(workflowConfiguration.getHadoopConf());
    FileSystem targetFs = FileSystem.get(workflowConfiguration.getHadoopConf());

    // build archive
    DwcaArchiveBuilder generator = new DwcaArchiveBuilder(datasetService,
                                                          datasetUsageService,
                                                          occurrenceDownloadService,
                                                          sourceFs,
                                                          targetFs,
                                                          archiveDir,
                                                          titleLookup,
                                                          configuration,
                                                          workflowConfiguration);
    generator.buildArchive(new File(tmpDir, configuration.getDownloadKey() + ".zip"));
  }

  private static String writeCitation(Writer citationWriter, Dataset dataset, UUID constituentId)
    throws IOException {
    // citation
    String citationLink = null;
    if (dataset.getCitation() != null && !Strings.isNullOrEmpty(dataset.getCitation().getText())) {
      citationWriter.write('\n' + dataset.getCitation().getText());
      if (!Strings.isNullOrEmpty(dataset.getCitation().getIdentifier())) {
        citationLink = ", " + dataset.getCitation().getIdentifier();
        citationWriter.write(citationLink);
      }
    } else {
      LOG.error(String.format("Constituent dataset misses mandatory citation for id: %s", constituentId));
    }
    if (dataset.getDoi() != null) {
      citationWriter.write(" " + dataset.getDoi());
    }
    return citationLink;
  }

  /**
   * Write rights text.
   */
  private static void writeRights(Writer rightsWriter, Dataset dataset, String citationLink)
    throws IOException {
    // write rights
    rightsWriter.write("\n\nDataset: " + dataset.getTitle());
    if (!Strings.isNullOrEmpty(citationLink)) {
      rightsWriter.write(citationLink);
    }
    rightsWriter.write("\nRights as supplied: ");
    if (dataset.getLicense() != null && dataset.getLicense().isConcrete()) {
      rightsWriter.write(dataset.getLicense().getLicenseUrl());
    } else {
      rightsWriter.write("Not supplied");
    }
  }

  @VisibleForTesting
  protected DwcaArchiveBuilder(
    DatasetService datasetService,
    DatasetOccurrenceDownloadUsageService datasetUsageService,
    OccurrenceDownloadService occurrenceDownloadService,
    FileSystem sourceFs,
    FileSystem targetFs,
    File archiveDir,
    TitleLookup titleLookup,
    DownloadJobConfiguration configuration,
    WorkflowConfiguration workflowConfiguration
  ) {
    this.datasetService = datasetService;
    this.datasetUsageService = datasetUsageService;
    this.occurrenceDownloadService = occurrenceDownloadService;
    this.sourceFs = sourceFs;
    this.targetFs = targetFs;
    this.archiveDir = archiveDir;
    this.titleLookup = titleLookup;
    dataset = new Dataset();
    this.configuration = configuration;
    this.workflowConfiguration = workflowConfiguration;
  }

  /**
   * Main method to assemble the dwc archive and do all the work until we have a final zip file.
   *
   * @param zipFile the final zip file holding the entire archive
   */
  public void buildArchive(File zipFile) throws DownloadException {
    LOG.info("Start building the archive {} ", zipFile.getPath());

    try {
      if (zipFile.exists()) {
        zipFile.delete();
      }
      if (!configuration.isSmallDownload()) {
        // oozie might try several times to run this job, so make sure our filesystem is clean
        cleanupFS();

        // create the temp archive dir
        archiveDir.mkdirs();
      }

      // metadata, citation and rights
      License downloadLicense = addConstituentMetadata();

      // persist the License assigned to the download
      persistDownloadLicense(configuration.getDownloadKey(), downloadLicense);

      // metadata about the entire archive data
      generateMetadata();

      // meta.xml
      DwcArchiveUtils.createArchiveDescriptor(archiveDir);

      // zip up
      LOG.info("Zipping archive {}", archiveDir.toString());
      CompressionUtil.zipDir(archiveDir, zipFile, true);

      // add the large download data files to the zip stream
      if (!configuration.isSmallDownload()) {
        appendPreCompressedFiles(zipFile);
      }
      targetFs.moveFromLocalFile(new Path(zipFile.getPath()),
                                 new Path(workflowConfiguration.getHdfsOutputPath(), zipFile.getName()));

    } catch (IOException e) {
      throw new DownloadException(e);

    } finally {
      // always cleanUp temp dir
      cleanupFS();
    }

  }

  public void createEmlFile(UUID constituentId, File emlDir) throws IOException {
    Closer closer = Closer.create();
    try {
      // store dataset EML as constituent metadata
      InputStream in = closer.register(datasetService.getMetadataDocument(constituentId));
      if (in != null) {
        // copy into archive, reading stream from registry services
        OutputStream out = closer.register(new FileOutputStream(new File(emlDir, constituentId + ".xml")));
        ByteStreams.copy(in, out);
      } else {
        LOG.error("Found no EML for datasetId {}", constituentId);
      }

    } catch (FileNotFoundException ex) {
      LOG.error("Error creating eml file", ex);
    } catch (IOException ex) {
      LOG.error("Error creating eml file", ex);
    } finally {
      closer.close();
    }
  }

  /**
   * Creates the dataset description.
   */
  @VisibleForTesting
  protected String getDatasetDescription() {
    StringBuilder description = new StringBuilder();
    // transform json filter into predicate instance and then into human readable string
    String humanQuery = configuration.getFilter();
    try {
      ObjectMapper mapper = new ObjectMapper();
      Predicate p = mapper.readValue(configuration.getFilter(), Predicate.class);
      humanQuery = new HumanFilterBuilder(titleLookup).humanFilterString(p);
    } catch (Exception e) {
      LOG.error("Failed to transform JSON query into human query: {}", configuration.getFilter(), e);
    }

    description.append(String.format(METADATA_DESC_HEADER_FMT, humanQuery));
    List<Constituent> byRecords = constituentsOrder.sortedCopy(constituents);
    for (Constituent c : byRecords) {
      description.append(c.records + " records from " + c.title + '\n');
    }
    return description.toString();
  }

  protected DataDescription createDataDescription() {
    // link back to archive
    DataDescription dataDescription = new DataDescription();
    dataDescription.setFormat(DATA_DESC_FORMAT);
    dataDescription.setCharset(Charsets.UTF_8.displayName());
    try {
      dataDescription.setUrl(new URI(workflowConfiguration.getDownloadLink(configuration.getDownloadKey())));
    } catch (URISyntaxException e) {
      LOG.error(String.format("Wrong url %s", workflowConfiguration.getDownloadLink(configuration.getDownloadKey())),
                e);
    }
    return dataDescription;
  }

  /**
   * Rewrites the zip file by opening the original and appending the pre-compressed content on the fly.
   */
  private void appendPreCompressedFiles(File zipFile) throws IOException {

    LOG.info("Appending pre-compressed occurrence content to the Zip: " + zipFile.getAbsolutePath());

    File tempZip = new File(archiveDir, zipFile.getName() + ".part");
    boolean renameOk = zipFile.renameTo(tempZip);
    if (renameOk) {
      try (
        ZipInputStream zin = new ZipInputStream(new FileInputStream(tempZip));
        ModalZipOutputStream out = new ModalZipOutputStream(new BufferedOutputStream(new FileOutputStream(zipFile)));
      ) {

        // copy existing entries
        ZipEntry entry = zin.getNextEntry();
        while (entry != null) {
          out.putNextEntry(new org.gbif.hadoop.compress.d2.zip.ZipEntry(entry.getName()),
                           ModalZipOutputStream.MODE.DEFAULT);
          ByteStreams.copy(zin, out);
          entry = zin.getNextEntry();
        }

        // NOTE: hive lowercases all the paths
        appendPreCompressedFile(out,
                                new Path(configuration.getInterpretedDataFileName()),
                                INTERPRETED_FILENAME,
                                HeadersFileUtil.getInterpretedTableHeader());
        appendPreCompressedFile(out,
                                new Path(configuration.getVerbatimDataFileName()),
                                VERBATIM_FILENAME,
                                HeadersFileUtil.getVerbatimTableHeader());
        appendPreCompressedFile(out,
                                new Path(configuration.getMultimediaDataFileName()),
                                MULTIMEDIA_FILENAME,
                                HeadersFileUtil.getMultimediaTableHeader());

      } finally {
        // we've rewritten so remove the original
        if (tempZip != null) {
          tempZip.delete();
        }
      }

    } else {
      throw new IllegalStateException("Unable to rename existing zip, to allow appending occurrence data");
    }
  }

  /**
   * Appends the compressed files found within the directory to the zip stream as the named file
   */
  private void appendPreCompressedFile(ModalZipOutputStream out, Path dir, String filename, String headerRow)
    throws IOException {
    RemoteIterator<LocatedFileStatus> files = sourceFs.listFiles(dir, false);
    List<InputStream> parts = Lists.newArrayList();

    // Add the header first, which must also be compressed
    ByteArrayOutputStream header = new ByteArrayOutputStream();
    D2Utils.compress(new ByteArrayInputStream(headerRow.getBytes()), header);
    parts.add(new ByteArrayInputStream(header.toByteArray()));

    // Locate the streams to the compressed content on HDFS
    while (files.hasNext()) {
      LocatedFileStatus fs = files.next();
      Path path = fs.getPath();
      if (path.toString().endsWith(D2Utils.FILE_EXTENSION)) {
        LOG.info("Deflated content to merge: " + path);
        parts.add(sourceFs.open(path));
      }
    }

    // create the Zip entry, and write the compressed bytes
    org.gbif.hadoop.compress.d2.zip.ZipEntry ze = new org.gbif.hadoop.compress.d2.zip.ZipEntry(filename);
    out.putNextEntry(ze, ModalZipOutputStream.MODE.PRE_DEFLATED);
    try (D2CombineInputStream in = new D2CombineInputStream(parts)) {
      ByteStreams.copy(in, out);
      in.close(); // important so counts are accurate
      ze.setSize(in.getUncompressedLength()); // important to set the sizes and CRC
      ze.setCompressedSize(in.getCompressedLength());
      ze.setCrc(in.getCrc32());
    } finally {
      out.closeEntry();
    }
  }

  /**
   * Adds an eml file per dataset involved into a subfolder "dataset" which is supported by our dwc archive reader.
   * Create a rights.txt and citation.txt file targeted at humans to quickly yield an overview about rights and
   * datasets involved.
   * This method returns the License that must be assigned to the occurrence download file.
   */
  private License addConstituentMetadata() throws IOException {

    Path citationSrc = new Path(configuration.getCitationDataFileName());

    LOG.info("Adding constituent dataset metadata to archive, based on: {}", citationSrc);

    // now read the dataset citation table and create an EML file per datasetId
    // first copy from HDFS to local file
    if (!sourceFs.exists(citationSrc)) {
      LOG.warn("No citation file directory existing on HDFS, skip creating of dataset metadata {}", citationSrc);
      return licenseSelector.getSelectedLicense();
    }

    Map<UUID, Integer> srcDatasets = readDatasetCounts(citationSrc);

    File emlDir = new File(archiveDir, "dataset");
    if (!srcDatasets.isEmpty()) {
      emlDir.mkdir();
    }
    Closer closer = Closer.create();

    Writer rightsWriter = closer.register(FileUtils.startNewUtf8File(new File(archiveDir, RIGHTS_FILENAME)));
    Writer citationWriter = closer.register(FileUtils.startNewUtf8File(new File(archiveDir, CITATIONS_FILENAME)));

    closer.register(citationWriter);
    // write fixed citations header
    citationWriter.write(CITATION_HEADER);
    // now iterate over constituent UUIDs

    for (Entry<UUID, Integer> dsEntry : srcDatasets.entrySet()) {
      UUID constituentId = dsEntry.getKey();
      LOG.info("Processing constituent dataset: {}", constituentId);
      // catch errors for each uuid to make sure one broken dataset does not bring down the entire process
      try {
        Dataset srcDataset = datasetService.get(constituentId);

        licenseSelector.collectLicense(srcDataset.getLicense());
        // citation
        String citationLink = writeCitation(citationWriter, srcDataset, constituentId);
        // rights
        writeRights(rightsWriter, srcDataset, citationLink);
        // eml file
        createEmlFile(constituentId, emlDir);

        // add as constituent for later
        constituents.add(new Constituent(srcDataset.getTitle(), dsEntry.getValue()));

        // add original author as content provider to main dataset description
        Contact provider = DwcaContactsUtil.getContentProviderContact(srcDataset);
        if (provider != null) {
          dataset.getContacts().add(provider);
        }
      } catch (UniformInterfaceException e) {
        LOG.error(String.format("Registry client http exception: %d \n %s",
                                e.getResponse().getStatus(),
                                e.getResponse().getEntity(String.class)), e);
      } catch (Exception e) {
        LOG.error("Error creating download file", e);
        return licenseSelector.getSelectedLicense();
      }
    }
    closer.close();
    return licenseSelector.getSelectedLicense();
  }

  /**
   * Creates a single EML metadata file for the entire archive.
   * Make sure we execute this method AFTER building the constituents metadata which adds to our dataset instance.
   */
  private void generateMetadata() {
    LOG.info("Add query dataset metadata to archive");
    try {
      // Random UUID use because the downloadKey is not a string in UUID format
      Download download = occurrenceDownloadService.get(configuration.getDownloadKey());
      String downloadUniqueID = configuration.getDownloadKey();
      if (download.getDoi() != null) {
        downloadUniqueID = download.getDoi().getDoiName();
        dataset.setDoi(download.getDoi());
        Identifier identifier = new Identifier();
        identifier.setCreated(download.getCreated());
        identifier.setIdentifier(configuration.getDownloadKey());
        identifier.setType(IdentifierType.GBIF_PORTAL);
        dataset.setIdentifiers(Lists.newArrayList(identifier));
      }
      dataset.setKey(UUID.randomUUID());
      dataset.setTitle(String.format(DATASET_TITLE_FMT, downloadUniqueID));
      dataset.setDescription(getDatasetDescription());
      dataset.setCreated(download.getCreated());
      Citation citation = new Citation(String.format(DATASET_TITLE_FMT, downloadUniqueID), downloadUniqueID);
      dataset.setCitation(citation);
      // can we derive a link from the query to set the dataset.homepage?
      dataset.setPubDate(download.getCreated());
      dataset.setDataLanguage(Language.ENGLISH);
      dataset.setType(DatasetType.OCCURRENCE);
      dataset.getDataDescriptions().add(createDataDescription());
      //TODO: use new license field once available
      if (download.getLicense().isConcrete()) {
        dataset.setRights(String.format(RIGHTS, download.getLicense().getLicenseTitle(), download.getLicense().getLicenseUrl()));
      }
      dataset.getContacts()
        .add(DwcaContactsUtil.createContact(DOWNLOAD_CONTACT_SERVICE,
                                            DOWNLOAD_CONTACT_EMAIL,
                                            ContactType.ORIGINATOR,
                                            true));
      dataset.getContacts()
        .add(DwcaContactsUtil.createContact(DOWNLOAD_CONTACT_SERVICE,
                                            DOWNLOAD_CONTACT_EMAIL,
                                            ContactType.ADMINISTRATIVE_POINT_OF_CONTACT,
                                            true));
      dataset.getContacts()
        .add(DwcaContactsUtil.createContact(DOWNLOAD_CONTACT_SERVICE,
                                            DOWNLOAD_CONTACT_EMAIL,
                                            ContactType.METADATA_AUTHOR,
                                            true));

      File eml = new File(archiveDir, METADATA_FILENAME);
      Writer writer = FileUtils.startNewUtf8File(eml);
      EML_WRITER.writeTo(dataset, writer);

    } catch (Exception e) {
      LOG.error("Failed to write query result dataset EML file", e);
    }
  }

  /**
   * Removes all temporary file system artifacts but the final zip archive.
   */
  private void cleanupFS() throws DownloadException {
    LOG.info("Cleaning up archive directory {}", archiveDir.getPath());
    if (archiveDir.exists()) {
      FileUtils.deleteDirectoryRecursively(archiveDir);
    }
  }

  /**
   * Persists the dataset usage information and swallows any exception to avoid an error during the file building.
   */
  private void persistDatasetUsage(Integer count, String downloadKey, UUID datasetKey) {
    try {
      Dataset dataset = datasetService.get(datasetKey);
      if (dataset != null) { //the dataset still exists
        DatasetOccurrenceDownloadUsage datasetUsage = new DatasetOccurrenceDownloadUsage();
        datasetUsage.setDatasetKey(datasetKey);
        datasetUsage.setNumberRecords(count);
        datasetUsage.setDownloadKey(downloadKey);
        datasetUsage.setDatasetDOI(dataset.getDoi());
        if (dataset.getCitation() != null && dataset.getCitation().getText() != null) {
          datasetUsage.setDatasetCitation(dataset.getCitation().getText());
        }
        datasetUsage.setDatasetTitle(dataset.getTitle());
        datasetUsageService.create(datasetUsage);
      }
    } catch (Exception e) {
      LOG.error("Error persisting dataset usage information, downloadKey: {}, datasetKey: {}", downloadKey,
                datasetKey, e);
    }
  }

  /**
   * Persist download license that was assigned to the occurrence download.
   *
   * @param downloadKey
   * @param license
   */
  private void persistDownloadLicense(String downloadKey, License license) {
    try {
      Download download = occurrenceDownloadService.get(configuration.getDownloadKey());
      download.setLicense(license);
      occurrenceDownloadService.update(download);
    } catch (Exception ex) {
      LOG.error("Error updating download license, downloadKey: {}, license: {}", downloadKey, license, ex);
    }
  }

  /**
   * Creates Map with dataset UUIDs and its record counts.
   */
  private Map<UUID, Integer> readDatasetCounts(Path citationSrc) throws IOException {
    // the hive query result is a directory with one or more files - read them all into a uuid set
    Map<UUID, Integer> srcDatasets = Maps.newHashMap(); // map of uuids to occurrence counts
    FileStatus[] citFiles = sourceFs.listStatus(citationSrc);
    int invalidUuids = 0;
    Closer closer = Closer.create();
    for (FileStatus fs : citFiles) {
      if (!fs.isDirectory()) {
        BufferedReader citationReader =
          new BufferedReader(new InputStreamReader(sourceFs.open(fs.getPath()), Charsets.UTF_8));
        closer.register(citationReader);
        try {
          String line = citationReader.readLine();
          while (line != null) {
            if (!Strings.isNullOrEmpty(line)) {
              // we also catch errors for every dataset so we dont break the loop
              try {
                Iterator<String> iter = TAB_SPLITTER.split(line).iterator();
                // play safe and make sure we got a uuid - even though our api doesnt require it
                UUID key = UUID.fromString(iter.next());
                Integer count = Integer.parseInt(iter.next());
                srcDatasets.put(key, count);
                // small downloads persist dataset usages while builds the citations file
                if (!configuration.isSmallDownload()) {
                  persistDatasetUsage(count, configuration.getDownloadKey(), key);
                }
              } catch (IllegalArgumentException e) {
                // ignore invalid UUIDs
                LOG.info("Found invalid UUID as datasetId {}", line);
                invalidUuids++;
              }
            }
            line = citationReader.readLine();
          }
        } finally {
          closer.close();
        }
      }
    }
    if (invalidUuids > 0) {
      LOG.info("Found {} invalid dataset UUIDs", invalidUuids);
    } else {
      LOG.info("All {} dataset UUIDs are valid", srcDatasets.size());
    }
    return srcDatasets;
  }

  /**
   * Removes the file .occurrence.txt.crc that is created by the function FileUtil.copyMerge.
   * This method is temporary change to fix the issue http://dev.gbif.org/issues/browse/OCC-306.
   */
  private void removeDataCRCFile(String destFileName) {
    File occCRCDataFile = new File(archiveDir, String.format(CRC_FILE_FMT, destFileName));
    if (occCRCDataFile.exists()) {
      occCRCDataFile.delete();
    }
  }

  /**
   * Simple, local representation for a constituent dataset.
   */
  static class Constituent {

    private final String title;
    private final int records;

    Constituent(String title, int records) {
      this.title = title;
      this.records = records;
    }
  }
}