/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.publisher;
import java.io.Closeable;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.util.Collection;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.State;
import gobblin.configuration.WorkUnitState;
/**
* Defines how to publish data and its corresponding metadata. Can be used for either task level or job level publishing.
*/
public abstract class DataPublisher implements Closeable {
protected final State state;
public DataPublisher(State state) {
this.state = state;
}
/**
* @deprecated {@link DataPublisher} initialization should be done in the constructor.
*/
@Deprecated
public abstract void initialize()
throws IOException;
/**
* Publish the data for the given tasks.
*/
public abstract void publishData(Collection<? extends WorkUnitState> states)
throws IOException;
/**
* Publish the metadata (e.g., schema) for the given tasks. Checkpoints should not be published as part of metadata.
* They are published by Gobblin runtime after the metadata and data are published.
*/
public abstract void publishMetadata(Collection<? extends WorkUnitState> states)
throws IOException;
/**
* First publish the metadata via {@link DataPublisher#publishMetadata(Collection)}, and then publish the output data
* via the {@link DataPublisher#publishData(Collection)} method.
*
* @param states is a {@link Collection} of {@link WorkUnitState}s.
* @throws IOException if there is a problem with publishing the metadata or the data.
*/
public void publish(Collection<? extends WorkUnitState> states)
throws IOException {
if (shouldPublishMetadataFirst()) {
publishMetadata(states);
publishData(states);
} else {
publishData(states);
publishMetadata(states);
}
}
public State getState() {
return this.state;
}
/**
* Get an instance of {@link DataPublisher}.
*
* @param dataPublisherClass A concrete class that extends {@link DataPublisher}.
* @param state A {@link State} used to instantiate the {@link DataPublisher}.
* @return A {@link DataPublisher} instance.
*/
public static DataPublisher getInstance(Class<? extends DataPublisher> dataPublisherClass, State state)
throws ReflectiveOperationException {
Constructor<? extends DataPublisher> dataPublisherConstructor = dataPublisherClass.getConstructor(State.class);
return dataPublisherConstructor.newInstance(state);
}
/**
* Returns true if the implementation of {@link DataPublisher} is thread-safe.
*
* <p>
* For a thread-safe {@link DataPublisher}, this method should return this.getClass() == <class>.class
* to ensure that any extensions must explicitly be marked as thread safe.
* </p>
*/
public boolean isThreadSafe() {
return this.getClass() == DataPublisher.class;
}
/**
* Return true if the current publisher can be skipped.
*
* <p>
* For a publisher that can be skipped, it should not have any effect on state persistence. It will be skipped when
* a job is cancelled, and all finished tasks are configured to be committed.
* </p>
*/
public boolean canBeSkipped() {
return this.state.getPropAsBoolean(ConfigurationKeys.DATA_PUBLISHER_CAN_BE_SKIPPED,
ConfigurationKeys.DEFAULT_DATA_PUBLISHER_CAN_BE_SKIPPED);
}
/**
* Generally metadata should be published before the data it represents, but this allows subclasses to override
* if they are dependent on data getting published first.
*/
protected boolean shouldPublishMetadataFirst() {
return true;
}
}