/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.source;
import gobblin.configuration.SourceState;
import gobblin.configuration.WorkUnitState;
import java.io.IOException;
import java.util.List;
import gobblin.source.extractor.Extractor;
import gobblin.source.workunit.WorkUnit;
/**
* An interface for classes that the end users implement to work with a data source from which
* schema and data records can be extracted.
*
* <p>
* An implementation of this interface should contain all the logic required to work with a
* specific data source. This usually includes work determination and partitioning, and details
* of the connection protocol to work with the data source.
* </p>
*
* @author kgoodhop
*
* @param <S> output schema type
* @param <D> output record type
*/
public interface Source<S, D> {
/**
* Get a list of {@link WorkUnit}s, each of which is for extracting a portion of the data.
*
* <p>
* Each {@link WorkUnit} will be used instantiate a {@link gobblin.configuration.WorkUnitState} that gets passed to the
* {@link #getExtractor(gobblin.configuration.WorkUnitState)} method to get an {@link Extractor} for extracting schema
* and data records from the source. The {@link WorkUnit} instance should have all the properties
* needed for the {@link Extractor} to work.
* </p>
*
* <p>
* Typically the list of {@link WorkUnit}s for the current run is determined by taking into account
* the list of {@link WorkUnit}s from the previous run so data gets extracted incrementally. The
* method {@link gobblin.configuration.SourceState#getPreviousWorkUnitStates} can be used to get the list of {@link WorkUnit}s
* from the previous run.
* </p>
*
* @param state see {@link gobblin.configuration.SourceState}
* @return a list of {@link WorkUnit}s
*/
public abstract List<WorkUnit> getWorkunits(SourceState state);
/**
* Get an {@link Extractor} based on a given {@link gobblin.configuration.WorkUnitState}.
*
* <p>
* The {@link Extractor} returned can use {@link gobblin.configuration.WorkUnitState} to store arbitrary key-value pairs
* that will be persisted to the state store and loaded in the next scheduled job run.
* </p>
*
* @param state a {@link gobblin.configuration.WorkUnitState} carrying properties needed by the returned {@link Extractor}
* @return an {@link Extractor} used to extract schema and data records from the data source
* @throws IOException if it fails to create an {@link Extractor}
*/
public abstract Extractor<S, D> getExtractor(WorkUnitState state)
throws IOException;
/**
* Shutdown this {@link Source} instance.
*
* <p>
* This method is called once when the job completes. Properties (key-value pairs) added to the input
* {@link SourceState} instance will be persisted and available to the next scheduled job run through
* the method {@link #getWorkunits(SourceState)}. If there is no cleanup or reporting required for a
* particular implementation of this interface, then it is acceptable to have a default implementation
* of this method.
* </p>
*
* @param state see {@link SourceState}
*/
public abstract void shutdown(SourceState state);
}