/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.configuration;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import gobblin.source.workunit.WorkUnit;
import gobblin.source.workunit.Extract;
import lombok.Getter;
/**
* A container for all meta data related to a particular source. This includes all properties
* defined in job configuration files and all properties from tasks of the previous run.
*
* <p>
* Properties can be overwritten at runtime and persisted upon job completion. Persisted
* properties will be loaded in the next run and made available to use by the
* {@link gobblin.source.Source}.
* </p>
*
* @author kgoodhop
*/
public class SourceState extends State {
private static final Set<Extract> EXTRACT_SET = Sets.newConcurrentHashSet();
private static final DateTimeFormatter DTF =
DateTimeFormat.forPattern("yyyyMMddHHmmss").withLocale(Locale.US).withZone(DateTimeZone.UTC);
@Getter
private final Map<String, SourceState> previousDatasetStatesByUrns;
@Getter
private final List<WorkUnitState> previousWorkUnitStates = Lists.newArrayList();
/**
* Default constructor.
*/
public SourceState() {
this.previousDatasetStatesByUrns = ImmutableMap.of();
}
/**
* Constructor.
*
* @param properties job configuration properties
* @param previousWorkUnitStates an {@link Iterable} of {@link WorkUnitState}s of the previous job run
*/
public SourceState(State properties, Iterable<WorkUnitState> previousWorkUnitStates) {
super.addAll(properties);
this.previousDatasetStatesByUrns = ImmutableMap.of();
for (WorkUnitState workUnitState : previousWorkUnitStates) {
this.previousWorkUnitStates.add(new ImmutableWorkUnitState(workUnitState));
}
}
/**
* Constructor.
*
* @param properties job configuration properties
* @param previousDatasetStatesByUrns {@link SourceState} of the previous job run
* @param previousWorkUnitStates an {@link Iterable} of {@link WorkUnitState}s of the previous job run
*/
public SourceState(State properties, Map<String, ? extends SourceState> previousDatasetStatesByUrns,
Iterable<WorkUnitState> previousWorkUnitStates) {
super.addAll(properties.getProperties());
this.previousDatasetStatesByUrns = ImmutableMap.copyOf(previousDatasetStatesByUrns);
for (WorkUnitState workUnitState : previousWorkUnitStates) {
this.previousWorkUnitStates.add(new ImmutableWorkUnitState(workUnitState));
}
}
/**
* Get the {@link SourceState} of the previous job run.
*
* <p>
* This is a convenient method for existing jobs that do not use the new feature that allows output data to
* be committed on a per-dataset basis. Use of this method assumes that the job deals with a single dataset,
* which uses the default data URN defined by {@link ConfigurationKeys#DEFAULT_DATASET_URN}.
* </p>
*
* @return {@link SourceState} of the previous job run or {@code null} if no previous {@link SourceState} is found
*/
public SourceState getPreviousSourceState() {
return getPreviousDatasetState(ConfigurationKeys.DEFAULT_DATASET_URN);
}
/**
* Get the state (in the form of a {@link SourceState}) of a dataset identified by a dataset URN
* of the previous job run.
*
* @param datasetUrn the dataset URN
* @return the dataset state (in the form of a {@link SourceState}) of the previous job run
* or {@code null} if no previous dataset state is found for the given dataset URN
*/
public SourceState getPreviousDatasetState(String datasetUrn) {
if (!this.previousDatasetStatesByUrns.containsKey(datasetUrn)) {
return null;
}
return new ImmutableSourceState(this.previousDatasetStatesByUrns.get(datasetUrn));
}
/**
* Get a {@link Map} from dataset URNs (as being specified by {@link ConfigurationKeys#DATASET_URN_KEY}
* to the {@link WorkUnitState} with the dataset URNs.
*
* <p>
* {@link WorkUnitState}s that do not have {@link ConfigurationKeys#DATASET_URN_KEY} set will be added
* to the dataset state belonging to {@link ConfigurationKeys#DEFAULT_DATASET_URN}.
* </p>
*
* @return a {@link Map} from dataset URNs to the {@link WorkUnitState} with the dataset URNs
*/
public Map<String, Iterable<WorkUnitState>> getPreviousWorkUnitStatesByDatasetUrns() {
Map<String, Iterable<WorkUnitState>> previousWorkUnitStatesByDatasetUrns = Maps.newHashMap();
for (WorkUnitState workUnitState : this.previousWorkUnitStates) {
String datasetUrn =
workUnitState.getProp(ConfigurationKeys.DATASET_URN_KEY, ConfigurationKeys.DEFAULT_DATASET_URN);
if (!previousWorkUnitStatesByDatasetUrns.containsKey(datasetUrn)) {
previousWorkUnitStatesByDatasetUrns.put(datasetUrn, Lists.<WorkUnitState> newArrayList());
}
((List<WorkUnitState>) previousWorkUnitStatesByDatasetUrns.get(datasetUrn)).add(workUnitState);
}
return ImmutableMap.copyOf(previousWorkUnitStatesByDatasetUrns);
}
/**
* Create a new properly populated {@link Extract} instance.
*
* <p>
* This method should always return a new unique {@link Extract} instance.
* </p>
*
* @param type {@link gobblin.source.workunit.Extract.TableType}
* @param namespace namespace of the table this extract belongs to
* @param table name of the table this extract belongs to
* @return a new unique {@link Extract} instance
*
* @Deprecated Use {@link gobblin.source.extractor.extract.AbstractSource#createExtract(
* gobblin.source.workunit.Extract.TableType, String, String)}
*/
@Deprecated
public synchronized Extract createExtract(Extract.TableType type, String namespace, String table) {
Extract extract = new Extract(this, type, namespace, table);
while (EXTRACT_SET.contains(extract)) {
if (Strings.isNullOrEmpty(extract.getExtractId())) {
extract.setExtractId(DTF.print(new DateTime()));
} else {
DateTime extractDateTime = DTF.parseDateTime(extract.getExtractId());
extract.setExtractId(DTF.print(extractDateTime.plusSeconds(1)));
}
}
EXTRACT_SET.add(extract);
return extract;
}
/**
* Create a new {@link WorkUnit} instance from a given {@link Extract}.
*
* @param extract given {@link Extract}
* @return a new {@link WorkUnit} instance
*
* @deprecated Properties in SourceState should not added to a WorkUnit. Having each WorkUnit contain a copy of
* SourceState is a waste of memory. Use {@link WorkUnit#create(Extract)}.
*/
@Deprecated
public WorkUnit createWorkUnit(Extract extract) {
return new WorkUnit(this, extract);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(this.previousWorkUnitStates.size());
for (WorkUnitState state : this.previousWorkUnitStates) {
state.write(out);
}
super.write(out);
}
@Override
public void readFields(DataInput in) throws IOException {
int size = in.readInt();
for (int i = 0; i < size; i++) {
WorkUnitState workUnitState = new WorkUnitState();
workUnitState.readFields(in);
this.previousWorkUnitStates.add(new ImmutableWorkUnitState(workUnitState));
}
super.readFields(in);
}
@Override
public boolean equals(Object object) {
if (!(object instanceof SourceState)) {
return false;
}
SourceState other = (SourceState) object;
return super.equals(other) && this.previousDatasetStatesByUrns.equals(other.previousDatasetStatesByUrns)
&& this.previousWorkUnitStates.equals(other.previousWorkUnitStates);
}
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + this.previousDatasetStatesByUrns.hashCode();
result = prime * result + this.previousWorkUnitStates.hashCode();
return result;
}
/**
* An immutable version of {@link SourceState} that disables all methods that may change the
* internal state of a {@link SourceState}.
*/
private static class ImmutableSourceState extends SourceState {
public ImmutableSourceState(SourceState sourceState) {
super(sourceState, sourceState.previousDatasetStatesByUrns, sourceState.previousWorkUnitStates);
}
@Override
public void readFields(DataInput in) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public void setId(String id) {
throw new UnsupportedOperationException();
}
@Override
public void setProp(String key, Object value) {
throw new UnsupportedOperationException();
}
@Override
public synchronized void appendToListProp(String key, String value) {
throw new UnsupportedOperationException();
}
@Override
public void addAll(State otherState) {
throw new UnsupportedOperationException();
}
@Override
public void addAll(Properties properties) {
throw new UnsupportedOperationException();
}
}
}