/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.util.dataset;
import java.util.Map;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicate;
import com.google.common.base.Strings;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import gobblin.configuration.State;
import gobblin.configuration.StateUtils;
import gobblin.source.workunit.WorkUnit;
public class DatasetUtils {
private static final Logger LOG = LoggerFactory.getLogger(DatasetUtils.class);
public static final String DATASET = "dataset";
/**
* A configuration key that allows a user to specify config parameters on a dataset specific level. The value of this
* config should be a JSON array. Each entry should be a {@link JsonObject} and should contain a
* {@link com.google.gson.JsonPrimitive} that identifies the dataset. All configs in each dataset entry will
* be added to the {@link WorkUnit}s for that dataset.
*
* <p>
* An example value could be: "[{"dataset" : "myDataset1", "writer.partition.columns" : "header.memberId"},
* {"dataset" : "myDataset2", "writer.partition.columns" : "auditHeader.time"}]".
* </p>
*
* <p>
* The "dataset" field also allows regular expressions. For example, one can specify key, value
* "dataset" : "myDataset.*". In this case all datasets whose name matches the pattern "myDataset.*" will have
* all the specified config properties added to their {@link WorkUnit}s. If more a dataset matches multiple
* "dataset"s then the properties from all the {@link JsonObject}s will be added to their {@link WorkUnit}s.
* </p>
*/
public static final String DATASET_SPECIFIC_PROPS = DATASET + ".specific.props";
/**
* For backward compatibility.
*/
private static final String KAFKA_TOPIC_SPECIFIC_STATE = "kafka.topic.specific.state";
private DatasetUtils() {}
/**
* Given a {@link Iterable} of dataset identifiers (e.g., name, URN, etc.), return a {@link Map} that links each
* dataset with the extra configuration information specified in the state via {@link #DATASET_SPECIFIC_PROPS}.
*/
public static Map<String, State> getDatasetSpecificProps(Iterable<String> datasets, State state) {
if (!Strings.isNullOrEmpty(state.getProp(DATASET_SPECIFIC_PROPS))
|| !Strings.isNullOrEmpty(state.getProp(KAFKA_TOPIC_SPECIFIC_STATE))) {
Map<String, State> datasetSpecificConfigMap = Maps.newHashMap();
JsonArray array = !Strings.isNullOrEmpty(state.getProp(DATASET_SPECIFIC_PROPS))
? state.getPropAsJsonArray(DATASET_SPECIFIC_PROPS) : state.getPropAsJsonArray(KAFKA_TOPIC_SPECIFIC_STATE);
// Iterate over the entire JsonArray specified by the config key
for (JsonElement datasetElement : array) {
// Check that each entry in the JsonArray is a JsonObject
Preconditions.checkArgument(datasetElement.isJsonObject(),
"The value for property " + DATASET_SPECIFIC_PROPS + " is malformed");
JsonObject object = datasetElement.getAsJsonObject();
// Only process JsonObjects that have a dataset identifier
if (object.has(DATASET)) {
JsonElement datasetNameElement = object.get(DATASET);
Preconditions.checkArgument(datasetNameElement.isJsonPrimitive(), "The value for property "
+ DATASET_SPECIFIC_PROPS + " is malformed, the " + DATASET + " field must be a string");
// Iterate through each dataset that matches the value of the JsonObjects DATASET field
for (String dataset : Iterables.filter(datasets, new DatasetPredicate(datasetNameElement.getAsString()))) {
// If an entry already exists for a dataset, add it to the current state, else create a new state
if (datasetSpecificConfigMap.containsKey(dataset)) {
datasetSpecificConfigMap.get(dataset).addAll(StateUtils.jsonObjectToState(object, DATASET));
} else {
datasetSpecificConfigMap.put(dataset, StateUtils.jsonObjectToState(object, DATASET));
}
}
} else {
LOG.warn("Skipping JsonElement " + datasetElement + " as it is does not contain a field with key " + DATASET);
}
}
return datasetSpecificConfigMap;
}
return Maps.newHashMap();
}
/**
* Implementation of {@link Predicate} that takes in a dataset regex via its constructor. It returns true in the
* {@link #apply(String)} method only if the dataset regex matches the specified dataset identifier.
*/
private static class DatasetPredicate implements Predicate<String> {
private final Pattern datasetPattern;
private DatasetPredicate(String datasetRegex) {
this.datasetPattern = Pattern.compile(datasetRegex, Pattern.CASE_INSENSITIVE);
}
@Override
public boolean apply(String input) {
return this.datasetPattern.matcher(input).matches();
}
}
}