/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.copy.replication;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import com.google.common.base.Optional;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigRenderOptions;
import gobblin.configuration.ConfigurationKeys;
import gobblin.data.management.copy.CopyConfiguration;
import lombok.extern.slf4j.Slf4j;
/**
* Based on single dataset configuration in {@link Config} format, in Pull mode replication, there could be multiple
* {@link ConfigBasedDataset} generated. For example, if two replicas exists on the same copy to cluster,
* say replica1 and replica2, then there will be 2 {@link ConfigBasedDataset} generated, one for replication data from
* copy from {@link EndPoint} to replica1, the other from copy from {@link EndPoint} to replica2
*
* This class will be responsible to generate those {@link ConfigBasedDataset}s
*
* @author mitu
*/
@Slf4j
public class ConfigBasedMultiDatasets {
private final Properties props;
private final List<ConfigBasedDataset> datasets = new ArrayList<>();
/**
* if push mode is set in property, only replicate data when
* 1. Push mode is set in Config store
* 2. CopyTo cluster in sync with property with {@link #ConfigurationKeys.WRITER_FILE_SYSTEM_URI}
*/
public static final String REPLICATION_PUSH_MODE = CopyConfiguration.COPY_PREFIX + ".replicationPushMode";
public ConfigBasedMultiDatasets (Config c, Properties props){
this.props = props;
try {
FileSystem executionCluster = FileSystem.get(new Configuration());
URI executionClusterURI = executionCluster.getUri();
ReplicationConfiguration rc = ReplicationConfiguration.buildFromConfig(c);
// push mode
if(this.props.containsKey(REPLICATION_PUSH_MODE) && Boolean.parseBoolean(this.props.getProperty(REPLICATION_PUSH_MODE))){
generateDatasetInPushMode(rc, executionClusterURI);
}
// default pull mode
else{
generateDatasetInPullMode(rc, executionClusterURI);
}
} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
log.error("Can not create Replication Configuration from raw config "
+ c.root().render(ConfigRenderOptions.defaults().setComments(false).setOriginComments(false)), e);
} catch (IOException ioe) {
log.error("Can not decide current execution cluster ", ioe);
}
}
private void generateDatasetInPushMode(ReplicationConfiguration rc, URI executionClusterURI){
if(rc.getCopyMode()== ReplicationCopyMode.PULL){
log.info("Skip process pull mode dataset with meta data{} as job level property specify push mode ", rc.getMetaData());
return;
}
if (!this.props.containsKey(ConfigurationKeys.WRITER_FILE_SYSTEM_URI)){
return;
}
String pushModeTargetCluster = this.props.getProperty(ConfigurationKeys.WRITER_FILE_SYSTEM_URI);
// PUSH mode
CopyRouteGenerator cpGen = rc.getCopyRouteGenerator();
List<EndPoint> replicas = rc.getReplicas();
List<EndPoint> pushCandidates = new ArrayList<EndPoint>(replicas);
pushCandidates.add(rc.getSource());
for(EndPoint pushFrom: pushCandidates){
if(needGenerateCopyEntity(pushFrom, executionClusterURI)){
Optional<List<CopyRoute>> copyRoutes = cpGen.getPushRoutes(rc, pushFrom);
if(!copyRoutes.isPresent()) {
log.warn("In Push mode, did not found any copyRoute for dataset with meta data {}", rc.getMetaData());
return;
}
for(CopyRoute cr: copyRoutes.get()){
if(cr.getCopyTo() instanceof HadoopFsEndPoint){
HadoopFsEndPoint ep = (HadoopFsEndPoint)cr.getCopyTo();
if(ep.getFsURI().toString().equals(pushModeTargetCluster)){
this.datasets.add(new ConfigBasedDataset(rc, this.props, cr));
}
}
}// inner for loops ends
}
}// outer for loop ends
}
private void generateDatasetInPullMode(ReplicationConfiguration rc, URI executionClusterURI){
if(rc.getCopyMode()== ReplicationCopyMode.PUSH){
log.info("Skip process push mode dataset with meta data{} as job level property specify pull mode ", rc.getMetaData());
return;
}
// PULL mode
CopyRouteGenerator cpGen = rc.getCopyRouteGenerator();
List<EndPoint> replicas = rc.getReplicas();
for(EndPoint replica: replicas){
if(needGenerateCopyEntity(replica, executionClusterURI)){
Optional<CopyRoute> copyRoute = cpGen.getPullRoute(rc, replica);
if(copyRoute.isPresent()){
this.datasets.add(new ConfigBasedDataset(rc, this.props, copyRoute.get()));
}
}
}
}
public List<ConfigBasedDataset> getConfigBasedDatasetList(){
return this.datasets;
}
private boolean needGenerateCopyEntity(EndPoint e, URI executionClusterURI){
if(!(e instanceof HadoopFsEndPoint)){
return false;
}
HadoopFsEndPoint ep = (HadoopFsEndPoint)e;
return ep.getFsURI().equals(executionClusterURI);
}
}