/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.data.management.copy.hive;
import java.io.IOException;
import java.util.Map;
import java.util.Properties;
import lombok.Data;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.mapred.InputFormat;
import com.google.common.collect.Maps;
import gobblin.data.management.copy.RecursivePathFinder;
import gobblin.util.PathUtils;
/**
* Contains data for a Hive location as well as additional data if {@link #HIVE_DATASET_COPY_ADDITIONAL_PATHS_RECURSIVELY_ENABLED} set to true.
*/
@Data
public class HiveLocationDescriptor {
public static final String HIVE_DATASET_COPY_ADDITIONAL_PATHS_RECURSIVELY_ENABLED =
HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.additional.paths.recursively.enabled";
public static final String HIVE_LOCATION_LISTING_METHOD =
HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.location.listing.method";
public static final String SKIP_HIDDEN_PATHS =
HiveDatasetFinder.HIVE_DATASET_PREFIX + ".copy.locations.listing.skipHiddenPaths";
public static final String DEFAULT_SKIP_HIDDEN_PATHS = Boolean.toString(false);
public static final String DEFAULT_HIVE_LOCATION_LISTING_METHOD = PathFindingMethod.INPUT_FORMAT.name();
public enum PathFindingMethod {
INPUT_FORMAT, RECURSIVE
}
protected final Path location;
protected final InputFormat<?, ?> inputFormat;
protected final FileSystem fileSystem;
protected final Properties properties;
public Map<Path, FileStatus> getPaths() throws IOException {
PathFindingMethod pathFindingMethod = PathFindingMethod.valueOf(
this.properties.getProperty(HIVE_LOCATION_LISTING_METHOD, DEFAULT_HIVE_LOCATION_LISTING_METHOD).toUpperCase());
Map<Path, FileStatus> result = Maps.newHashMap();
if (pathFindingMethod == PathFindingMethod.INPUT_FORMAT) {
for (Path path : HiveUtils.getPaths(this.inputFormat, this.location)) {
result.put(path, this.fileSystem.getFileStatus(path));
}
boolean useHiveLocationDescriptorWithAdditionalData =
Boolean.valueOf(this.properties.getProperty(HIVE_DATASET_COPY_ADDITIONAL_PATHS_RECURSIVELY_ENABLED, "false"));
if (useHiveLocationDescriptorWithAdditionalData) {
if (PathUtils.isGlob(this.location)) {
throw new IOException("can not get additional data for glob pattern path " + this.location);
}
RecursivePathFinder finder = new RecursivePathFinder(this.fileSystem, this.location, this.properties);
for (FileStatus status : finder.getPaths(false)) {
result.put(status.getPath(), status);
}
}
return result;
} else if (pathFindingMethod == PathFindingMethod.RECURSIVE) {
if (PathUtils.isGlob(this.location)) {
throw new IOException("Cannot use recursive listing for globbed locations.");
}
boolean skipHiddenPaths = Boolean.parseBoolean(this.properties.getProperty(SKIP_HIDDEN_PATHS, DEFAULT_SKIP_HIDDEN_PATHS));
RecursivePathFinder finder = new RecursivePathFinder(this.fileSystem, this.location, this.properties);
for (FileStatus status : finder.getPaths(skipHiddenPaths)) {
result.put(status.getPath(), status);
}
return result;
} else {
throw new IOException("Hive location listing method not recognized: " + pathFindingMethod);
}
}
public static HiveLocationDescriptor forTable(Table table, FileSystem fs, Properties properties) throws IOException {
return new HiveLocationDescriptor(table.getDataLocation(), HiveUtils.getInputFormat(table.getTTable().getSd()), fs, properties);
}
public static HiveLocationDescriptor forPartition(Partition partition, FileSystem fs, Properties properties) throws IOException {
return new HiveLocationDescriptor(partition.getDataLocation(),
HiveUtils.getInputFormat(partition.getTPartition().getSd()), fs, properties);
}
}