/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.util;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Comparator;
import java.util.List;
import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.google.common.primitives.Longs;
/**
* Utility class for listing files on a {@link FileSystem}.
*
* @see FileSystem
*/
public class FileListUtils {
private static final Logger LOG = LoggerFactory.getLogger(FileListUtils.class);
public static final Comparator<FileStatus> LATEST_MOD_TIME_ORDER = new Comparator<FileStatus>() {
@Override
public int compare(FileStatus file1, FileStatus file2) {
return Longs.compare(Long.valueOf(file2.getModificationTime()), Long.valueOf(file1.getModificationTime()));
}
};
public static final PathFilter NO_OP_PATH_FILTER = new PathFilter() {
@Override
public boolean accept(Path path) {
return true;
}
};
public static List<FileStatus> listFilesRecursively(FileSystem fs, Path path) throws IOException {
return listFilesRecursively(fs, path, NO_OP_PATH_FILTER);
}
public static List<FileStatus> listFilesRecursively(FileSystem fs, Iterable<Path> paths) throws IOException {
List<FileStatus> results = Lists.newArrayList();
for (Path path : paths) {
results.addAll(listFilesRecursively(fs, path));
}
return results;
}
/**
* Helper method to list out all files under a specified path. The specified {@link PathFilter} is treated as a file
* filter, that is it is only applied to file {@link Path}s.
*/
public static List<FileStatus> listFilesRecursively(FileSystem fs, Path path, PathFilter fileFilter)
throws IOException {
return listFilesRecursivelyHelper(fs, Lists.<FileStatus> newArrayList(), fs.getFileStatus(path), fileFilter, false);
}
/**
* Helper method to list out all files under a specified path. If applyFilterToDirectories is false, the supplied
* {@link PathFilter} will only be applied to files.
*/
public static List<FileStatus> listFilesRecursively(FileSystem fs, Path path, PathFilter fileFilter,
boolean applyFilterToDirectories) throws IOException {
return listFilesRecursivelyHelper(fs, Lists.<FileStatus> newArrayList(), fs.getFileStatus(path), fileFilter,
applyFilterToDirectories);
}
private static List<FileStatus> listFilesRecursivelyHelper(FileSystem fs, List<FileStatus> files,
FileStatus fileStatus, PathFilter fileFilter, boolean applyFilterToDirectories)
throws FileNotFoundException, IOException {
if (fileStatus.isDirectory()) {
for (FileStatus status : fs.listStatus(fileStatus.getPath(),
applyFilterToDirectories ? fileFilter : NO_OP_PATH_FILTER)) {
if (fileStatus.isDirectory()) {
listFilesRecursivelyHelper(fs, files, status, fileFilter, applyFilterToDirectories);
} else {
files.add(fileStatus);
}
}
} else if (fileFilter.accept(fileStatus.getPath())) {
files.add(fileStatus);
}
return files;
}
/**
* Method to list out all files, or directory if no file exists, under a specified path.
*/
public static List<FileStatus> listMostNestedPathRecursively(FileSystem fs, Path path) throws IOException {
return listMostNestedPathRecursively(fs, path, NO_OP_PATH_FILTER);
}
public static List<FileStatus> listMostNestedPathRecursively(FileSystem fs, Iterable<Path> paths) throws IOException {
List<FileStatus> results = Lists.newArrayList();
for (Path path : paths) {
results.addAll(listMostNestedPathRecursively(fs, path));
}
return results;
}
/**
* Method to list out all files, or directory if no file exists, under a specified path.
* The specified {@link PathFilter} is treated as a file filter, that is it is only applied to file {@link Path}s.
*/
public static List<FileStatus> listMostNestedPathRecursively(FileSystem fs, Path path, PathFilter fileFilter)
throws IOException {
return listMostNestedPathRecursivelyHelper(fs, Lists.<FileStatus> newArrayList(), fs.getFileStatus(path),
fileFilter);
}
private static List<FileStatus> listMostNestedPathRecursivelyHelper(FileSystem fs, List<FileStatus> files,
FileStatus fileStatus, PathFilter fileFilter) throws IOException {
if (fileStatus.isDirectory()) {
FileStatus[] curFileStatus = fs.listStatus(fileStatus.getPath());
if (ArrayUtils.isEmpty(curFileStatus)) {
files.add(fileStatus);
} else {
for (FileStatus status : curFileStatus) {
listMostNestedPathRecursivelyHelper(fs, files, status, fileFilter);
}
}
} else if (fileFilter.accept(fileStatus.getPath())) {
files.add(fileStatus);
}
return files;
}
/**
* Helper method to list out all paths under a specified path. If the {@link org.apache.hadoop.fs.FileSystem} is
* unable to list the contents of a relevant directory, will log an error and skip.
*/
public static List<FileStatus> listPathsRecursively(FileSystem fs, Path path, PathFilter fileFilter)
throws IOException {
return listPathsRecursivelyHelper(fs, Lists.<FileStatus> newArrayList(), fs.getFileStatus(path), fileFilter);
}
private static List<FileStatus> listPathsRecursivelyHelper(FileSystem fs, List<FileStatus> files,
FileStatus fileStatus, PathFilter fileFilter) {
if (fileFilter.accept(fileStatus.getPath())) {
files.add(fileStatus);
}
if (fileStatus.isDirectory()) {
try {
for (FileStatus status : fs.listStatus(fileStatus.getPath())) {
listPathsRecursivelyHelper(fs, files, status, fileFilter);
}
} catch (IOException ioe) {
LOG.error("Could not list contents of path " + fileStatus.getPath());
}
}
return files;
}
}