/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.source.extractor.extract.google;
import java.io.BufferedInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.util.Progressable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.api.client.auth.oauth2.Credential;
import com.google.api.client.googleapis.json.GoogleJsonError;
import com.google.api.client.googleapis.json.GoogleJsonResponseException;
import com.google.api.services.drive.Drive;
import com.google.api.services.drive.model.File;
import com.google.api.services.drive.model.FileList;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.io.Closer;
import static gobblin.configuration.ConfigurationKeys.*;
import static gobblin.source.extractor.extract.google.GoogleCommonKeys.*;
import gobblin.configuration.State;
import gobblin.util.HadoopUtils;
import gobblin.util.io.SeekableFSInputStream;
/**
* A {@link FileSystem} implementation that provides the {@link FileSystem} interface for an Google Drive server.
* <ul>
* <li>Note that {@link GoogleDriveFileSystem} currently only supports list, get, delete use cases.
* <li>Google drive has two different identifier. File ID and File name -- where folder is just different mime-type of File.
* As File name can be duplicate under same folder, all path that GoogleDriveFileSystem takes assumes that it's a File ID.
* <li>It is the caller's responsibility to call {@link #close()} on this {@link FileSystem} to disconnect the session.
*
* {@link GoogleDriveFileSystem} does not cache instance and {@link FileSystem#get(Configuration)} creates a new {@link GoogleDriveFileSystem} everytime
* instead of cached copy.
* </ul>
*/
public class GoogleDriveFileSystem extends FileSystem {
private static final Logger LOG = LoggerFactory.getLogger(GoogleDriveFileSystem.class);
static final String PAGE_SIZE = GOOGLE_SOURCE_PREFIX + "fs_helper.page_size"; //for paginated API
static final String FOLDER_MIME_TYPE = "application/vnd.google-apps.folder";
static final int DEFAULT_PAGE_SIZE = 50;
private Drive client;
private final Closer closer;
private int pageSize = DEFAULT_PAGE_SIZE;
public GoogleDriveFileSystem(Drive client) {
this();
this.client = client;
}
public GoogleDriveFileSystem(Drive client, int pageSize) {
this(client);
Preconditions.checkArgument(pageSize > 0, "pageSize should be positive number");
this.pageSize = pageSize;
}
public GoogleDriveFileSystem() {
super();
this.closer = Closer.create();
}
@Override
public synchronized void initialize(URI uri, Configuration conf) throws IOException {
if (this.client == null) {
super.initialize(uri, conf);
State state = HadoopUtils.getStateFromConf(conf);
Credential credential = new GoogleCommon.CredentialBuilder(state.getProp(SOURCE_CONN_PRIVATE_KEY), state.getPropAsList(API_SCOPES))
.fileSystemUri(state.getProp(PRIVATE_KEY_FILESYSTEM_URI))
.proxyUrl(state.getProp(SOURCE_CONN_USE_PROXY_URL))
.port(state.getProp(SOURCE_CONN_USE_PROXY_PORT))
.serviceAccountId(state.getProp(SOURCE_CONN_USERNAME))
.build();
this.client = new Drive.Builder(credential.getTransport(),
GoogleCommon.getJsonFactory(),
credential)
.setApplicationName(Preconditions.checkNotNull(state.getProp(APPLICATION_NAME),
"ApplicationName is required"))
.build();
this.pageSize = state.getPropAsInt(PAGE_SIZE, DEFAULT_PAGE_SIZE);
}
}
@Override
public FSDataInputStream open(Path path, int bufferSize) throws IOException {
return closer.register(new FSDataInputStream(
new SeekableFSInputStream(
new BufferedInputStream(
client.files().get(toFileId(path)).executeMediaAsInputStream(), bufferSize))));
}
@Override
public FSDataInputStream open(Path path) throws IOException {
return closer.register(new FSDataInputStream(
new SeekableFSInputStream(
new BufferedInputStream(
client.files().get(toFileId(path)).executeMediaAsInputStream()))));
}
@Override
public boolean delete(Path path, boolean recursive) throws IOException {
Preconditions.checkArgument(recursive, "Non-recursive is not supported.");
String fileId = toFileId(path);
LOG.debug("Deleting file: " + fileId);
try {
client.files().delete(fileId).execute();
} catch (GoogleJsonResponseException e) {
GoogleJsonError error = e.getDetails();
if (404 == error.getCode()) { //Non-existing file id
return false;
}
throw e;
}
return true;
}
@Override
public FileStatus[] listStatus(Path path) throws FileNotFoundException, IOException {
String folderId = toFileId(path);
List<File> fileMetadata = lsFileMetadata(folderId, null);
if (fileMetadata.isEmpty()) {
throw new FileNotFoundException();
}
FileStatus[] statusArr = new FileStatus[fileMetadata.size()];
int idx = 0;
for (File metadata: fileMetadata) {
FileStatus status = toFileStatus(metadata);
statusArr[idx++] = status;
}
return statusArr;
}
private List<File> lsFileMetadata(String folderId, String fileId) throws IOException {
String pageToken = null;
List<File> result = new ArrayList<>();
Optional<String> query = buildQuery(folderId, fileId);
do {
Drive.Files.List request = client.files()
.list()
.setFields("files/id,files/mimeType,files/modifiedTime,files/size,files/permissions")
.setPageSize(pageSize);
if (query.isPresent()) {
request = request.setQ(query.get());
}
if (pageToken != null) {
request = request.setPageToken(pageToken);
}
LOG.info("Google drive List request: " + request);
if (LOG.isDebugEnabled()) {
LOG.debug("Google drive List request: " + request);
}
FileList fileList = null;
try {
fileList = request.execute();
} catch (GoogleJsonResponseException e) {
GoogleJsonError error = e.getDetails();
if (404 == error.getCode()) {
throw new FileNotFoundException("File not found. Request: " + request);
}
throw e;
}
pageToken = fileList.getNextPageToken();
List<File> files = fileList.getFiles();
if (files == null || files.isEmpty()) {
return result;
}
result.addAll(files);
} while (pageToken != null);
return result;
}
/**
* Build query for Google drive.
* @see https://developers.google.com/drive/v3/web/search-parameters
*
* @param folderId
* @param fileName
* @return Query
*/
@VisibleForTesting
Optional<String> buildQuery(String folderId, String fileName) {
if (StringUtils.isEmpty(folderId) && StringUtils.isEmpty(fileName)) {
return Optional.absent();
}
StringBuilder query = new StringBuilder();
if (StringUtils.isNotEmpty(folderId)) {
query.append("'").append(folderId).append("'")
.append(" in parents");
}
if (StringUtils.isNotEmpty(fileName)) {
if (query.length() > 0) {
query.append(" and ");
}
query.append("name contains ")
.append("'").append(fileName).append("'");
}
return Optional.of(query.toString());
}
/**
* org.apache.hadoop.fs.Path assumes that there separator in file system naming and "/" is the separator.
* When org.apache.hadoop.fs.Path sees "/" in path String, it splits into parent and name. As fileID is a random
* String determined by Google and it can contain "/" itself, this method check if parent and name is separated and
* restore "/" back to file ID.
*
* @param p
* @return
*/
public static String toFileId(Path p) {
if (p.isRoot()) {
return "";
}
final String format = "%s" + Path.SEPARATOR + "%s";
if (p.getParent() != null && StringUtils.isEmpty(p.getParent().getName())) {
return p.getName();
}
return String.format(format, toFileId(p.getParent()), p.getName());
}
@Override
public void close() throws IOException {
super.close();
closer.close();
}
@Override
public FileStatus getFileStatus(Path p) throws IOException {
Preconditions.checkNotNull(p);
String fileId = toFileId(p);
File metadata = client.files().get(fileId)
.setFields("id,mimeType,modifiedTime,size,permissions")
.execute();
return toFileStatus(metadata);
}
private FileStatus toFileStatus(File metadata) {
return new FileStatus(metadata.getSize() == null ? 0L : metadata.getSize(),
FOLDER_MIME_TYPE.equals(metadata.getMimeType()),
-1,
-1,
metadata.getModifiedTime().getValue(),
new Path(metadata.getId()));
}
//Below are unsupported methods
@Override
public void setWorkingDirectory(Path new_dir) {
throw new UnsupportedOperationException();
}
@Override
public Path getWorkingDirectory() {
throw new UnsupportedOperationException();
}
@Override
public boolean mkdirs(Path f, FsPermission permission) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize,
short replication, long blockSize, Progressable progress) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public boolean rename(Path src, Path dst) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public URI getUri() {
throw new UnsupportedOperationException();
}
}