/**
* Copyright 2015 StreamSets Inc.
*
* Licensed under the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.streamsets.pipeline.lib.io;
import com.streamsets.pipeline.api.impl.Utils;
import com.streamsets.pipeline.config.FileRollMode;
import com.streamsets.pipeline.config.PostProcessingOptions;
import com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService;
import com.streamsets.pipeline.lib.util.GlobFilePathUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.ScheduledExecutorService;
public class GlobFileContextProvider extends BaseFileContextProvider {
private static final Logger LOG = LoggerFactory.getLogger(GlobFileContextProvider.class);
private static class GlobFileInfo implements Closeable {
private final MultiFileInfo globFileInfo;
private final FileFinder fileFinder;
private final Path finderPath;
// if scan interval is zero the GlobFileInfo will work synchronously and it won't require an executor
public GlobFileInfo(MultiFileInfo globFileInfo, ScheduledExecutorService executor, int scanIntervalSecs) {
this.globFileInfo = globFileInfo;
//For Periodic Pattern Roll mode, we will just use the parent path of
//the file for globbing and we will filter just the directories.
this.finderPath = (globFileInfo.getFileRollMode() == FileRollMode.PATTERN)?
Paths.get(globFileInfo.getFileFullPath()).getParent()
: Paths.get(globFileInfo.getFileFullPath());
FileFilterOption filterOption = (globFileInfo.getFileRollMode() == FileRollMode.PATTERN) ?
FileFilterOption.FILTER_DIRECTORIES_ONLY
: FileFilterOption.FILTER_REGULAR_FILES_ONLY;
this.fileFinder = (scanIntervalSecs == 0) ? new SynchronousFileFinder(finderPath, filterOption)
: new AsynchronousFileFinder(finderPath, scanIntervalSecs, executor, filterOption);
}
public MultiFileInfo getFileInfo(Path path) {
//For Periodic Pattern Roll Mode we will only watch for path of the parent
//Once we resolve the filepath for parent
//we will attach the final file name to the path to do the periodic pattern match.
if (globFileInfo.getFileRollMode() == FileRollMode.PATTERN) {
return new MultiFileInfo(
globFileInfo,
path.toString() + File.separatorChar + Paths.get(globFileInfo.getFileFullPath()).getFileName()
);
} else {
return new MultiFileInfo(
globFileInfo,
path.toString()
);
}
}
public Set<Path> find() throws IOException {
return fileFinder.find();
}
public boolean forget(MultiFileInfo multiFileInfo) {
return (multiFileInfo.getFileRollMode() == FileRollMode.PATTERN) ?
fileFinder.forget(Paths.get(multiFileInfo.getFileFullPath()).getParent()) :
fileFinder.forget(Paths.get(multiFileInfo.getFileFullPath()));
}
@Override
public void close() throws IOException {
fileFinder.close();
}
@Override
public String toString() {
return Utils.format("GlobFileInfo [finderPath='{}']", finderPath);
}
}
private final List<GlobFileInfo> globFileInfos;
private final Charset charset;
private final int maxLineLength;
private final PostProcessingOptions postProcessing;
private final String archiveDir;
private final FileEventPublisher eventPublisher;
private int scanIntervalSecs;
private boolean inPreviewMode;
private boolean allowForLateDirectoryCreation;
private Map<Path, MultiFileInfo> nonExistingPaths = new HashMap<Path, MultiFileInfo>();
private ScheduledExecutorService executor;
private DirectoryPathCreationWatcher directoryWatcher = null;
public GlobFileContextProvider(
boolean allowForLateDirectoryCreation,
List<MultiFileInfo> fileInfos,
int scanIntervalSecs,
Charset charset,
int maxLineLength,
PostProcessingOptions postProcessing,
String archiveDir,
FileEventPublisher eventPublisher,
boolean inPreviewMode) throws IOException {
super();
// if scan interval is zero the GlobFileInfo will work synchronously and it won't require an executor
globFileInfos = new CopyOnWriteArrayList<GlobFileInfo>();
fileContexts = new ArrayList<>();
this.allowForLateDirectoryCreation = allowForLateDirectoryCreation;
this.scanIntervalSecs = scanIntervalSecs;
this.charset = charset;
this.maxLineLength = maxLineLength;
this.postProcessing = postProcessing;
this.archiveDir = archiveDir;
this.eventPublisher = eventPublisher;
this.inPreviewMode = inPreviewMode;
executor = (scanIntervalSecs == 0) ? null :
new SafeScheduledExecutorService(fileInfos.size() / 3 + 1, "File Finder");
for (MultiFileInfo fileInfo : fileInfos) {
if (!checkForNonExistingPath(fileInfo)) {
addToContextOrGlobFileInfo(fileInfo);
}
}
if (allowForLateDirectoryCreation && !nonExistingPaths.isEmpty()) {
directoryWatcher = new DirectoryPathCreationWatcher(nonExistingPaths.keySet(), this.scanIntervalSecs);
}
LOG.debug("Created");
}
private void addToContextOrGlobFileInfo(MultiFileInfo fileInfo) throws IOException {
//Make sure if it is a periodic pattern roll mode and there is no globbing in the parent path
//if so add it to globFileInfo
if (fileInfo.getFileRollMode() == FileRollMode.PATTERN
&& !GlobFilePathUtil.hasGlobWildcard(fileInfo.getFileFullPath().replaceAll("\\$\\{"+"PATTERN"+"\\}", "")))
{
fileContexts.add(
new FileContext(
fileInfo,
charset,
maxLineLength,
postProcessing,
archiveDir,
eventPublisher,
inPreviewMode
)
);
} else {
//If scanIntervalSecs == 0, the GlobFile Info doc says it is synchronous it does not need a executor.
globFileInfos.add(new GlobFileInfo(fileInfo, (scanIntervalSecs == 0)? null : executor, scanIntervalSecs));
}
}
private boolean checkForNonExistingPath(MultiFileInfo multiFileInfo) throws IOException {
Path pathToSearchFor = GlobFilePathUtil.getPivotPath(Paths.get(multiFileInfo.getFileFullPath()).getParent());
boolean exists = Files.exists(pathToSearchFor);
if (!exists) {
if (!allowForLateDirectoryCreation) {
throw new IOException(Utils.format("Path does not exist:{}", pathToSearchFor));
} else {
nonExistingPaths.put(pathToSearchFor, multiFileInfo);
return true;
}
}
return false;
}
private void findCreatedDirectories() throws IOException{
if (allowForLateDirectoryCreation && !nonExistingPaths.isEmpty()) {
for (Path foundPath : directoryWatcher.find()) {
MultiFileInfo fileInfo = nonExistingPaths.get(foundPath);
addToContextOrGlobFileInfo(fileInfo);
nonExistingPaths.remove(foundPath);
LOG.debug("Found Path '{}'", foundPath);
}
}
}
private Map<FileContext, GlobFileInfo> fileToGlobFile = new HashMap<>();
private void findNewFileContexts() throws IOException {
//Thread fail safe
Iterator<GlobFileInfo> iterator = globFileInfos.iterator();
while (iterator.hasNext()) {
GlobFileInfo globfileInfo = iterator.next();
Set<Path> found = globfileInfo.find();
for (Path path : found) {
FileContext fileContext = new FileContext(
globfileInfo.getFileInfo(path),
charset,
maxLineLength,
postProcessing,
archiveDir,
eventPublisher,
inPreviewMode
);
fileContexts.add(fileContext);
fileToGlobFile.put(fileContext, globfileInfo);
LOG.debug("Found '{}'", fileContext);
}
}
}
@Override
public void purge() {
Iterator<FileContext> iterator = fileContexts.iterator();
boolean purgedAtLeastOne = false;
while (iterator.hasNext()) {
FileContext fileContext = iterator.next();
if (!fileContext.isActive()) {
fileContext.close();
iterator.remove();
if (fileToGlobFile.containsKey(fileContext)) {
fileToGlobFile.get(fileContext).forget(fileContext.getMultiFileInfo());
}
LOG.debug("Removed '{}'", fileContext);
purgedAtLeastOne = true;
}
}
if (purgedAtLeastOne) {
//reset loop counter to be within boundaries.
resetCurrentAndStartingIdx();
startNewLoop();
}
}
/**
* Sets the file offsets to use for the next read. To work correctly, the last return offsets should be used or
* an empty <code>Map</code> if there is none.
* <p/>
* If a reader is already live, the corresponding set offset is ignored as we cache all the contextual information
* of live readers.
*
* @param offsets directory offsets.
* @throws java.io.IOException thrown if there was an IO error while preparing file offsets.
*/
@Override
public void setOffsets(Map<String, String> offsets) throws IOException {
Utils.checkNotNull(offsets, "offsets");
LOG.trace("setOffsets()");
// We look for created directory paths here
findCreatedDirectories();
// we look for new files only here
findNewFileContexts();
// we purge file only here
purge();
super.setOffsets(offsets);
startNewLoop();
}
@Override
public void close() {
LOG.debug("Closed");
if (executor != null) {
executor.shutdownNow();
}
if (directoryWatcher != null) {
directoryWatcher.close();
}
for (GlobFileInfo globFileInfo : globFileInfos) {
try {
globFileInfo.close();
} catch (IOException ex) {
LOG.warn("Could not close '{}': {}", globFileInfo, ex.toString(), ex);
}
}
//Close File Contexts
super.close();
}
}