/** * Copyright 2015 StreamSets Inc. * * Licensed under the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.streamsets.pipeline.lib.io; import com.streamsets.pipeline.api.impl.Utils; import com.streamsets.pipeline.config.PostProcessingOptions; import com.streamsets.pipeline.lib.util.ThreadUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Closeable; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.Map; /** * The <code>MultiFileReader</code> is a Reader that allows to read multiple files in a 'tail -f' mode while * keeping track of the current offsets and detecting if the files has been renamed. * <p/> * It builds on top of the {@link SingleLineLiveFileReader} adding support for reading data from multiple files in different * directories. * <p/> * Directories are read in round-robin fashion to avoid starvation. * <p/> * The usage pattern is: * <p/> * <pre> * offsetMap = .... * reader.setOffsets(offsetMap); * chunk = reader.next(timeoutInMillis); * if (chunk != null) { * .... * } * offsetMap = reader.getOffsets(); * </pre> * <p/> * The offsetMap must be kept/persisted by the caller to ensure current offsets are not lost. */ public class MultiFileReader implements Closeable { private static final Logger LOG = LoggerFactory.getLogger(MultiFileReader.class); private final static long MAX_YIELD_TIME = Integer.parseInt(System.getProperty("MultiFileReader.yield.ms", "500")); private final FileContextProvider fileContextProvider; private final List<FileEvent> events; private boolean open; private boolean inPreviewMode; /** * Creates a <code>MultiFileReader</code> that will scan/read multiple directories for data. * * @param fileInfos a list with the information for for each directory to scan/read. * @param charset the data charset (for all files) * @param maxLineLength the maximum line length (for all files) * @throws IOException thrown if there was an IO error while creating the reader. */ public MultiFileReader( List<MultiFileInfo> fileInfos, Charset charset, int maxLineLength, PostProcessingOptions postProcessing, String archiveDir, boolean globbing, int scanIntervalSecs, boolean allowForLateDirectoryCreation, boolean inPreviewMode ) throws IOException { Utils.checkNotNull(fileInfos, "fileInfos"); Utils.checkArgument(!fileInfos.isEmpty(), "fileInfos cannot be empty"); Utils.checkNotNull(charset, "charset"); Utils.checkArgument(maxLineLength > 1, "maxLineLength must be greater than one"); Utils.checkNotNull(postProcessing, "postProcessing"); Utils.checkArgument( postProcessing != PostProcessingOptions.ARCHIVE || (archiveDir != null && !archiveDir.isEmpty()), "archiveDir cannot be empty if postProcessing is ARCHIVE"); archiveDir = (postProcessing == PostProcessingOptions.ARCHIVE) ? archiveDir : null; this.inPreviewMode = inPreviewMode; events = new ArrayList<>(fileInfos.size() * 2); FileEventPublisher eventPublisher = new FileEventPublisher() { @Override public void publish(FileEvent event) { events.add(event); } }; //We assume ExactFileContextProvider has fileInfo which are exact and present. //We are using GlobFileInfo during FileTailSource which will allow for late Directory creation/ supports wild cards. fileContextProvider = (globbing)? new GlobFileContextProvider( allowForLateDirectoryCreation, fileInfos, scanIntervalSecs, charset, maxLineLength, postProcessing, archiveDir, eventPublisher, inPreviewMode ) : new ExactFileContextProvider( fileInfos, charset, maxLineLength, postProcessing, archiveDir, eventPublisher, inPreviewMode ); open = true; } /** * Sets the file offsets to use for the next read. To work correctly, the last return offsets should be used or * an empty <code>Map</code> if there is none. * <p/> * If a reader is already live, the corresponding set offset is ignored as we cache all the contextual information * of live readers. * * @param offsets directory offsets. * @throws IOException thrown if there was an IO error while preparing file offsets. */ public void setOffsets(Map<String, String> offsets) throws IOException { Utils.checkState(open, "Not open"); fileContextProvider.setOffsets(offsets); // we reset the events on every setOffsets(). events.clear(); } /** * Purge invalid file entries. */ public void purge() { fileContextProvider.purge(); } /** * Returns the current file offsets. The returned offsets should be set before the next read. * * @return the current file offsets. * @throws IOException thrown if there was an IO error while preparing file offsets. */ public Map<String, String> getOffsets() throws IOException { Utils.checkState(open, "Not open"); return fileContextProvider.getOffsets(); } /** * Returns all file events (start and end) since the last {@link #setOffsets(java.util.Map)} call. * * @return all files events. */ public List<FileEvent> getEvents() { return events; } // if we are in timeout private boolean isTimeout(long startTime ,long maxWaitTimeMillis) { return (System.currentTimeMillis() - startTime) > maxWaitTimeMillis; } // remaining time till timeout, return zero if already in timeout private long getRemainingWaitTime(long startTime, long maxWaitTimeMillis) { long remaining = maxWaitTimeMillis - (System.currentTimeMillis() - startTime); return (remaining > 0) ? remaining : 0; } /** * Reads the next {@link LiveFileChunk} from the directories waiting the specified time for one. * * @param waitMillis number of milliseconds to block waiting for a chunk. * @return the next chunk, or <code>null</code> if there is no next chunk and the waiting time passed. */ public LiveFileChunk next(long waitMillis) { Utils.checkState(open, "Not open"); waitMillis = (waitMillis > 0) ? waitMillis : 0; long startTime = System.currentTimeMillis(); LiveFileChunk chunk = null; boolean exit = false; fileContextProvider.startNewLoop(); while (!exit) { if (!fileContextProvider.didFullLoop()) { FileContext fileContext = fileContextProvider.next(); try { LiveFileReader reader = fileContext.getReader(); if (reader != null) { if (reader.hasNext()) { chunk = reader.next(0); if (LOG.isTraceEnabled()) { LOG.trace("next(): directory '{}', file '{}', offset '{}' got data '{}'", fileContext.getMultiFileInfo().getFileFullPath(), reader.getLiveFile(), reader.getOffset(), chunk != null); } } else { if (LOG.isTraceEnabled()) { LOG.trace("next(): directory '{}', file '{}', offset '{}' EOF reached", fileContext.getMultiFileInfo().getFileFullPath(), reader.getLiveFile(), reader.getOffset()); } } fileContext.releaseReader(false); } else { if (LOG.isTraceEnabled()) { LOG.trace("next(): directory '{}', no reader available", fileContext.getMultiFileInfo().getFileFullPath()); } } } catch (IOException ex) { LOG.error("Error while reading file: {}", ex.toString(), ex); try { fileContext.releaseReader(true); } catch (IOException ex1) { LOG.warn("Error while releasing reader in error: {}", ex1.toString(), ex1); } } } // check exit conditions (we have a chunk, or we timed-out waitMillis) exit = chunk != null; if (!exit) { // if we looped thru all dir contexts in this call we yield CPU if (fileContextProvider.didFullLoop()) { exit = isTimeout(startTime, waitMillis); if (!exit && LOG.isTraceEnabled()) { LOG.trace("next(): looped through all directories, yielding CPU"); } exit = exit || !ThreadUtil.sleep(Math.min(getRemainingWaitTime(startTime, waitMillis), MAX_YIELD_TIME)); fileContextProvider.startNewLoop(); } } } return chunk; } /** * Determines the offset lag for each active file being read. * * @param offsetMap the current Offset for file keys. * @return map of fileKey to offset. */ public Map<String, Long> getOffsetsLag(Map<String, String> offsetMap) throws IOException{ return fileContextProvider.getOffsetsLag(offsetMap); } /** * Determines the number of files yet to be processed. * * @return map of file key (One per directory where files are located) to the number of files */ public Map<String, Long> getPendingFiles() throws IOException{ return fileContextProvider.getPendingFiles(); } /** * Closes all open readers. */ @Override public void close() throws IOException { if (open) { open = false; fileContextProvider.close(); } } }