/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.db.commitlog; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.Collection; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import org.cliffc.high_scale_lib.NonBlockingHashMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.config.CFMetaData; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.Schema; import org.apache.cassandra.db.ColumnFamily; import org.apache.cassandra.db.Mutation; import org.apache.cassandra.io.FSWriteError; import org.apache.cassandra.io.util.FileUtils; import org.apache.cassandra.utils.PureJavaCrc32; import org.apache.cassandra.utils.WaitQueue; /* * A single commit log file on disk. Manages creation of the file and writing mutations to disk, * as well as tracking the last mutation position of any "dirty" CFs covered by the segment file. Segment * files are initially allocated to a fixed size and can grow to accomidate a larger value if necessary. */ public class CommitLogSegment { private static final Logger logger = LoggerFactory.getLogger(CommitLogSegment.class); private final static long idBase = System.currentTimeMillis(); private final static AtomicInteger nextId = new AtomicInteger(1); // The commit log entry overhead in bytes (int: length + long: head checksum + long: tail checksum) static final int ENTRY_OVERHEAD_SIZE = 4 + 8 + 8; // The commit log (chained) sync marker/header size in bytes (int: length + long: checksum [segmentId, position]) static final int SYNC_MARKER_SIZE = 4 + 8; // The current AppendLock object - i.e. the one all threads adding new log records should use to synchronise private final AtomicReference<AppendLock> appendLock = new AtomicReference<>(new AppendLock()); private final AtomicInteger allocatePosition = new AtomicInteger(); // Everything before this offset has been synced and written. The SYNC_MARKER_SIZE bytes after // each sync are reserved, and point forwards to the next such offset. The final // sync marker in a segment will be zeroed out, or point to EOF. private volatile int lastSyncedOffset; // the amount of the tail of the file we have allocated but not used - this is used when we discard a log segment // to ensure nobody writes to it after we've decided we're done with it private int discardedTailFrom; // a signal for writers to wait on to confirm the log message they provided has been written to disk private final WaitQueue syncComplete = new WaitQueue(); // a map of Cf->dirty position; this is used to permit marking Cfs clean whilst the log is still in use private final NonBlockingHashMap<UUID, AtomicInteger> cfDirty = new NonBlockingHashMap<>(1024); // a map of Cf->clean position; this is used to permit marking Cfs clean whilst the log is still in use private final ConcurrentHashMap<UUID, AtomicInteger> cfClean = new ConcurrentHashMap<>(); public final long id; private final File logFile; private final RandomAccessFile logFileAccessor; private final MappedByteBuffer buffer; public final CommitLogDescriptor descriptor; /** * @return a newly minted segment file */ static CommitLogSegment freshSegment() { return new CommitLogSegment(null); } static long getNextId() { return idBase + nextId.getAndIncrement(); } /** * Constructs a new segment file. * * @param filePath if not null, recycles the existing file by renaming it and truncating it to CommitLog.SEGMENT_SIZE. */ CommitLogSegment(String filePath) { id = getNextId(); descriptor = new CommitLogDescriptor(id); logFile = new File(DatabaseDescriptor.getCommitLogLocation(), descriptor.fileName()); boolean isCreating = true; try { if (filePath != null) { File oldFile = new File(filePath); if (oldFile.exists()) { logger.debug("Re-using discarded CommitLog segment for {} from {}", id, filePath); if (!oldFile.renameTo(logFile)) throw new IOException("Rename from " + filePath + " to " + id + " failed"); isCreating = false; } } // Open the initial the segment file logFileAccessor = new RandomAccessFile(logFile, "rw"); if (isCreating) logger.debug("Creating new commit log segment {}", logFile.getPath()); // Map the segment, extending or truncating it to the standard segment size. // (We may have restarted after a segment size configuration change, leaving "incorrectly" // sized segments on disk.) logFileAccessor.setLength(DatabaseDescriptor.getCommitLogSegmentSize()); buffer = logFileAccessor.getChannel().map(FileChannel.MapMode.READ_WRITE, 0, DatabaseDescriptor.getCommitLogSegmentSize()); // mark the initial header as uninitialised buffer.putInt(0, 0); buffer.putLong(4, 0); allocatePosition.set(SYNC_MARKER_SIZE); } catch (IOException e) { throw new FSWriteError(e, logFile); } } /** * allocate space in this buffer for the provided mutation, and populate the provided * Allocation object, returning true on success. False indicates there is not enough room in * this segment, and a new segment is needed */ boolean allocate(Mutation mutation, int size, Allocation alloc) { final AppendLock appendLock = lockForAppend(); try { int position = allocate(size); if (position < 0) { appendLock.unlock(); return false; } alloc.buffer = (ByteBuffer) buffer.duplicate().position(position).limit(position + size); alloc.position = position; alloc.segment = this; alloc.appendLock = appendLock; markDirty(mutation, position); return true; } catch (Throwable t) { appendLock.unlock(); throw t; } } // obtain the current AppendLock and lock it for record appending private AppendLock lockForAppend() { while (true) { AppendLock appendLock = this.appendLock.get(); if (appendLock.lock()) return appendLock; } } // allocate bytes in the segment, or return -1 if not enough space private int allocate(int size) { while (true) { int prev = allocatePosition.get(); int next = prev + size; if (next >= buffer.capacity()) return -1; if (allocatePosition.compareAndSet(prev, next)) return prev; } } // ensures no more of this segment is writeable, by allocating any unused section at the end and marking it discarded synchronized void discardUnusedTail() { if (discardedTailFrom > 0) return; while (true) { int prev = allocatePosition.get(); int next = buffer.capacity(); if (allocatePosition.compareAndSet(prev, next)) { discardedTailFrom = prev; return; } } } /** * Forces a disk flush for this segment file. */ synchronized void sync() { try { // check we have more work to do if (allocatePosition.get() <= lastSyncedOffset + SYNC_MARKER_SIZE) return; // allocate a new sync marker; this is both necessary in itself, but also serves to demarcate // the point at which we can safely consider records to have been completely written to int nextMarker; nextMarker = allocate(SYNC_MARKER_SIZE); boolean close = false; if (nextMarker < 0) { // ensure no more of this CLS is writeable, and mark ourselves for closing discardUnusedTail(); close = true; if (discardedTailFrom < buffer.capacity() - SYNC_MARKER_SIZE) { // if there's room in the discard section to write an empty header, use that as the nextMarker nextMarker = discardedTailFrom; } else { // not enough space left in the buffer, so mark the next sync marker as the EOF position nextMarker = buffer.capacity(); } } // swap the append lock AppendLock curAppendLock = appendLock.get(); appendLock.set(new AppendLock()); curAppendLock.expireAndWaitForCompletion(); // write previous sync marker to point to next sync marker // we don't chain the crcs here to ensure this method is idempotent if it fails int offset = lastSyncedOffset; final PureJavaCrc32 crc = new PureJavaCrc32(); crc.update((int) (id & 0xFFFFFFFFL)); crc.update((int) (id >>> 32)); crc.update(offset); buffer.putInt(offset, nextMarker); buffer.putLong(offset + 4, crc.getValue()); // zero out the next sync marker so replayer can cleanly exit if (nextMarker < buffer.capacity()) { buffer.putInt(nextMarker, 0); buffer.putLong(nextMarker + 4, 0); } // actually perform the sync and signal those waiting for it buffer.force(); syncComplete.signalAll(); if (close) { close(); nextMarker = buffer.capacity(); } lastSyncedOffset = nextMarker; } catch (Exception e) // MappedByteBuffer.force() does not declare IOException but can actually throw it { throw new FSWriteError(e, getPath()); } } public boolean isFullySynced() { return lastSyncedOffset == buffer.capacity(); } /** * Completely discards a segment file by deleting it. (Potentially blocking operation) */ void delete() { FileUtils.deleteWithConfirm(logFile); } /** * Recycle processes an unneeded segment file for reuse. * * @return a new CommitLogSegment representing the newly reusable segment. */ CommitLogSegment recycle() { try { sync(); } catch (FSWriteError e) { logger.error("I/O error flushing {} {}", this, e.getMessage()); throw e; } close(); return new CommitLogSegment(getPath()); } /** * @return the current ReplayPosition for this log segment */ public ReplayPosition getContext() { return new ReplayPosition(id, allocatePosition.get()); } /** * @return the file path to this segment */ public String getPath() { return logFile.getPath(); } /** * @return the file name of this segment */ public String getName() { return logFile.getName(); } /** * Close the segment file. */ void close() { try { FileUtils.clean(buffer); logFileAccessor.close(); } catch (IOException e) { throw new FSWriteError(e, getPath()); } } void markDirty(Mutation mutation, int allocatedPosition) { for (ColumnFamily columnFamily : mutation.getColumnFamilies()) { // check for deleted CFS CFMetaData cfm = columnFamily.metadata(); if (cfm.isPurged()) logger.error("Attempted to write commit log entry for unrecognized column family: {}", columnFamily.id()); else ensureAtleast(cfDirty, cfm.cfId, allocatedPosition); } } /** * Marks the ColumnFamily specified by cfId as clean for this log segment. If the * given context argument is contained in this file, it will only mark the CF as * clean if no newer writes have taken place. * * @param cfId the column family ID that is now clean * @param context the optional clean offset */ public void markClean(UUID cfId, ReplayPosition context) { if (!cfDirty.containsKey(cfId)) return; if (context.segment == id) markClean(cfId, context.position); else if (context.segment > id) markClean(cfId, Integer.MAX_VALUE); } private void markClean(UUID cfId, int position) { ensureAtleast(cfClean, cfId, position); removeCleanFromDirty(); } private static void ensureAtleast(ConcurrentMap<UUID, AtomicInteger> map, UUID cfId, int value) { AtomicInteger i = map.get(cfId); if (i == null) { AtomicInteger i2 = map.putIfAbsent(cfId, i = new AtomicInteger()); if (i2 != null) i = i2; } while (true) { int cur = i.get(); if (cur > value) break; if (i.compareAndSet(cur, value)) break; } } private void removeCleanFromDirty() { // if we're still allocating from this segment, don't touch anything since it can't be done thread-safely if (!isFullySynced()) return; Iterator<Map.Entry<UUID, AtomicInteger>> iter = cfClean.entrySet().iterator(); while (iter.hasNext()) { Map.Entry<UUID, AtomicInteger> clean = iter.next(); UUID cfId = clean.getKey(); AtomicInteger cleanPos = clean.getValue(); AtomicInteger dirtyPos = cfDirty.get(cfId); if (dirtyPos != null && dirtyPos.intValue() < cleanPos.intValue()) { cfDirty.remove(cfId); iter.remove(); } } } /** * @return a collection of dirty CFIDs for this segment file. */ public Collection<UUID> getDirtyCFIDs() { removeCleanFromDirty(); if (cfClean.isEmpty() || cfDirty.isEmpty()) return cfDirty.keySet(); List<UUID> r = new ArrayList<>(cfDirty.size()); for (Map.Entry<UUID, AtomicInteger> dirty : cfDirty.entrySet()) { UUID cfId = dirty.getKey(); AtomicInteger dirtyPos = dirty.getValue(); AtomicInteger cleanPos = cfClean.get(cfId); if (cleanPos == null || cleanPos.intValue() < dirtyPos.intValue()) r.add(dirty.getKey()); } return r; } /** * @return true if this segment is unused and safe to recycle or delete */ public boolean isUnused() { // if it's not fully synced, we assume we're still in use as the active allocatingFrom if (!isFullySynced()) return false; removeCleanFromDirty(); return cfDirty.isEmpty(); } /** * Check to see if a certain ReplayPosition is contained by this segment file. * * @param context the replay position to be checked * @return true if the replay position is contained by this segment file. */ public boolean contains(ReplayPosition context) { return context.segment == id; } // For debugging, not fast public String dirtyString() { StringBuilder sb = new StringBuilder(); for (UUID cfId : getDirtyCFIDs()) { CFMetaData m = Schema.instance.getCFMetaData(cfId); sb.append(m == null ? "<deleted>" : m.cfName).append(" (").append(cfId).append("), "); } return sb.toString(); } @Override public String toString() { return "CommitLogSegment(" + getPath() + ')'; } public static class CommitLogSegmentFileComparator implements Comparator<File> { public int compare(File f, File f2) { CommitLogDescriptor desc = CommitLogDescriptor.fromFileName(f.getName()); CommitLogDescriptor desc2 = CommitLogDescriptor.fromFileName(f2.getName()); return (int) (desc.id - desc2.id); } } /** * A relatively simple class for synchronising flushes() with log message writers: * Log writers take the readLock prior to allocating themselves space in the segment; * once they complete writing the record they release the read lock. A call to sync() * will first check the position we have allocated space up until, then allocate a new AppendLock object, * take the writeLock of the previous AppendLock, and invalidate it for further log writes. All appends are * redirected to the new AppendLock so they do not block; only the sync() blocks waiting to obtain the writeLock. * Once it obtains the lock it is guaranteed that all writes up to the allocation position it checked at * the start have been completely written to. */ private static final class AppendLock { final ReadWriteLock syncLock = new ReentrantReadWriteLock(); final Lock logLock = syncLock.readLock(); // a map of Cfs with log records that have not been synced to disk, so cannot be marked clean yet boolean expired; // false if the lock could not be acquired for adding a log record; // a new AppendLock object will already be available, so fetch appendLock().get() // and retry boolean lock() { if (!logLock.tryLock()) return false; if (expired) { logLock.unlock(); return false; } return true; } // release the lock so that a appendLock() may complete void unlock() { logLock.unlock(); } void expireAndWaitForCompletion() { // wait for log records to complete (take writeLock) syncLock.writeLock().lock(); expired = true; // release lock immediately, though effectively a NOOP since we use tryLock() for log record appends syncLock.writeLock().unlock(); } } /** * A simple class for tracking information about the portion of a segment that has been allocated to a log write. * The constructor leaves the fields uninitialized for population by CommitlogManager, so that it can be * stack-allocated by escape analysis in CommitLog.add. */ static final class Allocation { private CommitLogSegment segment; private AppendLock appendLock; private int position; private ByteBuffer buffer; CommitLogSegment getSegment() { return segment; } ByteBuffer getBuffer() { return buffer; } // markWritten() MUST be called once we are done with the segment or the CL will never flush void markWritten() { appendLock.unlock(); } void awaitDiskSync() { while (segment.lastSyncedOffset < position) { WaitQueue.Signal signal = segment.syncComplete.register(); if (segment.lastSyncedOffset < position) signal.awaitUninterruptibly(); } } } }