/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db.commitlog;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.cliffc.high_scale_lib.NonBlockingHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.config.Schema;
import org.apache.cassandra.db.ColumnFamily;
import org.apache.cassandra.db.Mutation;
import org.apache.cassandra.io.FSWriteError;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.utils.PureJavaCrc32;
import org.apache.cassandra.utils.WaitQueue;
/*
* A single commit log file on disk. Manages creation of the file and writing mutations to disk,
* as well as tracking the last mutation position of any "dirty" CFs covered by the segment file. Segment
* files are initially allocated to a fixed size and can grow to accomidate a larger value if necessary.
*/
public class CommitLogSegment
{
private static final Logger logger = LoggerFactory.getLogger(CommitLogSegment.class);
private final static long idBase = System.currentTimeMillis();
private final static AtomicInteger nextId = new AtomicInteger(1);
// The commit log entry overhead in bytes (int: length + long: head checksum + long: tail checksum)
static final int ENTRY_OVERHEAD_SIZE = 4 + 8 + 8;
// The commit log (chained) sync marker/header size in bytes (int: length + long: checksum [segmentId, position])
static final int SYNC_MARKER_SIZE = 4 + 8;
// The current AppendLock object - i.e. the one all threads adding new log records should use to synchronise
private final AtomicReference<AppendLock> appendLock = new AtomicReference<>(new AppendLock());
private final AtomicInteger allocatePosition = new AtomicInteger();
// Everything before this offset has been synced and written. The SYNC_MARKER_SIZE bytes after
// each sync are reserved, and point forwards to the next such offset. The final
// sync marker in a segment will be zeroed out, or point to EOF.
private volatile int lastSyncedOffset;
// the amount of the tail of the file we have allocated but not used - this is used when we discard a log segment
// to ensure nobody writes to it after we've decided we're done with it
private int discardedTailFrom;
// a signal for writers to wait on to confirm the log message they provided has been written to disk
private final WaitQueue syncComplete = new WaitQueue();
// a map of Cf->dirty position; this is used to permit marking Cfs clean whilst the log is still in use
private final NonBlockingHashMap<UUID, AtomicInteger> cfDirty = new NonBlockingHashMap<>(1024);
// a map of Cf->clean position; this is used to permit marking Cfs clean whilst the log is still in use
private final ConcurrentHashMap<UUID, AtomicInteger> cfClean = new ConcurrentHashMap<>();
public final long id;
private final File logFile;
private final RandomAccessFile logFileAccessor;
private final MappedByteBuffer buffer;
public final CommitLogDescriptor descriptor;
/**
* @return a newly minted segment file
*/
static CommitLogSegment freshSegment()
{
return new CommitLogSegment(null);
}
static long getNextId()
{
return idBase + nextId.getAndIncrement();
}
/**
* Constructs a new segment file.
*
* @param filePath if not null, recycles the existing file by renaming it and truncating it to CommitLog.SEGMENT_SIZE.
*/
CommitLogSegment(String filePath)
{
id = getNextId();
descriptor = new CommitLogDescriptor(id);
logFile = new File(DatabaseDescriptor.getCommitLogLocation(), descriptor.fileName());
boolean isCreating = true;
try
{
if (filePath != null)
{
File oldFile = new File(filePath);
if (oldFile.exists())
{
logger.debug("Re-using discarded CommitLog segment for {} from {}", id, filePath);
if (!oldFile.renameTo(logFile))
throw new IOException("Rename from " + filePath + " to " + id + " failed");
isCreating = false;
}
}
// Open the initial the segment file
logFileAccessor = new RandomAccessFile(logFile, "rw");
if (isCreating)
logger.debug("Creating new commit log segment {}", logFile.getPath());
// Map the segment, extending or truncating it to the standard segment size.
// (We may have restarted after a segment size configuration change, leaving "incorrectly"
// sized segments on disk.)
logFileAccessor.setLength(DatabaseDescriptor.getCommitLogSegmentSize());
buffer = logFileAccessor.getChannel().map(FileChannel.MapMode.READ_WRITE, 0, DatabaseDescriptor.getCommitLogSegmentSize());
// mark the initial header as uninitialised
buffer.putInt(0, 0);
buffer.putLong(4, 0);
allocatePosition.set(SYNC_MARKER_SIZE);
}
catch (IOException e)
{
throw new FSWriteError(e, logFile);
}
}
/**
* allocate space in this buffer for the provided mutation, and populate the provided
* Allocation object, returning true on success. False indicates there is not enough room in
* this segment, and a new segment is needed
*/
boolean allocate(Mutation mutation, int size, Allocation alloc)
{
final AppendLock appendLock = lockForAppend();
try
{
int position = allocate(size);
if (position < 0)
{
appendLock.unlock();
return false;
}
alloc.buffer = (ByteBuffer) buffer.duplicate().position(position).limit(position + size);
alloc.position = position;
alloc.segment = this;
alloc.appendLock = appendLock;
markDirty(mutation, position);
return true;
}
catch (Throwable t)
{
appendLock.unlock();
throw t;
}
}
// obtain the current AppendLock and lock it for record appending
private AppendLock lockForAppend()
{
while (true)
{
AppendLock appendLock = this.appendLock.get();
if (appendLock.lock())
return appendLock;
}
}
// allocate bytes in the segment, or return -1 if not enough space
private int allocate(int size)
{
while (true)
{
int prev = allocatePosition.get();
int next = prev + size;
if (next >= buffer.capacity())
return -1;
if (allocatePosition.compareAndSet(prev, next))
return prev;
}
}
// ensures no more of this segment is writeable, by allocating any unused section at the end and marking it discarded
synchronized void discardUnusedTail()
{
if (discardedTailFrom > 0)
return;
while (true)
{
int prev = allocatePosition.get();
int next = buffer.capacity();
if (allocatePosition.compareAndSet(prev, next))
{
discardedTailFrom = prev;
return;
}
}
}
/**
* Forces a disk flush for this segment file.
*/
synchronized void sync()
{
try
{
// check we have more work to do
if (allocatePosition.get() <= lastSyncedOffset + SYNC_MARKER_SIZE)
return;
// allocate a new sync marker; this is both necessary in itself, but also serves to demarcate
// the point at which we can safely consider records to have been completely written to
int nextMarker;
nextMarker = allocate(SYNC_MARKER_SIZE);
boolean close = false;
if (nextMarker < 0)
{
// ensure no more of this CLS is writeable, and mark ourselves for closing
discardUnusedTail();
close = true;
if (discardedTailFrom < buffer.capacity() - SYNC_MARKER_SIZE)
{
// if there's room in the discard section to write an empty header, use that as the nextMarker
nextMarker = discardedTailFrom;
}
else
{
// not enough space left in the buffer, so mark the next sync marker as the EOF position
nextMarker = buffer.capacity();
}
}
// swap the append lock
AppendLock curAppendLock = appendLock.get();
appendLock.set(new AppendLock());
curAppendLock.expireAndWaitForCompletion();
// write previous sync marker to point to next sync marker
// we don't chain the crcs here to ensure this method is idempotent if it fails
int offset = lastSyncedOffset;
final PureJavaCrc32 crc = new PureJavaCrc32();
crc.update((int) (id & 0xFFFFFFFFL));
crc.update((int) (id >>> 32));
crc.update(offset);
buffer.putInt(offset, nextMarker);
buffer.putLong(offset + 4, crc.getValue());
// zero out the next sync marker so replayer can cleanly exit
if (nextMarker < buffer.capacity())
{
buffer.putInt(nextMarker, 0);
buffer.putLong(nextMarker + 4, 0);
}
// actually perform the sync and signal those waiting for it
buffer.force();
syncComplete.signalAll();
if (close)
{
close();
nextMarker = buffer.capacity();
}
lastSyncedOffset = nextMarker;
}
catch (Exception e) // MappedByteBuffer.force() does not declare IOException but can actually throw it
{
throw new FSWriteError(e, getPath());
}
}
public boolean isFullySynced()
{
return lastSyncedOffset == buffer.capacity();
}
/**
* Completely discards a segment file by deleting it. (Potentially blocking operation)
*/
void delete()
{
FileUtils.deleteWithConfirm(logFile);
}
/**
* Recycle processes an unneeded segment file for reuse.
*
* @return a new CommitLogSegment representing the newly reusable segment.
*/
CommitLogSegment recycle()
{
try
{
sync();
}
catch (FSWriteError e)
{
logger.error("I/O error flushing {} {}", this, e.getMessage());
throw e;
}
close();
return new CommitLogSegment(getPath());
}
/**
* @return the current ReplayPosition for this log segment
*/
public ReplayPosition getContext()
{
return new ReplayPosition(id, allocatePosition.get());
}
/**
* @return the file path to this segment
*/
public String getPath()
{
return logFile.getPath();
}
/**
* @return the file name of this segment
*/
public String getName()
{
return logFile.getName();
}
/**
* Close the segment file.
*/
void close()
{
try
{
FileUtils.clean(buffer);
logFileAccessor.close();
}
catch (IOException e)
{
throw new FSWriteError(e, getPath());
}
}
void markDirty(Mutation mutation, int allocatedPosition)
{
for (ColumnFamily columnFamily : mutation.getColumnFamilies())
{
// check for deleted CFS
CFMetaData cfm = columnFamily.metadata();
if (cfm.isPurged())
logger.error("Attempted to write commit log entry for unrecognized column family: {}", columnFamily.id());
else
ensureAtleast(cfDirty, cfm.cfId, allocatedPosition);
}
}
/**
* Marks the ColumnFamily specified by cfId as clean for this log segment. If the
* given context argument is contained in this file, it will only mark the CF as
* clean if no newer writes have taken place.
*
* @param cfId the column family ID that is now clean
* @param context the optional clean offset
*/
public void markClean(UUID cfId, ReplayPosition context)
{
if (!cfDirty.containsKey(cfId))
return;
if (context.segment == id)
markClean(cfId, context.position);
else if (context.segment > id)
markClean(cfId, Integer.MAX_VALUE);
}
private void markClean(UUID cfId, int position)
{
ensureAtleast(cfClean, cfId, position);
removeCleanFromDirty();
}
private static void ensureAtleast(ConcurrentMap<UUID, AtomicInteger> map, UUID cfId, int value)
{
AtomicInteger i = map.get(cfId);
if (i == null)
{
AtomicInteger i2 = map.putIfAbsent(cfId, i = new AtomicInteger());
if (i2 != null)
i = i2;
}
while (true)
{
int cur = i.get();
if (cur > value)
break;
if (i.compareAndSet(cur, value))
break;
}
}
private void removeCleanFromDirty()
{
// if we're still allocating from this segment, don't touch anything since it can't be done thread-safely
if (!isFullySynced())
return;
Iterator<Map.Entry<UUID, AtomicInteger>> iter = cfClean.entrySet().iterator();
while (iter.hasNext())
{
Map.Entry<UUID, AtomicInteger> clean = iter.next();
UUID cfId = clean.getKey();
AtomicInteger cleanPos = clean.getValue();
AtomicInteger dirtyPos = cfDirty.get(cfId);
if (dirtyPos != null && dirtyPos.intValue() < cleanPos.intValue())
{
cfDirty.remove(cfId);
iter.remove();
}
}
}
/**
* @return a collection of dirty CFIDs for this segment file.
*/
public Collection<UUID> getDirtyCFIDs()
{
removeCleanFromDirty();
if (cfClean.isEmpty() || cfDirty.isEmpty())
return cfDirty.keySet();
List<UUID> r = new ArrayList<>(cfDirty.size());
for (Map.Entry<UUID, AtomicInteger> dirty : cfDirty.entrySet())
{
UUID cfId = dirty.getKey();
AtomicInteger dirtyPos = dirty.getValue();
AtomicInteger cleanPos = cfClean.get(cfId);
if (cleanPos == null || cleanPos.intValue() < dirtyPos.intValue())
r.add(dirty.getKey());
}
return r;
}
/**
* @return true if this segment is unused and safe to recycle or delete
*/
public boolean isUnused()
{
// if it's not fully synced, we assume we're still in use as the active allocatingFrom
if (!isFullySynced())
return false;
removeCleanFromDirty();
return cfDirty.isEmpty();
}
/**
* Check to see if a certain ReplayPosition is contained by this segment file.
*
* @param context the replay position to be checked
* @return true if the replay position is contained by this segment file.
*/
public boolean contains(ReplayPosition context)
{
return context.segment == id;
}
// For debugging, not fast
public String dirtyString()
{
StringBuilder sb = new StringBuilder();
for (UUID cfId : getDirtyCFIDs())
{
CFMetaData m = Schema.instance.getCFMetaData(cfId);
sb.append(m == null ? "<deleted>" : m.cfName).append(" (").append(cfId).append("), ");
}
return sb.toString();
}
@Override
public String toString()
{
return "CommitLogSegment(" + getPath() + ')';
}
public static class CommitLogSegmentFileComparator implements Comparator<File>
{
public int compare(File f, File f2)
{
CommitLogDescriptor desc = CommitLogDescriptor.fromFileName(f.getName());
CommitLogDescriptor desc2 = CommitLogDescriptor.fromFileName(f2.getName());
return (int) (desc.id - desc2.id);
}
}
/**
* A relatively simple class for synchronising flushes() with log message writers:
* Log writers take the readLock prior to allocating themselves space in the segment;
* once they complete writing the record they release the read lock. A call to sync()
* will first check the position we have allocated space up until, then allocate a new AppendLock object,
* take the writeLock of the previous AppendLock, and invalidate it for further log writes. All appends are
* redirected to the new AppendLock so they do not block; only the sync() blocks waiting to obtain the writeLock.
* Once it obtains the lock it is guaranteed that all writes up to the allocation position it checked at
* the start have been completely written to.
*/
private static final class AppendLock
{
final ReadWriteLock syncLock = new ReentrantReadWriteLock();
final Lock logLock = syncLock.readLock();
// a map of Cfs with log records that have not been synced to disk, so cannot be marked clean yet
boolean expired;
// false if the lock could not be acquired for adding a log record;
// a new AppendLock object will already be available, so fetch appendLock().get()
// and retry
boolean lock()
{
if (!logLock.tryLock())
return false;
if (expired)
{
logLock.unlock();
return false;
}
return true;
}
// release the lock so that a appendLock() may complete
void unlock()
{
logLock.unlock();
}
void expireAndWaitForCompletion()
{
// wait for log records to complete (take writeLock)
syncLock.writeLock().lock();
expired = true;
// release lock immediately, though effectively a NOOP since we use tryLock() for log record appends
syncLock.writeLock().unlock();
}
}
/**
* A simple class for tracking information about the portion of a segment that has been allocated to a log write.
* The constructor leaves the fields uninitialized for population by CommitlogManager, so that it can be
* stack-allocated by escape analysis in CommitLog.add.
*/
static final class Allocation
{
private CommitLogSegment segment;
private AppendLock appendLock;
private int position;
private ByteBuffer buffer;
CommitLogSegment getSegment()
{
return segment;
}
ByteBuffer getBuffer()
{
return buffer;
}
// markWritten() MUST be called once we are done with the segment or the CL will never flush
void markWritten()
{
appendLock.unlock();
}
void awaitDiskSync()
{
while (segment.lastSyncedOffset < position)
{
WaitQueue.Signal signal = segment.syncComplete.register();
if (segment.lastSyncedOffset < position)
signal.awaitUninterruptibly();
}
}
}
}