/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.io.sstable;
import java.io.*;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.google.common.collect.Sets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.config.Schema;
import org.apache.cassandra.db.ArrayBackedSortedColumns;
import org.apache.cassandra.db.ColumnFamily;
import org.apache.cassandra.db.ColumnIndex;
import org.apache.cassandra.db.ColumnSerializer;
import org.apache.cassandra.db.CounterCell;
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.db.DeletionTime;
import org.apache.cassandra.db.OnDiskAtom;
import org.apache.cassandra.db.RangeTombstone;
import org.apache.cassandra.db.RowIndexEntry;
import org.apache.cassandra.db.compaction.AbstractCompactedRow;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.io.FSWriteError;
import org.apache.cassandra.io.compress.CompressedSequentialWriter;
import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
import org.apache.cassandra.io.sstable.metadata.MetadataType;
import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
import org.apache.cassandra.io.util.*;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.FilterFactory;
import org.apache.cassandra.utils.IFilter;
import org.apache.cassandra.utils.Pair;
import org.apache.cassandra.utils.StreamingHistogram;
public class SSTableWriter extends SSTable
{
private static final Logger logger = LoggerFactory.getLogger(SSTableWriter.class);
// not very random, but the only value that can't be mistaken for a legal column-name length
public static final int END_OF_ROW = 0x0000;
private IndexWriter iwriter;
private SegmentedFile.Builder dbuilder;
private final SequentialWriter dataFile;
private DecoratedKey lastWrittenKey;
private FileMark dataMark;
private final MetadataCollector sstableMetadataCollector;
private final long repairedAt;
public SSTableWriter(String filename, long keyCount, long repairedAt)
{
this(filename,
keyCount,
repairedAt,
Schema.instance.getCFMetaData(Descriptor.fromFilename(filename)),
StorageService.getPartitioner(),
new MetadataCollector(Schema.instance.getCFMetaData(Descriptor.fromFilename(filename)).comparator));
}
private static Set<Component> components(CFMetaData metadata)
{
Set<Component> components = new HashSet<Component>(Arrays.asList(Component.DATA,
Component.PRIMARY_INDEX,
Component.STATS,
Component.SUMMARY,
Component.TOC,
Component.DIGEST));
if (metadata.getBloomFilterFpChance() < 1.0)
components.add(Component.FILTER);
if (metadata.compressionParameters().sstableCompressor != null)
{
components.add(Component.COMPRESSION_INFO);
}
else
{
// it would feel safer to actually add this component later in maybeWriteDigest(),
// but the components are unmodifiable after construction
components.add(Component.CRC);
}
return components;
}
public SSTableWriter(String filename,
long keyCount,
long repairedAt,
CFMetaData metadata,
IPartitioner partitioner,
MetadataCollector sstableMetadataCollector)
{
super(Descriptor.fromFilename(filename),
components(metadata),
metadata,
partitioner);
this.repairedAt = repairedAt;
if (compression)
{
dataFile = SequentialWriter.open(getFilename(),
descriptor.filenameFor(Component.COMPRESSION_INFO),
metadata.compressionParameters(),
sstableMetadataCollector);
dbuilder = SegmentedFile.getCompressedBuilder((CompressedSequentialWriter) dataFile);
}
else
{
dataFile = SequentialWriter.open(new File(getFilename()), new File(descriptor.filenameFor(Component.CRC)));
dbuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode());
}
iwriter = new IndexWriter(keyCount, dataFile);
this.sstableMetadataCollector = sstableMetadataCollector;
}
public void mark()
{
dataMark = dataFile.mark();
iwriter.mark();
}
public void resetAndTruncate()
{
dataFile.resetAndTruncate(dataMark);
iwriter.resetAndTruncate();
}
/**
* Perform sanity checks on @param decoratedKey and @return the position in the data file before any data is written
*/
private long beforeAppend(DecoratedKey decoratedKey)
{
assert decoratedKey != null : "Keys must not be null"; // empty keys ARE allowed b/c of indexed column values
if (lastWrittenKey != null && lastWrittenKey.compareTo(decoratedKey) >= 0)
throw new RuntimeException("Last written key " + lastWrittenKey + " >= current key " + decoratedKey + " writing into " + getFilename());
return (lastWrittenKey == null) ? 0 : dataFile.getFilePointer();
}
private void afterAppend(DecoratedKey decoratedKey, long dataEnd, RowIndexEntry index)
{
sstableMetadataCollector.addKey(decoratedKey.getKey());
lastWrittenKey = decoratedKey;
last = lastWrittenKey;
if (first == null)
first = lastWrittenKey;
if (logger.isTraceEnabled())
logger.trace("wrote " + decoratedKey + " at " + dataEnd);
iwriter.append(decoratedKey, index, dataEnd);
dbuilder.addPotentialBoundary(dataEnd);
}
/**
* @param row
* @return null if the row was compacted away entirely; otherwise, the PK index entry for this row
*/
public RowIndexEntry append(AbstractCompactedRow row)
{
long startPosition = beforeAppend(row.key);
RowIndexEntry entry;
try
{
entry = row.write(startPosition, dataFile.stream);
if (entry == null)
return null;
}
catch (IOException e)
{
throw new FSWriteError(e, dataFile.getPath());
}
long endPosition = dataFile.getFilePointer();
sstableMetadataCollector.update(endPosition - startPosition, row.columnStats());
afterAppend(row.key, endPosition, entry);
return entry;
}
public void append(DecoratedKey decoratedKey, ColumnFamily cf)
{
if (decoratedKey.getKey().remaining() > FBUtilities.MAX_UNSIGNED_SHORT)
{
logger.error("Key size {} exceeds maximum of {}, skipping row",
decoratedKey.getKey().remaining(),
FBUtilities.MAX_UNSIGNED_SHORT);
return;
}
long startPosition = beforeAppend(decoratedKey);
long endPosition;
try
{
RowIndexEntry entry = rawAppend(cf, startPosition, decoratedKey, dataFile.stream);
endPosition = dataFile.getFilePointer();
afterAppend(decoratedKey, endPosition, entry);
}
catch (IOException e)
{
throw new FSWriteError(e, dataFile.getPath());
}
sstableMetadataCollector.update(endPosition - startPosition, cf.getColumnStats());
}
public static RowIndexEntry rawAppend(ColumnFamily cf, long startPosition, DecoratedKey key, DataOutputPlus out) throws IOException
{
assert cf.hasColumns() || cf.isMarkedForDelete();
ColumnIndex.Builder builder = new ColumnIndex.Builder(cf, key.getKey(), out);
ColumnIndex index = builder.build(cf);
out.writeShort(END_OF_ROW);
return RowIndexEntry.create(startPosition, cf.deletionInfo().getTopLevelDeletion(), index);
}
/**
* @throws IOException if a read from the DataInput fails
* @throws FSWriteError if a write to the dataFile fails
*/
public long appendFromStream(DecoratedKey key, CFMetaData metadata, DataInput in, Descriptor.Version version) throws IOException
{
long currentPosition = beforeAppend(key);
ColumnStats.MaxLongTracker maxTimestampTracker = new ColumnStats.MaxLongTracker(Long.MAX_VALUE);
ColumnStats.MinLongTracker minTimestampTracker = new ColumnStats.MinLongTracker(Long.MIN_VALUE);
ColumnStats.MaxIntTracker maxDeletionTimeTracker = new ColumnStats.MaxIntTracker(Integer.MAX_VALUE);
List<ByteBuffer> minColumnNames = Collections.emptyList();
List<ByteBuffer> maxColumnNames = Collections.emptyList();
StreamingHistogram tombstones = new StreamingHistogram(TOMBSTONE_HISTOGRAM_BIN_SIZE);
boolean hasLegacyCounterShards = false;
ColumnFamily cf = ArrayBackedSortedColumns.factory.create(metadata);
cf.delete(DeletionTime.serializer.deserialize(in));
ColumnIndex.Builder columnIndexer = new ColumnIndex.Builder(cf, key.getKey(), dataFile.stream);
if (cf.deletionInfo().getTopLevelDeletion().localDeletionTime < Integer.MAX_VALUE)
{
tombstones.update(cf.deletionInfo().getTopLevelDeletion().localDeletionTime);
maxDeletionTimeTracker.update(cf.deletionInfo().getTopLevelDeletion().localDeletionTime);
minTimestampTracker.update(cf.deletionInfo().getTopLevelDeletion().markedForDeleteAt);
maxTimestampTracker.update(cf.deletionInfo().getTopLevelDeletion().markedForDeleteAt);
}
Iterator<RangeTombstone> rangeTombstoneIterator = cf.deletionInfo().rangeIterator();
while (rangeTombstoneIterator.hasNext())
{
RangeTombstone rangeTombstone = rangeTombstoneIterator.next();
tombstones.update(rangeTombstone.getLocalDeletionTime());
minTimestampTracker.update(rangeTombstone.timestamp());
maxTimestampTracker.update(rangeTombstone.timestamp());
maxDeletionTimeTracker.update(rangeTombstone.getLocalDeletionTime());
minColumnNames = ColumnNameHelper.minComponents(minColumnNames, rangeTombstone.min, metadata.comparator);
maxColumnNames = ColumnNameHelper.maxComponents(maxColumnNames, rangeTombstone.max, metadata.comparator);
}
Iterator<OnDiskAtom> iter = metadata.getOnDiskIterator(in, ColumnSerializer.Flag.PRESERVE_SIZE, Integer.MIN_VALUE, version);
try
{
while (iter.hasNext())
{
OnDiskAtom atom = iter.next();
if (atom == null)
break;
if (atom instanceof CounterCell)
{
atom = ((CounterCell) atom).markLocalToBeCleared();
hasLegacyCounterShards = hasLegacyCounterShards || ((CounterCell) atom).hasLegacyShards();
}
int deletionTime = atom.getLocalDeletionTime();
if (deletionTime < Integer.MAX_VALUE)
tombstones.update(deletionTime);
minTimestampTracker.update(atom.timestamp());
maxTimestampTracker.update(atom.timestamp());
minColumnNames = ColumnNameHelper.minComponents(minColumnNames, atom.name(), metadata.comparator);
maxColumnNames = ColumnNameHelper.maxComponents(maxColumnNames, atom.name(), metadata.comparator);
maxDeletionTimeTracker.update(atom.getLocalDeletionTime());
columnIndexer.add(atom); // This write the atom on disk too
}
columnIndexer.maybeWriteEmptyRowHeader();
dataFile.stream.writeShort(END_OF_ROW);
}
catch (IOException e)
{
throw new FSWriteError(e, dataFile.getPath());
}
sstableMetadataCollector.updateMinTimestamp(minTimestampTracker.get())
.updateMaxTimestamp(maxTimestampTracker.get())
.updateMaxLocalDeletionTime(maxDeletionTimeTracker.get())
.addRowSize(dataFile.getFilePointer() - currentPosition)
.addColumnCount(columnIndexer.writtenAtomCount())
.mergeTombstoneHistogram(tombstones)
.updateMinColumnNames(minColumnNames)
.updateMaxColumnNames(maxColumnNames)
.updateHasLegacyCounterShards(hasLegacyCounterShards);
afterAppend(key, currentPosition, RowIndexEntry.create(currentPosition, cf.deletionInfo().getTopLevelDeletion(), columnIndexer.build()));
return currentPosition;
}
/**
* After failure, attempt to close the index writer and data file before deleting all temp components for the sstable
*/
public void abort()
{
assert descriptor.type.isTemporary;
if (iwriter == null && dataFile == null)
return;
if (iwriter != null)
iwriter.abort();
if (dataFile!= null)
dataFile.abort();
Set<Component> components = SSTable.componentsFor(descriptor);
try
{
if (!components.isEmpty())
SSTable.delete(descriptor, components);
}
catch (FSWriteError e)
{
logger.error(String.format("Failed deleting temp components for %s", descriptor), e);
throw e;
}
}
// we use this method to ensure any managed data we may have retained references to during the write are no
// longer referenced, so that we do not need to enclose the expensive call to closeAndOpenReader() in a transaction
public void isolateReferences()
{
// currently we only maintain references to first/last/lastWrittenKey from the data provided; all other
// data retention is done through copying
first = getMinimalKey(first);
last = lastWrittenKey = getMinimalKey(last);
}
private Descriptor makeTmpLinks()
{
// create temp links if they don't already exist
Descriptor link = descriptor.asType(Descriptor.Type.TEMPLINK);
if (!new File(link.filenameFor(Component.PRIMARY_INDEX)).exists())
{
FileUtils.createHardLink(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)), new File(link.filenameFor(Component.PRIMARY_INDEX)));
FileUtils.createHardLink(new File(descriptor.filenameFor(Component.DATA)), new File(link.filenameFor(Component.DATA)));
}
return link;
}
public SSTableReader openEarly(long maxDataAge)
{
StatsMetadata sstableMetadata = (StatsMetadata) sstableMetadataCollector.finalizeMetadata(partitioner.getClass().getCanonicalName(),
metadata.getBloomFilterFpChance(),
repairedAt).get(MetadataType.STATS);
// find the max (exclusive) readable key
IndexSummaryBuilder.ReadableBoundary boundary = iwriter.getMaxReadable();
if (boundary == null)
return null;
assert boundary.indexLength > 0 && boundary.dataLength > 0;
Descriptor link = makeTmpLinks();
// open the reader early, giving it a FINAL descriptor type so that it is indistinguishable for other consumers
SegmentedFile ifile = iwriter.builder.complete(link.filenameFor(Component.PRIMARY_INDEX), boundary.indexLength);
SegmentedFile dfile = dbuilder.complete(link.filenameFor(Component.DATA), boundary.dataLength);
SSTableReader sstable = SSTableReader.internalOpen(descriptor.asType(Descriptor.Type.FINAL),
components, metadata,
partitioner, ifile,
dfile, iwriter.summary.build(partitioner, boundary),
iwriter.bf.sharedCopy(), maxDataAge, sstableMetadata, SSTableReader.OpenReason.EARLY);
// now it's open, find the ACTUAL last readable key (i.e. for which the data file has also been flushed)
sstable.first = getMinimalKey(first);
sstable.last = getMinimalKey(boundary.lastKey);
return sstable;
}
public static enum FinishType
{
CLOSE(null, true),
NORMAL(SSTableReader.OpenReason.NORMAL, true),
EARLY(SSTableReader.OpenReason.EARLY, false), // no renaming
FINISH_EARLY(SSTableReader.OpenReason.NORMAL, true); // tidy up an EARLY finish
final SSTableReader.OpenReason openReason;
public final boolean isFinal;
FinishType(SSTableReader.OpenReason openReason, boolean isFinal)
{
this.openReason = openReason;
this.isFinal = isFinal;
}
}
public SSTableReader closeAndOpenReader()
{
return closeAndOpenReader(System.currentTimeMillis());
}
public SSTableReader closeAndOpenReader(long maxDataAge)
{
return finish(FinishType.NORMAL, maxDataAge, this.repairedAt);
}
public SSTableReader finish(FinishType finishType, long maxDataAge, long repairedAt)
{
assert finishType != FinishType.CLOSE;
Pair<Descriptor, StatsMetadata> p;
p = close(finishType, repairedAt < 0 ? this.repairedAt : repairedAt);
Descriptor desc = p.left;
StatsMetadata metadata = p.right;
if (finishType == FinishType.EARLY)
desc = makeTmpLinks();
// finalize in-memory state for the reader
SegmentedFile ifile = iwriter.builder.complete(desc.filenameFor(Component.PRIMARY_INDEX), finishType.isFinal);
SegmentedFile dfile = dbuilder.complete(desc.filenameFor(Component.DATA), finishType.isFinal);
SSTableReader sstable = SSTableReader.internalOpen(desc.asType(Descriptor.Type.FINAL),
components,
this.metadata,
partitioner,
ifile,
dfile,
iwriter.summary.build(partitioner),
iwriter.bf.sharedCopy(),
maxDataAge,
metadata,
finishType.openReason);
sstable.first = getMinimalKey(first);
sstable.last = getMinimalKey(last);
if (finishType.isFinal)
{
iwriter.bf.close();
iwriter.summary.close();
// try to save the summaries to disk
sstable.saveSummary(iwriter.builder, dbuilder);
iwriter = null;
dbuilder = null;
}
return sstable;
}
// Close the writer and return the descriptor to the new sstable and it's metadata
public Pair<Descriptor, StatsMetadata> close()
{
return close(FinishType.CLOSE, this.repairedAt);
}
private Pair<Descriptor, StatsMetadata> close(FinishType type, long repairedAt)
{
switch (type)
{
case EARLY: case CLOSE: case NORMAL:
iwriter.close();
dataFile.close();
if (type == FinishType.CLOSE)
iwriter.bf.close();
}
// write sstable statistics
Map<MetadataType, MetadataComponent> metadataComponents ;
metadataComponents = sstableMetadataCollector
.finalizeMetadata(partitioner.getClass().getCanonicalName(),
metadata.getBloomFilterFpChance(),repairedAt);
// remove the 'tmp' marker from all components
Descriptor descriptor = this.descriptor;
if (type.isFinal)
{
dataFile.writeFullChecksum(descriptor);
writeMetadata(descriptor, metadataComponents);
// save the table of components
SSTable.appendTOC(descriptor, components);
descriptor = rename(descriptor, components);
}
return Pair.create(descriptor, (StatsMetadata) metadataComponents.get(MetadataType.STATS));
}
private static void writeMetadata(Descriptor desc, Map<MetadataType, MetadataComponent> components)
{
SequentialWriter out = SequentialWriter.open(new File(desc.filenameFor(Component.STATS)));
try
{
desc.getMetadataSerializer().serialize(components, out.stream);
}
catch (IOException e)
{
throw new FSWriteError(e, out.getPath());
}
finally
{
out.close();
}
}
static Descriptor rename(Descriptor tmpdesc, Set<Component> components)
{
Descriptor newdesc = tmpdesc.asType(Descriptor.Type.FINAL);
rename(tmpdesc, newdesc, components);
return newdesc;
}
public static void rename(Descriptor tmpdesc, Descriptor newdesc, Set<Component> components)
{
for (Component component : Sets.difference(components, Sets.newHashSet(Component.DATA, Component.SUMMARY)))
{
FileUtils.renameWithConfirm(tmpdesc.filenameFor(component), newdesc.filenameFor(component));
}
// do -Data last because -Data present should mean the sstable was completely renamed before crash
FileUtils.renameWithConfirm(tmpdesc.filenameFor(Component.DATA), newdesc.filenameFor(Component.DATA));
// rename it without confirmation because summary can be available for loadNewSSTables but not for closeAndOpenReader
FileUtils.renameWithOutConfirm(tmpdesc.filenameFor(Component.SUMMARY), newdesc.filenameFor(Component.SUMMARY));
}
public long getFilePointer()
{
return dataFile.getFilePointer();
}
public long getOnDiskFilePointer()
{
return dataFile.getOnDiskFilePointer();
}
/**
* Encapsulates writing the index and filter for an SSTable. The state of this object is not valid until it has been closed.
*/
class IndexWriter
{
private final SequentialWriter indexFile;
public final SegmentedFile.Builder builder;
public final IndexSummaryBuilder summary;
public final IFilter bf;
private FileMark mark;
IndexWriter(long keyCount, final SequentialWriter dataFile)
{
indexFile = SequentialWriter.open(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)));
builder = SegmentedFile.getBuilder(DatabaseDescriptor.getIndexAccessMode());
summary = new IndexSummaryBuilder(keyCount, metadata.getMinIndexInterval(), Downsampling.BASE_SAMPLING_LEVEL);
bf = FilterFactory.getFilter(keyCount, metadata.getBloomFilterFpChance(), true);
// register listeners to be alerted when the data files are flushed
indexFile.setPostFlushListener(new Runnable()
{
public void run()
{
summary.markIndexSynced(indexFile.getLastFlushOffset());
}
});
dataFile.setPostFlushListener(new Runnable()
{
public void run()
{
summary.markDataSynced(dataFile.getLastFlushOffset());
}
});
}
// finds the last (-offset) decorated key that can be guaranteed to occur fully in the flushed portion of the index file
IndexSummaryBuilder.ReadableBoundary getMaxReadable()
{
return summary.getLastReadableBoundary();
}
public void append(DecoratedKey key, RowIndexEntry indexEntry, long dataEnd)
{
bf.add(key.getKey());
long indexStart = indexFile.getFilePointer();
try
{
ByteBufferUtil.writeWithShortLength(key.getKey(), indexFile.stream);
metadata.comparator.rowIndexEntrySerializer().serialize(indexEntry, indexFile.stream);
}
catch (IOException e)
{
throw new FSWriteError(e, indexFile.getPath());
}
long indexEnd = indexFile.getFilePointer();
if (logger.isTraceEnabled())
logger.trace("wrote index entry: " + indexEntry + " at " + indexStart);
summary.maybeAddEntry(key, indexStart, indexEnd, dataEnd);
builder.addPotentialBoundary(indexStart);
}
public void abort()
{
summary.close();
indexFile.abort();
bf.close();
}
/**
* Closes the index and bloomfilter, making the public state of this writer valid for consumption.
*/
public void close()
{
if (components.contains(Component.FILTER))
{
String path = descriptor.filenameFor(Component.FILTER);
try
{
// bloom filter
FileOutputStream fos = new FileOutputStream(path);
DataOutputStreamPlus stream = new DataOutputStreamPlus(new BufferedOutputStream(fos));
FilterFactory.serialize(bf, stream);
stream.flush();
fos.getFD().sync();
stream.close();
}
catch (IOException e)
{
throw new FSWriteError(e, path);
}
}
// index
long position = indexFile.getFilePointer();
indexFile.close(); // calls force
FileUtils.truncate(indexFile.getPath(), position);
}
public void mark()
{
mark = indexFile.mark();
}
public void resetAndTruncate()
{
// we can't un-set the bloom filter addition, but extra keys in there are harmless.
// we can't reset dbuilder either, but that is the last thing called in afterappend so
// we assume that if that worked then we won't be trying to reset.
indexFile.resetAndTruncate(mark);
}
@Override
public String toString()
{
return "IndexWriter(" + descriptor + ")";
}
}
}