/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db.compaction;
import java.nio.ByteBuffer;
import java.io.*;
import java.util.*;
import com.google.common.base.Throwables;
import org.apache.cassandra.db.*;
import org.apache.cassandra.io.sstable.*;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.io.util.RandomAccessReader;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.OutputHandler;
public class Scrubber implements Closeable
{
public final ColumnFamilyStore cfs;
public final SSTableReader sstable;
public final File destination;
private final CompactionController controller;
private final boolean isCommutative;
private final int expectedBloomFilterSize;
private final RandomAccessReader dataFile;
private final RandomAccessReader indexFile;
private final ScrubInfo scrubInfo;
private SSTableWriter writer;
private SSTableReader newSstable;
private SSTableReader newInOrderSstable;
private int goodRows;
private int badRows;
private int emptyRows;
private final OutputHandler outputHandler;
private static final Comparator<AbstractCompactedRow> acrComparator = new Comparator<AbstractCompactedRow>()
{
public int compare(AbstractCompactedRow r1, AbstractCompactedRow r2)
{
return r1.key.compareTo(r2.key);
}
};
private final Set<AbstractCompactedRow> outOfOrderRows = new TreeSet<AbstractCompactedRow>(acrComparator);
public Scrubber(ColumnFamilyStore cfs, SSTableReader sstable) throws IOException
{
this(cfs, sstable, new OutputHandler.LogOutput(), false);
}
public Scrubber(ColumnFamilyStore cfs, SSTableReader sstable, OutputHandler outputHandler, boolean isOffline) throws IOException
{
this.cfs = cfs;
this.sstable = sstable;
this.outputHandler = outputHandler;
// Calculate the expected compacted filesize
this.destination = cfs.directories.getDirectoryForNewSSTables();
if (destination == null)
throw new IOException("disk full");
List<SSTableReader> toScrub = Collections.singletonList(sstable);
// If we run scrub offline, we should never purge tombstone, as we cannot know if other sstable have data that the tombstone deletes.
this.controller = isOffline
? new ScrubController(cfs)
: new CompactionController(cfs, Collections.singleton(sstable), CompactionManager.getDefaultGcBefore(cfs));
this.isCommutative = cfs.metadata.getDefaultValidator().isCommutative();
this.expectedBloomFilterSize = Math.max(cfs.metadata.getIndexInterval(), (int)(SSTableReader.getApproximateKeyCount(toScrub,cfs.metadata)));
// loop through each row, deserializing to check for damage.
// we'll also loop through the index at the same time, using the position from the index to recover if the
// row header (key or data size) is corrupt. (This means our position in the index file will be one row
// "ahead" of the data file.)
this.dataFile = isOffline
? sstable.openDataReader()
: sstable.openDataReader(CompactionManager.instance.getRateLimiter());
this.indexFile = RandomAccessReader.open(new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX)));
this.scrubInfo = new ScrubInfo(dataFile, sstable);
}
public void scrub()
{
outputHandler.output("Scrubbing " + sstable);
try
{
ByteBuffer nextIndexKey = ByteBufferUtil.readWithShortLength(indexFile);
{
// throw away variable so we don't have a side effect in the assert
long firstRowPositionFromIndex = RowIndexEntry.serializer.deserialize(indexFile, sstable.descriptor.version).position;
assert firstRowPositionFromIndex == 0 : firstRowPositionFromIndex;
}
// TODO errors when creating the writer may leave empty temp files.
writer = CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, sstable);
AbstractCompactedRow prevRow = null;
while (!dataFile.isEOF())
{
if (scrubInfo.isStopRequested())
throw new CompactionInterruptedException(scrubInfo.getCompactionInfo());
long rowStart = dataFile.getFilePointer();
outputHandler.debug("Reading row at " + rowStart);
DecoratedKey key = null;
long dataSize = -1;
try
{
key = sstable.partitioner.decorateKey(ByteBufferUtil.readWithShortLength(dataFile));
if (sstable.descriptor.version.hasRowSizeAndColumnCount)
{
dataSize = dataFile.readLong();
outputHandler.debug(String.format("row %s is %s bytes", ByteBufferUtil.bytesToHex(key.key), dataSize));
}
}
catch (Throwable th)
{
throwIfFatal(th);
// check for null key below
}
ByteBuffer currentIndexKey = nextIndexKey;
long nextRowPositionFromIndex;
try
{
nextIndexKey = indexFile.isEOF() ? null : ByteBufferUtil.readWithShortLength(indexFile);
nextRowPositionFromIndex = indexFile.isEOF()
? dataFile.length()
: RowIndexEntry.serializer.deserialize(indexFile, sstable.descriptor.version).position;
}
catch (Throwable th)
{
outputHandler.warn("Error reading index file", th);
nextIndexKey = null;
nextRowPositionFromIndex = dataFile.length();
}
long dataStart = dataFile.getFilePointer();
long dataStartFromIndex = currentIndexKey == null
? -1
: rowStart + 2 + currentIndexKey.remaining();
if (sstable.descriptor.version.hasRowSizeAndColumnCount)
dataStartFromIndex += 8;
long dataSizeFromIndex = nextRowPositionFromIndex - dataStartFromIndex;
if (!sstable.descriptor.version.hasRowSizeAndColumnCount)
{
dataSize = dataSizeFromIndex;
outputHandler.debug(String.format("row %s is %s bytes", ByteBufferUtil.bytesToHex(key.key), dataSize));
}
else
{
if (currentIndexKey != null)
outputHandler.debug(String.format("Index doublecheck: row %s is %s bytes", ByteBufferUtil.bytesToHex(currentIndexKey), dataSizeFromIndex));
}
assert currentIndexKey != null || indexFile.isEOF();
writer.mark();
try
{
if (key == null)
throw new IOError(new IOException("Unable to read row key from data file"));
if (dataSize > dataFile.length())
throw new IOError(new IOException("Impossible row size " + dataSize));
SSTableIdentityIterator row = new SSTableIdentityIterator(sstable, dataFile, key, dataSize, true);
AbstractCompactedRow compactedRow = controller.getCompactedRow(row);
if (prevRow != null && acrComparator.compare(prevRow, compactedRow) >= 0)
{
outOfOrderRows.add(compactedRow);
outputHandler.warn(String.format("Out of order row detected (%s found after %s)", compactedRow.key, prevRow.key));
continue;
}
if (writer.append(compactedRow) == null)
emptyRows++;
else
goodRows++;
prevRow = compactedRow;
if (!key.key.equals(currentIndexKey) || dataStart != dataStartFromIndex)
outputHandler.warn("Index file contained a different key or row size; using key from data file");
}
catch (Throwable th)
{
throwIfFatal(th);
outputHandler.warn("Non-fatal error reading row (stacktrace follows)", th);
writer.resetAndTruncate();
if (currentIndexKey != null
&& (key == null || !key.key.equals(currentIndexKey) || dataStart != dataStartFromIndex || dataSize != dataSizeFromIndex))
{
outputHandler.output(String.format("Retrying from row index; data is %s bytes starting at %s",
dataSizeFromIndex, dataStartFromIndex));
key = sstable.partitioner.decorateKey(currentIndexKey);
try
{
SSTableIdentityIterator row = new SSTableIdentityIterator(sstable, dataFile, key, dataSizeFromIndex, true);
AbstractCompactedRow compactedRow = controller.getCompactedRow(row);
if (prevRow != null && acrComparator.compare(prevRow, compactedRow) >= 0)
{
outOfOrderRows.add(compactedRow);
outputHandler.warn(String.format("Out of order row detected (%s found after %s)", compactedRow.key, prevRow.key));
continue;
}
if (writer.append(compactedRow) == null)
emptyRows++;
else
goodRows++;
prevRow = compactedRow;
}
catch (Throwable th2)
{
throwIfFatal(th2);
// Skipping rows is dangerous for counters (see CASSANDRA-2759)
if (isCommutative)
throw new IOError(th2);
outputHandler.warn("Retry failed too. Skipping to next row (retry's stacktrace follows)", th2);
writer.resetAndTruncate();
dataFile.seek(nextRowPositionFromIndex);
badRows++;
}
}
else
{
// Skipping rows is dangerous for counters (see CASSANDRA-2759)
if (isCommutative)
throw new IOError(th);
outputHandler.warn("Row at " + dataStart + " is unreadable; skipping to next");
if (currentIndexKey != null)
dataFile.seek(nextRowPositionFromIndex);
badRows++;
}
}
}
if (writer.getFilePointer() > 0)
newSstable = writer.closeAndOpenReader(sstable.maxDataAge);
}
catch (Throwable t)
{
if (writer != null)
writer.abort();
throw Throwables.propagate(t);
}
finally
{
controller.close();
}
if (!outOfOrderRows.isEmpty())
{
SSTableWriter inOrderWriter = CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, sstable);
for (AbstractCompactedRow row : outOfOrderRows)
inOrderWriter.append(row);
newInOrderSstable = inOrderWriter.closeAndOpenReader(sstable.maxDataAge);
outputHandler.warn(String.format("%d out of order rows found while scrubbing %s; Those have been written (in order) to a new sstable (%s)", outOfOrderRows.size(), sstable, newInOrderSstable));
}
if (newSstable == null)
{
if (badRows > 0)
outputHandler.warn("No valid rows found while scrubbing " + sstable + "; it is marked for deletion now. If you want to attempt manual recovery, you can find a copy in the pre-scrub snapshot");
else
outputHandler.output("Scrub of " + sstable + " complete; looks like all " + emptyRows + " rows were tombstoned");
}
else
{
outputHandler.output("Scrub of " + sstable + " complete: " + goodRows + " rows in new sstable and " + emptyRows + " empty (tombstoned) rows dropped");
if (badRows > 0)
outputHandler.warn("Unable to recover " + badRows + " rows that were skipped. You can attempt manual recovery from the pre-scrub snapshot. You can also run nodetool repair to transfer the data from a healthy replica, if any");
}
}
public SSTableReader getNewSSTable()
{
return newSstable;
}
public SSTableReader getNewInOrderSSTable()
{
return newInOrderSstable;
}
private void throwIfFatal(Throwable th)
{
if (th instanceof Error && !(th instanceof AssertionError || th instanceof IOError))
throw (Error) th;
}
public void close()
{
FileUtils.closeQuietly(dataFile);
FileUtils.closeQuietly(indexFile);
}
public CompactionInfo.Holder getScrubInfo()
{
return scrubInfo;
}
private static class ScrubInfo extends CompactionInfo.Holder
{
private final RandomAccessReader dataFile;
private final SSTableReader sstable;
public ScrubInfo(RandomAccessReader dataFile, SSTableReader sstable)
{
this.dataFile = dataFile;
this.sstable = sstable;
}
public CompactionInfo getCompactionInfo()
{
try
{
return new CompactionInfo(sstable.metadata,
OperationType.SCRUB,
dataFile.getFilePointer(),
dataFile.length());
}
catch (Exception e)
{
throw new RuntimeException();
}
}
}
private static class ScrubController extends CompactionController
{
public ScrubController(ColumnFamilyStore cfs)
{
super(cfs, Integer.MAX_VALUE);
}
@Override
public boolean shouldPurge(DecoratedKey key, long delTimestamp)
{
return false;
}
}
}