/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.db; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.security.MessageDigest; import net.nicoulaj.compilecommand.annotations.Inline; import org.apache.cassandra.config.CFMetaData; import org.apache.cassandra.config.ColumnDefinition; import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.db.composites.*; import org.apache.cassandra.db.filter.ColumnSlice; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.CompositeType; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.FastByteOperations; import org.apache.cassandra.utils.concurrent.OpOrder; import org.apache.cassandra.utils.memory.*; /** * Packs a CellName AND a Cell into one off-heap representation. * Layout is: * * Note we store the ColumnIdentifier in full as bytes. This seems an okay tradeoff for now, as we just * look it back up again when we need to, and in the near future we hope to switch to ints, longs or * UUIDs representing column identifiers on disk, at which point we can switch that here as well. * * [timestamp][value offset][name size]][name extra][name offset deltas][cell names][value][Descendants] * [ 8b ][ 4b ][ 2b ][ 1b ][ each 2b ][ arb < 64k][ arb ][ arbitrary ] * * descendants: any overriding classes will put their state here * name offsets are deltas from their base offset, and don't include the first offset, or the end position of the final entry, * i.e. there will be size - 1 entries, and each is a delta that is added to the offset of the position of the first name * (which is always CELL_NAME_OFFSETS_OFFSET + (2 * (size - 1))). The length of the final name fills up any remaining * space upto the value offset * name extra: lowest 2 bits indicate the clustering size delta (i.e. how many name items are NOT part of the clustering key) * the next 2 bits indicate the CellNameType * the next bit indicates if the column is a static or clustered/dynamic column */ public abstract class AbstractNativeCell extends AbstractCell implements CellName { static final int TIMESTAMP_OFFSET = 4; private static final int VALUE_OFFSET_OFFSET = 12; private static final int CELL_NAME_SIZE_OFFSET = 16; private static final int CELL_NAME_EXTRA_OFFSET = 18; private static final int CELL_NAME_OFFSETS_OFFSET = 19; private static final int CELL_NAME_SIZE_DELTA_MASK = 3; private static final int CELL_NAME_TYPE_SHIFT = 2; private static final int CELL_NAME_TYPE_MASK = 7; private static enum NameType { COMPOUND_DENSE(0 << 2), COMPOUND_SPARSE(1 << 2), COMPOUND_SPARSE_STATIC(2 << 2), SIMPLE_DENSE(3 << 2), SIMPLE_SPARSE(4 << 2); static final NameType[] TYPES = NameType.values(); final int bits; NameType(int bits) { this.bits = bits; } static NameType typeOf(CellName name) { if (name instanceof CompoundDenseCellName) { assert !name.isStatic(); return COMPOUND_DENSE; } if (name instanceof CompoundSparseCellName) return name.isStatic() ? COMPOUND_SPARSE_STATIC : COMPOUND_SPARSE; if (name instanceof SimpleDenseCellName) { assert !name.isStatic(); return SIMPLE_DENSE; } if (name instanceof SimpleSparseCellName) { assert !name.isStatic(); return SIMPLE_SPARSE; } if (name instanceof NativeCell) return ((NativeCell) name).nametype(); throw new AssertionError(); } } private final long peer; // peer is assigned by peer updater in setPeer method AbstractNativeCell() { peer = -1; } public AbstractNativeCell(NativeAllocator allocator, OpOrder.Group writeOp, Cell copyOf) { int size = sizeOf(copyOf); peer = allocator.allocate(size, writeOp); MemoryUtil.setInt(peer, size); construct(copyOf); } protected int sizeOf(Cell cell) { int size = CELL_NAME_OFFSETS_OFFSET + Math.max(0, cell.name().size() - 1) * 2 + cell.value().remaining(); CellName name = cell.name(); for (int i = 0; i < name.size(); i++) size += name.get(i).remaining(); return size; } protected void construct(Cell from) { setLong(TIMESTAMP_OFFSET, from.timestamp()); CellName name = from.name(); int nameSize = name.size(); int offset = CELL_NAME_SIZE_OFFSET; setShort(offset, (short) nameSize); assert nameSize - name.clusteringSize() <= 2; byte cellNameExtraBits = (byte) ((nameSize - name.clusteringSize()) | NameType.typeOf(name).bits); setByte(offset += 2, cellNameExtraBits); offset += 1; short cellNameDelta = 0; for (int i = 1; i < nameSize; i++) { cellNameDelta += name.get(i - 1).remaining(); setShort(offset, cellNameDelta); offset += 2; } for (int i = 0; i < nameSize; i++) { ByteBuffer bb = name.get(i); setBytes(offset, bb); offset += bb.remaining(); } setInt(VALUE_OFFSET_OFFSET, offset); setBytes(offset, from.value()); } // the offset at which to read the short that gives the names private int nameDeltaOffset(int i) { return CELL_NAME_OFFSETS_OFFSET + ((i - 1) * 2); } int valueStartOffset() { return getInt(VALUE_OFFSET_OFFSET); } private int valueEndOffset() { return (int) (internalSize() - postfixSize()); } protected int postfixSize() { return 0; } @Override public ByteBuffer value() { long offset = valueStartOffset(); return getByteBuffer(offset, (int) (internalSize() - (postfixSize() + offset))).order(ByteOrder.BIG_ENDIAN); } private int clusteringSizeDelta() { return getByte(CELL_NAME_EXTRA_OFFSET) & CELL_NAME_SIZE_DELTA_MASK; } public boolean isStatic() { return nametype() == NameType.COMPOUND_SPARSE_STATIC; } NameType nametype() { return NameType.TYPES[(((int) this.getByte(CELL_NAME_EXTRA_OFFSET)) >> CELL_NAME_TYPE_SHIFT) & CELL_NAME_TYPE_MASK]; } public long minTimestamp() { return timestamp(); } public long maxTimestamp() { return timestamp(); } public int clusteringSize() { return size() - clusteringSizeDelta(); } @Override public ColumnIdentifier cql3ColumnName(CFMetaData metadata) { switch (nametype()) { case SIMPLE_SPARSE: return getIdentifier(metadata, get(clusteringSize())); case COMPOUND_SPARSE_STATIC: case COMPOUND_SPARSE: ByteBuffer buffer = get(clusteringSize()); if (buffer.remaining() == 0) return CompoundSparseCellNameType.rowMarkerId; return getIdentifier(metadata, buffer); case SIMPLE_DENSE: case COMPOUND_DENSE: return null; default: throw new AssertionError(); } } public ByteBuffer collectionElement() { return isCollectionCell() ? get(size() - 1) : null; } // we always have a collection element if our clustering size is 2 less than our total size, // and we never have one otherwiss public boolean isCollectionCell() { return clusteringSizeDelta() == 2; } public boolean isSameCQL3RowAs(CellNameType type, CellName other) { switch (nametype()) { case SIMPLE_DENSE: case COMPOUND_DENSE: return type.compare(this, other) == 0; case COMPOUND_SPARSE_STATIC: case COMPOUND_SPARSE: int clusteringSize = clusteringSize(); if (clusteringSize != other.clusteringSize() || other.isStatic() != isStatic()) return false; for (int i = 0; i < clusteringSize; i++) if (type.subtype(i).compare(get(i), other.get(i)) != 0) return false; return true; case SIMPLE_SPARSE: return true; default: throw new AssertionError(); } } public int size() { return getShort(CELL_NAME_SIZE_OFFSET); } public boolean isEmpty() { return size() == 0; } public ByteBuffer get(int i) { return get(i, null); } private ByteBuffer get(int i, AbstractAllocator copy) { // remember to take dense/sparse into account, and only return EOC when not dense int size = size(); assert i >= 0 && i < size(); int cellNamesOffset = nameDeltaOffset(size); int startDelta = i == 0 ? 0 : getShort(nameDeltaOffset(i)); int endDelta = i < size - 1 ? getShort(nameDeltaOffset(i + 1)) : valueStartOffset() - cellNamesOffset; int length = endDelta - startDelta; if (copy == null) return getByteBuffer(cellNamesOffset + startDelta, length).order(ByteOrder.BIG_ENDIAN); ByteBuffer result = copy.allocate(length); FastByteOperations.UnsafeOperations.copy(null, peer + cellNamesOffset + startDelta, result, 0, length); return result; } private static final ThreadLocal<byte[]> BUFFER = new ThreadLocal<byte[]>() { protected byte[] initialValue() { return new byte[256]; } }; protected void writeComponentTo(MessageDigest digest, int i, boolean includeSize) { // remember to take dense/sparse into account, and only return EOC when not dense int size = size(); assert i >= 0 && i < size(); int cellNamesOffset = nameDeltaOffset(size); int startDelta = i == 0 ? 0 : getShort(nameDeltaOffset(i)); int endDelta = i < size - 1 ? getShort(nameDeltaOffset(i + 1)) : valueStartOffset() - cellNamesOffset; int componentStart = cellNamesOffset + startDelta; int count = endDelta - startDelta; if (includeSize) FBUtilities.updateWithShort(digest, count); writeMemoryTo(digest, componentStart, count); } protected void writeMemoryTo(MessageDigest digest, int from, int count) { // only batch if we have more than 16 bytes remaining to transfer, otherwise fall-back to single-byte updates int i = 0, batchEnd = count - 16; if (i < batchEnd) { byte[] buffer = BUFFER.get(); while (i < batchEnd) { int transfer = Math.min(count - i, 256); getBytes(from + i, buffer, 0, transfer); digest.update(buffer, 0, transfer); i += transfer; } } while (i < count) digest.update(getByte(from + i++)); } public EOC eoc() { return EOC.NONE; } public Composite withEOC(EOC eoc) { throw new UnsupportedOperationException(); } public Composite start() { throw new UnsupportedOperationException(); } public Composite end() { throw new UnsupportedOperationException(); } public ColumnSlice slice() { throw new UnsupportedOperationException(); } public boolean isPrefixOf(CType type, Composite c) { if (size() > c.size() || isStatic() != c.isStatic()) return false; for (int i = 0; i < size(); i++) { if (type.subtype(i).compare(get(i), c.get(i)) != 0) return false; } return true; } public ByteBuffer toByteBuffer() { // for simple sparse we just return our one name buffer switch (nametype()) { case SIMPLE_DENSE: case SIMPLE_SPARSE: return get(0); case COMPOUND_DENSE: case COMPOUND_SPARSE_STATIC: case COMPOUND_SPARSE: // This is the legacy format of composites. // See org.apache.cassandra.db.marshal.CompositeType for details. ByteBuffer result = ByteBuffer.allocate(cellDataSize()); if (isStatic()) ByteBufferUtil.writeShortLength(result, CompositeType.STATIC_MARKER); for (int i = 0; i < size(); i++) { ByteBuffer bb = get(i); ByteBufferUtil.writeShortLength(result, bb.remaining()); result.put(bb); result.put((byte) 0); } result.flip(); return result; default: throw new AssertionError(); } } protected void updateWithName(MessageDigest digest) { // for simple sparse we just return our one name buffer switch (nametype()) { case SIMPLE_DENSE: case SIMPLE_SPARSE: writeComponentTo(digest, 0, false); break; case COMPOUND_DENSE: case COMPOUND_SPARSE_STATIC: case COMPOUND_SPARSE: // This is the legacy format of composites. // See org.apache.cassandra.db.marshal.CompositeType for details. if (isStatic()) FBUtilities.updateWithShort(digest, CompositeType.STATIC_MARKER); for (int i = 0; i < size(); i++) { writeComponentTo(digest, i, true); digest.update((byte) 0); } break; default: throw new AssertionError(); } } protected void updateWithValue(MessageDigest digest) { int offset = valueStartOffset(); int length = valueEndOffset() - offset; writeMemoryTo(digest, offset, length); } @Override // this is the NAME dataSize, only! public int dataSize() { switch (nametype()) { case SIMPLE_DENSE: case SIMPLE_SPARSE: return valueStartOffset() - nameDeltaOffset(size()); case COMPOUND_DENSE: case COMPOUND_SPARSE_STATIC: case COMPOUND_SPARSE: int size = size(); return valueStartOffset() - nameDeltaOffset(size) + 3 * size + (isStatic() ? 2 : 0); default: throw new AssertionError(); } } public boolean equals(Object obj) { if (obj == this) return true; if (obj instanceof CellName) return equals((CellName) obj); if (obj instanceof Cell) return equals((Cell) obj); return false; } public boolean equals(CellName that) { int size = this.size(); if (size != that.size()) return false; for (int i = 0 ; i < size ; i++) if (!get(i).equals(that.get(i))) return false; return true; } private static final ByteBuffer[] EMPTY = new ByteBuffer[0]; @Override public CellName copy(CFMetaData cfm, AbstractAllocator allocator) { ByteBuffer[] r; switch (nametype()) { case SIMPLE_DENSE: return CellNames.simpleDense(get(0, allocator)); case COMPOUND_DENSE: r = new ByteBuffer[size()]; for (int i = 0; i < r.length; i++) r[i] = get(i, allocator); return CellNames.compositeDense(r); case COMPOUND_SPARSE_STATIC: case COMPOUND_SPARSE: int clusteringSize = clusteringSize(); r = clusteringSize == 0 ? EMPTY : new ByteBuffer[clusteringSize()]; for (int i = 0; i < clusteringSize; i++) r[i] = get(i, allocator); ByteBuffer nameBuffer = get(r.length); ColumnIdentifier name; if (nameBuffer.remaining() == 0) { name = CompoundSparseCellNameType.rowMarkerId; } else { name = getIdentifier(cfm, nameBuffer); } if (clusteringSizeDelta() == 2) { ByteBuffer element = allocator.clone(get(size() - 1)); return CellNames.compositeSparseWithCollection(r, element, name, isStatic()); } return CellNames.compositeSparse(r, name, isStatic()); case SIMPLE_SPARSE: return CellNames.simpleSparse(getIdentifier(cfm, get(0))); } throw new IllegalStateException(); } private static ColumnIdentifier getIdentifier(CFMetaData cfMetaData, ByteBuffer name) { ColumnDefinition def = cfMetaData.getColumnDefinition(name); if (def != null) { return def.name; } else { // it's safe to simply grab based on clusteringPrefixSize() as we are only called if not a dense type AbstractType<?> type = cfMetaData.comparator.subtype(cfMetaData.comparator.clusteringPrefixSize()); return new ColumnIdentifier(HeapAllocator.instance.clone(name), type); } } @Override public Cell withUpdatedName(CellName newName) { throw new UnsupportedOperationException(); } @Override public Cell withUpdatedTimestamp(long newTimestamp) { throw new UnsupportedOperationException(); } protected long internalSize() { return MemoryUtil.getInt(peer); } private void checkPosition(long offset, long size) { assert size >= 0; assert peer > 0 : "Memory was freed"; assert offset >= 0 && offset + size <= internalSize() : String.format("Illegal range: [%d..%d), size: %s", offset, offset + size, internalSize()); } protected final void setByte(long offset, byte b) { checkPosition(offset, 1); MemoryUtil.setByte(peer + offset, b); } protected final void setShort(long offset, short s) { checkPosition(offset, 1); MemoryUtil.setShort(peer + offset, s); } protected final void setInt(long offset, int l) { checkPosition(offset, 4); MemoryUtil.setInt(peer + offset, l); } protected final void setLong(long offset, long l) { checkPosition(offset, 8); MemoryUtil.setLong(peer + offset, l); } protected final void setBytes(long offset, ByteBuffer buffer) { int start = buffer.position(); int count = buffer.limit() - start; if (count == 0) return; checkPosition(offset, count); MemoryUtil.setBytes(peer + offset, buffer); } protected final byte getByte(long offset) { checkPosition(offset, 1); return MemoryUtil.getByte(peer + offset); } protected final void getBytes(long offset, byte[] trg, int trgOffset, int count) { checkPosition(offset, count); MemoryUtil.getBytes(peer + offset, trg, trgOffset, count); } protected final int getShort(long offset) { checkPosition(offset, 2); return MemoryUtil.getShort(peer + offset); } protected final int getInt(long offset) { checkPosition(offset, 4); return MemoryUtil.getInt(peer + offset); } protected final long getLong(long offset) { checkPosition(offset, 8); return MemoryUtil.getLong(peer + offset); } protected final ByteBuffer getByteBuffer(long offset, int length) { checkPosition(offset, length); return MemoryUtil.getByteBuffer(peer + offset, length); } // requires isByteOrderComparable to be true. Compares the name components only; ; may need to compare EOC etc still @Inline public final int compareTo(final Composite that) { if (isStatic() != that.isStatic()) { // Static sorts before non-static no matter what, except for empty which // always sort first if (isEmpty()) return that.isEmpty() ? 0 : -1; if (that.isEmpty()) return 1; return isStatic() ? -1 : 1; } int size = size(); int size2 = that.size(); int minSize = Math.min(size, size2); int startDelta = 0; int cellNamesOffset = nameDeltaOffset(size); for (int i = 0 ; i < minSize ; i++) { int endDelta = i < size - 1 ? getShort(nameDeltaOffset(i + 1)) : valueStartOffset() - cellNamesOffset; long offset = peer + cellNamesOffset + startDelta; int length = endDelta - startDelta; int cmp = FastByteOperations.UnsafeOperations.compareTo(null, offset, length, that.get(i)); if (cmp != 0) return cmp; startDelta = endDelta; } EOC eoc = that.eoc(); if (size == size2) return this.eoc().compareTo(eoc); return size < size2 ? this.eoc().prefixComparisonResult : -eoc.prefixComparisonResult; } public final int compareToSimple(final Composite that) { assert size() == 1 && that.size() == 1; int length = valueStartOffset() - nameDeltaOffset(1); long offset = peer + nameDeltaOffset(1); return FastByteOperations.UnsafeOperations.compareTo(null, offset, length, that.get(0)); } }