/** * Copyright The Apache Software Foundation * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.io.hfile.bucket; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.hbase.io.hfile.BlockCacheKey; import org.apache.hadoop.hbase.io.hfile.bucket.BucketCache.BucketEntry; /** * This class is used to allocate a block with specified size and free the block * when evicting. It manages an array of buckets, each bucket is associated with * a size and caches elements up to this size. For completely empty bucket, this * size could be re-specified dynamically. * * This class is not thread safe. */ @InterfaceAudience.Private public final class BucketAllocator { static final Log LOG = LogFactory.getLog(BucketAllocator.class); final private static class Bucket { private long baseOffset; private int itemAllocationSize, sizeIndex; private int itemCount; private int freeList[]; private int freeCount, usedCount; public Bucket(long offset) { baseOffset = offset; sizeIndex = -1; } void reconfigure(int sizeIndex) { this.sizeIndex = sizeIndex; assert sizeIndex < BUCKET_SIZES.length; itemAllocationSize = BUCKET_SIZES[sizeIndex]; itemCount = (int) (((long) BUCKET_CAPACITY) / (long) itemAllocationSize); freeCount = itemCount; usedCount = 0; freeList = new int[itemCount]; for (int i = 0; i < freeCount; ++i) freeList[i] = i; } public boolean isUninstantiated() { return sizeIndex == -1; } public int sizeIndex() { return sizeIndex; } public int itemAllocationSize() { return itemAllocationSize; } public boolean hasFreeSpace() { return freeCount > 0; } public boolean isCompletelyFree() { return usedCount == 0; } public int freeCount() { return freeCount; } public int usedCount() { return usedCount; } public int freeBytes() { return freeCount * itemAllocationSize; } public int usedBytes() { return usedCount * itemAllocationSize; } public long baseOffset() { return baseOffset; } /** * Allocate a block in this bucket, return the offset representing the * position in physical space * @return the offset in the IOEngine */ public long allocate() { assert freeCount > 0; // Else should not have been called assert sizeIndex != -1; ++usedCount; long offset = baseOffset + (freeList[--freeCount] * itemAllocationSize); assert offset >= 0; return offset; } public void addAllocation(long offset) throws BucketAllocatorException { offset -= baseOffset; if (offset < 0 || offset % itemAllocationSize != 0) throw new BucketAllocatorException( "Attempt to add allocation for bad offset: " + offset + " base=" + baseOffset + ", bucket size=" + itemAllocationSize); int idx = (int) (offset / itemAllocationSize); boolean matchFound = false; for (int i = 0; i < freeCount; ++i) { if (matchFound) freeList[i - 1] = freeList[i]; else if (freeList[i] == idx) matchFound = true; } if (!matchFound) throw new BucketAllocatorException("Couldn't find match for index " + idx + " in free list"); ++usedCount; --freeCount; } private void free(long offset) { offset -= baseOffset; assert offset >= 0; assert offset < itemCount * itemAllocationSize; assert offset % itemAllocationSize == 0; assert usedCount > 0; assert freeCount < itemCount; // Else duplicate free int item = (int) (offset / (long) itemAllocationSize); assert !freeListContains(item); --usedCount; freeList[freeCount++] = item; } private boolean freeListContains(int blockNo) { for (int i = 0; i < freeCount; ++i) { if (freeList[i] == blockNo) return true; } return false; } } final class BucketSizeInfo { // Free bucket means it has space to allocate a block; // Completely free bucket means it has no block. private List<Bucket> bucketList, freeBuckets, completelyFreeBuckets; private int sizeIndex; BucketSizeInfo(int sizeIndex) { bucketList = new ArrayList<Bucket>(); freeBuckets = new ArrayList<Bucket>(); completelyFreeBuckets = new ArrayList<Bucket>(); this.sizeIndex = sizeIndex; } public void instantiateBucket(Bucket b) { assert b.isUninstantiated() || b.isCompletelyFree(); b.reconfigure(sizeIndex); bucketList.add(b); freeBuckets.add(b); completelyFreeBuckets.add(b); } public int sizeIndex() { return sizeIndex; } /** * Find a bucket to allocate a block * @return the offset in the IOEngine */ public long allocateBlock() { Bucket b = null; if (freeBuckets.size() > 0) // Use up an existing one first... b = freeBuckets.get(freeBuckets.size() - 1); if (b == null) { b = grabGlobalCompletelyFreeBucket(); if (b != null) instantiateBucket(b); } if (b == null) return -1; long result = b.allocate(); blockAllocated(b); return result; } void blockAllocated(Bucket b) { if (!b.isCompletelyFree()) completelyFreeBuckets.remove(b); if (!b.hasFreeSpace()) freeBuckets.remove(b); } public Bucket findAndRemoveCompletelyFreeBucket() { Bucket b = null; assert bucketList.size() > 0; if (bucketList.size() == 1) { // So we never get complete starvation of a bucket for a size return null; } if (completelyFreeBuckets.size() > 0) { b = completelyFreeBuckets.get(0); removeBucket(b); } return b; } private void removeBucket(Bucket b) { assert b.isCompletelyFree(); bucketList.remove(b); freeBuckets.remove(b); completelyFreeBuckets.remove(b); } public void freeBlock(Bucket b, long offset) { assert bucketList.contains(b); // else we shouldn't have anything to free... assert (!completelyFreeBuckets.contains(b)); b.free(offset); if (!freeBuckets.contains(b)) freeBuckets.add(b); if (b.isCompletelyFree()) completelyFreeBuckets.add(b); } public IndexStatistics statistics() { long free = 0, used = 0; for (Bucket b : bucketList) { free += b.freeCount(); used += b.usedCount(); } return new IndexStatistics(free, used, BUCKET_SIZES[sizeIndex]); } } // Default block size is 64K, so we choose more sizes near 64K, you'd better // reset it according to your cluster's block size distribution // TODO Make these sizes configurable // TODO Support the view of block size distribution statistics private static final int BUCKET_SIZES[] = { 4 * 1024 + 1024, 8 * 1024 + 1024, 16 * 1024 + 1024, 32 * 1024 + 1024, 40 * 1024 + 1024, 48 * 1024 + 1024, 56 * 1024 + 1024, 64 * 1024 + 1024, 96 * 1024 + 1024, 128 * 1024 + 1024, 192 * 1024 + 1024, 256 * 1024 + 1024, 384 * 1024 + 1024, 512 * 1024 + 1024 }; /** * Round up the given block size to bucket size, and get the corresponding * BucketSizeInfo * @param blockSize * @return BucketSizeInfo */ public BucketSizeInfo roundUpToBucketSizeInfo(int blockSize) { for (int i = 0; i < BUCKET_SIZES.length; ++i) if (blockSize <= BUCKET_SIZES[i]) return bucketSizeInfos[i]; return null; } static final int BIG_ITEM_SIZE = (512 * 1024) + 1024; // 513K plus overhead static public final int FEWEST_ITEMS_IN_BUCKET = 4; // The capacity size for each bucket static final long BUCKET_CAPACITY = FEWEST_ITEMS_IN_BUCKET * BIG_ITEM_SIZE; private Bucket[] buckets; private BucketSizeInfo[] bucketSizeInfos; private final long totalSize; private long usedSize = 0; BucketAllocator(long availableSpace) throws BucketAllocatorException { buckets = new Bucket[(int) (availableSpace / (long) BUCKET_CAPACITY)]; if (buckets.length < BUCKET_SIZES.length) throw new BucketAllocatorException( "Bucket allocator size too small - must have room for at least " + BUCKET_SIZES.length + " buckets"); bucketSizeInfos = new BucketSizeInfo[BUCKET_SIZES.length]; for (int i = 0; i < BUCKET_SIZES.length; ++i) { bucketSizeInfos[i] = new BucketSizeInfo(i); } for (int i = 0; i < buckets.length; ++i) { buckets[i] = new Bucket(BUCKET_CAPACITY * i); bucketSizeInfos[i < BUCKET_SIZES.length ? i : BUCKET_SIZES.length - 1] .instantiateBucket(buckets[i]); } this.totalSize = ((long) buckets.length) * BUCKET_CAPACITY; } /** * Rebuild the allocator's data structures from a persisted map. * @param availableSpace capacity of cache * @param map A map stores the block key and BucketEntry(block's meta data * like offset, length) * @param realCacheSize cached data size statistics for bucket cache * @throws BucketAllocatorException */ BucketAllocator(long availableSpace, Map<BlockCacheKey, BucketEntry> map, AtomicLong realCacheSize) throws BucketAllocatorException { this(availableSpace); // each bucket has an offset, sizeindex. probably the buckets are too big // in our default state. so what we do is reconfigure them according to what // we've found. we can only reconfigure each bucket once; if more than once, // we know there's a bug, so we just log the info, throw, and start again... boolean[] reconfigured = new boolean[buckets.length]; for (Map.Entry<BlockCacheKey, BucketEntry> entry : map.entrySet()) { long foundOffset = entry.getValue().offset(); int foundLen = entry.getValue().getLength(); int bucketSizeIndex = -1; for (int i = 0; i < BUCKET_SIZES.length; ++i) { if (foundLen <= BUCKET_SIZES[i]) { bucketSizeIndex = i; break; } } if (bucketSizeIndex == -1) { throw new BucketAllocatorException( "Can't match bucket size for the block with size " + foundLen); } int bucketNo = (int) (foundOffset / (long) BUCKET_CAPACITY); if (bucketNo < 0 || bucketNo >= buckets.length) throw new BucketAllocatorException("Can't find bucket " + bucketNo + ", total buckets=" + buckets.length + "; did you shrink the cache?"); Bucket b = buckets[bucketNo]; if (reconfigured[bucketNo] == true) { if (b.sizeIndex() != bucketSizeIndex) throw new BucketAllocatorException( "Inconsistent allocation in bucket map;"); } else { if (!b.isCompletelyFree()) throw new BucketAllocatorException("Reconfiguring bucket " + bucketNo + " but it's already allocated; corrupt data"); // Need to remove the bucket from whichever list it's currently in at // the moment... BucketSizeInfo bsi = bucketSizeInfos[bucketSizeIndex]; BucketSizeInfo oldbsi = bucketSizeInfos[b.sizeIndex()]; oldbsi.removeBucket(b); bsi.instantiateBucket(b); reconfigured[bucketNo] = true; } realCacheSize.addAndGet(foundLen); buckets[bucketNo].addAllocation(foundOffset); usedSize += buckets[bucketNo].itemAllocationSize(); bucketSizeInfos[bucketSizeIndex].blockAllocated(b); } } public String getInfo() { StringBuilder sb = new StringBuilder(1024); for (int i = 0; i < buckets.length; ++i) { Bucket b = buckets[i]; sb.append(" Bucket ").append(i).append(": ").append(b.itemAllocationSize()); sb.append(" freeCount=").append(b.freeCount()).append(" used=") .append(b.usedCount()); sb.append('\n'); } return sb.toString(); } public long getUsedSize() { return this.usedSize; } public long getFreeSize() { long freeSize = this.totalSize - getUsedSize(); return freeSize; } public long getTotalSize() { return this.totalSize; } /** * Allocate a block with specified size. Return the offset * @param blockSize size of block * @throws BucketAllocatorException,CacheFullException * @return the offset in the IOEngine */ public synchronized long allocateBlock(int blockSize) throws CacheFullException, BucketAllocatorException { assert blockSize > 0; BucketSizeInfo bsi = roundUpToBucketSizeInfo(blockSize); if (bsi == null) { throw new BucketAllocatorException("Allocation too big size=" + blockSize); } long offset = bsi.allocateBlock(); // Ask caller to free up space and try again! if (offset < 0) throw new CacheFullException(blockSize, bsi.sizeIndex()); usedSize += BUCKET_SIZES[bsi.sizeIndex()]; return offset; } private Bucket grabGlobalCompletelyFreeBucket() { for (BucketSizeInfo bsi : bucketSizeInfos) { Bucket b = bsi.findAndRemoveCompletelyFreeBucket(); if (b != null) return b; } return null; } /** * Free a block with the offset * @param offset block's offset * @return size freed */ public synchronized int freeBlock(long offset) { int bucketNo = (int) (offset / (long) BUCKET_CAPACITY); assert bucketNo >= 0 && bucketNo < buckets.length; Bucket targetBucket = buckets[bucketNo]; bucketSizeInfos[targetBucket.sizeIndex()].freeBlock(targetBucket, offset); usedSize -= targetBucket.itemAllocationSize(); return targetBucket.itemAllocationSize(); } public int sizeIndexOfAllocation(long offset) { int bucketNo = (int) (offset / (long) BUCKET_CAPACITY); assert bucketNo >= 0 && bucketNo < buckets.length; Bucket targetBucket = buckets[bucketNo]; return targetBucket.sizeIndex(); } public int sizeOfAllocation(long offset) { int bucketNo = (int) (offset / (long) BUCKET_CAPACITY); assert bucketNo >= 0 && bucketNo < buckets.length; Bucket targetBucket = buckets[bucketNo]; return targetBucket.itemAllocationSize(); } static public int getMaximumAllocationIndex() { return BUCKET_SIZES.length; } static class IndexStatistics { private long freeCount, usedCount, itemSize, totalCount; public long freeCount() { return freeCount; } public long usedCount() { return usedCount; } public long totalCount() { return totalCount; } public long freeBytes() { return freeCount * itemSize; } public long usedBytes() { return usedCount * itemSize; } public long totalBytes() { return totalCount * itemSize; } public long itemSize() { return itemSize; } public IndexStatistics(long free, long used, long itemSize) { setTo(free, used, itemSize); } public IndexStatistics() { setTo(-1, -1, 0); } public void setTo(long free, long used, long itemSize) { this.itemSize = itemSize; this.freeCount = free; this.usedCount = used; this.totalCount = free + used; } } public void dumpToLog() { logStatistics(); StringBuilder sb = new StringBuilder(); for (Bucket b : buckets) { sb.append("Bucket:").append(b.baseOffset).append('\n'); sb.append(" Size index: " + b.sizeIndex() + "; Free:" + b.freeCount + "; used:" + b.usedCount + "; freelist\n"); for (int i = 0; i < b.freeCount(); ++i) sb.append(b.freeList[i]).append(','); sb.append('\n'); } LOG.info(sb); } public void logStatistics() { IndexStatistics total = new IndexStatistics(); IndexStatistics[] stats = getIndexStatistics(total); LOG.info("Bucket allocator statistics follow:\n"); LOG.info(" Free bytes=" + total.freeBytes() + "+; used bytes=" + total.usedBytes() + "; total bytes=" + total.totalBytes()); for (IndexStatistics s : stats) { LOG.info(" Object size " + s.itemSize() + " used=" + s.usedCount() + "; free=" + s.freeCount() + "; total=" + s.totalCount()); } } public IndexStatistics[] getIndexStatistics(IndexStatistics grandTotal) { IndexStatistics[] stats = getIndexStatistics(); long totalfree = 0, totalused = 0; for (IndexStatistics stat : stats) { totalfree += stat.freeBytes(); totalused += stat.usedBytes(); } grandTotal.setTo(totalfree, totalused, 1); return stats; } public IndexStatistics[] getIndexStatistics() { IndexStatistics[] stats = new IndexStatistics[BUCKET_SIZES.length]; for (int i = 0; i < stats.length; ++i) stats[i] = bucketSizeInfos[i].statistics(); return stats; } public long freeBlock(long freeList[]) { long sz = 0; for (int i = 0; i < freeList.length; ++i) sz += freeBlock(freeList[i]); return sz; } }