Optimize TLSF allocator: flat native block storage, alignment mask, SkipLocalsInit

xoofx · xoofx · commit 2d6bdae7cfb2 · 2026-02-22T22:49:37.000+01:00
- Replace grouped Block[] arrays with flat native memory (Block*) for
  single-indirection block access on the hot path
- Pre-compute alignment mask to avoid repeated subtraction in TryAllocate
- Add [module: SkipLocalsInit] to eliminate unnecessary zero-initialization
- Vectorize BinsDirectory.Initialize with Span.Fill instead of scalar loop
- Implement IDisposable to properly free native block buffer
- Make BlockLinks.Undefined a static readonly field
- Enable AggressiveInlining on AlignHelper.AlignUpOffset
- Remove artificial lock() from benchmark for accurate comparison
diff --git a/src/XenoAtom.Allocators.Bench/BenchAllocator.cs b/src/XenoAtom.Allocators.Bench/BenchAllocator.cs
@@ -18,7 +18,7 @@ public class BenchAllocator
     private Random _random = new Random();
 
     private static int[] AllocSizes = [64, 96, 150, 200, 400, 1024, 4096];
-    
+
     private const int AllocationCount = 2048;
 
     [GlobalSetup]
@@ -46,19 +46,13 @@ public void Tlsf()
 
         for (int i = 0; i < AllocationCount; i++)
         {
-            lock (_tlsfAllocator) // Make it more fair to the libc benchmark
-            {
-                var allocate = _tlsfAllocator.Allocate(GetNextRandomSize());
-                localList.Add(allocate);
-            }
+            var allocate = _tlsfAllocator.Allocate(GetNextRandomSize());
+            localList.Add(allocate);
         }
 
         for(int i = 0; i < localList.Count; i++)
         {
-            lock (_tlsfAllocator) // Make it more fair to the libc benchmark
-            {
-                _tlsfAllocator.Free(localList[i]);
-            }
+            _tlsfAllocator.Free(localList[i]);
         }
     }
 
@@ -84,7 +78,7 @@ private unsafe class BasicChunkAllocator : IMemoryChunkAllocator
     {
         private readonly Dictionary<int, MemoryChunk> _chunks = new Dictionary<int, MemoryChunk>();
         private const int ChunkSize = 65536;
-        
+
         public bool TryAllocateChunk(MemorySize minSize, out MemoryChunk chunk)
         {
             var blockSize = (uint)Math.Max(ChunkSize, (int)minSize.Value);
diff --git a/src/XenoAtom.Allocators/AlignHelper.cs b/src/XenoAtom.Allocators/AlignHelper.cs
@@ -24,7 +24,7 @@ public static uint AlignUp(uint value, uint alignment)
         return (value + alignment - 1) & ~(uint)(alignment - 1);
     }
 
-    //[MethodImpl(MethodImplOptions.AggressiveInlining)]
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static uint AlignUpOffset(ulong value, uint alignment)
     {
         Debug.Assert(BitOperations.IsPow2(alignment));
diff --git a/src/XenoAtom.Allocators/ModuleAttributes.cs b/src/XenoAtom.Allocators/ModuleAttributes.cs
@@ -0,0 +1,7 @@
+// Copyright (c) Alexandre Mutel. All rights reserved.
+// Licensed under the BSD-Clause 2 license.
+// See license.txt file in the project root for full license information.
+
+using System.Runtime.CompilerServices;
+
+[module: SkipLocalsInit]
diff --git a/src/XenoAtom.Allocators/TlsfAllocator.cs b/src/XenoAtom.Allocators/TlsfAllocator.cs
@@ -16,11 +16,11 @@ namespace XenoAtom.Allocators;
 
 /// <summary>
 /// This is a TLSF (Two-Level Segregated Fit) allocator following the paper http://www.gii.upv.es/tlsf/files/papers/ecrts04_tlsf.pdf
-/// 
+///
 /// But with the following modifications:
 /// - We are relying on a backend allocator for the chunks.
 /// - We are not storing the block headers in the allocated memory but in separate array as the memory allocated from chunks might not be accessible from CPU (e.g GPU).
-/// 
+///
 /// With its backend allocator, this allocator is dynamic and its size can grow as needed. This allocator doesn't allocate memory by itself,
 /// but use a backend allocator to allocate chunks of memory. It is agnostic of the backend allocator (that can allocate memory from RAM or GPU memory...etc.).
 /// </summary>
@@ -29,13 +29,15 @@ namespace XenoAtom.Allocators;
 /// Note that this class is not thread safe and should be guarded by a lock if used in a multi-threaded environment.
 /// The rationale is that this allocator can be used with Thread Local Storage (TLS) buffers that are not shared between threads and don't need locking.
 /// </remarks>
-public sealed unsafe class TlsfAllocator
+public sealed unsafe class TlsfAllocator : IDisposable
 {
     private readonly IMemoryChunkAllocator _context;
     private readonly uint _alignment;
+    private readonly uint _alignmentMask;
     private UnsafeList<Chunk> _chunks;
-    private UnsafeList<Block[]> _groupedBlocks;
+    private Block* _blocks;
     private int _blockCount;
+    private int _blockCapacity;
     private int _indexToFirstAvailableBlock;
     private BinsDirectory _bins;
 
@@ -47,10 +49,6 @@ public sealed unsafe class TlsfAllocator
     private const int TotalBinCount = BinCount * SubBinCount;
     private const int MinAlignment = 1 << (BaseBin0Log2 - SubBinsLog2); // Minimum alignment is 64 bytes
 
-    private const int GroupedBlockCountLog2 = 10; // TODO: Should we make this configurable? (It might not help with optimizations in that case for indexing a block)
-    private const int GroupedBlockCount = 1 << GroupedBlockCountLog2;
-    private const int GroupedBlockMask = GroupedBlockCount - 1;
-
     /// <summary>
     /// Creates a new instance of <see cref="TlsfAllocator"/>.
     /// </summary>
@@ -77,8 +75,8 @@ public TlsfAllocator(IMemoryChunkAllocator context, in TlsfAllocatorConfig confi
         }
         _context = context;
         _alignment = Math.Max(MinAlignment, alignment);
+        _alignmentMask = _alignment - 1;
         _chunks = new UnsafeList<Chunk>((int)config.PreAllocatedChunkCount);
-        _groupedBlocks = new UnsafeList<Block[]>();
         _indexToFirstAvailableBlock = -1;
         _bins = new BinsDirectory();
     }
@@ -111,7 +109,7 @@ public TlsfAllocation Allocate(uint size)
     public bool TryAllocate(MemorySize size, out TlsfAllocation allocation)
     {
         // We align the size to the alignment (so free blocks are always aligned)
-        size = AlignHelper.AlignUp(size, _alignment);
+        size = (size + _alignmentMask) & ~_alignmentMask;
 
         var firstLevelIndex = Mapping(size, out int secondLevelIndex);
 
@@ -128,7 +126,7 @@ public bool TryAllocate(MemorySize size, out TlsfAllocation allocation)
 
         var offsetIntoChunk = freeBlock.OffsetIntoChunk;
         var newFreeBlockSize = freeBlock.Size - size;
-        
+
         if (newFreeBlockSize > 0)
         {
             // we need to shrink the block and create a new block used
@@ -147,11 +145,11 @@ public bool TryAllocate(MemorySize size, out TlsfAllocation allocation)
             usedBlock.Size = size;
             usedBlock.IsUsed = true;
             usedBlock.FreeLink = BlockLinks.Undefined;
-            
+
             // Insert the new block in the physical order
             usedBlock.PhysicalLink.Next = freeBlockIndex;
             usedBlock.PhysicalLink.Previous = freeBlock.PhysicalLink.Previous;
-            
+
             if (freeBlock.PhysicalLink.Previous < 0)
             {
                 // Relink the beginning of the chunk
@@ -162,7 +160,7 @@ public bool TryAllocate(MemorySize size, out TlsfAllocation allocation)
                 ref var previousBlock = ref GetBlockAt(freeBlock.PhysicalLink.Previous);
                 previousBlock.PhysicalLink.Next = usedBlockIndex;
                 Debug.Assert(previousBlock.OffsetIntoChunk + previousBlock.Size == offsetIntoChunk);
-            } 
+            }
 
             Debug.Assert(usedBlock.OffsetIntoChunk + size == freeBlock.OffsetIntoChunk);
             freeBlock.PhysicalLink.Previous = usedBlockIndex;
@@ -299,6 +297,18 @@ public void Reset()
         _bins.Initialize();
     }
 
+    /// <inheritdoc />
+    public void Dispose()
+    {
+        Reset();
+        if (_blocks != null)
+        {
+            NativeMemory.Free(_blocks);
+            _blocks = null;
+            _blockCapacity = 0;
+        }
+    }
+
     /// <summary>
     /// Dumps the internal state of this allocator to a string.
     /// </summary>
@@ -431,7 +441,7 @@ public void Dump(StringBuilder buffer)
                 buffer.AppendLine($"{$"[{(length == 1 ? firstBlockAvailableIndex : $"{firstBlockAvailableIndex}-{_blockCount - 1}")}]",C1} {$"",C2} {"",C3} {"",C4} {"Avail",C5} {"",C6} {"",C7}");
             }
         }
-        
+
         static string ToBin<T>(T number) where T : unmanaged, IBinaryInteger<T>
         {
             var builder = new StringBuilder();
@@ -475,31 +485,31 @@ private void MarkBlockAsAvailable(ref Block block, int blockIndex)
     private ref Block GetBlockAt(int index)
     {
         Debug.Assert(index >= 0 && index < _blockCount);
-
-        var groupIndex = index >> GroupedBlockCountLog2;
-        var blocks = _groupedBlocks.UnsafeGetRefAt(groupIndex);
-        var localIndex = index & GroupedBlockMask;
-        return ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(blocks), localIndex);
+        return ref Unsafe.Add(ref Unsafe.AsRef<Block>(_blocks), index);
     }
 
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     private ref Block GetOrCreateBlockAt(int index)
     {
-        var groupIndex = index >> GroupedBlockCountLog2;
-
-        while (groupIndex >= _groupedBlocks.Count)
+        if (index >= _blockCapacity)
         {
-            _groupedBlocks.Add(new Block[GroupedBlockCount]);
+            GrowBlocks(index);
         }
 
         if (index >= _blockCount)
         {
             _blockCount = index + 1;
         }
 
-        var blocks = _groupedBlocks.UnsafeGetRefAt(groupIndex);
-        var localIndex = index & GroupedBlockMask;
-        return ref Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(blocks), localIndex);
+        return ref Unsafe.Add(ref Unsafe.AsRef<Block>(_blocks), index);
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    private void GrowBlocks(int minIndex)
+    {
+        var newCapacity = (int)BitOperations.RoundUpToPowerOf2((uint)Math.Max(_blockCapacity * 2, minIndex + 1));
+        _blocks = (Block*)NativeMemory.Realloc(_blocks, (nuint)(newCapacity * sizeof(Block)));
+        _blockCapacity = newCapacity;
     }
 
     private void RemoveBlockFromFreeList(ref Block block, int firstLevelIndex, int secondLevelIndex)
@@ -586,7 +596,7 @@ private ref Block TryFindSuitableBlock(uint size, ref int firstLevelIndex, ref i
             chunkEntry.FreeBlockCount++;
             chunkEntry.FirstBlockInPhysicalOrder = blockIndex;
             chunk = localChunk;
-            
+
             Debug.Assert(BitOperations.IsPow2(chunk.Size));
             ref var block = ref GetOrCreateBlockAt(blockIndex);
             block.ChunkIndex = (uint)chunkIndex;
@@ -641,7 +651,7 @@ private ref Block TryFindSuitableBlock(uint size, ref int firstLevelIndex, ref i
             return ref block;
         }
     }
-    
+
     [MethodImpl(MethodImplOptions.AggressiveInlining)]
     public static int Mapping(uint size, out int secondLevelIndex)
     {
@@ -669,17 +679,17 @@ private struct BinsDirectory
         public uint FirstLevelBitmap => _firstLevelBitmap;
 
         public ushort GetSecondLevelBitmap(int index) => _secondLevelBitmap[index];
-        
+
         public BinsDirectory()
         {
             Initialize();
         }
 
         public void Initialize()
         {
-            for (int i = 0; i < TotalBinCount; i++)
+            fixed (int* ptr = _firstFreeBlockIndices)
             {
-                _firstFreeBlockIndices[i] = -1;
+                new Span<int>(ptr, TotalBinCount).Fill(-1);
             }
         }
 
@@ -776,7 +786,7 @@ private string ToDebuggerDisplay()
         {
             return $"Offset: {OffsetIntoChunk}, Size: {Size}, End: {OffsetIntoChunk + Size}, Status: {(IsUsed ? "Used":IsAvailable?"Avail":"Free")}, FreeLink: {FreeLink.Previous}<->{FreeLink.Next},  PhysicalLink: {PhysicalLink.Previous} <-> {PhysicalLink.Next}";
         }
-        
+
         public bool IsUsed
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -796,7 +806,7 @@ public bool IsAvailable
 
     private struct BlockLinks
     {
-        public static BlockLinks Undefined => Unsafe.BitCast<long, BlockLinks>(-1);
+        public static readonly BlockLinks Undefined = Unsafe.BitCast<long, BlockLinks>(-1L);
 
         public int Previous;
         public int Next;

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ public class BenchAllocator`
`18`	`18`	`private Random _random = new Random();`
`19`	`19`
`20`	`20`	`private static int[] AllocSizes = [64, 96, 150, 200, 400, 1024, 4096];`
`21`		`-`
	`21`	`+`
`22`	`22`	`private const int AllocationCount = 2048;`
`23`	`23`
`24`	`24`	`[GlobalSetup]`
`@@ -46,19 +46,13 @@ public void Tlsf()`
`46`	`46`
`47`	`47`	`for (int i = 0; i < AllocationCount; i++)`
`48`	`48`	`{`
`49`		`- lock (_tlsfAllocator) // Make it more fair to the libc benchmark`
`50`		`- {`
`51`		`- var allocate = _tlsfAllocator.Allocate(GetNextRandomSize());`
`52`		`- localList.Add(allocate);`
`53`		`- }`
	`49`	`+ var allocate = _tlsfAllocator.Allocate(GetNextRandomSize());`
	`50`	`+ localList.Add(allocate);`
`54`	`51`	`}`
`55`	`52`
`56`	`53`	`for(int i = 0; i < localList.Count; i++)`
`57`	`54`	`{`
`58`		`- lock (_tlsfAllocator) // Make it more fair to the libc benchmark`
`59`		`- {`
`60`		`- _tlsfAllocator.Free(localList[i]);`
`61`		`- }`
	`55`	`+ _tlsfAllocator.Free(localList[i]);`
`62`	`56`	`}`
`63`	`57`	`}`
`64`	`58`
`@@ -84,7 +78,7 @@ private unsafe class BasicChunkAllocator : IMemoryChunkAllocator`
`84`	`78`	`{`
`85`	`79`	`private readonly Dictionary<int, MemoryChunk> _chunks = new Dictionary<int, MemoryChunk>();`
`86`	`80`	`private const int ChunkSize = 65536;`
`87`		`-`
	`81`	`+`
`88`	`82`	`public bool TryAllocateChunk(MemorySize minSize, out MemoryChunk chunk)`
`89`	`83`	`{`
`90`	`84`	`var blockSize = (uint)Math.Max(ChunkSize, (int)minSize.Value);`
Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ public static uint AlignUp(uint value, uint alignment)`
`24`	`24`	`return (value + alignment - 1) & ~(uint)(alignment - 1);`
`25`	`25`	`}`
`26`	`26`
`27`		`- //[MethodImpl(MethodImplOptions.AggressiveInlining)]`
	`27`	`+ [MethodImpl(MethodImplOptions.AggressiveInlining)]`
`28`	`28`	`public static uint AlignUpOffset(ulong value, uint alignment)`
`29`	`29`	`{`
`30`	`30`	`Debug.Assert(BitOperations.IsPow2(alignment));`