Merge pull request #828 from InfiniTensor/issue/809

issue/809 支持锁定、缓存的内存分配器

Merge pull request #828 from InfiniTensor/issue/809
issue/809 支持锁定、缓存的内存分配器
62fe6999 · PanZezhong1725 · GitHub · 0ead67fc · f4966bab · 62fe6999
Unverified Commit 62fe6999 authored Dec 24, 2025 by PanZezhong1725 Committed by GitHub Dec 24, 2025
5 changed files
--- a/src/infinicore/context/allocators/pinnable_block_allocator.cc
+++ b/src/infinicore/context/allocators/pinnable_block_allocator.cc
+#include "pinnable_block_allocator.hpp"
+
+#include "../../utils.hpp"
+
+#include <algorithm>
+#include <infinirt.h>
+#include <stdexcept>
+
+namespace infinicore {
+
+// ------------------- Helper functions -------------------
+
+// Round up size to nearest multiple of alignment
+inline size_t align_up(size_t size, size_t alignment) {
+    return (size + alignment - 1) / alignment * alignment;
+}
+
+// ------------------- Constructor -------------------
+PinnableBlockAllocator::PinnableBlockAllocator(Device device)
+    : device_(device) {
+    size_classes_ = {
+        {256 * 1024, {}},        // 256 KB
+        {1 * 1024 * 1024, {}},   // 1 MB
+        {4 * 1024 * 1024, {}},   // 4 MB
+        {16 * 1024 * 1024, {}},  // 16 MB
+        {64 * 1024 * 1024, {}},  // 64 MB
+        {256 * 1024 * 1024, {}}, // 256 MB
+    };
+}
+
+// ------------------- allocate -------------------
+std::byte *PinnableBlockAllocator::allocate(size_t size) {
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    // Align size to 256 bytes for GPU
+    size = align_up(size, 256);
+
+    std::shared_ptr<Block> block;
+
+    // 1. Try size-class allocation for small/medium
+    for (auto &cls : size_classes_) {
+        if (size <= cls.block_size) {
+            if (!cls.free_blocks.empty()) {
+                block = cls.free_blocks.back();
+                cls.free_blocks.pop_back();
+                block->in_use = true;
+                return reinterpret_cast<std::byte *>(block->ptr);
+            }
+            // Allocate a new block for this class
+            block = std::make_shared<Block>();
+            block->size = cls.block_size;
+            block->frozen = pinned_mode_;
+            block->in_use = true;
+
+            INFINICORE_CHECK_ERROR(infinirtMalloc(&block->ptr, block->size));
+
+            all_blocks_[block->ptr] = block;
+            return reinterpret_cast<std::byte *>(block->ptr);
+        }
+    }
+
+    // 2. Large block allocation
+    // Try to reuse a frozen or free large block
+    auto it = std::find_if(large_blocks_.begin(), large_blocks_.end(),
+                           [size](const std::shared_ptr<Block> &b) { return b->size >= size && !b->in_use; });
+
+    if (it != large_blocks_.end()) {
+        block = *it;
+        block->in_use = true;
+        block->frozen = block->frozen || pinned_mode_;
+        return reinterpret_cast<std::byte *>(block->ptr);
+    }
+
+    // Allocate new large block
+    block = std::make_shared<Block>();
+    block->size = size;
+    block->frozen = pinned_mode_;
+    block->in_use = true;
+
+    INFINICORE_CHECK_ERROR(infinirtMalloc(&block->ptr, block->size));
+
+    large_blocks_.push_back(block);
+    all_blocks_[block->ptr] = block;
+
+    return reinterpret_cast<std::byte *>(block->ptr);
+}
+
+// ------------------- deallocate -------------------
+void PinnableBlockAllocator::deallocate(std::byte *ptr) {
+    if (!ptr) {
+        return;
+    }
+
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    auto it = all_blocks_.find(reinterpret_cast<void *>(ptr));
+    if (it == all_blocks_.end()) {
+        throw std::runtime_error("Pointer not allocated by this allocator");
+    }
+
+    auto block = it->second;
+    if (!block->in_use) {
+        throw std::runtime_error("Double free detected in PinnableBlockAllocator");
+    }
+
+    block->in_use = false;
+
+    if (!block->in_use) {
+        for (auto &cls : size_classes_) {
+            if (block->size == cls.block_size) {
+                cls.free_blocks.push_back(block);
+                break;
+            }
+        }
+    }
+}
+
+// ------------------- trim -------------------
+void PinnableBlockAllocator::trim() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    // Free non-frozen size-class blocks
+    for (auto &cls : size_classes_) {
+        for (auto it = cls.free_blocks.begin(); it != cls.free_blocks.end();) {
+            if (!(*it)->frozen) {
+                INFINICORE_CHECK_ERROR(infinirtFree((*it)->ptr));
+                all_blocks_.erase((*it)->ptr);
+                it = cls.free_blocks.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }
+    // Free non-frozen large blocks
+    for (auto it = large_blocks_.begin(); it != large_blocks_.end();) {
+        if (!(*it)->frozen && !(*it)->in_use) {
+            INFINICORE_CHECK_ERROR(infinirtFree((*it)->ptr));
+            all_blocks_.erase((*it)->ptr);
+            it = large_blocks_.erase(it);
+        } else {
+            ++it;
+        }
+    }
+}
+
+// ------------------- Destructor -------------------
+PinnableBlockAllocator::~PinnableBlockAllocator() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    for (auto &p : all_blocks_) {
+        if (p.second->ptr) {
+            infinirtFree(p.second->ptr);
+        }
+    }
+    all_blocks_.clear();
+    large_blocks_.clear();
+    for (auto &cls : size_classes_) {
+        cls.free_blocks.clear();
+    }
+}
+
+} // namespace infinicore
--- a/src/infinicore/context/allocators/pinnable_block_allocator.hpp
+++ b/src/infinicore/context/allocators/pinnable_block_allocator.hpp
+#pragma once
+
+#include "memory_allocator.hpp"
+
+#include "../context_impl.hpp"
+
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+namespace infinicore {
+class PinnableBlockAllocator : public MemoryAllocator {
+    // Represents a single memory block
+    struct Block {
+        void *ptr = nullptr; // Device pointer
+        size_t size = 0;     // Block size in bytes
+        bool frozen = false; // True if used in pinned/graph mode
+        bool in_use = false; // Wether the block is currently in use
+    };
+
+    // A simple size-class allocator for small/medium blocks
+    struct SizeClass {
+        size_t block_size; // Fixed size for this class
+        std::vector<std::shared_ptr<Block>> free_blocks;
+    };
+
+public:
+    explicit PinnableBlockAllocator(Device device);
+    ~PinnableBlockAllocator();
+
+    std::byte *allocate(size_t size) override;
+    void deallocate(std::byte *ptr) override;
+
+    // Switch pinned/graph mode
+    void set_pin_mode(bool pinned) { pinned_mode_ = pinned; }
+
+    // trim cached blocks back to GPU (not pinned)
+    void trim();
+
+private:
+    Device device_;
+
+    bool pinned_mode_ = false;
+
+    std::vector<SizeClass> size_classes_;
+    std::vector<std::shared_ptr<Block>> large_blocks_;
+    std::unordered_map<void *, std::shared_ptr<Block>> all_blocks_;
+
+    std::mutex mutex_; // Thread safety
+};
+
+} // namespace infinicore
--- a/src/infinicore/context/allocators/device_caching_allocator.cc
+++ b/src/infinicore/context/allocators/device_caching_allocator.cc
-#include "device_caching_allocator.hpp"
+#include "stream_ordered_allocator.hpp"

 #include <infinirt.h>

 #include "../../utils.hpp"

 namespace infinicore {
-DeviceCachingAllocator::DeviceCachingAllocator(Device device) : MemoryAllocator(), device_(device) {}
+StreamOrderedAllocator::StreamOrderedAllocator(Device device) : MemoryAllocator(), device_(device) {}

-std::byte *DeviceCachingAllocator::allocate(size_t size) {
+std::byte *StreamOrderedAllocator::allocate(size_t size) {
    void *ptr = nullptr;
    INFINICORE_CHECK_ERROR(infinirtMallocAsync(&ptr, size, context::getStream()));
    return (std::byte *)ptr;
 }

-void DeviceCachingAllocator::deallocate(std::byte *ptr) {
+void StreamOrderedAllocator::deallocate(std::byte *ptr) {
    INFINICORE_CHECK_ERROR(infinirtFreeAsync(ptr, context::getStream()));
 }
 } // namespace infinicore
--- a/src/infinicore/context/allocators/device_caching_allocator.hpp
+++ b/src/infinicore/context/allocators/device_caching_allocator.hpp
@@ -5,10 +5,10 @@
 #include "../context_impl.hpp"

 namespace infinicore {
-class DeviceCachingAllocator : public MemoryAllocator {
+class StreamOrderedAllocator : public MemoryAllocator {
 public:
-    explicit DeviceCachingAllocator(Device device);
-    ~DeviceCachingAllocator() = default;
+    explicit StreamOrderedAllocator(Device device);
+    ~StreamOrderedAllocator() = default;

    std::byte *allocate(size_t size) override;
    void deallocate(std::byte *ptr) override;

--- a/src/infinicore/context/runtime/runtime.cc
+++ b/src/infinicore/context/runtime/runtime.cc
@@ -2,9 +2,10 @@

 #include "../../utils.hpp"

-#include "../allocators/device_caching_allocator.hpp"
 #include "../allocators/device_pinned_allocator.hpp"
 #include "../allocators/host_allocator.hpp"
+#include "../allocators/pinnable_block_allocator.hpp"
+#include "../allocators/stream_ordered_allocator.hpp"

 namespace infinicore {
 Runtime::Runtime(Device device) : device_(device) {
@@ -14,7 +15,7 @@ Runtime::Runtime(Device device) : device_(device) {
    if (device_.getType() == Device::Type::CPU) {
        device_memory_allocator_ = std::make_unique<HostAllocator>();
    } else {
-        device_memory_allocator_ = std::make_unique<DeviceCachingAllocator>(device);
+        device_memory_allocator_ = std::make_unique<PinnableBlockAllocator>(device);
        pinned_host_memory_allocator_ = std::make_unique<DevicePinnedHostAllocator>(device);
    }
 }