Merge branch 'demo131' into Issue/862

8d09630a · gongchensu · GitHub · ab52dead · 012df56c · 8d09630a
Unverified Commit 8d09630a authored Feb 11, 2026 by gongchensu Committed by GitHub Feb 11, 2026
20 changed files
--- a/python/infinicore/nn/functional/linear_w8a8i8.py
+++ b/python/infinicore/nn/functional/linear_w8a8i8.py
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def linear_w8a8i8(
+    input: Tensor,
+    weight_packed: Tensor,
+    weight_scale: Tensor,
+    bias=None,
+    out=None,
+) -> Tensor:
+    r"""Linear layer with weight quantized to int8 and input quantized to int8 with per-tensor scale."""
+
+    if out is None:
+        return Tensor(
+            _infinicore.linear_w8a8i8(
+                input._underlying,
+                weight_packed._underlying,
+                weight_scale._underlying,
+                None if bias is None else bias._underlying,
+            )
+        )
+
+    _infinicore.linear_w8a8i8_(
+        out._underlying,
+        input._underlying,
+        weight_packed._underlying,
+        weight_scale._underlying,
+        None if bias is None else bias._underlying,
+    )
+    return out
--- a/python/infinicore/nn/functional/silu_and_mul.py
+++ b/python/infinicore/nn/functional/silu_and_mul.py
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def silu_and_mul(input: Tensor, out=None) -> Tensor:
+    r"""Apply the SiLU and Mul (SwiGLU) function.
+
+    Formula: output = SiLU(input_gate) * input_up
+    Input shape: [..., 2*d], Output shape: [..., d]
+    """
+
+    if out is None:
+        return Tensor(_infinicore.silu_and_mul(input._underlying))
+
+    _infinicore.silu_and_mul_(out._underlying, input._underlying)
+
+    return out
--- a/python/infinicore/ops/add_rms_norm.py
+++ b/python/infinicore/ops/add_rms_norm.py
+import infinicore.tensor as tensor
+from infinicore.lib import _infinicore
+
+
+def add_rms_norm(a, b, weight, epsilon=1e-5, *, out=None, residual=None):
+    """
+    Fused Add and RMS Normalization.
+
+    Args:
+        a: First input tensor
+        b: Second input tensor
+        weight: Scale weights
+        epsilon: Small constant for numerical stability, default is 1e-5
+        out: Optional output tuple (y, residual_out) for in-place operation
+
+    Returns:
+        Tuple of (normalized_result, add_result): (RMSNorm(a + b) * weight, a + b)
+        The add_result can be used as residual for subsequent layers.
+    """
+    if out is None:
+        out = tensor.empty(a.shape, dtype=a.dtype, device=a.device)
+    if residual is None:
+        residual = tensor.empty(b.shape, dtype=b.dtype, device=b.device)
+
+    _infinicore.add_rms_norm_(
+        out._underlying,
+        residual._underlying,
+        a._underlying,
+        b._underlying,
+        weight._underlying,
+        epsilon,
+    )
+
+    return out, residual
--- a/python/infinicore/ops/kv_caching.py
+++ b/python/infinicore/ops/kv_caching.py
+from infinicore.lib import _infinicore
+
+
+def kv_caching(k_cache, v_cache, k, v, past_kv_lengths):
+    _infinicore.kv_caching_(
+        k_cache._underlying,
+        v_cache._underlying,
+        k._underlying,
+        v._underlying,
+        past_kv_lengths._underlying,
+    )
+
+    return k_cache, v_cache
--- a/python/infinicore/ops/paged_attention.py
+++ b/python/infinicore/ops/paged_attention.py
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def paged_attention(
+    q: Tensor,
+    k_cache: Tensor,
+    v_cache: Tensor,
+    block_tables: Tensor,
+    cache_lens: Tensor,
+    alibi_slopes: Tensor | None = None,
+    scale: float = 1.0,
+    *,
+    out: Tensor | None = None,
+):
+    if out is None:
+        return Tensor(
+            _infinicore.paged_attention(
+                q._underlying,
+                k_cache._underlying,
+                v_cache._underlying,
+                block_tables._underlying,
+                cache_lens._underlying,
+                alibi_slopes._underlying if alibi_slopes is not None else None,
+                scale,
+            )
+        )
+
+    _infinicore.paged_attention_(
+        out._underlying,
+        q._underlying,
+        k_cache._underlying,
+        v_cache._underlying,
+        block_tables._underlying,
+        cache_lens._underlying,
+        alibi_slopes._underlying if alibi_slopes is not None else None,
+        scale,
+    )
+
+    return out
--- a/python/infinicore/ops/paged_attention_prefill.py
+++ b/python/infinicore/ops/paged_attention_prefill.py
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def paged_attention_prefill(
+    q: Tensor,
+    k_cache: Tensor,
+    v_cache: Tensor,
+    block_tables: Tensor,
+    history_lens: Tensor,
+    cu_seqlens_q: Tensor,
+    alibi_slopes: Tensor | None = None,
+    scale: float = 1.0,
+    *,
+    out: Tensor | None = None,
+):
+    alibi_ptr = alibi_slopes._underlying if alibi_slopes is not None else None
+
+    if out is None:
+        return Tensor(
+            _infinicore.paged_attention_prefill(
+                q._underlying,
+                k_cache._underlying,
+                v_cache._underlying,
+                block_tables._underlying,
+                history_lens._underlying,
+                cu_seqlens_q._underlying,
+                alibi_ptr,
+                scale,
+            )
+        )
+
+    _infinicore.paged_attention_prefill_(
+        out._underlying,
+        q._underlying,
+        k_cache._underlying,
+        v_cache._underlying,
+        block_tables._underlying,
+        history_lens._underlying,
+        cu_seqlens_q._underlying,
+        alibi_ptr,
+        scale,
+    )
+
+    return out
--- a/python/infinicore/ops/paged_caching.py
+++ b/python/infinicore/ops/paged_caching.py
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def paged_caching(
+    k_cache: Tensor,
+    v_cache: Tensor,
+    k: Tensor,
+    v: Tensor,
+    slot_mapping: Tensor,
+):
+    Tensor(
+        _infinicore.paged_caching_(
+            k_cache._underlying,
+            v_cache._underlying,
+            k._underlying,
+            v._underlying,
+            slot_mapping._underlying,
+        )
+    )
+    return (k_cache, v_cache)
--- a/python/infinicore/tensor.py
+++ b/python/infinicore/tensor.py
@@ -42,6 +42,11 @@ class Tensor:
                    getattr(self._underlying, name)
                ),
            )
+        else:
+            raise AttributeError(
+                "{!r} object has no attribute {!r}".format(__name__, name)
+            )
+
        return getattr(self, name)

    @property

--- a/scripts/build_ntops.py
+++ b/scripts/build_ntops.py
+import concurrent.futures
 import importlib
 import pathlib

@@ -11,16 +12,32 @@ SRC_DIR_PATH = CURRENT_FILE_PATH.parent.parent / "src"
 def _find_and_build_ops():
    ops_path = SRC_DIR_PATH / "infiniop" / "ops"

-    for op_dir in ops_path.iterdir():
-        ninetoothed_path = op_dir / "ninetoothed"
+    with concurrent.futures.ProcessPoolExecutor() as executor:
+        futures = []

-        if ninetoothed_path.is_dir():
-            module_path = ninetoothed_path / "build"
-            relative_path = module_path.relative_to(SRC_DIR_PATH)
-            import_name = ".".join(relative_path.parts)
-            module = importlib.import_module(import_name)
+        for op_dir in ops_path.iterdir():
+            ninetoothed_path = op_dir / "ninetoothed"

-            module.build()
+            if not ninetoothed_path.is_dir():
+                continue
+
+            build_file = ninetoothed_path / "build.py"
+            if not build_file.exists():
+                continue
+
+            futures.append(executor.submit(_build, ninetoothed_path))
+
+        for future in concurrent.futures.as_completed(futures):
+            future.result()
+
+
+def _build(ninetoothed_path):
+    module_path = ninetoothed_path / "build"
+    relative_path = module_path.relative_to(SRC_DIR_PATH)
+    import_name = ".".join(relative_path.parts)
+    module = importlib.import_module(import_name)
+
+    module.build()


 if __name__ == "__main__":

--- a/src/infiniccl-test/main.cpp
+++ b/src/infiniccl-test/main.cpp
@@ -12,7 +12,7 @@ void printUsage() {
    std::cout << "infiniccl-test --<device>" << std::endl
              << std::endl;
    std::cout << "  --<device>" << std::endl;
-    std::cout << "    Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|qy|kunlun|hygon)." << std::endl
+    std::cout << "    Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|qy|kunlun|hygon|ali)." << std::endl
              << std::endl;
    std::cout << "The program will run tests on all visible devices of the specified device type."
              << " Use Environmental Variables such as CUDA_VSIBLE_DEVICES to limit visible device IDs.";
@@ -46,6 +46,7 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
        else PARSE_DEVICE("--qy", INFINI_DEVICE_QY)
        else PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
        else PARSE_DEVICE("--hygon", INFINI_DEVICE_HYGON)
+        else PARSE_DEVICE("--ali", INFINI_DEVICE_ALI)
        else {
            printUsage();
        }

--- a/src/infiniccl/cambricon/infiniccl_cambricon.cc
+++ b/src/infiniccl/cambricon/infiniccl_cambricon.cc
@@ -62,7 +62,7 @@ infiniStatus_t commInitAll(

    for (int i = 0; i < ndevice; i++) {
        rank_list[i] = i;
-        CHECK_INTERNAL(cnrtSetDevice(device_ids[i]), CNRT_RET_SUCCESS);
+        CHECK_INTERNAL(cnrtSetDevice(device_ids[i]), cnrtSuccess);
    }

    CHECK_CNCL(cnclInitComms(cncl_comms.data(), ndevice,

--- a/src/infiniccl/cuda/infiniccl_cuda.h
+++ b/src/infiniccl/cuda/infiniccl_cuda.h
@@ -4,7 +4,7 @@
 #include "../infiniccl_impl.h"

 // Windows does not support CUDA
-#if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)) && defined(ENABLE_CCL) && !defined(_WIN32)
+#if (defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)) && defined(ENABLE_CCL) && !defined(_WIN32)
 INFINICCL_DEVICE_API_IMPL(cuda)
 #else
 INFINICCL_DEVICE_API_NOOP(cuda)

--- a/src/infiniccl/infiniccl.cc
+++ b/src/infiniccl/infiniccl.cc
@@ -27,6 +27,7 @@ __C infiniStatus_t infinicclCommInitAll(
        COMM_INIT_ALL(INFINI_DEVICE_METAX, metax);
        COMM_INIT_ALL(INFINI_DEVICE_MOORE, moore);
        COMM_INIT_ALL(INFINI_DEVICE_KUNLUN, kunlun);
+        COMM_INIT_ALL(INFINI_DEVICE_ALI, cuda);
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
@@ -53,6 +54,7 @@ __C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
        COMM_DESTROY(INFINI_DEVICE_METAX, metax);
        COMM_DESTROY(INFINI_DEVICE_MOORE, moore);
        COMM_DESTROY(INFINI_DEVICE_KUNLUN, kunlun);
+        COMM_DESTROY(INFINI_DEVICE_ALI, cuda);
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
@@ -86,6 +88,7 @@ __C infiniStatus_t infinicclAllReduce(
        ALL_REDUCE(INFINI_DEVICE_METAX, metax);
        ALL_REDUCE(INFINI_DEVICE_MOORE, moore);
        ALL_REDUCE(INFINI_DEVICE_KUNLUN, kunlun);
+        ALL_REDUCE(INFINI_DEVICE_ALI, cuda);

    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

--- a/src/infinicore-test/README.md
+++ b/src/infinicore-test/README.md
@@ -66,6 +66,7 @@ xmake build infinicore-test
 ./infinicore-test --qy
 ./infinicore-test --kunlun
 ./infinicore-test --hygon
+./infinicore-test --ali
 ```

 ### Customize Test Parameters

--- a/src/infinicore-test/main.cc
+++ b/src/infinicore-test/main.cc
@@ -42,6 +42,7 @@ void printUsage() {
              << "  qy" << std::endl
              << "  kunlun" << std::endl
              << "  hygon" << std::endl
+              << "  ali" << std::endl
              << std::endl
              << "Available tests:" << std::endl
              << "  basic       - Basic memory allocation and deallocation tests" << std::endl
@@ -84,6 +85,8 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
            args.device_type = INFINI_DEVICE_KUNLUN;
        } else if (arg == "--hygon") {
            args.device_type = INFINI_DEVICE_HYGON;
+        } else if (arg == "--ali") {
+            args.device_type = INFINI_DEVICE_ALI;
        } else if (arg == "--test") {
            if (i + 1 >= argc) {
                std::cerr << "Error: --test requires a test name" << std::endl;

--- a/src/infinicore/context/allocators/device_pinned_allocator.cc
+++ b/src/infinicore/context/allocators/device_pinned_allocator.cc
@@ -12,12 +12,18 @@ DevicePinnedHostAllocator::~DevicePinnedHostAllocator() {
 }

 std::byte *DevicePinnedHostAllocator::allocate(size_t size) {
+    if (size == 0) {
+        return nullptr;
+    }
    void *ptr;
    INFINICORE_CHECK_ERROR(infinirtMallocHost(&ptr, size));
    return (std::byte *)ptr;
 }

 void DevicePinnedHostAllocator::deallocate(std::byte *ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
    if (owner_ == context::getDevice()) {
        INFINICORE_CHECK_ERROR(infinirtFreeHost(ptr));
        gc();

--- a/src/infinicore/context/allocators/host_allocator.cc
+++ b/src/infinicore/context/allocators/host_allocator.cc
@@ -4,10 +4,16 @@

 namespace infinicore {
 std::byte *HostAllocator::allocate(size_t size) {
+    if (size == 0) {
+        return nullptr;
+    }
    return (std::byte *)std::malloc(size);
 }

 void HostAllocator::deallocate(std::byte *ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
    std::free(ptr);
 }


--- a/src/infinicore/context/allocators/pinnable_block_allocator.cc
+++ b/src/infinicore/context/allocators/pinnable_block_allocator.cc
+#include "pinnable_block_allocator.hpp"
+
+#include "../context_impl.hpp"
+
+#include "../../utils.hpp"
+
+#include <algorithm>
+#include <infinirt.h>
+#include <stdexcept>
+
+namespace infinicore {
+
+// ------------------- Helper functions -------------------
+
+// Round up size to nearest multiple of alignment
+inline size_t align_up(size_t size, size_t alignment) {
+    return (size + alignment - 1) / alignment * alignment;
+}
+
+// ------------------- Constructor -------------------
+PinnableBlockAllocator::PinnableBlockAllocator(Device device)
+    : device_(device) {
+    size_classes_ = {
+        {32 * 1024, {}},         // 32 KB
+        {256 * 1024, {}},        // 256 KB
+        {1 * 1024 * 1024, {}},   // 1 MB
+        {2 * 1024 * 1024, {}},   // 2 MB
+        {4 * 1024 * 1024, {}},   // 4 MB
+        {8 * 1024 * 1024, {}},   // 8 MB
+        {16 * 1024 * 1024, {}},  // 16 MB
+        {32 * 1024 * 1024, {}},  // 32 MB
+        {64 * 1024 * 1024, {}},  // 64 MB
+        {128 * 1024 * 1024, {}}, // 128 MB
+        {256 * 1024 * 1024, {}}, // 256 MB
+    };
+}
+
+// ------------------- allocate -------------------
+std::byte *PinnableBlockAllocator::allocate(size_t size) {
+    if (size == 0) {
+        return nullptr;
+    }
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    // Align size to 256 bytes for GPU
+    size = align_up(size, 256);
+
+    std::shared_ptr<Block> block;
+
+    // 1. Try size-class allocation for small/medium
+    for (auto &cls : size_classes_) {
+        if (size <= cls.block_size) {
+            if (!cls.free_blocks.empty()) {
+                block = cls.free_blocks.back();
+                while (block != nullptr && block->in_use) {
+                    cls.free_blocks.pop_back();
+                    if (cls.free_blocks.empty()) {
+                        block = nullptr;
+                        break;
+                    }
+                    block = cls.free_blocks.back();
+                }
+                if (block != nullptr) {
+                    cls.free_blocks.pop_back();
+                    block->in_use = true;
+                    return reinterpret_cast<std::byte *>(block->ptr);
+                }
+            }
+            // Allocate a new block for this class
+            block = std::make_shared<Block>();
+            block->size = cls.block_size;
+            block->frozen = pinned_mode_;
+            block->in_use = true;
+
+            INFINICORE_CHECK_ERROR(infinirtMalloc(&block->ptr, block->size));
+
+            all_blocks_[block->ptr] = block;
+            return reinterpret_cast<std::byte *>(block->ptr);
+        }
+    }
+
+    // 2. Large block allocation
+    // Try to reuse a frozen or free large block
+    auto it = std::find_if(large_blocks_.begin(), large_blocks_.end(),
+                           [size](const std::shared_ptr<Block> &b) { return b->size >= size && !b->in_use; });
+
+    if (it != large_blocks_.end()) {
+        block = *it;
+        block->in_use = true;
+        block->frozen = block->frozen || pinned_mode_;
+        return reinterpret_cast<std::byte *>(block->ptr);
+    }
+
+    // Allocate new large block
+    block = std::make_shared<Block>();
+    block->size = size;
+    block->frozen = pinned_mode_;
+    block->in_use = true;
+
+    INFINICORE_CHECK_ERROR(infinirtMalloc(&block->ptr, block->size));
+
+    large_blocks_.push_back(block);
+    all_blocks_[block->ptr] = block;
+
+    return reinterpret_cast<std::byte *>(block->ptr);
+}
+
+// ------------------- deallocate -------------------
+void PinnableBlockAllocator::deallocate(std::byte *ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
+
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    auto it = all_blocks_.find(reinterpret_cast<void *>(ptr));
+    if (it == all_blocks_.end()) {
+        throw std::runtime_error("Pointer not allocated by this allocator");
+    }
+
+    auto block = it->second;
+    if (!block->in_use) {
+        throw std::runtime_error("Double free detected in PinnableBlockAllocator");
+    }
+
+    block->in_use = false;
+
+    if (!block->in_use) {
+        for (auto &cls : size_classes_) {
+            if (block->size == cls.block_size) {
+                cls.free_blocks.push_back(block);
+                break;
+            }
+        }
+    }
+}
+
+size_t PinnableBlockAllocator::mark_in_use_(void *ptr, bool in_use) {
+    auto it = all_blocks_.find(reinterpret_cast<void *>(ptr));
+    if (it == all_blocks_.end()) {
+        throw std::runtime_error("Pointer not allocated by this allocator");
+    }
+    std::lock_guard<std::mutex> lock(mutex_);
+    it->second->in_use = in_use;
+    return it->second->size;
+}
+
+// ------------------- trim -------------------
+void PinnableBlockAllocator::trim() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    // Free non-frozen size-class blocks
+    for (auto &cls : size_classes_) {
+        for (auto it = cls.free_blocks.begin(); it != cls.free_blocks.end();) {
+            if (!(*it)->frozen) {
+                INFINICORE_CHECK_ERROR(infinirtFree((*it)->ptr));
+                all_blocks_.erase((*it)->ptr);
+                it = cls.free_blocks.erase(it);
+            } else {
+                ++it;
+            }
+        }
+    }
+    // Free non-frozen large blocks
+    for (auto it = large_blocks_.begin(); it != large_blocks_.end();) {
+        if (!(*it)->frozen && !(*it)->in_use) {
+            INFINICORE_CHECK_ERROR(infinirtFree((*it)->ptr));
+            all_blocks_.erase((*it)->ptr);
+            it = large_blocks_.erase(it);
+        } else {
+            ++it;
+        }
+    }
+}
+
+// ------------------- Destructor -------------------
+PinnableBlockAllocator::~PinnableBlockAllocator() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    for (auto &p : all_blocks_) {
+        if (p.second->ptr) {
+            infinirtFree(p.second->ptr);
+        }
+    }
+    all_blocks_.clear();
+    large_blocks_.clear();
+    for (auto &cls : size_classes_) {
+        cls.free_blocks.clear();
+    }
+}
+
+} // namespace infinicore
--- a/src/infinicore/context/allocators/pinnable_block_allocator.hpp
+++ b/src/infinicore/context/allocators/pinnable_block_allocator.hpp
+#pragma once
+
+#include "memory_allocator.hpp"
+
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+namespace infinicore {
+class PinnableBlockAllocator : public MemoryAllocator {
+    // Represents a single memory block
+    struct Block {
+        void *ptr = nullptr; // Device pointer
+        size_t size = 0;     // Block size in bytes
+        bool frozen = false; // True if used in pinned/graph mode
+        bool in_use = false; // Wether the block is currently in use
+    };
+
+    // A simple size-class allocator for small/medium blocks
+    struct SizeClass {
+        size_t block_size; // Fixed size for this class
+        std::vector<std::shared_ptr<Block>> free_blocks;
+    };
+
+public:
+    PinnableBlockAllocator(Device device);
+    ~PinnableBlockAllocator();
+
+    std::byte *allocate(size_t size) override;
+    void deallocate(std::byte *ptr) override;
+
+    // Switch pinned/graph mode
+    void set_pin_mode(bool pinned) { pinned_mode_ = pinned; }
+
+    // internal use only, force set in_use flag for a mem block
+    // return the size of the block
+    size_t mark_in_use_(void *ptr, bool in_use);
+
+    // trim cached blocks back to GPU (not pinned)
+    void trim();
+
+private:
+    Device device_;
+
+    bool pinned_mode_ = false;
+
+    std::vector<SizeClass> size_classes_;
+    std::vector<std::shared_ptr<Block>> large_blocks_;
+    std::unordered_map<void *, std::shared_ptr<Block>> all_blocks_;
+
+    std::mutex mutex_; // Thread safety
+};
+
+} // namespace infinicore
--- a/src/infinicore/context/allocators/device_caching_allocator.cc
+++ b/src/infinicore/context/allocators/device_caching_allocator.cc
-#include "device_caching_allocator.hpp"
+#include "stream_ordered_allocator.hpp"

 #include <infinirt.h>

 #include "../../utils.hpp"

 namespace infinicore {
-DeviceCachingAllocator::DeviceCachingAllocator(Device device) : MemoryAllocator(), device_(device) {}
+StreamOrderedAllocator::StreamOrderedAllocator(Device device) : MemoryAllocator(), device_(device) {}

-std::byte *DeviceCachingAllocator::allocate(size_t size) {
+std::byte *StreamOrderedAllocator::allocate(size_t size) {
+    if (size == 0) {
+        return nullptr;
+    }
    void *ptr = nullptr;
    INFINICORE_CHECK_ERROR(infinirtMallocAsync(&ptr, size, context::getStream()));
    return (std::byte *)ptr;
 }

-void DeviceCachingAllocator::deallocate(std::byte *ptr) {
+void StreamOrderedAllocator::deallocate(std::byte *ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
    INFINICORE_CHECK_ERROR(infinirtFreeAsync(ptr, context::getStream()));
 }
 } // namespace infinicore