add csrc and mmdeploy module

546b4279 · limm · 502f4fb9 · 546b4279 · 546b4279 · 546b4279
Commit 546b4279 authored Jun 25, 2025 by limm
20 changed files
--- a/csrc/mmdeploy/core/utils/stacktrace.h
+++ b/csrc/mmdeploy/core/utils/stacktrace.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_CORE_STACKTRACE_H_
+#define MMDEPLOY_SRC_CORE_STACKTRACE_H_
+
+#include <memory>
+#include <string>
+
+namespace mmdeploy {
+
+class Stacktrace {
+ public:
+  ~Stacktrace();
+  Stacktrace() noexcept;
+  explicit Stacktrace(int);
+  Stacktrace& operator=(const Stacktrace&);
+  Stacktrace& operator=(Stacktrace&& other) noexcept;
+  Stacktrace(const Stacktrace&);
+  Stacktrace(Stacktrace&&) noexcept;
+  std::string to_string() const;
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace mmdeploy
+
+#endif  // MMDEPLOY_SRC_CORE_STACKTRACE_H_
--- a/csrc/mmdeploy/core/value.h
+++ b/csrc/mmdeploy/core/value.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_TYPES_VALUE_H_
+#define MMDEPLOY_TYPES_VALUE_H_
+
+#include <cassert>
+#include <cstdint>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <type_traits>
+#include <variant>
+#include <vector>
+
+#include "mmdeploy/core/logger.h"
+#include "mmdeploy/core/mpl/priority_tag.h"
+#include "mmdeploy/core/mpl/static_any.h"
+#include "mmdeploy/core/mpl/type_traits.h"
+#include "mmdeploy/core/status_code.h"
+
+namespace mmdeploy {
+
+enum class ValueType : int {
+  kNull = 0,
+  kBool,
+  kInt,
+  kUInt,
+  kFloat,
+  kString,
+  kBinary,
+  kArray,
+  kObject,
+  kPointer,
+  kDynamic,
+  kAny,
+};
+
+class Value;
+
+#if __GNUC__ >= 8
+using Byte = std::byte;
+#else
+enum class Byte : unsigned char {};
+#endif
+
+namespace detail {
+class ValueRef;
+}
+
+template <typename T>
+class ValueIterator {
+ public:
+  using value_type = Value;
+  using difference_type = std::ptrdiff_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::bidirectional_iterator_tag;
+  using object_iterator_t = typename T::Object::iterator;
+  using array_iterator_t = typename T::Array::iterator;
+  ValueIterator() = default;
+  ValueIterator(T* value, object_iterator_t iter) : value_(value), object_iter_(iter) {}
+  ValueIterator(T* value, array_iterator_t iter) : value_(value), array_iter_(iter) {}
+  ValueIterator& operator++() {
+    if (value_->is_array()) {
+      ++array_iter_;
+    } else {
+      ++object_iter_;
+    }
+    return *this;
+  }
+  ValueIterator operator++(int) {
+    auto it = *this;
+    ++(*this);
+    return it;
+  }
+  T& operator*() {
+    if (value_->is_array()) {
+      return *array_iter_;
+    } else {
+      return object_iter_->second;
+    }
+  }
+  const T& operator*() const {
+    if (value_->is_array()) {
+      return *array_iter_;
+    } else {
+      return object_iter_->second;
+    }
+  }
+  T* operator->() {
+    if (value_->is_array()) {
+      return &(*array_iter_);
+    } else {
+      return &object_iter_->second;
+    }
+  }
+  const T* operator->() const {
+    if (value_->is_array()) {
+      return &(*array_iter_);
+    } else {
+      return &object_iter_->second;
+    }
+  }
+  const std::string& key() {
+    if (value_->is_object()) {
+      return object_iter_->first;
+    }
+    throw_exception(eInvalidArgument);
+  }
+  bool operator==(const ValueIterator& other) const {
+    return value_ == other.value_ && object_iter_ == other.object_iter_ &&
+           array_iter_ == other.array_iter_;
+  }
+  bool operator!=(const ValueIterator& other) const { return !(*this == other); }
+
+ private:
+  T* value_{};
+  object_iterator_t object_iter_{};
+  array_iterator_t array_iter_{};
+};
+
+class Dynamic;
+
+class Value;
+
+template <class T>
+struct EraseType {
+  T value;
+};
+
+template <class T>
+struct ArchiveType {
+  T value;
+};
+
+template <class T>
+EraseType<T&&> cast_by_erasure(T&& v) {
+  return {std::forward<T>(v)};
+}
+
+template <class T>
+ArchiveType<T&&> cast_by_archive(T&& v) {
+  return {std::forward<T>(v)};
+}
+
+template <typename T>
+struct is_value : std::is_same<T, Value> {};
+
+template <typename T>
+inline constexpr bool is_value_v = is_value<T>::value;
+
+namespace detail {
+template <typename T>
+struct is_pointer_to_const : std::false_type {};
+template <typename T>
+struct is_pointer_to_const<const T*> : std::true_type {};
+template <typename T>
+struct is_const_reference : std::false_type {};
+template <typename T>
+struct is_const_reference<const T&> : std::true_type {};
+}  // namespace detail
+
+class Value {
+ public:
+  using value_type = Value;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using difference_type = std::ptrdiff_t;
+  using size_type = std::size_t;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
+  using iterator = ValueIterator<Value>;
+  using const_iterator = ValueIterator<const Value>;
+
+  using Type = ValueType;
+
+  using Boolean = bool;
+  using Integer = int64_t;
+  using Unsigned = uint64_t;
+  using Float = double;
+  using String = std::string;
+  using Binary = std::vector<Byte>;
+  using Array = std::vector<Value>;
+  using Object = std::map<std::string, Value>;
+  using Pointer = std::shared_ptr<Value>;
+  using Dynamic = ::mmdeploy::Dynamic;
+  using Any = ::mmdeploy::StaticAny;
+  using ValueRef = detail::ValueRef;
+
+  static constexpr const auto kNull = ValueType::kNull;
+  static constexpr const auto kBool = ValueType::kBool;
+  static constexpr const auto kInt = ValueType::kInt;
+  static constexpr const auto kUInt = ValueType::kUInt;
+  static constexpr const auto kFloat = ValueType::kFloat;
+  static constexpr const auto kString = ValueType::kString;
+  static constexpr const auto kBinary = ValueType::kBinary;
+  static constexpr const auto kArray = ValueType::kArray;
+  static constexpr const auto kObject = ValueType::kObject;
+  static constexpr const auto kPointer = ValueType::kPointer;
+  static constexpr const auto kDynamic = ValueType::kDynamic;
+  static constexpr const auto kAny = ValueType::kAny;
+
+  Value(const ValueType v) : type_(v), data_(v) {}
+
+  Value(std::nullptr_t = nullptr) noexcept : Value(ValueType::kNull) {}
+
+  template <typename T, std::enable_if_t<std::is_same_v<T, ValueRef>, int> = 0>
+  Value(const T& ref) : Value(ref.moved_or_copied()) {}
+
+  Value(const Value& other) : type_(other.type_) {
+    switch (type_) {
+      case ValueType::kNull:
+        break;
+      case ValueType::kBool:
+        data_ = other.data_.boolean;
+        break;
+      case ValueType::kInt:
+        data_ = other.data_.number_integer;
+        break;
+      case ValueType::kUInt:
+        data_ = other.data_.number_unsigned;
+        break;
+      case ValueType::kFloat:
+        data_ = other.data_.number_float;
+        break;
+      case ValueType::kString:
+        data_ = *other.data_.string;
+        break;
+      case ValueType::kBinary:
+        data_ = *other.data_.binary;
+        break;
+      case ValueType::kArray:
+        data_ = *other.data_.array;
+        break;
+      case ValueType::kObject:
+        data_ = *other.data_.object;
+        break;
+      case ValueType::kPointer:
+        data_ = *other.data_.pointer;
+        break;
+      case ValueType::kAny:
+        data_.any = create<Any>(*other.data_.any);
+        break;
+      default:
+        throw_exception(eInvalidArgument);
+    }
+  }
+
+  template <class T, std::enable_if_t<std::is_same<std::decay_t<T>, bool>::value, bool> = true>
+  Value(T&& value) : type_(kBool), data_(Boolean{value}) {}
+
+  Value(int8_t value) : type_(kInt), data_(Integer{value}) {}
+  Value(int16_t value) : type_(kInt), data_(Integer{value}) {}
+  Value(int32_t value) : type_(kInt), data_(Integer{value}) {}
+  Value(int64_t value) : type_(kInt), data_(Integer{value}) {}
+  Value(uint8_t value) : type_(kUInt), data_(Unsigned{value}) {}
+  Value(uint16_t value) : type_(kUInt), data_(Unsigned{value}) {}
+  Value(uint32_t value) : type_(kUInt), data_(Unsigned{value}) {}
+  Value(uint64_t value) : type_(kUInt), data_(Unsigned{value}) {}
+  Value(float value) : type_(kFloat), data_(Float{value}) {}
+  Value(double value) : type_(kFloat), data_(Float{value}) {}
+  Value(Binary value) : type_(kBinary), data_(std::move(value)) {}
+  Value(Array value) : type_(kArray), data_(std::move(value)) {}
+  Value(Object value) : type_(kObject), data_(std::move(value)) {}
+  Value(Pointer value) : type_(kPointer), data_(std::move(value)) {}
+
+  template <class T, std::enable_if_t<std::is_constructible<String, T>::value, bool> = true>
+  Value(T&& value) : type_(kString), data_(String{std::forward<T>(value)}) {}
+
+  template <typename T, std::enable_if_t<is_cast_by_erasure<std::decay_t<T>>::value, bool> = true>
+  Value(T&& value) : Value(cast_by_erasure(std::forward<T>(value))) {}
+
+  template <typename T>
+  Value(EraseType<T>&& value) : type_(Type::kAny) {
+    data_.any = create<Any>(std::forward<T>(value.value));
+  }
+
+  Value(std::initializer_list<ValueRef> init, bool type_deduction = true,
+        Type manual_type = Type::kArray);
+
+  Value(Value&& other) noexcept : type_(other.type_), data_(other.data_) {
+    other.type_ = ValueType::kNull;
+    other.data_ = {};
+  }
+
+  // copy-and-swap
+  Value& operator=(Value other) noexcept {
+    using std::swap;
+    swap(type_, other.type_);
+    swap(data_, other.data_);
+    return *this;
+  }
+
+  ~Value() { data_.destroy(type_); }
+
+  operator Type() const noexcept { return type(); }
+  Type type() const noexcept { return _unwrap().type_; }
+  bool is_null() const noexcept { return _unwrap()._is_null(); }
+  bool is_array() const noexcept { return _unwrap()._is_array(); }
+  bool is_object() const noexcept { return _unwrap()._is_object(); }
+  template <typename T = void>
+  bool is_any() const noexcept {
+    return _unwrap()._is_any<T>();
+  }
+  bool is_boolean() const noexcept { return _unwrap()._is_boolean(); }
+  bool is_string() const noexcept { return _unwrap()._is_string(); }
+  bool is_binary() const noexcept { return _unwrap()._is_binary(); }
+  bool is_number() const noexcept { return _unwrap()._is_number(); }
+  bool is_number_integer() const noexcept { return _unwrap()._is_number_integer(); }
+  bool is_number_unsigned() const noexcept { return _unwrap()._is_number_unsigned(); }
+  bool is_number_float() const noexcept { return _unwrap()._is_number_float(); }
+  bool is_pointer() const noexcept { return _is_pointer(); }
+  size_t size() const noexcept { return _unwrap()._size(); }
+  bool empty() const noexcept { return _unwrap()._empty(); }
+
+ private:
+  constexpr Type _type() const noexcept { return type_; }
+
+  constexpr bool _is_null() const noexcept { return type_ == Type::kNull; }
+  constexpr bool _is_array() const noexcept { return type_ == Type::kArray; }
+  constexpr bool _is_object() const noexcept { return type_ == Type::kObject; }
+
+  template <typename T = void>
+  constexpr bool _is_any() const noexcept {
+    if (type_ != Type::kAny) {
+      return false;
+    }
+    if constexpr (std::is_void_v<T>) {
+      return true;
+    } else {
+      return traits::TypeId<T>::value == data_.any->type();
+    }
+  }
+
+  constexpr bool _is_boolean() const noexcept { return type_ == Type::kBool; }
+  constexpr bool _is_string() const noexcept { return type_ == Type::kString; }
+  constexpr bool _is_binary() const noexcept { return type_ == Type::kBinary; }
+  constexpr bool _is_number() const noexcept { return _is_number_integer() || _is_number_float(); }
+
+  constexpr bool _is_number_integer() const noexcept {
+    return type_ == Type::kInt || type_ == Type::kUInt;
+  }
+
+  constexpr bool _is_number_unsigned() const noexcept { return type_ == Type::kUInt; }
+  constexpr bool _is_number_float() const noexcept { return type_ == Type::kFloat; }
+  constexpr bool _is_pointer() const noexcept { return type_ == Type::kPointer; }
+
+  size_t _size() const noexcept {
+    switch (_type()) {
+      case ValueType::kNull:
+        return 0;
+      case ValueType::kArray:
+        return data_.array->size();
+      case ValueType::kObject:
+        return data_.object->size();
+      default:
+        return 1;
+    }
+  }
+
+  bool _empty() const noexcept {
+    switch (_type()) {
+      case Type::kNull:
+        return true;
+      case Type::kArray:
+        return data_.array->empty();
+      case Type::kObject:
+        return data_.object->empty();
+      default:
+        return false;
+    }
+  }
+
+ private:
+  Boolean* get_impl_ptr(Boolean*) noexcept { return _is_boolean() ? &data_.boolean : nullptr; }
+  const Boolean* get_impl_ptr(const Boolean*) const noexcept {
+    return _is_boolean() ? &data_.boolean : nullptr;
+  }
+  Integer* get_impl_ptr(Integer*) noexcept {
+    return _is_number_integer() ? &data_.number_integer : nullptr;
+  }
+  const Integer* get_impl_ptr(const Integer*) const noexcept {
+    return _is_number_integer() ? &data_.number_integer : nullptr;
+  }
+  Unsigned* get_impl_ptr(Unsigned*) noexcept {
+    return _is_number_unsigned() ? &data_.number_unsigned : nullptr;
+  }
+  const Unsigned* get_impl_ptr(const Unsigned*) const noexcept {
+    return _is_number_unsigned() ? &data_.number_unsigned : nullptr;
+  }
+  Float* get_impl_ptr(Float*) noexcept {
+    return _is_number_float() ? &data_.number_float : nullptr;
+  }
+  const Float* get_impl_ptr(const Float*) const noexcept {
+    return _is_number_float() ? &data_.number_float : nullptr;
+  }
+  String* get_impl_ptr(String*) noexcept { return _is_string() ? data_.string : nullptr; }
+  const String* get_impl_ptr(const String*) const noexcept {
+    return _is_string() ? data_.string : nullptr;
+  }
+  Binary* get_impl_ptr(Binary*) noexcept { return _is_binary() ? data_.binary : nullptr; }
+  const Binary* get_impl_ptr(const Binary*) const noexcept {
+    return _is_binary() ? data_.binary : nullptr;
+  }
+  Array* get_impl_ptr(Array*) noexcept { return _is_array() ? data_.array : nullptr; }
+  const Array* get_impl_ptr(const Array*) const noexcept {
+    return _is_array() ? data_.array : nullptr;
+  }
+  Object* get_impl_ptr(Object*) noexcept { return _is_object() ? data_.object : nullptr; }
+  const Object* get_impl_ptr(const Object*) const noexcept {
+    return _is_object() ? data_.object : nullptr;
+  }
+  Pointer* get_impl_ptr(Pointer*) noexcept { return _is_pointer() ? data_.pointer : nullptr; }
+  const Pointer* get_impl_ptr(const Pointer*) const noexcept {
+    return _is_pointer() ? data_.pointer : nullptr;
+  }
+  Any* get_impl_ptr(Any*) noexcept { return _is_any() ? data_.any : nullptr; }
+  const Any* get_impl_ptr(const Any*) const noexcept { return _is_any() ? data_.any : nullptr; }
+
+  template <typename T>
+  T* get_erased_ptr(EraseType<T>*) noexcept {
+    return _is_any() ? static_any_cast<T>(data_.any) : nullptr;
+  }
+  template <typename T>
+  const T* get_erased_ptr(const EraseType<T>*) const noexcept {
+    return _is_any() ? static_any_cast<T>(const_cast<const Any*>(data_.any)) : nullptr;
+  }
+
+  template <typename T, typename This>
+  static auto get_ref_impl(This& obj)
+      -> decltype((*obj.template get_ptr<std::add_pointer_t<T>>())) {
+    auto p = obj.template get_ptr<std::add_pointer_t<T>>();
+    if (p) {
+      return *p;
+    }
+    throw_exception(eInvalidArgument);
+  }
+
+  template <typename T, std::enable_if_t<std::is_pointer<T>::value, bool> = true>
+  auto _get_ptr() noexcept -> decltype(std::declval<Value&>().get_impl_ptr(std::declval<T>())) {
+    return get_impl_ptr(static_cast<T>(nullptr));
+  }
+
+  template <typename T, std::enable_if_t<detail::is_pointer_to_const<T>::value, bool> = true>
+  auto _get_ptr() const noexcept
+      -> decltype(std::declval<const Value&>().get_impl_ptr(std::declval<T>())) {
+    return get_impl_ptr(static_cast<T>(nullptr));
+  }
+
+  template <typename T, std::enable_if_t<std::is_pointer<T>::value, bool> = true>
+  auto _get_ptr() noexcept -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<T>())) {
+    return get_erased_ptr(static_cast<T>(nullptr));
+  }
+
+  template <typename T, std::enable_if_t<detail::is_pointer_to_const<T>::value, bool> = true>
+  auto _get_ptr() const noexcept
+      -> decltype(std::declval<const Value&>().get_erased_ptr(std::declval<T>())) {
+    return get_erased_ptr(static_cast<T>(nullptr));
+  }
+
+  // T* -> EraseType<T>*
+  template <
+      typename T, typename T0 = std::remove_pointer_t<T>,
+      std::enable_if_t<std::is_pointer<T>::value && is_cast_by_erasure<T0>::value, bool> = true>
+  auto _get_ptr() noexcept
+      -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<EraseType<T0>*>())) {
+    return get_erased_ptr(static_cast<EraseType<T0>*>(nullptr));
+  }
+
+  // const T* -> const EraseType<T>*
+  template <typename T, typename T0 = std::remove_const_t<std::remove_pointer_t<T>>,
+            std::enable_if_t<detail::is_pointer_to_const<T>::value && is_cast_by_erasure<T0>::value,
+                             bool> = true>
+  auto _get_ptr() const noexcept
+      -> decltype(std::declval<Value&>().get_erased_ptr(std::declval<const EraseType<T0>*>())) {
+    return get_erased_ptr(static_cast<const EraseType<T0>*>(nullptr));
+  }
+
+  template <typename T>
+  static auto test_get_ptr(T) -> decltype(std::declval<Value&>()._get_ptr<T>(), std::true_type{});
+
+  static std::false_type test_get_ptr(...);
+
+  template <typename T>
+  using has_get_ptr = decltype(test_get_ptr(std::declval<std::add_pointer_t<T>>()));
+
+  template <typename T, std::enable_if_t<std::is_reference<T>::value, bool> = true>
+  auto _get_ref() -> decltype((get_ref_impl<T>(std::declval<Value&>()))) {
+    return get_ref_impl<T>(*this);
+  }
+
+  template <typename T, std::enable_if_t<detail::is_const_reference<T>::value, bool> = true>
+  auto _get_ref() const -> decltype((get_ref_impl<T>(std::declval<Value&>()))) {
+    return get_ref_impl<T>(*this);
+  }
+
+  template <typename T,
+            std::enable_if_t<std::is_same<std::remove_const_t<T>, Value>::value, bool> = true>
+  Value _get() const {
+    return *this;
+  }
+
+  template <typename T,
+            std::enable_if_t<!std::is_arithmetic<T>::value && has_get_ptr<T>::value, bool> = true>
+  auto _get() const
+      -> std::remove_reference_t<decltype(std::declval<Value&>()._get_ref<const T&>())> {
+    return get_ref<const T&>();
+  }
+
+  template <typename T, std::enable_if_t<std::is_arithmetic<T>::value, bool> = true>
+  T _get() const {
+    switch (_type()) {
+      case kInt:
+        return static_cast<T>(*_get_ptr<const Integer*>());
+      case kUInt:
+        return static_cast<T>(*_get_ptr<const Unsigned*>());
+      case kFloat:
+        return static_cast<T>(*_get_ptr<const Float*>());
+      case kBool:
+        return static_cast<T>(*_get_ptr<const Boolean*>());
+      default:
+        throw_exception(eInvalidArgument);
+    }
+  }
+
+  template <typename T, std::enable_if_t<std::is_same<T, const char*>::value, bool> = true>
+  const char* _get() const {
+    if (_is_string()) {
+      return data_.string->c_str();
+    }
+    throw_exception(eInvalidArgument);
+  }
+
+  template <typename T>
+  T& _get_to(T& v) const {
+    v = get<T>();
+    return v;
+  }
+
+ public:
+  template <typename T>
+  auto get_ptr() noexcept -> decltype(std::declval<Value&>()._get_ptr<T>()) {
+    return _unwrap()._get_ptr<T>();
+  }
+
+  template <typename T>
+  auto get_ptr() const noexcept -> decltype(std::declval<const Value&>()._get_ptr<T>()) {
+    return _unwrap()._get_ptr<T>();
+  }
+
+  template <typename T>
+  auto get_ref() -> decltype((std::declval<Value&>()._get_ref<T>())) {
+    return _unwrap()._get_ref<T>();
+  }
+
+  template <typename T>
+  auto get_ref() const -> decltype((std::declval<const Value&>()._get_ref<T>())) {
+    return _unwrap()._get_ref<T>();
+  }
+
+  template <typename T>
+  auto get() -> decltype(std::declval<Value&>()._get<T>()) {
+    return _unwrap()._get<T>();
+  }
+
+  template <typename T>
+  auto get() const -> decltype(std::declval<const Value&>()._get<T>()) {
+    return _unwrap()._get<T>();
+  }
+
+  template <typename T>
+  auto get_to(T& v) const -> decltype((std::declval<const Value&>()._get_to(v))) {
+    return _unwrap()._get_to(v);
+  }
+
+  Array& array() & { return get_ref<Array&>(); }
+  Array&& array() && { return static_cast<Array&&>(get_ref<Array&>()); }
+  const Array& array() const& { return get_ref<const Array&>(); }
+  const Array&& array() const&& { return static_cast<const Array&&>(get_ref<const Array&>()); }
+
+  Object& object() & { return get_ref<Object&>(); }
+  Object&& object() && { return static_cast<Object&&>(get_ref<Object&>()); }
+  const Object& object() const& { return get_ref<const Object&>(); }
+  const Object&& object() const&& { return static_cast<const Object&&>(get_ref<const Object&>()); }
+
+  value_type& operator[](size_t idx) & {
+    return static_cast<value_type&>(_unwrap()._subscript(idx));
+  }
+
+  value_type&& operator[](size_t idx) && {
+    return static_cast<value_type&&>(_unwrap()._subscript(idx));
+  }
+
+  const value_type& operator[](size_t idx) const& {
+    return static_cast<const value_type&>(_unwrap()._subscript(idx));
+  }
+
+  const value_type&& operator[](size_t idx) const&& {
+    return static_cast<const value_type&&>(_unwrap()._subscript(idx));
+  }
+
+  value_type& operator[](const Object::key_type& idx) & {
+    return static_cast<value_type&>(_unwrap()._subscript(idx));
+  }
+
+  value_type&& operator[](const Object::key_type& idx) && {
+    return static_cast<value_type&&>(_unwrap()._subscript(idx));
+  }
+
+  const value_type& operator[](const Object::key_type& idx) const& {
+    return static_cast<const value_type&>(_unwrap()._subscript(idx));
+  }
+
+  const value_type&& operator[](const Object::key_type& idx) const&& {
+    return static_cast<const value_type&&>(_unwrap()._subscript(idx));
+  }
+
+  reference front() { return _unwrap()._front(); }
+
+  const_reference front() const { return _unwrap()._front(); }
+
+  reference back() { return _unwrap()._back(); }
+
+  const_reference back() const { return _unwrap()._back(); }
+
+  void push_back(Value&& val) { _unwrap()._push_back(std::move(val)); }
+
+  void push_back(const Value& val) { _unwrap()._push_back(val); }
+
+  template <typename Key>
+  bool contains(Key&& key) const {
+    return _unwrap()._contains(std::forward<Key>(key));
+  }
+
+  template <typename Key>
+  iterator find(Key&& key) {
+    return _unwrap()._find(std::forward<Key>(key));
+  }
+
+  template <typename Key>
+  const_iterator find(Key&& key) const {
+    return _unwrap()._find(std::forward<Key>(key));
+  }
+
+  template <typename T>
+  T value(const typename Object::key_type& key, const T& default_value) const {
+    return _unwrap()._value(key, default_value);
+  }
+
+  iterator begin() { return _unwrap()._begin(); }
+
+  iterator end() { return _unwrap()._end(); }
+
+  const_iterator begin() const { return _unwrap()._begin(); }
+
+  const_iterator end() const { return _unwrap()._end(); }
+
+  void update(const_reference v) { return _unwrap()._update(v); }
+
+ private:
+  reference _front() {
+    if (_is_array()) {
+      return (*data_.array).front();
+    }
+    throw_exception(eInvalidArgument);
+  }
+
+  const_reference _front() const {
+    if (_is_array()) {
+      return (*data_.array).front();
+    }
+    throw_exception(eInvalidArgument);
+  }
+
+  reference _back() {
+    if (_is_array()) {
+      return (*data_.array).back();
+    }
+    throw_exception(eInvalidArgument);
+  }
+
+  const_reference _back() const {
+    if (_is_array()) {
+      return (*data_.array).back();
+    }
+    throw_exception(eInvalidArgument);
+  }
+
+  void _push_back(Value&& val) {
+    if (!(_is_null() || _is_array())) {
+      throw_exception(eInvalidArgument);
+    }
+    if (_is_null()) {
+      *this = Type::kArray;
+    }
+    data_.array->push_back(std::move(val));
+  }
+
+  void _push_back(const Value& val) {
+    if (!(_is_null() || _is_array())) {
+      throw_exception(eInvalidArgument);
+    }
+    if (_is_null()) {
+      *this = Type::kArray;
+    }
+    data_.array->push_back(val);
+  }
+
+  template <typename Key>
+  bool _contains(Key&& key) const {
+    return _is_object() && data_.object->find(std::forward<Key>(key)) != data_.object->end();
+  }
+
+  template <typename Key>
+  iterator _find(Key&& key) {
+    if (_is_object()) {
+      auto iter = data_.object->find(std::forward<Key>(key));
+      return {this, iter};
+    }
+    throw_exception(eInvalidArgument);
+  }
+
+  template <typename Key>
+  const_iterator _find(Key&& key) const {
+    if (_is_object()) {
+      auto iter = data_.object->find(std::forward<Key>(key));
+      return {this, iter};
+    }
+    throw_exception(eInvalidArgument);
+  }
+
+  template <typename T>
+  T _value(const typename Object::key_type& key, const T& default_value) const {
+    if (_is_object()) {
+      const auto it = _find(key);
+      if (it != _end()) {
+        return (*it)._get<T>();
+      }
+      return default_value;
+    }
+    throw_exception(eInvalidArgument);
+  }
+
+  iterator _begin() {
+    if (_is_array()) {
+      return {this, data_.array->begin()};
+    } else if (_is_object()) {
+      return {this, data_.object->begin()};
+    } else {
+      throw_exception(eInvalidArgument);
+    }
+  }
+
+  iterator _end() {
+    if (_is_array()) {
+      return {this, data_.array->end()};
+    } else if (_is_object()) {
+      return {this, data_.object->end()};
+    } else {
+      throw_exception(eInvalidArgument);
+    }
+  }
+
+  const_iterator _begin() const {
+    if (_is_array()) {
+      return {this, data_.array->begin()};
+    } else if (_is_object()) {
+      return {this, data_.object->begin()};
+    } else {
+      throw_exception(eInvalidArgument);
+    }
+  }
+
+  const_iterator _end() const {
+    if (_is_array()) {
+      return {this, data_.array->end()};
+    } else if (_is_object()) {
+      return {this, data_.object->end()};
+    } else {
+      throw_exception(eInvalidArgument);
+    }
+  }
+
+  void _update(const_reference v) {
+    if (_is_null()) {
+      type_ = ValueType::kObject;
+      data_.object = create<Object>();
+    }
+    if (!(_is_object() && v._is_object())) {
+      throw_exception(eInvalidArgument);
+    }
+    for (auto it = v._begin(); it != v._end(); ++it) {
+      data_.object->operator[](it.key()) = *it;
+    }
+  }
+
+  Value& _unwrap() {
+    auto p = this;
+    while (p->_is_pointer() && *p->data_.pointer) {
+      p = p->data_.pointer->get();
+    }
+    return *p;
+  }
+
+  const Value& _unwrap() const {
+    auto p = this;
+    while (p->_is_pointer() && *p->data_.pointer) {
+      p = p->data_.pointer->get();
+    }
+    return *p;
+  }
+
+ private:
+  template <typename T, typename... Args>
+  static T* create(Args&&... args) {
+    return new T(std::forward<Args>(args)...);
+  }
+
+  template <typename T>
+  static void release(T* ptr) {
+    delete ptr;
+  }
+
+  value_type& _subscript(size_t idx) {
+    if (_is_array()) {
+      return (*data_.array)[idx];
+    }
+    throw_exception(eInvalidArgument);
+  }
+
+  const value_type& _subscript(size_t idx) const {
+    if (_is_array()) {
+      return (*data_.array)[idx];
+    }
+    throw_exception(eInvalidArgument);
+  }
+
+  reference _subscript(const Object::key_type& key) {
+    if (_is_null()) {
+      type_ = Type::kObject;
+      data_.object = create<Object>();
+    }
+    if (_is_object()) {
+      return (*data_.object)[key];
+    }
+    throw_exception(eInvalidArgument);
+  }
+
+  const_reference _subscript(const Object::key_type& key) const {
+    if (_is_object()) {
+      return (*data_.object)[key];
+    }
+    throw_exception(eInvalidArgument);
+  }
+
+ private:
+  union ValueData {
+    Boolean boolean;
+    Integer number_integer;
+    Unsigned number_unsigned;
+    Float number_float;
+    String* string;
+    Binary* binary;
+    Array* array;
+    Object* object;
+    Dynamic* dynamic;
+    Pointer* pointer;
+    Any* any;
+
+    ValueData() = default;
+
+    ValueData(Boolean v) noexcept : boolean(v) {}
+
+    ValueData(Integer v) noexcept : number_integer(v) {}
+
+    ValueData(Unsigned v) noexcept : number_unsigned(v) {}
+
+    ValueData(Float v) noexcept : number_float(v) {}
+
+    ValueData(Type type) {
+      switch (type) {
+        case Type::kBool:
+          boolean = Boolean{};
+          break;
+        case Type::kInt:
+          number_integer = Integer{};
+          break;
+        case Type::kUInt:
+          number_unsigned = Unsigned{};
+          break;
+        case Type::kFloat:
+          number_float = Float{};
+          break;
+        case Type::kString:
+          string = create<String>();
+          break;
+        case Type::kBinary:
+          binary = create<Binary>();
+          break;
+        case Type::kArray:
+          array = create<Array>();
+          break;
+        case Type::kObject:
+          object = create<Object>();
+          break;
+        case Type::kPointer:
+          pointer = create<Pointer>();
+          break;
+        case Type::kAny:
+          any = create<Any>();
+          break;
+        case Type::kNull:
+          object = nullptr;
+          break;
+        default:
+          throw_exception(eNotSupported);
+      }
+    }
+
+    ValueData(const String& value) { string = create<String>(value); }
+
+    ValueData(String&& value) { string = create<String>(std::move(value)); }
+
+    ValueData(const Binary& value) { binary = create<Binary>(value); }
+
+    ValueData(Binary&& value) { binary = create<Binary>(std::move(value)); }
+
+    ValueData(const Object& value) { object = create<Object>(value); }
+
+    ValueData(Object&& value) { object = create<Object>(std::move(value)); }
+
+    ValueData(const Array& value) { array = create<Array>(value); }
+
+    ValueData(Array&& value) { array = create<Array>(std::move(value)); }
+
+    ValueData(const Pointer& value) { pointer = create<Pointer>(value); }
+
+    ValueData(Pointer&& value) { pointer = create<Pointer>(std::move(value)); }
+
+    // nlohmann/json used an iterative implementation
+    void destroy(ValueType t) {
+      switch (t) {
+        case ValueType::kString:
+          release(string);
+          break;
+        case ValueType::kBinary:
+          release(binary);
+          break;
+        case ValueType::kArray:
+          release(array);
+          break;
+        case ValueType::kObject:
+          release(object);
+          break;
+        case ValueType::kPointer:
+          release(pointer);
+          break;
+        case ValueType::kAny:
+          release(any);
+          break;
+        default:
+          break;
+      }
+    }
+  };
+
+  ValueType type_ = ValueType::kNull;
+  ValueData data_ = {};
+};
+
+namespace detail {
+
+class ValueRef {
+ public:
+  ValueRef(Value&& value)
+      : owned_value_(std::move(value)), value_ref_(&owned_value_), is_rvalue_(true) {}
+
+  ValueRef(const Value& value) : value_ref_(const_cast<Value*>(&value)), is_rvalue_(false) {}
+
+  ValueRef(std::initializer_list<ValueRef> init)
+      : owned_value_(init), value_ref_(&owned_value_), is_rvalue_(true) {}
+
+  template <typename... Args, std::enable_if_t<std::is_constructible_v<Value, Args...>, int> = 0>
+  ValueRef(Args&&... args)
+      : owned_value_(std::forward<Args>(args)...), value_ref_(&owned_value_), is_rvalue_(true) {}
+
+  ValueRef(ValueRef&&) = default;
+  ValueRef(const ValueRef&) = delete;
+  ValueRef& operator=(const ValueRef&) = delete;
+  ValueRef& operator=(ValueRef&&) = delete;
+  ~ValueRef() = default;
+
+  Value moved_or_copied() const {
+    if (is_rvalue_) {
+      return std::move(*value_ref_);
+    }
+    return *value_ref_;
+  }
+
+  const Value& operator*() const { return *static_cast<const Value*>(value_ref_); }
+  const Value* operator->() const { return static_cast<const Value*>(value_ref_); }
+
+ private:
+  mutable Value owned_value_;
+  Value* value_ref_ = nullptr;
+  const bool is_rvalue_ = true;
+};
+
+}  // namespace detail
+
+inline Value::Value(std::initializer_list<ValueRef> init, bool type_deduction, Type manual_type) {
+  bool is_an_object = true;
+  for (const auto& x : init) {
+    if (!(x->_is_array() && x->_size() == 2 && x->_front()._is_string())) {
+      is_an_object = false;
+      break;
+    }
+  }
+  if (!type_deduction) {
+    if (manual_type == Type::kArray) {
+      is_an_object = false;
+    }
+    if (manual_type == Type::kObject && !is_an_object) {
+      throw_exception(eInvalidArgument);
+    }
+  }
+  if (is_an_object) {
+    type_ = Type::kObject;
+    data_ = Type::kObject;
+    for (const auto& x : init) {
+      auto e = x.moved_or_copied();
+      data_.object->emplace(std::move(*((*e.data_.array)[0].data_.string)),
+                            std::move((*e.data_.array)[1]));
+    }
+  } else {
+    type_ = Type::kArray;
+    data_.array = create<Array>(init.begin(), init.end());
+  }
+}
+
+inline Value make_pointer(Value v) { return std::make_shared<Value>(std::move(v)); }
+
+inline void update(Value::Object& dst, const Value::Object& src, int depth) {
+  if (depth < 0) {
+    return;
+  }
+  for (const auto& [key, value] : src) {
+    auto ret = dst.insert({key, value});
+    if (!ret.second && ret.first->second.is_object() && value.is_object()) {
+      update(ret.first->second.object(), value.object(), depth - 1);
+    }
+  }
+}
+
+}  // namespace mmdeploy
+
+#endif  // MMDEPLOY_TYPES_VALUE_H_
--- a/csrc/mmdeploy/device/CMakeLists.txt
+++ b/csrc/mmdeploy/device/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+
+add_subdirectory(cpu)
+
+if ("cuda" IN_LIST MMDEPLOY_TARGET_DEVICES)
+    add_subdirectory(cuda)
+endif ()
+
+if ("acl" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+    add_subdirectory(acl)
+endif ()
--- a/csrc/mmdeploy/device/acl/CMakeLists.txt
+++ b/csrc/mmdeploy/device/acl/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_acl_device)
+
+file(GLOB_RECURSE SRCS "*.cpp")
+
+mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
--- a/csrc/mmdeploy/device/acl/acl_device.cpp
+++ b/csrc/mmdeploy/device/acl/acl_device.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "mmdeploy/core/device_impl.h"
+
+namespace mmdeploy::framework {
+
+class AclPlatformRegisterer {
+ public:
+  AclPlatformRegisterer() { gPlatformRegistry().AddAlias("npu", "cpu"); }
+};
+
+AclPlatformRegisterer g_acl_platform_registerer;
+
+}  // namespace mmdeploy::framework
--- a/csrc/mmdeploy/device/cpu/CMakeLists.txt
+++ b/csrc/mmdeploy/device/cpu/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_cpu_device)
+
+file(GLOB_RECURSE SRCS "*.cpp")
+
+mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
+
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+target_link_libraries(${PROJECT_NAME} PRIVATE Threads::Threads)
+
+add_library(mmdeploy::device::cpu ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/device/cpu/cpu_device.cpp
+++ b/csrc/mmdeploy/device/cpu/cpu_device.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "cpu_device.h"
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+
+namespace mmdeploy::framework {
+
+class CpuHostMemory : public NonCopyable {
+ public:
+  CpuHostMemory() : size_(), data_(), owned_data_{false} {}
+  Result<void> Init(size_t size, size_t alignment) {
+    alignment = std::max(alignment, sizeof(void*));
+    auto space = (size + alignment - 1) / alignment * alignment;
+#ifdef _MSC_VER
+    data_ = _aligned_malloc(space, alignment);
+#elif defined(ANDROID)
+    posix_memalign(&data_, alignment, space);
+#else
+    data_ = aligned_alloc(alignment, space);
+#endif
+    if (!data_) {
+      return Status(eOutOfMemory);
+    }
+    aligned_data_ = data_;
+    size_ = size;
+    owned_data_ = true;
+    return success();
+  }
+  Result<void> Init(size_t size, std::shared_ptr<void> data) {
+    size_ = size;
+    external_ = std::move(data);
+    data_ = external_.get();
+    owned_data_ = false;
+    return success();
+  }
+  Result<void> Init(size_t size, void* data) {
+    size_ = size;
+    data_ = data;
+    owned_data_ = false;
+    return success();
+  }
+  ~CpuHostMemory() {
+    if (data_) {
+      if (owned_data_) {
+#ifdef _MSC_VER
+        _aligned_free(data_);
+#else
+        std::free(data_);
+#endif
+        owned_data_ = false;
+      }
+      data_ = nullptr;
+    }
+    external_.reset();
+    size_ = 0;
+  }
+  size_t size() const { return size_; }
+  void* data() const { return owned_data_ ? aligned_data_ : data_; }
+
+ private:
+  size_t size_;
+  void* data_;
+  void* aligned_data_{nullptr};
+  bool owned_data_;
+  std::shared_ptr<void> external_;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+/// CpuPlatformImpl
+
+Result<void> CpuPlatformImpl::BindDevice(Device device, Device* prev) {
+  // do nothing
+  if (prev) {
+    *prev = device;
+  }
+  return success();
+}
+
+shared_ptr<BufferImpl> CpuPlatformImpl::CreateBuffer(Device device) {
+  return std::make_shared<CpuBufferImpl>(device);
+}
+
+shared_ptr<StreamImpl> CpuPlatformImpl::CreateStream(Device device) {
+  return std::make_shared<CpuStreamImpl>(device);
+}
+
+shared_ptr<EventImpl> CpuPlatformImpl::CreateEvent(Device device) {
+  return std::make_shared<CpuEventImpl>(device);
+}
+
+int CpuPlatformImpl::GetPlatformId() const noexcept { return 0; }
+
+const char* CpuPlatformImpl::GetPlatformName() const noexcept { return "cpu"; }
+
+bool CpuPlatformImpl::CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset,
+                                     size_t dst_offset, size_t copy_size) {
+  if (src_offset + copy_size > src_size) {
+    return false;
+  }
+  if (dst_offset + copy_size > dst_size) {
+    return false;
+  }
+  return true;
+}
+
+inline void* OffsetPtr(void* ptr, size_t offset) {
+  return static_cast<void*>(static_cast<uint8_t*>(ptr) + offset);
+}
+
+inline const void* OffsetPtr(const void* ptr, size_t offset) {
+  return static_cast<const void*>(static_cast<const uint8_t*>(ptr) + offset);
+}
+
+Result<void> CpuPlatformImpl::CopyImpl(const void* src, void* dst, size_t src_size, size_t dst_size,
+                                       size_t src_offset, size_t dst_offset, size_t size,
+                                       Stream st) {
+  if (!CheckCopyParam(src_size, dst_size, src_offset, dst_offset, size)) {
+    return Status(eInvalidArgument);
+  }
+  auto task = [=] { std::memcpy(OffsetPtr(dst, dst_offset), OffsetPtr(src, src_offset), size); };
+  if (!st) {
+    task();
+    return success();
+  }
+  if (st.GetDevice().platform_id() != 0) {
+    return Status(eInvalidArgument);
+  }
+  auto cpu_stream = static_cast<CpuStreamImpl*>(st.GetNative());
+  if (!cpu_stream) {
+    return Status(eInvalidArgument);
+  }
+  return cpu_stream->Enqueue(std::move(task));
+}
+
+Result<void> CpuPlatformImpl::Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
+                                   Stream stream) {
+  auto dst_ptr = dst.GetNative();
+  if (!dst_ptr) {
+    return Status(eInvalidArgument);
+  }
+  if (dst.GetDevice().platform_id() != 0) {
+    return Status(eInvalidArgument);
+  }
+  return CopyImpl(host_ptr, dst_ptr, size, dst.GetSize(), 0, dst_offset, size, stream);
+}
+
+Result<void> CpuPlatformImpl::Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
+                                   Stream stream) {
+  auto src_ptr = src.GetNative();
+  if (!src_ptr) {
+    return Status(eInvalidArgument);
+  }
+  if (src.GetDevice().platform_id() != 0) {
+    return Status(eInvalidArgument);
+  }
+  return CopyImpl(src_ptr, host_ptr, src.GetSize(), size, src_offset, 0, size, stream);
+}
+Result<void> CpuPlatformImpl::Copy(Buffer src, Buffer dst, size_t size, size_t src_offset,
+                                   size_t dst_offset, Stream stream) {
+  auto src_ptr = src.GetNative();
+  auto dst_ptr = dst.GetNative();
+  if (!src_ptr || !dst_ptr) {
+    return Status(eInvalidArgument);
+  }
+  auto device = src.GetDevice();
+  if (device.platform_id() != 0 || device.platform_id() != dst.GetDevice().platform_id()) {
+    return Status(eInvalidArgument);
+  }
+  return CopyImpl(src_ptr, dst_ptr, src.GetSize(), dst.GetSize(), src_offset, dst_offset, size,
+                  stream);
+}
+
+Result<Stream> CpuPlatformImpl::GetDefaultStream(int32_t device_id) {
+  try {
+    std::call_once(init_flag_, [&] { default_stream_ = Stream(GetDevice(device_id)); });
+    return default_stream_;
+  } catch (...) {
+    return Status(eFail);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// CpuBufferImpl
+
+CpuBufferImpl::CpuBufferImpl(Device device) : BufferImpl(device) {}
+
+void* CpuBufferImpl::GetNative(ErrorCode* ec) {
+  if (!memory_) {
+    if (ec) *ec = eInvalidArgument;
+    return nullptr;
+  }
+  if (ec) *ec = ErrorCode::eSuccess;
+  return OffsetPtr(memory_->data(), offset_);
+}
+
+Allocator CpuBufferImpl::GetAllocator() const { return {}; }
+
+size_t CpuBufferImpl::GetSize(ErrorCode* ec) {
+  if (!memory_) {
+    if (ec) *ec = eInvalidArgument;
+    return 0;
+  }
+  if (ec) *ec = ErrorCode::eSuccess;
+  return size_;
+}
+
+// int CpuBufferImpl::Fill(uint8_t pattern, size_t size, size_t offset,
+//                         Stream& st) {
+//   if (!memory_ || !memory_->handle) {
+//     return Status(eInvalidArgument);
+//   }
+//   if (offset + size >= size_) {
+//     return Status(eInvalidArgument);
+//   }
+//   auto task = [=] {
+//     auto data = OffsetPtr(memory_->handle, offset);
+//     std::memset(data, pattern, size);
+//   };
+//   if (!st) {
+//     task();
+//     return M_SUCCESS;
+//   }
+//   if (st.GetDevice() != Device()) {
+//     return Status(eInvalidArgument);
+//   }
+//   auto cpu_stream = static_cast<CpuStreamImpl*>(st.GetNative());
+//   if (!cpu_stream) {
+//     return Status(eInvalidArgument);
+//   }
+//   return cpu_stream->Enqueue(std::move(task));
+// }
+
+Result<void> CpuBufferImpl::Init(size_t size, Allocator allocator, size_t alignment,
+                                 uint64_t flags) {
+  assert(!allocator && "CPU device doesn't support allocators yet");
+  memory_ = std::make_shared<CpuHostMemory>();
+  OUTCOME_TRY(memory_->Init(size, alignment));
+  size_ = size;
+  return success();
+}
+
+Result<void> CpuBufferImpl::Init(size_t size, std::shared_ptr<void> native, uint64_t flags) {
+  memory_ = std::make_shared<CpuHostMemory>();
+  OUTCOME_TRY(memory_->Init(size, std::move(native)));
+  size_ = size;
+  return success();
+}
+
+Result<BufferImplPtr> CpuBufferImpl::SubBuffer(size_t offset, size_t size, uint64_t flags) {
+  if (offset_ + offset + size > memory_->size()) {
+    return Status(eInvalidArgument);
+  }
+  auto impl = std::make_shared<CpuBufferImpl>(device_);
+  impl->memory_ = memory_;
+  impl->offset_ = offset_ + offset;
+  impl->size_ = size;
+  return impl;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// CpuStreamImpl
+
+CpuStreamImpl::CpuStreamImpl(Device device) : StreamImpl(device) {}
+
+CpuStreamImpl::~CpuStreamImpl() {
+  {
+    std::lock_guard lock(mutex_);
+    abort_ = true;
+  }
+  cv_.notify_one();
+  thread_.join();
+}
+
+Result<void> CpuStreamImpl::Init(uint64_t flags) {
+  thread_ = std::thread(&CpuStreamImpl::InternalThreadEntry, this);
+  return success();
+}
+
+Result<void> CpuStreamImpl::Init(std::shared_ptr<void> native, uint64_t flags) {
+  return Status(eNotSupported);
+}
+
+Result<void> CpuStreamImpl::Enqueue(Task task) {
+  {
+    std::lock_guard lock(mutex_);
+    task_queue_.push(std::move(task));
+  }
+  cv_.notify_one();
+  return success();
+}
+
+Result<void> CpuStreamImpl::DependsOn(Event& event) {
+  return Enqueue([&] { event.Wait().value(); });
+}
+
+Result<void> CpuStreamImpl::Query() {
+  std::lock_guard lock(mutex_);
+  if (task_queue_.empty()) {
+    return success();
+  } else {
+    return Status(eFail);
+  }
+}
+
+Result<void> CpuStreamImpl::Wait() {
+  {
+    std::unique_lock lock(mutex_);
+    cv_.wait(lock, [this] { return task_queue_.empty() || abort_; });
+  }
+  cv_.notify_one();
+  return success();
+}
+
+Result<void> CpuStreamImpl::Submit(Kernel& kernel) {
+  if (GetDevice() != kernel.GetDevice()) {
+    return Status(eInvalidArgument);
+  }
+  auto task = static_cast<Task*>(kernel.GetNative());
+  if (task) {
+    OUTCOME_TRY(Enqueue(*task));
+    return success();
+  }
+  return Status(eInvalidArgument);
+}
+
+void* CpuStreamImpl::GetNative(ErrorCode* ec) {
+  if (ec) *ec = ErrorCode::eSuccess;
+  return this;
+}
+
+void CpuStreamImpl::InternalThreadEntry() {
+  while (true) {
+    Task task;
+    {
+      std::unique_lock lock(mutex_);
+      cv_.wait(lock, [this] { return !task_queue_.empty() || abort_; });
+      if (abort_) {
+        break;
+      }
+      task = std::move(task_queue_.front());
+    }
+    if (task) {
+      task();
+    }
+    {
+      std::lock_guard lock(mutex_);
+      task_queue_.pop();
+    }
+    cv_.notify_one();
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// CpuEventImpl
+
+CpuEventImpl::CpuEventImpl(Device device) : EventImpl(device) {}
+
+Result<void> CpuEventImpl::Init(uint64_t flags) {
+  Reset();
+  return success();
+};
+
+Result<void> CpuEventImpl::Init(std::shared_ptr<void> native, uint64_t flags) {
+  return Status(eNotSupported);
+};
+
+Result<void> CpuEventImpl::Query() {
+  auto status = future_.wait_for(std::chrono::microseconds::zero());
+  if (status == std::future_status::ready) {
+    return success();
+  } else {
+    return Status(eNotReady);
+  }
+}
+
+Result<void> CpuEventImpl::Record(Stream& stream) {
+  if (stream.GetDevice() != device_) {
+    return Status(eInvalidArgument);
+  }
+  auto cpu_stream = static_cast<CpuStreamImpl*>(stream.GetNative());
+  if (!cpu_stream) return Status(eInvalidArgument);
+  Reset();
+  return cpu_stream->Enqueue([this] { promise_.set_value(); });
+}
+
+Result<void> CpuEventImpl::Wait() {
+  future_.wait();
+  return success();
+};
+
+void CpuEventImpl::Reset() {
+  promise_ = std::promise<void>();
+  future_ = promise_.get_future();
+}
+
+void* CpuEventImpl::GetNative(ErrorCode* ec) {
+  if (ec) *ec = ErrorCode::eSuccess;
+  return this;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+Kernel CreateCpuKernel(std::function<void()> task) {
+  return Kernel(std::make_shared<CpuKernelImpl>(gCpuPlatform().GetDevice(0), std::move(task)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// CpuPlatformRegisterer
+
+CpuPlatformImpl& gCpuPlatform() {
+  static Platform platform("cpu");
+  return Access::get<CpuPlatformImpl>(platform);
+}
+
+class CpuPlatformRegisterer {
+ public:
+  CpuPlatformRegisterer() {
+    gPlatformRegistry().Register([] { return std::make_shared<CpuPlatformImpl>(); });
+  }
+};
+
+CpuPlatformRegisterer g_cpu_platform_registerer;
+
+}  // namespace mmdeploy::framework
--- a/csrc/mmdeploy/device/cpu/cpu_device.h
+++ b/csrc/mmdeploy/device/cpu/cpu_device.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <thread>
+
+#include "mmdeploy/core/device_impl.h"
+#include "mmdeploy/core/types.h"
+
+namespace mmdeploy::framework {
+
+class CpuPlatformImpl : public PlatformImpl {
+ public:
+  int GetPlatformId() const noexcept override;
+
+  const char* GetPlatformName() const noexcept override;
+
+  Result<void> BindDevice(Device device, Device* prev) override;
+
+  shared_ptr<BufferImpl> CreateBuffer(Device device) override;
+
+  shared_ptr<StreamImpl> CreateStream(Device device) override;
+
+  shared_ptr<EventImpl> CreateEvent(Device device) override;
+
+  Result<void> Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
+                    Stream stream) override;
+
+  Result<void> Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
+                    Stream stream) override;
+
+  Result<void> Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset,
+                    Stream stream) override;
+
+  Result<Stream> GetDefaultStream(int32_t device_id) override;
+
+  Device GetDevice(int device_id) const { return Device(GetPlatformId(), device_id); }
+
+ private:
+  static bool CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset,
+                             size_t copy_size);
+
+  static Result<void> CopyImpl(const void* src, void* dst, size_t src_size, size_t dst_size,
+                               size_t src_offset, size_t dst_offset, size_t size, Stream st);
+
+  Stream default_stream_;
+  std::once_flag init_flag_;
+};
+
+CpuPlatformImpl& gCpuPlatform();
+
+class CpuHostMemory;
+
+class CpuBufferImpl : public BufferImpl {
+ public:
+  explicit CpuBufferImpl(Device device);
+
+  Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;
+
+  Result<void> Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;
+
+  Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;
+
+  void* GetNative(ErrorCode* ec) override;
+
+  Allocator GetAllocator() const override;
+
+  size_t GetSize(ErrorCode* ec) override;
+
+ private:
+  std::shared_ptr<CpuHostMemory> memory_;
+  size_t offset_{0};
+  size_t size_{0};
+};
+
+class CpuStreamImpl : public StreamImpl {
+ public:
+  using Task = std::function<void()>;
+
+  explicit CpuStreamImpl(Device device);
+
+  ~CpuStreamImpl() override;
+
+  Result<void> Init(uint64_t flags) override;
+
+  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+
+  Result<void> Enqueue(Task task);
+
+  Result<void> DependsOn(Event& event) override;
+
+  Result<void> Query() override;
+
+  Result<void> Wait() override;
+
+  Result<void> Submit(Kernel& kernel) override;
+
+  void* GetNative(ErrorCode* ec) override;
+
+ private:
+  void InternalThreadEntry();
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  std::queue<Task> task_queue_;
+  std::thread thread_;
+  Device device_;
+  bool abort_{false};
+};
+
+class CpuEventImpl : public EventImpl {
+ public:
+  explicit CpuEventImpl(Device device);
+
+  ~CpuEventImpl() override = default;
+
+  Result<void> Init(uint64_t flags) override;
+
+  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+
+  Result<void> Query() override;
+
+  Result<void> Record(Stream& stream) override;
+
+  Result<void> Wait() override;
+
+  void* GetNative(ErrorCode* ec) override;
+
+ private:
+  void Reset();
+  std::shared_future<void> future_;
+  std::promise<void> promise_;
+};
+
+class CpuKernelImpl : public KernelImpl {
+ public:
+  using Task = CpuStreamImpl::Task;
+
+  explicit CpuKernelImpl(Device device, Task task) : KernelImpl(device), task_(std::move(task)) {}
+
+  void* GetNative(ErrorCode* ec) override {
+    if (ec) *ec = ErrorCode::eSuccess;
+    return &task_;
+  }
+
+ private:
+  Task task_;
+};
+
+}  // namespace mmdeploy::framework
--- a/csrc/mmdeploy/device/cuda/CMakeLists.txt
+++ b/csrc/mmdeploy/device/cuda/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_cuda_device)
+
+set(SRCS cuda_device.cpp)
+mmdeploy_add_module(${PROJECT_NAME} "${SRCS}")
+target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_INCLUDE_DIRS})
+target_link_libraries(${PROJECT_NAME} PRIVATE ${CUDA_LIBRARIES} cuda)
+add_library(mmdeploy::device::cuda ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/device/cuda/buddy_allocator.h
+++ b/csrc/mmdeploy/device/cuda/buddy_allocator.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_DEVICE_CUDA_BUDDY_ALLOCATOR_H_
+#define MMDEPLOY_SRC_DEVICE_CUDA_BUDDY_ALLOCATOR_H_
+
+#include <cuda_runtime.h>
+
+#include <atomic>
+#include <chrono>
+#include <list>
+#include <mutex>
+#include <vector>
+
+#include "mmdeploy/core/logger.h"
+#include "mmdeploy/device/cuda/default_allocator.h"
+
+namespace mmdeploy::cuda {
+
+class BuddyAllocator {
+ public:
+  using size_type = std::size_t;
+
+  BuddyAllocator(size_type size, size_type block_size) {
+    block_size_ = block_size;
+    block_count_ = size / block_size_;
+    if (!IsPowerOfTwo(block_count_)) {
+      block_count_ = RoundToPowerOfTwo(block_count_);
+      MMDEPLOY_WARN("Rounding up block_count to next power of 2 {}", block_count_);
+    }
+    base_ = LogPowerOfTwo(block_count_);
+    size_ = block_size_ * block_count_;
+    memory_ = gDefaultAllocator().Allocate(size_);
+    tree_.resize(block_count_ * 2);
+    free_.resize(base_ + 1);
+    Build(1, 0);
+    Add(1, 0);
+    MMDEPLOY_ERROR("size = {}, block_size = {}, block_count = {}", size_, block_size_,
+                   block_count_);
+    size = size_;
+    for (int i = 0; i <= base_; ++i) {
+      MMDEPLOY_ERROR("level {}, size = {}", i, size);
+      size /= 2;
+    }
+  }
+
+  ~BuddyAllocator() {
+    for (int i = 0; i < free_.size(); ++i) {
+      MMDEPLOY_ERROR("free_[{}].size(): {}", i, free_[i].size());
+    }
+    gDefaultAllocator().Deallocate(memory_, size_);
+  }
+
+  [[nodiscard]] void* Allocate(size_type n) {
+    std::lock_guard lock{mutex_};
+    if (n > size_) {
+      return nullptr;
+    }
+    auto n_level = GetLevel(n);
+    auto level = n_level;
+    for (; level >= 0; --level) {
+      if (!free_[level].empty()) {
+        break;
+      }
+    }
+    if (level < 0) {
+      MMDEPLOY_WARN("failed to allocate memory size = {} bytes", n);
+      return nullptr;
+    }
+    for (; level < n_level; ++level) {
+      auto index = free_[level].front();
+      Split(index, level);
+    }
+    auto index = free_[level].front();
+    Del(index, level);
+    auto offset = (index ^ (1 << level)) << (base_ - level);
+    auto p = static_cast<uint8_t*>(memory_) + offset * block_size_;
+    return p;
+  }
+
+  void Deallocate(void* p, size_type n) {
+    std::lock_guard lock{mutex_};
+    auto offset = static_cast<uint8_t*>(p) - static_cast<uint8_t*>(memory_);
+    if (offset < 0 || offset % block_size_) {
+      MMDEPLOY_ERROR("invalid address: {}", p);
+    }
+    offset /= static_cast<long>(block_size_);
+    auto level = GetLevel(n);
+    auto index = (offset >> (base_ - level)) ^ (1 << level);
+    Add(index, level);
+    while (index > 1) {
+      auto buddy = index ^ 1;
+      if (tree_[buddy] != free_[level].end()) {
+        Merge(index, level);
+        index /= 2;
+        --level;
+      } else {
+        break;
+      }
+    }
+  }
+
+ private:
+  void Add(size_type index, size_type level) {
+    assert(tree_[index] == free_[level].end());
+    tree_[index] = free_[level].insert(free_[level].end(), index);
+  }
+
+  void Del(size_type index, size_type level) {
+    assert(tree_[index] != free_[level].end());
+    free_[level].erase(tree_[index]);
+    tree_[index] = free_[level].end();
+  }
+
+  void Split(size_type index, size_type level) {
+    Del(index, level);
+    Add(index * 2, level + 1);
+    Add(index * 2 + 1, level + 1);
+  }
+
+  void Merge(size_type index, size_type level) {
+    Del(index, level);
+    Del(index ^ 1, level);
+    Add(index / 2, level - 1);
+  }
+
+  size_type GetLevel(size_type size) const {
+    size = RoundToPowerOfTwo((size + block_size_ - 1) / block_size_);
+    return base_ - LogPowerOfTwo(size);
+  }
+
+  static bool IsPowerOfTwo(size_type n) { return (n & (n - 1)) == 0; }
+
+  static size_type RoundToPowerOfTwo(size_type n) {
+    --n;
+    n |= (n >> 1);
+    n |= (n >> 2);
+    n |= (n >> 4);
+    n |= (n >> 8);
+    n |= (n >> 16);
+    n |= (n >> 32);
+    return ++n;
+  }
+
+  static size_type LogPowerOfTwo(size_type v) {
+    size_type r{};
+    r |= ((v & 0xFFFFFFFF00000000) != 0) << 5;
+    r |= ((v & 0xFFFF0000FFFF0000) != 0) << 4;
+    r |= ((v & 0xFF00FF00FF00FF00) != 0) << 3;
+    r |= ((v & 0xF0F0F0F0F0F0F0F0) != 0) << 2;
+    r |= ((v & 0xCCCCCCCCCCCCCCCC) != 0) << 1;
+    r |= ((v & 0xAAAAAAAAAAAAAAAA) != 0);
+    return r;
+  }
+
+  void Build(size_type index, size_type level) {
+    if (index < tree_.size()) {
+      tree_[index] = free_[level].end();
+      index *= 2;
+      ++level;
+      Build(index, level);
+      Build(index + 1, level);
+    }
+  }
+
+ private:
+  size_type size_;
+  size_type block_size_;
+  size_type block_count_;
+  size_type base_;
+  void* memory_;
+  std::vector<std::list<size_type>::iterator> tree_;
+  std::vector<std::list<size_type> > free_;
+  std::mutex mutex_;
+};
+
+inline BuddyAllocator& gBuddyAllocator() {
+  static BuddyAllocator v(1U << 30, 1024 * 64);
+  return v;
+}
+
+}  // namespace mmdeploy::cuda
+
+#endif  // MMDEPLOY_SRC_DEVICE_CUDA_BUDDY_ALLOCATOR_H_
--- a/csrc/mmdeploy/device/cuda/cuda_device.cpp
+++ b/csrc/mmdeploy/device/cuda/cuda_device.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include "cuda_device.h"
+
+#include <cuda.h>
+
+#include "mmdeploy/device/device_allocator.h"
+
+namespace mmdeploy::framework {
+
+inline void* OffsetPtr(void* ptr, size_t offset) {
+  return static_cast<void*>(static_cast<uint8_t*>(ptr) + offset);
+}
+
+inline const void* OffsetPtr(const void* ptr, size_t offset) {
+  return static_cast<const void*>(static_cast<const uint8_t*>(ptr) + offset);
+}
+
+cudaMemcpyKind MapMemcpyKindToCuda(MemcpyKind kind) {
+  switch (kind) {
+    case MemcpyKind::HtoD:
+      return cudaMemcpyHostToDevice;
+    case MemcpyKind::DtoH:
+      return cudaMemcpyDeviceToHost;
+    case MemcpyKind::DtoD:
+      return cudaMemcpyDeviceToDevice;
+    default:
+      return cudaMemcpyDefault;
+  }
+}
+
+namespace cuda {
+
+class Mallocator : public AllocatorImpl {
+ public:
+  Block Allocate(size_t size) noexcept override {
+    if (size == 0) {
+      return Block{};
+    }
+    Block block;
+    if (auto status = cudaMalloc(&block.handle, size); status != cudaSuccess) {
+      // log error
+    }
+    block.size = size;
+    return block;
+  }
+  void Deallocate(Block& block) noexcept override {
+    if (!block.handle) {
+      return;
+    }
+    cudaFree(block.handle);
+  }
+  bool Owns(const Block& block) const noexcept override { return true; }
+};
+
+Allocator CreateDefaultAllocator() {
+  using namespace device_allocator;
+  AllocatorImplPtr allocator = std::make_shared<Mallocator>();
+  allocator = std::make_shared<Tree>(allocator, -1, .5);
+  allocator = std::make_shared<Locked>(allocator);
+  MMDEPLOY_DEBUG("Default CUDA allocator initialized");
+  return Access::create<Allocator>(allocator);
+}
+
+}  // namespace cuda
+
+// ! this class doesn't handle device id
+class CudaDeviceMemory : public NonCopyable {
+ public:
+  explicit CudaDeviceMemory(int device_id) : device_id_(device_id), size_(), owned_block_() {}
+  Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) {
+    if (alignment > 256 || 256 % alignment != 0) {
+      return Status(eNotSupported);
+    }
+    allocator_ = std::move(allocator);
+    CudaDeviceGuard guard(device_id_);
+    block_ = Access::get<AllocatorImpl>(allocator_).Allocate(size);
+    if (size && !block_.handle) {
+      return Status(eOutOfMemory);
+    }
+    size_ = size;
+    owned_block_ = true;
+    return success();
+  }
+  Result<void> Init(size_t size, std::shared_ptr<void> data, uint64_t flags) {
+    size_ = size;
+    external_ = std::move(data);
+    block_.handle = external_.get();
+    block_.size = size;
+    owned_block_ = false;
+    return success();
+  }
+  ~CudaDeviceMemory() {
+    if (block_.handle) {
+      if (owned_block_) {
+        CudaDeviceGuard guard(device_id_);
+        Access::get<AllocatorImpl>(allocator_).Deallocate(block_);
+        owned_block_ = false;
+      }
+      block_.handle = nullptr;
+    }
+    external_.reset();
+    size_ = 0;
+  }
+  size_t size() const { return size_; }
+  void* data() const { return block_.handle; }
+  const Allocator& allocator() const { return allocator_; }
+
+ private:
+  int device_id_;
+  size_t size_;
+  AllocatorImpl::Block block_;
+  bool owned_block_;
+  Allocator allocator_;
+  std::shared_ptr<void> external_;
+};
+
+shared_ptr<BufferImpl> CudaPlatformImpl::CreateBuffer(Device device) {
+  return std::make_shared<CudaBufferImpl>(device);
+}
+
+shared_ptr<StreamImpl> CudaPlatformImpl::CreateStream(Device device) {
+  return std::make_shared<CudaStreamImpl>(device);
+}
+
+shared_ptr<EventImpl> CudaPlatformImpl::CreateEvent(Device device) {
+  return std::make_shared<CudaEventImpl>(device);
+}
+
+Result<void> CudaPlatformImpl::BindDevice(Device device, Device* prev) {
+  if (device.platform_id() != platform_id_) {
+    return Status(eInvalidArgument);
+  }
+  // skip null device
+  if (device.device_id() == -1) {
+    return success();
+  }
+  int prev_device_id = -1;
+  if (prev) {
+    CUcontext ctx{};
+    cuCtxGetCurrent(&ctx);
+    if (ctx) {
+      cudaGetDevice(&prev_device_id);
+      *prev = Device(platform_id_, prev_device_id);
+    } else {
+      // cuda is not initialized return a null device as previous
+      *prev = Device(platform_id_, -1);
+    }
+  }
+  if (device.device_id() != prev_device_id) {
+    cudaSetDevice(device.device_id());
+  }
+  return success();
+}
+
+bool CudaPlatformImpl::CheckCopyDevice(const Device& src, const Device& dst, const Device& st) {
+  return st.is_device() && (src.is_host() || src == st) && (dst.is_host() || dst == st);
+}
+
+Result<void> CudaPlatformImpl::Copy(const void* host_ptr, Buffer dst, size_t size,
+                                    size_t dst_offset, Stream stream) {
+  if (!CheckCopyDevice(Device{0, 0}, dst.GetDevice(), stream.GetDevice())) {
+    return Status(eInvalidArgument);
+  }
+  if (size == 0) {
+    return success();
+  }
+  auto dst_ptr = dst.GetNative();
+  if (!dst_ptr) {
+    return Status(eInvalidArgument);
+  }
+  //  auto device = dst.GetDevice();
+  return CopyImpl(stream.GetDevice(), host_ptr, dst_ptr, size, dst.GetSize(), 0, dst_offset, size,
+                  stream);
+}
+
+Result<void> CudaPlatformImpl::Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
+                                    Stream stream) {
+  if (!CheckCopyDevice(src.GetDevice(), Device{0, 0}, stream.GetDevice())) {
+    return Status(eInvalidArgument);
+  }
+  if (size == 0) {
+    return success();
+  }
+  auto src_ptr = src.GetNative();
+  if (!src_ptr) {
+    return Status(eInvalidArgument);
+  }
+  //  auto device = src.GetDevice();
+  return CopyImpl(stream.GetDevice(), src_ptr, host_ptr, src.GetSize(), size, src_offset, 0, size,
+                  stream);
+}
+
+Result<void> CudaPlatformImpl::Copy(Buffer src, Buffer dst, size_t size, size_t src_offset,
+                                    size_t dst_offset, Stream stream) {
+  if (!CheckCopyDevice(src.GetDevice(), dst.GetDevice(), stream.GetDevice())) {
+    return Status(eInvalidArgument);
+  }
+  if (size == 0) {
+    return success();
+  }
+  auto src_ptr = src.GetNative();
+  auto dst_ptr = dst.GetNative();
+  if (!src_ptr || !dst_ptr) {
+    return Status(eInvalidArgument);
+  }
+  return CopyImpl(stream.GetDevice(), src_ptr, dst_ptr, src.GetSize(), dst.GetSize(), src_offset,
+                  dst_offset, size, stream);
+}
+
+bool CudaPlatformImpl::CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset,
+                                      size_t dst_offset, size_t copy_size) {
+  if (src_offset + copy_size > src_size) {
+    return false;
+  }
+  if (dst_offset + copy_size > dst_size) {
+    return false;
+  }
+  return true;
+}
+
+Result<void> CudaPlatformImpl::CopyImpl(Device device, const void* src, void* dst, size_t src_size,
+                                        size_t dst_size, size_t src_offset, size_t dst_offset,
+                                        size_t size, Stream st) {
+  if (!CheckCopyParam(src_size, dst_size, src_offset, dst_offset, size)) {
+    return Status(eInvalidArgument);
+  }
+
+  auto p_dst = OffsetPtr(dst, dst_offset);
+  auto p_src = OffsetPtr(src, src_offset);
+
+  CudaDeviceGuard guard(device);
+
+  if (st) {
+    auto cuda_stream = ::mmdeploy::framework::GetNative<cudaStream_t>(st);
+    // TODO: how about default stream cudaStream_t(0)?
+    if (!cuda_stream) {
+      return Status(eInvalidArgument);
+    }
+    auto err = cudaMemcpyAsync(p_dst, p_src, size, cudaMemcpyDefault, cuda_stream);
+    if (err != cudaSuccess) {
+      return Status(eFail);
+    }
+  } else {
+    auto err = cudaMemcpy(p_dst, p_src, size, cudaMemcpyDefault);
+    if (err != cudaSuccess) {
+      return Status(eFail);
+    }
+  }
+  return success();
+}
+
+Result<Stream> CudaPlatformImpl::GetDefaultStream(int32_t device_id) {
+  if (device_id >= per_device_data_.size()) {
+    return Status(eInvalidArgument);
+  }
+  return per_device_data_[device_id]->default_stream();
+}
+
+void CudaPlatformImpl::PerDeviceData::init() {
+  std::call_once(init_flag_, [&] {
+    CudaDeviceGuard guard(device_id_);
+    default_stream_ = Stream(gCudaPlatform().GetDevice(device_id_));
+    default_allocator_ = cuda::CreateDefaultAllocator();
+  });
+}
+
+CudaPlatformImpl::CudaPlatformImpl() {
+  int count{};
+  if (auto err = cudaGetDeviceCount(&count); err != cudaSuccess) {
+    MMDEPLOY_ERROR("error getting device count: {}", cudaGetErrorString(err));
+    throw_exception(eFail);
+  }
+  per_device_data_storage_.reserve(count);
+  per_device_data_.reserve(count);
+  for (int device_id = 0; device_id < count; ++device_id) {
+    per_device_data_storage_.push_back(std::make_unique<PerDeviceData>(device_id));
+    per_device_data_.push_back(per_device_data_storage_.back().get());
+  }
+}
+Allocator CudaPlatformImpl::GetDefaultAllocator(int32_t device_id) {
+  return per_device_data_[device_id]->default_allocator();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// CudaStreamImpl
+
+CudaStreamImpl::CudaStreamImpl(Device device) : StreamImpl(device), stream_(), owned_stream_() {}
+
+CudaStreamImpl::~CudaStreamImpl() {
+  CudaDeviceGuard guard(device_.device_id());
+  if (owned_stream_) {
+    if (auto status = cudaStreamDestroy(stream_); status != cudaSuccess) {
+      // TODO: signal error
+    }
+    owned_stream_ = false;
+  }
+  external_.reset();
+}
+
+Result<void> CudaStreamImpl::Init(uint64_t flags) {
+  CudaDeviceGuard guard(device_);
+  if (auto status = cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking);
+      status != cudaSuccess) {
+    return Status(eFail);
+  }
+  owned_stream_ = true;
+  return success();
+}
+
+Result<void> CudaStreamImpl::Init(std::shared_ptr<void> native, uint64_t flags) {
+  // ! nullptr is valid for cudaStream_t
+  external_ = std::move(native);
+  stream_ = static_cast<cudaStream_t>(external_.get());
+  owned_stream_ = false;
+  return success();
+}
+
+Result<void> CudaStreamImpl::DependsOn(Event& event) {
+  if (event.GetDevice() == device_) {
+    CudaDeviceGuard guard(device_);
+    auto native_event = ::mmdeploy::framework::GetNative<cudaEvent_t>(event);
+    cudaStreamWaitEvent(stream_, native_event, 0);
+    return success();
+  }
+  return Status(eInvalidArgument);
+}
+
+Result<void> CudaStreamImpl::Query() {
+  CudaDeviceGuard guard(device_);
+  if (cudaStreamQuery(stream_) == cudaSuccess) {
+    return success();
+  } else {
+    return Status(eFail);
+  }
+}
+
+Result<void> CudaStreamImpl::Wait() {
+  CudaDeviceGuard guard(device_);
+  if (cudaStreamSynchronize(stream_) == cudaSuccess) {
+    return success();
+  } else {
+    return Status(eFail);
+  }
+}
+
+Result<void> CudaStreamImpl::Submit(Kernel& kernel) {
+  auto task = ::mmdeploy::framework::GetNative<CudaTask*>(kernel);
+  if (task) {
+    CudaDeviceGuard guard(device_);
+    (*task)(stream_);
+    return success();
+  }
+  return Status(eInvalidArgument);
+}
+
+void* CudaStreamImpl::GetNative(ErrorCode* ec) {
+  if (ec) *ec = ErrorCode::eSuccess;
+  return stream_;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// CudaEventImpl
+
+CudaEventImpl::CudaEventImpl(Device device) : EventImpl(device), event_(), owned_event_() {}
+
+CudaEventImpl::~CudaEventImpl() {
+  CudaDeviceGuard guard(device_.device_id());
+  if (owned_event_) {
+    if (auto status = cudaEventDestroy(event_); status != cudaSuccess) {
+      // TODO: signal error
+    }
+    owned_event_ = false;
+  }
+  external_.reset();
+}
+
+Result<void> CudaEventImpl::Init(uint64_t flags) {
+  CudaDeviceGuard guard(device_);
+  if (auto status = cudaEventCreateWithFlags(&event_, 0); status != cudaSuccess) {
+    return Status(eFail);
+  }
+  owned_event_ = true;
+  return success();
+}
+
+Result<void> CudaEventImpl::Init(std::shared_ptr<void> native, uint64_t flags) {
+  if (!native) {
+    return Status(eInvalidArgument);
+  }
+  external_ = std::move(native);
+  event_ = static_cast<cudaEvent_t>(external_.get());
+  owned_event_ = false;
+  return success();
+}
+
+Result<void> CudaEventImpl::Query() {
+  if (cudaEventQuery(event_) == cudaSuccess) {
+    return success();
+  } else {
+    return Status(eFail);
+  }
+}
+
+Result<void> CudaEventImpl::Record(Stream& stream) {
+  if (stream.GetDevice() != device_) {
+    return Status(eInvalidArgument);
+  }
+  CudaDeviceGuard guard(device_);
+  auto native_stream = ::mmdeploy::framework::GetNative<cudaStream_t>(stream);
+  cudaEventRecord(event_, native_stream);
+  return success();
+}
+
+Result<void> CudaEventImpl::Wait() {
+  CudaDeviceGuard guard(device_);
+  if (cudaEventSynchronize(event_) == cudaSuccess) {
+    return success();
+  } else {
+    return Status(eFail);
+  }
+}
+
+void* CudaEventImpl::GetNative(ErrorCode* ec) {
+  if (ec) *ec = ErrorCode::eSuccess;
+  return event_;
+}
+////////////////////////////////////////////////////////////////////////////////
+/// CudaBufferImpl
+
+CudaBufferImpl::CudaBufferImpl(Device device) : BufferImpl(device) {}
+
+Result<void> CudaBufferImpl::Init(size_t size, Allocator allocator, size_t alignment,
+                                  uint64_t flags) {
+  memory_ = std::make_shared<CudaDeviceMemory>(device_.device_id());
+  if (!allocator) {
+    allocator = gCudaPlatform().GetDefaultAllocator(device_.device_id());
+  }
+  OUTCOME_TRY(memory_->Init(size, std::move(allocator), alignment, flags));
+  size_ = size;
+  return success();
+}
+
+Result<void> CudaBufferImpl::Init(size_t size, std::shared_ptr<void> native, uint64_t flags) {
+  memory_ = std::make_shared<CudaDeviceMemory>(device_.device_id());
+  OUTCOME_TRY(memory_->Init(size, std::move(native), flags));
+  size_ = size;
+  return success();
+}
+
+Result<BufferImplPtr> CudaBufferImpl::SubBuffer(size_t offset, size_t size, uint64_t flags) {
+  if (offset_ + offset + size > memory_->size()) {
+    return Status(eInvalidArgument);
+  }
+  auto impl = std::make_shared<CudaBufferImpl>(device_);
+  impl->memory_ = memory_;
+  impl->offset_ = offset_ + offset;
+  impl->size_ = size;
+  return impl;
+}
+
+size_t CudaBufferImpl::GetSize(ErrorCode* ec) { return size_; }
+
+void* CudaBufferImpl::GetNative(ErrorCode* ec) {
+  if (!memory_) {
+    if (ec) *ec = eInvalidArgument;
+    return nullptr;
+  }
+  if (ec) *ec = ErrorCode::eSuccess;
+  return OffsetPtr(memory_->data(), offset_);
+}
+
+Allocator CudaBufferImpl::GetAllocator() const { return memory_->allocator(); }
+
+////////////////////////////////////////////////////////////////////////////////
+/// CudaKernelImpl
+void* CudaKernelImpl::GetNative(ErrorCode* ec) {
+  if (ec) *ec = ErrorCode::eSuccess;
+  return &task_;
+}
+
+CudaKernelImpl::CudaKernelImpl(Device device, CudaTask task)
+    : KernelImpl(device), task_(std::move(task)) {}
+
+////////////////////////////////////////////////////////////////////////////////
+/// CudaPlatformRegisterer
+class CudaPlatformRegisterer {
+ public:
+  CudaPlatformRegisterer() {
+    gPlatformRegistry().Register([] { return std::make_shared<CudaPlatformImpl>(); });
+  }
+};
+
+CudaPlatformRegisterer g_cuda_platform_registerer;
+
+CudaPlatformImpl& gCudaPlatform() {
+  static Platform platform("cuda");
+  return Access::get<CudaPlatformImpl>(platform);
+}
+
+}  // namespace mmdeploy::framework
--- a/csrc/mmdeploy/device/cuda/cuda_device.h
+++ b/csrc/mmdeploy/device/cuda/cuda_device.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#include <any>
+#include <mutex>
+
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "mmdeploy/core/device_impl.h"
+#include "mmdeploy/core/types.h"
+
+namespace mmdeploy::framework {
+
+using CudaTask = std::function<void(cudaStream_t)>;
+
+class CudaPlatformImpl : public PlatformImpl {
+ public:
+  CudaPlatformImpl();
+
+  ~CudaPlatformImpl() override {
+    // The CUDA driver may have already shutdown before the platform dtor is called.
+    // As a workaround, simply leak per device resources and let the driver handle it
+    // FIXME: maybe a pair of global mmdeploy_init/deinit function would be a
+    //  better solution
+    for (auto& data : per_device_data_storage_) {
+      data.release();
+    }
+  }
+
+  const char* GetPlatformName() const noexcept override { return "cuda"; }
+
+  Result<void> BindDevice(Device device, Device* prev) override;
+
+  shared_ptr<BufferImpl> CreateBuffer(Device device) override;
+
+  shared_ptr<StreamImpl> CreateStream(Device device) override;
+
+  shared_ptr<EventImpl> CreateEvent(Device device) override;
+
+  Result<void> Copy(const void* host_ptr, Buffer dst, size_t size, size_t dst_offset,
+                    Stream stream) override;
+
+  Result<void> Copy(Buffer src, void* host_ptr, size_t size, size_t src_offset,
+                    Stream stream) override;
+
+  Result<void> Copy(Buffer src, Buffer dst, size_t size, size_t src_offset, size_t dst_offset,
+                    Stream stream) override;
+
+  Result<Stream> GetDefaultStream(int32_t device_id) override;
+
+  Allocator GetDefaultAllocator(int32_t device_id);
+
+  Device GetDevice(int device_id) { return Device(platform_id_, device_id); }
+
+ private:
+  static bool CheckCopyParam(size_t src_size, size_t dst_size, size_t src_offset, size_t dst_offset,
+                             size_t copy_size);
+
+  static bool CheckCopyDevice(const Device& src, const Device& dst, const Device& st);
+
+  static Result<void> CopyImpl(Device device, const void* src, void* dst, size_t src_size,
+                               size_t dst_size, size_t src_offset, size_t dst_offset, size_t size,
+                               Stream st);
+
+  class PerDeviceData {
+   public:
+    explicit PerDeviceData(int device_id) : device_id_(device_id) {}
+    void init();
+    Stream& default_stream() {
+      init();
+      return default_stream_;
+    }
+    Allocator& default_allocator() {
+      init();
+      return default_allocator_;
+    }
+
+   private:
+    int device_id_;
+    std::once_flag init_flag_;
+    Stream default_stream_;
+    Allocator default_allocator_;
+  };
+
+  std::vector<std::unique_ptr<PerDeviceData>> per_device_data_storage_;
+  std::vector<PerDeviceData*> per_device_data_;
+};
+
+CudaPlatformImpl& gCudaPlatform();
+
+class CudaDeviceMemory;
+
+class CudaBufferImpl : public BufferImpl {
+ public:
+  explicit CudaBufferImpl(Device device);
+
+  Result<void> Init(size_t size, Allocator allocator, size_t alignment, uint64_t flags) override;
+
+  Result<void> Init(size_t size, std::shared_ptr<void> native, uint64_t flags) override;
+
+  Result<BufferImplPtr> SubBuffer(size_t offset, size_t size, uint64_t flags) override;
+
+  void* GetNative(ErrorCode* ec) override;
+
+  Allocator GetAllocator() const override;
+
+  size_t GetSize(ErrorCode* ec) override;
+
+ private:
+  std::shared_ptr<CudaDeviceMemory> memory_;
+  size_t offset_{0};
+  size_t size_{0};
+};
+
+class CudaStreamImpl : public StreamImpl {
+ public:
+  explicit CudaStreamImpl(Device device);
+
+  ~CudaStreamImpl() override;
+
+  Result<void> Init(uint64_t flags) override;
+
+  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+
+  Result<void> DependsOn(Event& event) override;
+
+  Result<void> Query() override;
+
+  Result<void> Wait() override;
+
+  Result<void> Submit(Kernel& kernel) override;
+
+  void* GetNative(ErrorCode* ec) override;
+
+ private:
+  cudaStream_t stream_;
+  bool owned_stream_;
+  std::shared_ptr<void> external_;
+};
+
+class CudaEventImpl : public EventImpl {
+ public:
+  explicit CudaEventImpl(Device device);
+
+  ~CudaEventImpl() override;
+
+  Result<void> Init(uint64_t flags) override;
+
+  Result<void> Init(std::shared_ptr<void> native, uint64_t flags) override;
+
+  Result<void> Query() override;
+
+  Result<void> Record(Stream& stream) override;
+
+  Result<void> Wait() override;
+
+  void* GetNative(ErrorCode* ec) override;
+
+ private:
+  cudaEvent_t event_;
+  bool owned_event_;
+  std::shared_ptr<void> external_;
+};
+
+class CudaKernelImpl : public KernelImpl {
+ public:
+  explicit CudaKernelImpl(Device device, CudaTask task);
+
+  void* GetNative(ErrorCode* ec) override;
+
+ private:
+  CudaTask task_;
+};
+
+class CudaDeviceGuard {
+ public:
+  explicit CudaDeviceGuard(Device device) : CudaDeviceGuard(device.device_id()) {}
+  explicit CudaDeviceGuard(int device_id) : device_id_(device_id), prev_device_id_(-1) {
+    CUcontext ctx{};
+    cuCtxGetCurrent(&ctx);
+    if (ctx) {
+      cudaGetDevice(&prev_device_id_);
+    }
+    if (prev_device_id_ != device_id_) {
+      cudaSetDevice(device_id_);
+    }
+  }
+  ~CudaDeviceGuard() {
+    if (prev_device_id_ >= 0 && prev_device_id_ != device_id_) {
+      cudaSetDevice(prev_device_id_);
+    }
+  }
+
+ private:
+  int device_id_;
+  int prev_device_id_;
+};
+
+}  // namespace mmdeploy::framework
--- a/csrc/mmdeploy/device/cuda/default_allocator.h
+++ b/csrc/mmdeploy/device/cuda/default_allocator.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_DEVICE_CUDA_DEFAULT_ALLOCATOR_H_
+#define MMDEPLOY_SRC_DEVICE_CUDA_DEFAULT_ALLOCATOR_H_
+
+#include <cuda_runtime.h>
+
+#include <atomic>
+#include <chrono>
+
+#include "mmdeploy/core/logger.h"
+
+namespace mmdeploy::cuda {
+
+class DefaultAllocator {
+ public:
+  DefaultAllocator() = default;
+  ~DefaultAllocator() {
+    MMDEPLOY_ERROR("=== CUDA Default Allocator ===");
+    MMDEPLOY_ERROR("  Allocation: count={}, size={}MB, time={}ms", alloc_count_,
+                   alloc_size_ / (1024 * 1024.f), alloc_time_ / 1000000.f);
+    MMDEPLOY_ERROR("Deallocation: count={}, size={}MB, time={}ms", dealloc_count_,
+                   dealloc_size_ / (1024 * 1024.f), dealloc_time_ / 1000000.f);
+  }
+  [[nodiscard]] void* Allocate(std::size_t n) {
+    void* p{};
+    auto t0 = std::chrono::high_resolution_clock::now();
+    auto ret = cudaMalloc(&p, n);
+    auto t1 = std::chrono::high_resolution_clock::now();
+    alloc_time_ += (int64_t)std::chrono::duration<double, std::nano>(t1 - t0).count();
+    if (ret != cudaSuccess) {
+      MMDEPLOY_ERROR("error allocating cuda memory: {}", cudaGetErrorString(ret));
+      return nullptr;
+    }
+    alloc_count_ += 1;
+    alloc_size_ += n;
+    return p;
+  }
+  void Deallocate(void* p, std::size_t n) {
+    (void)n;
+    auto t0 = std::chrono::high_resolution_clock::now();
+    auto ret = cudaFree(p);
+    auto t1 = std::chrono::high_resolution_clock::now();
+    dealloc_time_ += (int64_t)std::chrono::duration<double, std::nano>(t1 - t0).count();
+    if (ret != cudaSuccess) {
+      MMDEPLOY_ERROR("error deallocating cuda memory: {}", cudaGetErrorString(ret));
+      return;
+    }
+    dealloc_count_ += 1;
+    dealloc_size_ += n;
+  }
+
+ private:
+  std::atomic<std::size_t> alloc_count_;
+  std::atomic<std::size_t> alloc_size_;
+  std::atomic<std::size_t> alloc_time_;
+  std::atomic<std::size_t> dealloc_count_;
+  std::atomic<std::size_t> dealloc_size_;
+  std::atomic<std::size_t> dealloc_time_;
+};
+
+inline DefaultAllocator& gDefaultAllocator() {
+  static DefaultAllocator v;
+  return v;
+}
+
+}  // namespace mmdeploy::cuda
+
+#endif  // MMDEPLOY_SRC_DEVICE_CUDA_DEFAULT_ALLOCATOR_H_
--- a/csrc/mmdeploy/device/cuda/linear_allocator.h
+++ b/csrc/mmdeploy/device/cuda/linear_allocator.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_DEVICE_CUDA_LINEARALLOCATOR_H_
+#define MMDEPLOY_SRC_DEVICE_CUDA_LINEARALLOCATOR_H_
+
+#include "default_allocator.h"
+
+namespace mmdeploy::cuda {
+
+class LinearAllocator {
+ public:
+  explicit LinearAllocator(std::size_t size) : size_(size) {
+    base_ = static_cast<uint8_t*>(gDefaultAllocator().Allocate(size));
+    ptr_ = base_;
+  }
+  ~LinearAllocator() { gDefaultAllocator().Deallocate(base_, size_); }
+  [[nodiscard]] void* Allocate(std::size_t n) {
+    std::optional<std::lock_guard<std::mutex> > lock;
+    if (mutex_) {
+      lock.emplace(*mutex_);
+    }
+    ++count_;
+    total_ += n;
+    auto ptr = static_cast<void*>(ptr_);
+    std::size_t space = base_ + size_ - ptr_;
+
+    if (std::align(16, n, ptr, space)) {
+      MMDEPLOY_ERROR("success n={}, total={}, count={}", n, total_, count_);
+      ptr_ = static_cast<uint8_t*>(ptr) + n;
+      return ptr;
+    }
+    MMDEPLOY_ERROR("fallback {}, total={}, count={}", n, total_, count_);
+    return gDefaultAllocator().Allocate(n);
+  }
+  void Deallocate(void* _p, std::size_t n) {
+    std::optional<std::lock_guard<std::mutex> > lock;
+    if (mutex_) {
+      lock.emplace(*mutex_);
+    }
+    auto p = static_cast<uint8_t*>(_p);
+    if (!(base_ <= p && p < ptr_)) {
+      gDefaultAllocator().Deallocate(_p, n);
+    }
+    total_ -= n;
+    --count_;
+    MMDEPLOY_ERROR("deallocate total={}, count={}", total_, count_);
+    if (total_ == 0) {
+      assert(count_ == 0);
+      ptr_ = base_;
+    }
+  }
+
+ private:
+  std::size_t size_;
+  uint8_t* base_;
+  uint8_t* ptr_;
+  std::size_t total_{};
+  std::size_t count_{};
+  std::optional<std::mutex> mutex_;
+};
+
+inline LinearAllocator& gLinearAllocator() {
+  static LinearAllocator v(1U << 30);
+  return v;
+}
+
+}  // namespace mmdeploy::cuda
+
+#endif  // MMDEPLOY_SRC_DEVICE_CUDA_LINEARALLOCATOR_H_
--- a/csrc/mmdeploy/device/device_allocator.h
+++ b/csrc/mmdeploy/device/device_allocator.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_SRC_CORE_DEVICE_ALLOCATOR_H_
+#define MMDEPLOY_SRC_CORE_DEVICE_ALLOCATOR_H_
+
+#include <chrono>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <numeric>
+#include <stack>
+
+#include "mmdeploy/core/device_impl.h"
+#include "mmdeploy/core/logger.h"
+
+namespace mmdeploy::framework::device_allocator {
+
+class Fallback : public AllocatorImpl {
+ public:
+  Fallback(AllocatorImplPtr primary, AllocatorImplPtr fallback)
+      : primary_(std::move(primary)), fallback_(std::move(fallback)) {}
+
+  Block Allocate(size_t size) noexcept override {
+    if (auto block = primary_->Allocate(size); block.handle) {
+      return block;
+    }
+    return fallback_->Allocate(size);
+  }
+
+  void Deallocate(Block& block) noexcept override {
+    if (primary_->Owns(block)) {
+      primary_->Deallocate(block);
+      return;
+    }
+    fallback_->Deallocate(block);
+  }
+
+  bool Owns(const Block& block) const noexcept override {
+    return primary_->Owns(block) || fallback_->Owns(block);
+  }
+
+ private:
+  AllocatorImplPtr primary_;
+  AllocatorImplPtr fallback_;
+};
+
+// TODO: batch allocation
+class Pool : public AllocatorImpl {
+ public:
+  explicit Pool(AllocatorImplPtr allocator, size_t min_size, size_t max_size, unsigned pool_size)
+      : allocator_(std::move(allocator)),
+        min_size_(min_size),
+        max_size_(max_size),
+        pool_size_(pool_size) {
+    free_.reserve(pool_size);
+  }
+
+  ~Pool() override {
+    while (!free_.empty()) {
+      Block block(free_.back(), max_size_);
+      allocator_->Deallocate(block);
+      free_.pop_back();
+    }
+  }
+
+  Block Allocate(size_t size) noexcept override {
+    if (min_size_ <= size && size <= max_size_) {
+      if (!free_.empty()) {
+        auto handle = free_.back();
+        free_.pop_back();
+        return Block{handle, max_size_};
+      } else {
+        return allocator_->Allocate(max_size_);
+      }
+    }
+    return Block{};
+  }
+
+  void Deallocate(Block& block) noexcept override {
+    if (Owns(block)) {
+      if (free_.size() < pool_size_) {
+        free_.push_back(block.handle);
+        block.handle = nullptr;
+        block.size = 0;
+      } else {
+        allocator_->Deallocate(block);
+      }
+    }
+  }
+
+  bool Owns(const Block& block) const noexcept override {
+    return block.handle && min_size_ <= block.size && block.size <= max_size_;
+  }
+
+ private:
+  AllocatorImplPtr allocator_;
+  size_t min_size_;
+  size_t max_size_;
+  unsigned pool_size_;
+  std::vector<void*> free_;
+};
+
+class Tree : public AllocatorImpl {
+  static constexpr auto kQuantizer = 100;
+
+ public:
+  Tree(AllocatorImplPtr allocator, size_t max_bytes, float threshold)
+      : allocator_(std::move(allocator)), max_tree_bytes_(max_bytes) {
+    if (threshold) {
+      thresh_numerator_ = static_cast<int>(threshold * kQuantizer);
+      thresh_denominator_ = kQuantizer;
+      auto divisor = std::gcd(thresh_numerator_, thresh_denominator_);
+      thresh_numerator_ /= divisor;
+      thresh_denominator_ /= divisor;
+    }
+  }
+
+  ~Tree() override {
+    for (const auto& [size, handle] : tree_) {
+      Block block(handle, size);
+      allocator_->Deallocate(block);
+    }
+  }
+
+  Block Allocate(size_t size) noexcept override {
+    if (auto it = tree_.lower_bound(size); it != tree_.end()) {
+      if (size * thresh_denominator_ >= it->first * thresh_numerator_) {
+        Block block(it->second, it->first);
+        tree_bytes_ -= it->first;
+        tree_.erase(it);
+        return block;
+      }
+    }
+    return allocator_->Allocate(size);
+  }
+  void Deallocate(Block& block) noexcept override {
+    auto bytes = tree_bytes_ + block.size;
+    if (bytes < max_tree_bytes_) {
+      tree_.insert({block.size, block.handle});
+      tree_bytes_ = bytes;
+      block.size = 0;
+      block.handle = nullptr;
+    } else {
+      allocator_->Deallocate(block);
+    }
+  }
+  bool Owns(const Block& block) const noexcept override { return true; }
+
+ private:
+  AllocatorImplPtr allocator_;
+  // threshold ~ thresh_numerator_ / thresh_denominator_
+  int thresh_numerator_{};
+  int thresh_denominator_{};
+  std::multimap<size_t, void*> tree_;
+  size_t max_tree_bytes_;
+  size_t tree_bytes_{};
+};
+
+class Stats : public AllocatorImpl {
+ public:
+  explicit Stats(AllocatorImplPtr allocator, std::string name)
+      : allocator_(std::move(allocator)), name_(std::move(name)) {}
+
+  ~Stats() override {
+    MMDEPLOY_INFO("=== {} ===", name_);
+    MMDEPLOY_INFO("  Allocation: count={}, size={}MB, time={}ms", data_.allocation_count,
+                  data_.allocated_bytes / (1024 * 1024.f),
+                  static_cast<float>(data_.allocation_time));
+    MMDEPLOY_INFO("Deallocation: count={}, size={}MB, time={}ms", data_.deallocation_count,
+                  data_.deallocated_bytes / (1024 * 1024.f),
+                  static_cast<float>(data_.deallocation_time));
+    MMDEPLOY_INFO("Peak memory usage: size={}MB", data_.peak / (1024 * 1024.f));
+  }
+
+  Block Allocate(size_t size) noexcept override {
+    auto t0 = std::chrono::high_resolution_clock::now();
+    auto block = allocator_->Allocate(size);
+    auto t1 = std::chrono::high_resolution_clock::now();
+    data_.allocation_time += std::chrono::duration<double, std::milli>(t1 - t0).count();
+    data_.allocated_bytes += block.size;
+    data_.peak = std::max(data_.peak, data_.allocated_bytes - data_.deallocated_bytes);
+    ++data_.allocation_count;
+    return block;
+  }
+
+  void Deallocate(Block& block) noexcept override {
+    ++data_.deallocation_count;
+    data_.deallocated_bytes += block.size;
+    auto t0 = std::chrono::high_resolution_clock::now();
+    allocator_->Deallocate(block);
+    auto t1 = std::chrono::high_resolution_clock::now();
+    data_.deallocation_time += std::chrono::duration<double, std::milli>(t1 - t0).count();
+  }
+
+  bool Owns(const Block& block) const noexcept override { return allocator_->Owns(block); }
+
+  const char* Name() const noexcept override { return name_.c_str(); }
+
+ private:
+  struct Data {
+    size_t allocation_count{};
+    size_t deallocation_count{};
+    size_t allocated_bytes{};
+    size_t deallocated_bytes{};
+    size_t peak{};
+    double allocation_time{};
+    double deallocation_time{};
+  };
+  Data data_;
+  AllocatorImplPtr allocator_;
+  std::string name_;
+};
+
+class Locked : public AllocatorImpl {
+ public:
+  explicit Locked(AllocatorImplPtr allocator) : allocator_(std::move(allocator)) {}
+  Block Allocate(size_t size) noexcept override {
+    std::lock_guard lock(mutex_);
+    return allocator_->Allocate(size);
+  }
+
+  void Deallocate(Block& block) noexcept override {
+    std::lock_guard lock(mutex_);
+    allocator_->Deallocate(block);
+  }
+
+  bool Owns(const Block& block) const noexcept override {
+    std::lock_guard lock(mutex_);
+    return allocator_->Owns(block);
+  }
+
+ private:
+  AllocatorImplPtr allocator_;
+  mutable std::mutex mutex_;
+};
+
+class Segregator : public AllocatorImpl {
+ public:
+  Segregator(size_t threshold, AllocatorImplPtr small, AllocatorImplPtr large)
+      : threshold_(threshold), small_(std::move(small)), large_(std::move(large)) {}
+
+  Block Allocate(size_t size) noexcept override {
+    if (size <= threshold_) {
+      return small_->Allocate(size);
+    }
+    return large_->Allocate(size);
+  }
+
+  void Deallocate(Block& block) noexcept override {
+    if (block.size <= threshold_) {
+      return small_->Deallocate(block);
+    }
+    return large_->Deallocate(block);
+  }
+
+  bool Owns(const Block& block) const noexcept override {
+    if (block.size <= threshold_) {
+      return small_->Owns(block);
+    }
+    return large_->Owns(block);
+  }
+
+ private:
+  size_t threshold_;
+  AllocatorImplPtr small_;
+  AllocatorImplPtr large_;
+};
+
+template <typename Allocator>
+class AllocatorAdapter : public AllocatorImpl {
+ public:
+  Block Allocate(size_t size) noexcept override { return allocator_.Allocate(size); }
+  void Deallocate(Block& block) noexcept override { return allocator_.Deallocate(block); }
+  bool Owns(const Block& block) const noexcept override { return allocator_.Owns(block); }
+
+ private:
+  Allocator allocator_;
+};
+
+class Bucketizer : public AllocatorImpl {
+ public:
+  using AllocatorCreator = std::function<AllocatorImplPtr(size_t, size_t)>;
+  Bucketizer(const AllocatorCreator& creator, size_t min_size, size_t max_size, size_t step_size)
+      : min_size_(min_size), max_size_(max_size), step_size_(step_size) {
+    for (auto base = min_size_; base < max_size_; base += step_size_) {
+      //      MMDEPLOY_ERROR("{}, {}", base, base + step_size - 1);
+      allocator_.push_back(creator(base, base + step_size - 1));
+    }
+    //    MMDEPLOY_ERROR("{}", allocator_.size());
+  }
+
+  Block Allocate(size_t size) noexcept override {
+    auto index = (size - min_size_) / step_size_;
+    if (0 <= index && index < allocator_.size()) {
+      return allocator_[index]->Allocate(size);
+    }
+    return Block{};
+  }
+
+  void Deallocate(Block& block) noexcept override {
+    auto index = (block.size - min_size_) / step_size_;
+    if (0 <= index && index < allocator_.size()) {
+      return allocator_[index]->Deallocate(block);
+    }
+  }
+
+  bool Owns(const Block& block) const noexcept override {
+    return min_size_ <= block.size && block.size < max_size_;
+  }
+
+ private:
+  std::vector<AllocatorImplPtr> allocator_;
+  size_t min_size_;
+  size_t max_size_;
+  size_t step_size_;
+};
+
+inline AllocatorImplPtr CreateFallback(AllocatorImplPtr primary, AllocatorImplPtr fallback) {
+  return std::make_shared<Fallback>(std::move(primary), std::move(fallback));
+}
+
+inline AllocatorImplPtr CreateStats(const std::string& name, AllocatorImplPtr allocator) {
+  return std::make_shared<Stats>(std::move(allocator), name);
+}
+
+inline AllocatorImplPtr CreatePool(size_t min_size, size_t max_size, unsigned int pool_size,
+                                   AllocatorImplPtr allocator) {
+  return std::make_shared<Pool>(std::move(allocator), min_size, max_size, pool_size);
+}
+
+inline AllocatorImplPtr CreateSegregator(size_t threshold, AllocatorImplPtr small,
+                                         AllocatorImplPtr large) {
+  return std::make_shared<Segregator>(threshold, std::move(small), std::move(large));
+}
+
+inline AllocatorImplPtr CreateBucketizer(size_t min_size, size_t max_size, size_t step_size,
+                                         const Bucketizer::AllocatorCreator& creator) {
+  return std::make_shared<Bucketizer>(creator, min_size, max_size, step_size);
+}
+
+inline AllocatorImplPtr CreatePoolBucketizer(size_t min_size, size_t max_size, size_t step_size,
+                                             unsigned pool_size,
+                                             const AllocatorImplPtr& allocator) {
+  auto creator = [&](size_t lo, size_t hi) {
+    return std::make_shared<Locked>(CreatePool(lo, hi, pool_size, allocator));
+  };
+  return CreateBucketizer(min_size, max_size, step_size, creator);
+}
+
+}  // namespace mmdeploy::framework::device_allocator
+
+#endif  // MMDEPLOY_SRC_CORE_DEVICE_ALLOCATOR_H_
--- a/csrc/mmdeploy/execution/CMakeLists.txt
+++ b/csrc/mmdeploy/execution/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+
+project(mmdeploy_execution)
+
+set(SRCS schedulers/schedulers.cpp)
+mmdeploy_add_module(${PROJECT_NAME} LIBRARY "${SRCS}")
+add_library(mmdeploy::execution ALIAS ${PROJECT_NAME})
--- a/csrc/mmdeploy/execution/bulk.h
+++ b/csrc/mmdeploy/execution/bulk.h
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/brycelelbach/wg21_p2300_std_execution/blob/main/include/execution.hpp
+
+#ifndef MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_BULK_H_
+#define MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_BULK_H_
+
+#include "closure.h"
+#include "concepts.h"
+#include "mmdeploy/core/logger.h"
+#include "utility.h"
+
+namespace mmdeploy {
+
+namespace __bulk {
+
+template <typename CvrefSender, typename Shape, typename Func, typename Receiver>
+struct _Operation {
+  struct type;
+};
+template <typename CvrefSender, typename Shape, typename Func, typename Receiver>
+using Operation = typename _Operation<CvrefSender, Shape, Func, remove_cvref_t<Receiver>>::type;
+
+template <typename Receiver, typename Shape, typename Func>
+struct _Receiver {
+  struct type;
+};
+template <typename Receiver, typename Shape, typename Func>
+using receiver_t = typename _Receiver<Receiver, Shape, Func>::type;
+
+template <typename Receiver, typename Shape, typename Func>
+struct _Receiver<Receiver, Shape, Func>::type {
+  Receiver receiver_;
+  Shape shape_;
+  Func func_;
+
+  template <class... As>
+  friend void tag_invoke(set_value_t, type&& self, As&&... as) noexcept {
+    MMDEPLOY_DEBUG("fallback Bulk implementation");
+    for (Shape i = 0; i < self.shape_; ++i) {
+      self.func_(i, as...);
+    }
+    SetValue(std::move(self.receiver_), (As &&) as...);
+  }
+};
+
+template <typename CvrefSender, typename Shape, typename Func, typename Receiver>
+struct _Operation<CvrefSender, Shape, Func, Receiver>::type {
+  connect_result_t<CvrefSender, receiver_t<Receiver, Shape, Func>> op_state2_;
+
+  friend void tag_invoke(start_t, type& self) { Start(self.op_state2_); }
+};
+
+template <typename Sender, typename Shape, typename Func>
+struct _Sender {
+  struct type;
+};
+template <typename Sender, typename Shape, typename Func>
+using sender_t = typename _Sender<remove_cvref_t<Sender>, remove_cvref_t<Shape>, Func>::type;
+
+template <typename Sender, typename Shape, typename Func>
+struct _Sender<Sender, Shape, Func>::type {
+  using value_types = completion_signatures_of_t<Sender>;
+
+  template <typename Receiver>
+  using _receiver_t = receiver_t<Receiver, Shape, Func>;
+
+  Sender sender_;
+  Shape shape_;
+  Func func_;
+
+  template <typename Self, typename Receiver, _decays_to<Self, type, int> = 0>
+  friend auto tag_invoke(connect_t, Self&& self, Receiver&& receiver)
+      -> Operation<_copy_cvref_t<Self, Sender>, Shape, Func, Receiver> {
+    return {Connect(((Self &&) self).sender_,
+                    _receiver_t<Receiver>{(Receiver &&) receiver, ((Self &&) self).shape_,
+                                          ((Self &&) self).func_})};
+  }
+};
+
+using std::enable_if_t;
+
+struct bulk_t {
+  template <typename Sender, typename Shape, typename Func,
+            enable_if_t<_is_sender<Sender> &&
+                            _tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func>,
+                        int> = 0>
+  auto operator()(Sender&& sender, Shape&& shape, Func func) const {
+    auto scheduler = GetCompletionScheduler(sender);
+    return tag_invoke(bulk_t{}, std::move(scheduler), (Sender &&) sender, (Shape &&) shape,
+                      (Func &&) func);
+  }
+  template <
+      typename Sender, typename Shape, typename Func,
+      enable_if_t<_is_sender<Sender> &&
+                      !_tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func> &&
+                      tag_invocable<bulk_t, Sender, Shape, Func>,
+                  int> = 0>
+  auto operator()(Sender&& sender, Shape&& shape, Func func) const {
+    return tag_invoke(bulk_t{}, (Sender &&) sender, (Shape &&) shape, (Func &&) func);
+  }
+  template <
+      typename Sender, typename Shape, typename Func,
+      enable_if_t<_is_sender<Sender> &&
+                      !_tag_invocable_with_completion_scheduler<bulk_t, Sender, Shape, Func> &&
+                      !tag_invocable<bulk_t, Sender, Shape, Func>,
+                  int> = 0>
+  auto operator()(Sender&& sender, Shape&& shape, Func func) const
+      -> sender_t<Sender, Shape, Func> {
+    return {(Sender &&) sender, (Shape &&) shape, std::move(func)};
+  }
+  template <typename Shape, typename Func>
+  _BinderBack<bulk_t, Shape, Func> operator()(Shape shape, Func fun) const {
+    return {{}, {}, {shape, std::move(fun)}};
+  }
+};
+
+}  // namespace __bulk
+
+using __bulk::bulk_t;
+inline constexpr bulk_t Bulk{};
+
+}  // namespace mmdeploy
+
+#endif  // MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_BULK_H_
--- a/csrc/mmdeploy/execution/closure.h
+++ b/csrc/mmdeploy/execution/closure.h
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/brycelelbach/wg21_p2300_std_execution/blob/main/include/execution.hpp
+
+#include <utility>
+
+#include "concepts.h"
+#include "utility.h"
+
+#ifndef MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_CLOSURE_H_
+#define MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_CLOSURE_H_
+
+namespace mmdeploy {
+
+namespace __closure {
+
+template <class D>
+struct SenderAdaptorClosure;
+
+}  // namespace __closure
+
+using __closure::SenderAdaptorClosure;
+
+namespace __closure {
+
+template <typename T0, typename T1>
+struct _Compose : SenderAdaptorClosure<_Compose<T0, T1>> {
+  T0 t0_;
+  T1 t1_;
+
+  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+  std::invoke_result_t<T1, std::invoke_result_t<T0, Sender>> operator()(Sender&& sender) && {
+    return ((T1 &&) t1_)(((T0 &&) t0_)((Sender &&) sender));
+  }
+
+  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+  std::invoke_result_t<T1, std::invoke_result_t<T0, Sender>> operator()(Sender&& sender) const& {
+    return t1_(t0_((Sender &&) sender));
+  }
+};
+
+template <typename D>
+struct SenderAdaptorClosure {};
+
+template <typename T0, typename T1,
+          typename = std::enable_if_t<
+              std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<T0>>, remove_cvref_t<T0>> &&
+              std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<T1>>, remove_cvref_t<T1>>>>
+_Compose<remove_cvref_t<T0>, remove_cvref_t<T1>> operator|(T0&& t0, T1&& t1) {
+  return {(T0 &&) t0, (T1 &&) t1};
+}
+
+template <typename Sender, typename Closure,
+          typename = std::enable_if_t<
+              _is_sender<Sender> && std::is_base_of_v<SenderAdaptorClosure<remove_cvref_t<Closure>>,
+                                                      remove_cvref_t<Closure>>>>
+std::invoke_result_t<Closure, Sender> operator|(Sender&& sender, Closure&& closure) {
+  return ((Closure &&) closure)((Sender &&) sender);
+}
+
+template <typename Func, typename... As>
+struct _BinderBack : SenderAdaptorClosure<_BinderBack<Func, As...>> {
+  Func func_;
+  std::tuple<As...> as_;
+
+  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+  std::invoke_result_t<Func, Sender, As...> operator()(Sender&& sender) && {
+    return std::apply(
+        [&sender, this](As&... as) { return ((Func &&) func_)((Sender &&) sender, (As &&) as...); },
+        as_);
+  }
+
+  template <typename Sender, std::enable_if_t<_is_sender<Sender>, int> = 0>
+  std::invoke_result_t<Func, Sender, As...> operator()(Sender&& sender) const& {
+    return std::apply([&sender, this](const As&... as) { return func_((Sender &&) sender, as...); },
+                      as_);
+  }
+};
+
+}  // namespace __closure
+
+using __closure::_BinderBack;
+
+}  // namespace mmdeploy
+
+#endif  // MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_CLOSURE_H_
--- a/csrc/mmdeploy/execution/concepts.h
+++ b/csrc/mmdeploy/execution/concepts.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_CONCEPTS_H_
+#define MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_CONCEPTS_H_
+
+#include "tag_invoke.h"
+
+namespace mmdeploy {
+
+namespace _get_completion_signatures {
+
+struct get_completion_signatures_t {
+  template <typename Sender, typename ValueTypes = typename remove_cvref_t<Sender>::value_types>
+  constexpr identity<ValueTypes> operator()(Sender&& sender) const noexcept {
+    return {};
+  }
+};
+
+}  // namespace _get_completion_signatures
+
+using _get_completion_signatures::get_completion_signatures_t;
+inline constexpr get_completion_signatures_t GetCompletionSignatures{};
+
+template <typename Sender>
+inline constexpr bool _is_sender = std::is_invocable_v<get_completion_signatures_t, Sender>&&
+    std::is_move_constructible_v<remove_cvref_t<Sender>>;
+
+// GetCompletionSignatures is expected to return identity<std::tuple<Types...>>;
+template <typename Sender>
+using completion_signatures_of_t =
+    typename std::invoke_result_t<get_completion_signatures_t, Sender>::type;
+
+namespace _set_value {
+struct set_value_t {
+  template <typename Receiver, typename... Args,
+            std::enable_if_t<is_tag_invocable_v<set_value_t, Receiver, Args...>, int> = 0>
+  void operator()(Receiver&& receiver, Args&&... args) const noexcept {
+    static_assert(is_nothrow_tag_invocable_v<set_value_t, Receiver, Args...>);
+    (void)tag_invoke(set_value_t{}, (Receiver &&) receiver, (Args &&) args...);
+  }
+};
+
+}  // namespace _set_value
+
+using _set_value::set_value_t;
+inline constexpr set_value_t SetValue{};
+
+namespace _start {
+
+struct start_t {
+  template <typename Operation, std::enable_if_t<tag_invocable<start_t, Operation&>, int> = 0>
+  void operator()(Operation& op_state) const
+      noexcept(is_nothrow_tag_invocable_v<start_t, Operation&>) {
+    (void)tag_invoke(start_t{}, op_state);
+  }
+};
+
+}  // namespace _start
+
+using _start::start_t;
+inline constexpr start_t Start{};
+
+namespace _connect {
+
+struct connect_t {
+  template <typename Sender, typename Receiver,
+            std::enable_if_t<is_tag_invocable_v<connect_t, Sender, Receiver>, int> = 0>
+  auto operator()(Sender&& sender, Receiver&& receiver) const
+      -> tag_invoke_result_t<connect_t, Sender, Receiver> {
+    return tag_invoke(connect_t{}, (Sender &&) sender, (Receiver &&) receiver);
+  }
+};
+
+}  // namespace _connect
+
+using _connect::connect_t;
+inline constexpr connect_t Connect{};
+
+namespace _get_completion_scheduler {
+
+struct get_completion_scheduler_t {
+  template <
+      typename Sender,
+      std::enable_if_t<is_tag_invocable_v<get_completion_scheduler_t, const Sender&>, int> = 0>
+  auto operator()(const Sender& sender) const noexcept
+      -> tag_invoke_result_t<get_completion_scheduler_t, const Sender&> {
+    return tag_invoke(get_completion_scheduler_t{}, sender);
+  }
+};
+
+}  // namespace _get_completion_scheduler
+
+using _get_completion_scheduler::get_completion_scheduler_t;
+inline constexpr get_completion_scheduler_t GetCompletionScheduler{};
+
+template <typename Sender>
+inline constexpr bool _has_completion_scheduler_v =
+    std::is_invocable_v<get_completion_scheduler_t, Sender>;
+
+template <typename Sender>
+struct _has_completion_scheduler : std::bool_constant<_has_completion_scheduler_v<Sender>> {};
+
+template <typename Sender>
+using _completion_scheduler_for = std::invoke_result_t<get_completion_scheduler_t, Sender>;
+
+namespace impl {
+
+template <typename Func, typename Sender, typename TArgs, typename SFINAE = void>
+struct _tag_invocable_with_completion_scheduler : std::false_type {};
+
+template <typename Func, typename Sender, typename... Args>
+struct _tag_invocable_with_completion_scheduler<
+    Func, Sender, std::tuple<Args...>, std::enable_if_t<_has_completion_scheduler_v<Sender>>>
+    : is_tag_invocable<Func, _completion_scheduler_for<Sender>, Sender, Args...> {};
+
+}  // namespace impl
+
+template <typename Func, typename Sender, typename... Args>
+inline constexpr bool _tag_invocable_with_completion_scheduler =
+    impl::_tag_invocable_with_completion_scheduler<Func, Sender, std::tuple<Args...>>::value;
+
+template <typename T, typename SFINAE = void>
+struct _is_range : std::false_type {};
+
+template <typename T>
+struct _is_range<T,
+                 std::void_t<decltype(std::begin(std::declval<T>()), std::end(std::declval<T>()))>>
+    : std::true_type {};
+
+template <typename T>
+inline constexpr bool _is_range_v = _is_range<T>::value;
+
+}  // namespace mmdeploy
+
+#endif  // MMDEPLOY_CSRC_EXPERIMENTAL_EXECUTION_CONCEPTS_H_
--- a/csrc/mmdeploy/execution/dynamic_batch.h
+++ b/csrc/mmdeploy/execution/dynamic_batch.h
+// Copyright (c) OpenMMLab. All rights reserved.
+
+#ifndef MMDEPLOY_CSRC_EXECUTION_DYNAMIC_BATCH_H_
+#define MMDEPLOY_CSRC_EXECUTION_DYNAMIC_BATCH_H_
+
+#include <atomic>
+
+#include "mmdeploy/execution/then.h"
+#include "mmdeploy/execution/utility.h"
+
+namespace mmdeploy {
+
+namespace _dynamic_batch {
+
+struct dynamic_batch_t {
+  struct context_base_t {
+    void (*destroy_)(context_base_t*);
+  };
+  struct context_t {
+    std::atomic<context_base_t*> base{};
+    ~context_t() {
+      if (auto p = base.load()) {
+        p->destroy_(p);
+      }
+    }
+  };
+
+  template <typename Sender, typename Func,
+            std::enable_if_t<
+                _tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender, context_t&, Func>,
+                int> = 0>
+  auto operator()(Sender&& sender, context_t& context, Func func) const {
+    auto scheduler = GetCompletionScheduler(sender);
+    return tag_invoke(*this, std::move(scheduler), (Sender &&) sender, context, std::move(func));
+  }
+
+  template <typename Sender, typename Func,
+            std::enable_if_t<!_tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender,
+                                                                       context_t&, Func> &&
+                                 tag_invocable<dynamic_batch_t, Sender, context_t&, Func>,
+                             int> = 0>
+  auto operator()(Sender&& sender, context_t& context, Func func) const {
+    return tag_invoke(*this, (Sender &&) sender, context, std::move(func));
+  }
+
+  template <typename Sender, typename Context, typename Func,
+            std::enable_if_t<
+                !_tag_invocable_with_completion_scheduler<dynamic_batch_t, Sender, Context, Func> &&
+                    !tag_invocable<dynamic_batch_t, Sender, Context, Func>,
+                int> = 0>
+  auto operator()(Sender&& sender, Context&&, Func func) const {
+    return Then((Sender &&) sender, std::move(func));
+  }
+};
+
+}  // namespace _dynamic_batch
+
+using _dynamic_batch::dynamic_batch_t;
+inline constexpr dynamic_batch_t DynamicBatch{};
+
+}  // namespace mmdeploy
+
+#endif  // MMDEPLOY_CSRC_EXECUTION_DYNAMIC_BATCH_H_