Merge pull request #990 from InfiniTensor/demo131

Demo-131 Cuda graph with optimized paged attention

Merge pull request #990 from InfiniTensor/demo131
Demo-131 Cuda graph with optimized paged attention
784139b9 · thatPepe · GitHub · 3c8fb3c0 · 1d6527cb · 784139b9
Unverified Commit 784139b9 authored Feb 13, 2026 by thatPepe Committed by GitHub Feb 13, 2026
20 changed files
--- a/src/infiniop/ops/causal_softmax/bang/causal_softmax_bang.mlu
+++ b/src/infiniop/ops/causal_softmax/bang/causal_softmax_bang.mlu
@@ -131,7 +131,7 @@ void causalSoftmaxUnion(void *workspace, int core_per_cluster, int cluster_count
    kernel_dim.x = core_per_cluster;
    kernel_dim.y = cluster_count;
    kernel_dim.z = 1;
-    kernel_type = CNRT_FUNC_TYPE_UNION1;
+    kernel_type = cnrtFuncTypeUnion1;

    // Launch kernel
    causalSoftmax<T><<<kernel_dim, kernel_type, queue>>>(

--- a/src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h
+++ b/src/infiniop/ops/causal_softmax/moore/causal_softmax_kernel.h
@@ -28,7 +28,7 @@ __device__ void causalSoftmaxKernel(
        //          1 | * * * ... * *   |
        //          2 | * * * ... * * * |
        //  height: 3  col_id->
-        if (width + blockIdx.x >= threadIdx.x + height) {
+        if (width + blockIdx.x >= col + height) {
            if constexpr (std::is_same_v<Tdata, half> || std::is_same_v<Tdata, cuda_bfloat16>) {
                /*
                 * MUSA does not support CUDA's native `hexp` function.

--- a/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
+++ b/src/infiniop/ops/causal_softmax/nvidia/causal_softmax_nvidia.cu
@@ -76,7 +76,15 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
                                     const void *x,
                                     void *stream_) const {
    cudaStream_t stream = (cudaStream_t)stream_;
-    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
+            y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
+            _info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_2048) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_2048>(
+            y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
+            _info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
            y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
            _info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
@@ -84,10 +92,6 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
            y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
            _info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
-    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
-        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
-            y, x, _info.dtype, _info.batch_size, _info.seq_len, _info.total_seq_len,
-            _info.y_stride_b, _info.y_stride_i, _info.x_stride_b, _info.x_stride_i, stream));
    } else {
        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
    }

--- a/src/infiniop/ops/causal_softmax/operator.cc
+++ b/src/infiniop/ops/causal_softmax/operator.cc
@@ -5,7 +5,7 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/causal_softmax_cpu.h"
 #endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
 #include "nvidia/causal_softmax_nvidia.cuh"
 #endif
 #ifdef ENABLE_METAX_API
@@ -48,6 +48,9 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
 #ifdef ENABLE_ILUVATAR_API
        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        CREATE(INFINI_DEVICE_ALI, nvidia);
+#endif
 #ifdef ENABLE_QY_API
        CREATE(INFINI_DEVICE_QY, nvidia);
 #endif
@@ -90,6 +93,9 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
 #ifdef ENABLE_ILUVATAR_API
        GET(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        GET(INFINI_DEVICE_ALI, nvidia);
+#endif
 #ifdef ENABLE_QY_API
        GET(INFINI_DEVICE_QY, nvidia);
 #endif
@@ -137,6 +143,9 @@ __C infiniStatus_t infiniopCausalSoftmax(
 #ifdef ENABLE_ILUVATAR_API
        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        CALCULATE(INFINI_DEVICE_ALI, nvidia);
+#endif
 #ifdef ENABLE_QY_API
        CALCULATE(INFINI_DEVICE_QY, nvidia);
 #endif
@@ -179,6 +188,9 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
 #ifdef ENABLE_ILUVATAR_API
        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        DESTROY(INFINI_DEVICE_ALI, nvidia);
+#endif
 #ifdef ENABLE_QY_API
        DESTROY(INFINI_DEVICE_QY, nvidia);
 #endif

--- a/src/infiniop/ops/clip/operator.cc
+++ b/src/infiniop/ops/clip/operator.cc
@@ -5,7 +5,7 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/clip_cpu.h"
 #endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
 #include "nvidia/clip_nvidia.cuh"
 #endif
 #ifdef ENABLE_METAX_API
@@ -42,6 +42,9 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
 #ifdef ENABLE_ILUVATAR_API
        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        CREATE(INFINI_DEVICE_ALI, nvidia);
+#endif
 #ifdef ENABLE_QY_API
        CREATE(INFINI_DEVICE_QY, nvidia);
 #endif
@@ -76,6 +79,9 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
 #ifdef ENABLE_ILUVATAR_API
        GET(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        GET(INFINI_DEVICE_ALI, nvidia);
+#endif
 #ifdef ENABLE_QY_API
        GET(INFINI_DEVICE_QY, nvidia);
 #endif
@@ -118,6 +124,9 @@ __C infiniStatus_t infiniopClip(
 #ifdef ENABLE_ILUVATAR_API
        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        CALCULATE(INFINI_DEVICE_ALI, nvidia);
+#endif
 #ifdef ENABLE_QY_API
        CALCULATE(INFINI_DEVICE_QY, nvidia);
 #endif
@@ -154,6 +163,9 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
 #ifdef ENABLE_ILUVATAR_API
        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        DELETE(INFINI_DEVICE_ALI, nvidia);
+#endif
 #ifdef ENABLE_QY_API
        DELETE(INFINI_DEVICE_QY, nvidia);
 #endif

--- a/src/infiniop/ops/conv/operator.cc
+++ b/src/infiniop/ops/conv/operator.cc
@@ -5,7 +5,7 @@
 #ifdef ENABLE_CPU_API
 #include "cpu/conv_cpu.h"
 #endif
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
 #include "nvidia/conv_nvidia.cuh"
 #endif

@@ -45,6 +45,9 @@ __C __export infiniStatus_t infiniopCreateConvDescriptor(infiniopHandle_t handle
 #ifdef ENABLE_QY_API
        CREATE(INFINI_DEVICE_QY, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        CREATE(INFINI_DEVICE_ALI, nvidia);
+#endif

    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -76,6 +79,9 @@ infiniopGetConvWorkspaceSize(
 #ifdef ENABLE_QY_API
        GET(INFINI_DEVICE_QY, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        GET(INFINI_DEVICE_ALI, nvidia);
+#endif

    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -115,6 +121,9 @@ __C infiniStatus_t infiniopConv(
 #ifdef ENABLE_QY_API
        CALCULATE(INFINI_DEVICE_QY, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        CALCULATE(INFINI_DEVICE_ALI, nvidia);
+#endif

    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -142,6 +151,9 @@ infiniopDestroyConvDescriptor(infiniopConvDescriptor_t desc) {
 #ifdef ENABLE_QY_API
        DELETE(INFINI_DEVICE_QY, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        DELETE(INFINI_DEVICE_ALI, nvidia);
+#endif

    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

--- a/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
+++ b/src/infiniop/ops/dequantize_awq/nvidia/dequantize_w42f16_nvidia.cu
-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)

 #include "../../../devices/nvidia/nvidia_handle.cuh"
 #include "../../../devices/nvidia/nvidia_kernel_common.cuh"

--- a/src/infiniop/ops/dequantize_awq/operator.cc
+++ b/src/infiniop/ops/dequantize_awq/operator.cc
@@ -2,7 +2,7 @@
 #include "../../handle.h"
 #include "infiniop/ops/dequantize_awq.h"

-#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API) || defined(ENABLE_ALI_API)
 #include "nvidia/dequantize_w42f16_nvidia.cuh"
 #endif
 #ifdef ENABLE_MOORE_API
@@ -43,6 +43,10 @@ __C infiniStatus_t infiniopCreateDequantizeAWQDescriptor(
 #ifdef ENABLE_QY_API
        CREATE(INFINI_DEVICE_QY, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        CREATE(INFINI_DEVICE_ALI, nvidia);
+#endif
+
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
@@ -70,6 +74,10 @@ __C infiniStatus_t infiniopGetDequantizeAWQWorkspaceSize(infiniopDequantizeAWQDe
 #ifdef ENABLE_QY_API
        GET(INFINI_DEVICE_QY, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        GET(INFINI_DEVICE_ALI, nvidia);
+#endif
+
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
@@ -104,6 +112,10 @@ __C infiniStatus_t infiniopDequantizeAWQ(
 #ifdef ENABLE_QY_API
        CALCULATE(INFINI_DEVICE_QY, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        CALCULATE(INFINI_DEVICE_ALI, nvidia);
+#endif
+
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }
@@ -132,6 +144,10 @@ infiniopDestroyDequantizeAWQDescriptor(infiniopDequantizeAWQDescriptor_t desc) {
 #ifdef ENABLE_QY_API
        DELETE(INFINI_DEVICE_QY, nvidia);
 #endif
+#ifdef ENABLE_ALI_API
+        DELETE(INFINI_DEVICE_ALI, nvidia);
+#endif
+
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
    }

--- a/src/infiniop/ops/embedding/cpu/embedding_cpu.cc
+++ b/src/infiniop/ops/embedding/cpu/embedding_cpu.cc
+#include "embedding_cpu.h"
+#include "../../../../utils.h"
+#include "../../../handle.h"
+#include "../../../tensor.h"
+#include <cstring>
+
+namespace op::embedding::cpu {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc) {
+
+    auto input_shape = input_desc->shape();
+    auto weight_shape = weight_desc->shape();
+
+    CHECK_OR_RETURN(weight_shape.size() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
+    CHECK_OR_RETURN(output_desc->shape().size() == input_shape.size() + 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+    auto output_shape = output_desc->shape();
+    size_t embedding_dim = weight_shape[1];
+    CHECK_OR_RETURN(output_shape.back() == embedding_dim, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+    for (size_t i = 0; i < input_shape.size(); ++i) {
+        CHECK_OR_RETURN(output_shape[i] == input_shape[i], INFINI_STATUS_BAD_TENSOR_SHAPE);
+    }
+
+    auto input_dtype = input_desc->dtype();
+    auto weight_dtype = weight_desc->dtype();
+    CHECK_OR_RETURN(input_dtype == INFINI_DTYPE_I32 || input_dtype == INFINI_DTYPE_I64,
+                    INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_OR_RETURN(weight_dtype == INFINI_DTYPE_F32 || weight_dtype == INFINI_DTYPE_F16 || weight_dtype == INFINI_DTYPE_BF16, INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_OR_RETURN(output_desc->dtype() == weight_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    size_t num_indices = 1;
+    for (auto dim : input_shape) {
+        num_indices *= dim;
+    }
+
+    size_t vocab_size = weight_shape[0];
+
+    *desc_ptr = new Descriptor(
+        num_indices,
+        embedding_dim,
+        vocab_size,
+        input_dtype,
+        weight_dtype,
+        new Opaque{},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *output,
+    const void *input,
+    const void *weight,
+    void *stream) const {
+
+    if (_num_indices == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    size_t element_size = infiniSizeOf(_weight_dtype);
+    size_t row_bytes = _embedding_dim * element_size;
+
+    if (_input_dtype == INFINI_DTYPE_I32) {
+        const int32_t *indices_ptr = reinterpret_cast<const int32_t *>(input);
+        const std::byte *weight_ptr = reinterpret_cast<const std::byte *>(weight);
+        std::byte *out_ptr = reinterpret_cast<std::byte *>(output);
+
+        for (size_t i = 0; i < _num_indices; ++i) {
+            int32_t idx = indices_ptr[i];
+            if (idx >= 0 && static_cast<size_t>(idx) < _vocab_size) {
+                std::memcpy(out_ptr + i * row_bytes,
+                            weight_ptr + static_cast<size_t>(idx) * row_bytes,
+                            row_bytes);
+            }
+        }
+    } else if (_input_dtype == INFINI_DTYPE_I64) {
+        const int64_t *indices_ptr = reinterpret_cast<const int64_t *>(input);
+        const std::byte *weight_ptr = reinterpret_cast<const std::byte *>(weight);
+        std::byte *out_ptr = reinterpret_cast<std::byte *>(output);
+
+        for (size_t i = 0; i < _num_indices; ++i) {
+            int64_t idx = indices_ptr[i];
+            if (idx >= 0 && static_cast<size_t>(idx) < _vocab_size) {
+                std::memcpy(out_ptr + i * row_bytes,
+                            weight_ptr + static_cast<size_t>(idx) * row_bytes,
+                            row_bytes);
+            }
+        }
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::embedding::cpu
--- a/src/infiniop/ops/embedding/cpu/embedding_cpu.h
+++ b/src/infiniop/ops/embedding/cpu/embedding_cpu.h
+#ifndef __EMBEDDING_CPU_H__
+#define __EMBEDDING_CPU_H__
+
+#include "../embedding.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __EMBEDDING_CPU_H__
--- a/src/infiniop/ops/embedding/cuda/embedding_kernel.cuh
+++ b/src/infiniop/ops/embedding/cuda/embedding_kernel.cuh
+#ifndef __EMBEDDING_CUDA_KERNEL_CUH__
+#define __EMBEDDING_CUDA_KERNEL_CUH__
+
+#include <type_traits>
+
+// Helper function to check memory alignment
+__forceinline__ __device__ bool is_aligned(const void *ptr, size_t alignment) {
+    // Use size_t for pointer arithmetic in device code (more compatible)
+    return (reinterpret_cast<size_t>(ptr) % alignment == 0);
+}
+
+// Vectorized copy for float type using float4
+template <typename IndexType>
+__forceinline__ __device__ void copyVectorizedFloat4(
+    float *__restrict__ dst,
+    const float *__restrict__ src,
+    size_t embedding_dim) {
+    // Use float4 for vectorized access (16 bytes, 4 floats)
+    const float4 *src_vec = reinterpret_cast<const float4 *>(src);
+    float4 *dst_vec = reinterpret_cast<float4 *>(dst);
+    size_t vec_count = embedding_dim / 4;
+
+    // Vectorized copy using __ldg for read-only weight
+    for (size_t i = 0; i < vec_count; ++i) {
+        dst_vec[i] = __ldg(&src_vec[i]);
+    }
+
+    // Copy remaining elements
+    size_t remaining = embedding_dim % 4;
+    if (remaining > 0) {
+        size_t offset = vec_count * 4;
+        for (size_t i = 0; i < remaining; ++i) {
+            dst[offset + i] = __ldg(&src[offset + i]);
+        }
+    }
+}
+
+// Vectorized copy for float type using float2 (fallback when not aligned to 16 bytes)
+template <typename IndexType>
+__forceinline__ __device__ void copyVectorizedFloat2(
+    float *__restrict__ dst,
+    const float *__restrict__ src,
+    size_t embedding_dim) {
+    // Use float2 for vectorized access (8 bytes, 2 floats)
+    const float2 *src_vec = reinterpret_cast<const float2 *>(src);
+    float2 *dst_vec = reinterpret_cast<float2 *>(dst);
+    size_t vec_count = embedding_dim / 2;
+
+    // Vectorized copy using __ldg for read-only weight
+    for (size_t i = 0; i < vec_count; ++i) {
+        dst_vec[i] = __ldg(&src_vec[i]);
+    }
+
+    // Copy remaining element if odd
+    if (embedding_dim % 2 != 0) {
+        dst[embedding_dim - 1] = __ldg(&src[embedding_dim - 1]);
+    }
+}
+
+// Vectorized copy for half type using half2
+template <typename IndexType>
+__forceinline__ __device__ void copyVectorizedHalf2(
+    half *__restrict__ dst,
+    const half *__restrict__ src,
+    size_t embedding_dim) {
+    // Use half2 for vectorized access (4 bytes, 2 halfs)
+    const half2 *src_vec = reinterpret_cast<const half2 *>(src);
+    half2 *dst_vec = reinterpret_cast<half2 *>(dst);
+    size_t vec_count = embedding_dim / 2;
+
+    // Vectorized copy using __ldg for read-only weight
+    for (size_t i = 0; i < vec_count; ++i) {
+        dst_vec[i] = __ldg(&src_vec[i]);
+    }
+
+    // Copy remaining element if odd
+    if (embedding_dim % 2 != 0) {
+        dst[embedding_dim - 1] = __ldg(&src[embedding_dim - 1]);
+    }
+}
+
+// Vectorized copy for bfloat16 type using bfloat162
+template <typename IndexType>
+__forceinline__ __device__ void copyVectorizedBFloat162(
+    cuda_bfloat16 *__restrict__ dst,
+    const cuda_bfloat16 *__restrict__ src,
+    size_t embedding_dim) {
+    // Use bfloat162 for vectorized access (4 bytes, 2 bfloat16s)
+    const cuda_bfloat162 *src_vec = reinterpret_cast<const cuda_bfloat162 *>(src);
+    cuda_bfloat162 *dst_vec = reinterpret_cast<cuda_bfloat162 *>(dst);
+    size_t vec_count = embedding_dim / 2;
+
+    // Vectorized copy using __ldg for read-only weight
+    for (size_t i = 0; i < vec_count; ++i) {
+        dst_vec[i] = __ldg(&src_vec[i]);
+    }
+
+    // Copy remaining element if odd
+    if (embedding_dim % 2 != 0) {
+        dst[embedding_dim - 1] = __ldg(&src[embedding_dim - 1]);
+    }
+}
+
+// Scalar copy fallback with __ldg optimization
+template <typename T, typename IndexType>
+__forceinline__ __device__ void copyScalar(
+    T *__restrict__ dst,
+    const T *__restrict__ src,
+    size_t embedding_dim) {
+    // Scalar copy with __ldg for read-only weight
+    for (size_t i = 0; i < embedding_dim; ++i) {
+        dst[i] = __ldg(&src[i]);
+    }
+}
+
+#endif // __EMBEDDING_CUDA_KERNEL_CUH__
--- a/src/infiniop/ops/embedding/embedding.h
+++ b/src/infiniop/ops/embedding/embedding.h
+#ifndef __EMBEDDING_H__
+#define __EMBEDDING_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+
+#define DESCRIPTOR(NAMESPACE)                             \
+                                                          \
+    namespace op::embedding::NAMESPACE {                  \
+    class Descriptor final : public InfiniopDescriptor {  \
+        struct Opaque;                                    \
+        Opaque *_opaque;                                  \
+        size_t _num_indices;                              \
+        size_t _embedding_dim;                            \
+        size_t _vocab_size;                               \
+        infiniDtype_t _input_dtype;                       \
+        infiniDtype_t _weight_dtype;                      \
+                                                          \
+        Descriptor(                                       \
+            size_t num_indices,                           \
+            size_t embedding_dim,                         \
+            size_t vocab_size,                            \
+            infiniDtype_t input_dtype,                    \
+            infiniDtype_t weight_dtype,                   \
+            Opaque *opaque,                               \
+            infiniDevice_t device_type,                   \
+            int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id}, \
+              _opaque(opaque),                            \
+              _num_indices(num_indices),                  \
+              _embedding_dim(embedding_dim),              \
+              _vocab_size(vocab_size),                    \
+              _input_dtype(input_dtype),                  \
+              _weight_dtype(weight_dtype) {}              \
+                                                          \
+    public:                                               \
+        ~Descriptor();                                    \
+                                                          \
+        static infiniStatus_t create(                     \
+            infiniopHandle_t handle,                      \
+            Descriptor **desc_ptr,                        \
+            infiniopTensorDescriptor_t output_desc,       \
+            infiniopTensorDescriptor_t input_desc,        \
+            infiniopTensorDescriptor_t weight_desc);      \
+                                                          \
+        infiniStatus_t calculate(                         \
+            void *output,                                 \
+            const void *input,                            \
+            const void *weight,                           \
+            void *stream) const;                          \
+    };                                                    \
+    }
+
+#endif // __EMBEDDING_H__
--- a/src/infiniop/ops/embedding/metax/embedding_metax.cuh
+++ b/src/infiniop/ops/embedding/metax/embedding_metax.cuh
+#ifndef __EMBEDDING_METAX_H__
+#define __EMBEDDING_METAX_H__
+
+#include "../embedding.h"
+
+DESCRIPTOR(metax)
+
+#endif // __EMBEDDING_METAX_H__
--- a/src/infiniop/ops/embedding/metax/embedding_metax.maca
+++ b/src/infiniop/ops/embedding/metax/embedding_metax.maca
+#include "../../../../utils.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../tensor.h"
+#include "../cuda/embedding_kernel.cuh"
+#include "embedding_metax.cuh"
+
+template <typename T, typename IndexType>
+INFINIOP_METAX_KERNEL embeddingKernel(
+    T *__restrict__ output,
+    const IndexType *__restrict__ indices,
+    const T *__restrict__ weight,
+    size_t num_indices,
+    size_t embedding_dim,
+    size_t vocab_size) {
+    // Calculate global thread index
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < num_indices) {
+        // Get the index value
+        IndexType index_val = __ldg(&indices[idx]);
+
+        // Bounds check - handle negative indices gracefully
+        if (index_val >= 0 && static_cast<size_t>(index_val) < vocab_size) {
+            // Copy embedding vector from weight to output
+            const T *src = weight + static_cast<size_t>(index_val) * embedding_dim;
+            T *dst = output + idx * embedding_dim;
+
+            // Choose optimal copy strategy based on type and alignment
+            if constexpr (std::is_same_v<T, float>) {
+                // Check alignment for float4 (16 bytes)
+                bool aligned_16 = is_aligned(src, 16) && is_aligned(dst, 16);
+                if (aligned_16 && embedding_dim >= 4 && embedding_dim % 4 == 0) {
+                    copyVectorizedFloat4<IndexType>(dst, src, embedding_dim);
+                } else if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
+                    // Try float2 if not aligned to 16 bytes
+                    copyVectorizedFloat2<IndexType>(dst, src, embedding_dim);
+                } else {
+                    copyScalar<T, IndexType>(dst, src, embedding_dim);
+                }
+            } else if constexpr (std::is_same_v<T, half>) {
+                // Use half2 for vectorized access
+                if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
+                    copyVectorizedHalf2<IndexType>(dst, src, embedding_dim);
+                } else {
+                    copyScalar<T, IndexType>(dst, src, embedding_dim);
+                }
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                // Use bfloat162 for vectorized access
+                if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
+                    copyVectorizedBFloat162<IndexType>(dst, src, embedding_dim);
+                } else {
+                    copyScalar<T, IndexType>(dst, src, embedding_dim);
+                }
+            } else {
+                // Fallback to scalar copy with __ldg
+                copyScalar<T, IndexType>(dst, src, embedding_dim);
+            }
+        }
+    }
+}
+
+namespace op::embedding::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc) {
+
+    auto input_shape = input_desc->shape();
+    auto weight_shape = weight_desc->shape();
+
+    // Validate shapes
+    CHECK_OR_RETURN(weight_shape.size() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
+    CHECK_OR_RETURN(output_desc->shape().size() == input_shape.size() + 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+    // Check output shape matches input shape + embedding_dim
+    auto output_shape = output_desc->shape();
+    size_t embedding_dim = weight_shape[1];
+    CHECK_OR_RETURN(output_shape.back() == embedding_dim, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+    for (size_t i = 0; i < input_shape.size(); ++i) {
+        CHECK_OR_RETURN(output_shape[i] == input_shape[i], INFINI_STATUS_BAD_TENSOR_SHAPE);
+    }
+
+    // Validate dtypes
+    auto input_dtype = input_desc->dtype();
+    auto weight_dtype = weight_desc->dtype();
+    CHECK_OR_RETURN(input_dtype == INFINI_DTYPE_I32 || input_dtype == INFINI_DTYPE_I64,
+                    INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_OR_RETURN(weight_dtype == INFINI_DTYPE_F32 || weight_dtype == INFINI_DTYPE_F16 || 
+                    weight_dtype == INFINI_DTYPE_BF16, INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_OR_RETURN(output_desc->dtype() == weight_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    // Calculate number of indices (supporting batch dimension)
+    size_t num_indices = 1;
+    for (auto dim : input_shape) {
+        num_indices *= dim;
+    }
+
+    size_t vocab_size = weight_shape[0];
+
+    *desc_ptr = new Descriptor(
+        num_indices,
+        embedding_dim,
+        vocab_size,
+        input_dtype,
+        weight_dtype,
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *output,
+    const void *input,
+    const void *weight,
+    void *stream) const {
+
+    if (_num_indices == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    auto hc_stream = reinterpret_cast<hcStream_t>(stream);
+
+    // Dynamic block size optimization based on embedding_dim for Metax platform
+    size_t block_size = 256; // Default block size for Metax
+    if (_embedding_dim <= 64) {
+        block_size = 512; // Small embedding_dim: use larger block for better occupancy
+    } else if (_embedding_dim >= 1024) {
+        block_size = 128; // Large embedding_dim: use smaller block to reduce register pressure
+    }
+
+    size_t grid_size = (_num_indices + block_size - 1) / block_size;
+
+    // Launch kernel based on dtypes for Metax platform
+    if (_input_dtype == INFINI_DTYPE_I32) {
+        const int32_t *indices_ptr = reinterpret_cast<const int32_t *>(input);
+
+        if (_weight_dtype == INFINI_DTYPE_F32) {
+            embeddingKernel<float, int32_t><<<grid_size, block_size, 0, hc_stream>>>(
+                reinterpret_cast<float *>(output),
+                indices_ptr,
+                reinterpret_cast<const float *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else if (_weight_dtype == INFINI_DTYPE_F16) {
+            embeddingKernel<half, int32_t><<<grid_size, block_size, 0, hc_stream>>>(
+                reinterpret_cast<half *>(output),
+                indices_ptr,
+                reinterpret_cast<const half *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else if (_weight_dtype == INFINI_DTYPE_BF16) {
+            // Use Metax's bfloat16 type
+            embeddingKernel<__hpcc_bfloat16, int32_t><<<grid_size, block_size, 0, hc_stream>>>(
+                reinterpret_cast<__hpcc_bfloat16 *>(output),
+                indices_ptr,
+                reinterpret_cast<const __hpcc_bfloat16 *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else if (_input_dtype == INFINI_DTYPE_I64) {
+        const int64_t *indices_ptr = reinterpret_cast<const int64_t *>(input);
+
+        if (_weight_dtype == INFINI_DTYPE_F32) {
+            embeddingKernel<float, int64_t><<<grid_size, block_size, 0, hc_stream>>>(
+                reinterpret_cast<float *>(output),
+                indices_ptr,
+                reinterpret_cast<const float *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else if (_weight_dtype == INFINI_DTYPE_F16) {
+            embeddingKernel<half, int64_t><<<grid_size, block_size, 0, hc_stream>>>(
+                reinterpret_cast<half *>(output),
+                indices_ptr,
+                reinterpret_cast<const half *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else if (_weight_dtype == INFINI_DTYPE_BF16) {
+            embeddingKernel<__hpcc_bfloat16, int64_t><<<grid_size, block_size, 0, hc_stream>>>(
+                reinterpret_cast<__hpcc_bfloat16 *>(output),
+                indices_ptr,
+                reinterpret_cast<const __hpcc_bfloat16 *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::embedding::metax
--- a/src/infiniop/ops/embedding/moore/embedding_moore.h
+++ b/src/infiniop/ops/embedding/moore/embedding_moore.h
+#ifndef __EMBEDDING_MOORE_H__
+#define __EMBEDDING_MOORE_H__
+
+#include "../embedding.h"
+
+DESCRIPTOR(moore)
+
+#endif // __EMBEDDING_MOORE_H__
--- a/src/infiniop/ops/embedding/moore/embedding_moore.mu
+++ b/src/infiniop/ops/embedding/moore/embedding_moore.mu
+#include "../../../../utils.h"
+#include "../../../devices/moore/moore_common.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+#include "../../../tensor.h"
+#include "embedding_moore_kernel.h"
+#include "embedding_moore.h"
+#include <musa_runtime.h>
+
+template <typename T, typename IndexType>
+INFINIOP_MOORE_KERNEL embeddingKernel(
+    T *__restrict__ output,
+    const IndexType *__restrict__ indices,
+    const T *__restrict__ weight,
+    size_t num_indices,
+    size_t embedding_dim,
+    size_t vocab_size) {
+    // Calculate global thread index
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < num_indices) {
+        // Get the index value with Moore-optimized memory access
+        IndexType index_val = indices[idx];
+
+        // Bounds check - handle negative indices gracefully
+        if (index_val >= 0 && static_cast<size_t>(index_val) < vocab_size) {
+            // Copy embedding vector from weight to output
+            const T *src = weight + static_cast<size_t>(index_val) * embedding_dim;
+            T *dst = output + idx * embedding_dim;
+
+            // Choose optimal copy strategy based on type and alignment
+            if constexpr (std::is_same_v<T, float>) {
+                // Check alignment for float4 (16 bytes)
+                bool aligned_16 = is_aligned(src, 16) && is_aligned(dst, 16);
+                if (aligned_16 && embedding_dim >= 4 && embedding_dim % 4 == 0) {
+                    copyVectorizedFloat4<IndexType>(dst, src, embedding_dim);
+                } else if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
+                    // Try float2 if not aligned to 16 bytes
+                    copyVectorizedFloat2<IndexType>(dst, src, embedding_dim);
+                } else {
+                    copyScalar<T, IndexType>(dst, src, embedding_dim);
+                }
+            } else if constexpr (std::is_same_v<T, half>) {
+                // Use half2 for vectorized access
+                if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
+                    copyVectorizedHalf2<IndexType>(dst, src, embedding_dim);
+                } else {
+                    copyScalar<T, IndexType>(dst, src, embedding_dim);
+                }
+            } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+                // Use mt_bfloat162 for vectorized access (Moore-specific type)
+                if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
+                    copyVectorizedBFloat162<IndexType>(dst, src, embedding_dim);
+                } else {
+                    copyScalar<T, IndexType>(dst, src, embedding_dim);
+                }
+            } else {
+                // Fallback to scalar copy with Moore-optimized memory access
+                copyScalar<T, IndexType>(dst, src, embedding_dim);
+            }
+        }
+    }
+}
+
+namespace op::embedding::moore {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc) {
+
+    auto input_shape = input_desc->shape();
+    auto weight_shape = weight_desc->shape();
+
+    // Validate shapes
+    CHECK_OR_RETURN(weight_shape.size() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
+    CHECK_OR_RETURN(output_desc->shape().size() == input_shape.size() + 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+    // Check output shape matches input shape + embedding_dim
+    auto output_shape = output_desc->shape();
+    size_t embedding_dim = weight_shape[1];
+    CHECK_OR_RETURN(output_shape.back() == embedding_dim, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+    for (size_t i = 0; i < input_shape.size(); ++i) {
+        CHECK_OR_RETURN(output_shape[i] == input_shape[i], INFINI_STATUS_BAD_TENSOR_SHAPE);
+    }
+
+    // Validate dtypes
+    auto input_dtype = input_desc->dtype();
+    auto weight_dtype = weight_desc->dtype();
+    CHECK_OR_RETURN(input_dtype == INFINI_DTYPE_I32 || input_dtype == INFINI_DTYPE_I64,
+                    INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_OR_RETURN(weight_dtype == INFINI_DTYPE_F32 || weight_dtype == INFINI_DTYPE_F16 || weight_dtype == INFINI_DTYPE_BF16, INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_OR_RETURN(output_desc->dtype() == weight_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    // Calculate number of indices (supporting batch dimension)
+    size_t num_indices = 1;
+    for (auto dim : input_shape) {
+        num_indices *= dim;
+    }
+
+    size_t vocab_size = weight_shape[0];
+
+    *desc_ptr = new Descriptor(
+        num_indices,
+        embedding_dim,
+        vocab_size,
+        input_dtype,
+        weight_dtype,
+        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *output,
+    const void *input,
+    const void *weight,
+    void *stream) const {
+
+    if (_num_indices == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+
+    // Dynamic block size optimization based on embedding_dim
+    // Moore platform typically has different performance characteristics
+    size_t block_size = 256; // Default for Moore
+    if (_embedding_dim <= 64) {
+        block_size = 512; // Small embedding_dim: use larger block for better occupancy
+    } else if (_embedding_dim >= 1024) {
+        block_size = 128; // Large embedding_dim: use smaller block to reduce register pressure
+    } else if (_embedding_dim <= 256) {
+        block_size = 384; // Medium embedding_dim: balanced configuration
+    }
+
+    size_t grid_size = (_num_indices + block_size - 1) / block_size;
+
+    // Launch kernel based on dtypes
+    // Note: Moore uses __mt_bfloat16 instead of __nv_bfloat16
+    if (_input_dtype == INFINI_DTYPE_I32) {
+        const int32_t *indices_ptr = reinterpret_cast<const int32_t *>(input);
+
+        if (_weight_dtype == INFINI_DTYPE_F32) {
+            embeddingKernel<float, int32_t><<<grid_size, block_size, 0, musa_stream>>>(
+                reinterpret_cast<float *>(output),
+                indices_ptr,
+                reinterpret_cast<const float *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else if (_weight_dtype == INFINI_DTYPE_F16) {
+            embeddingKernel<half, int32_t><<<grid_size, block_size, 0, musa_stream>>>(
+                reinterpret_cast<half *>(output),
+                indices_ptr,
+                reinterpret_cast<const half *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else if (_weight_dtype == INFINI_DTYPE_BF16) {
+            // Use Moore's bfloat16 type
+            embeddingKernel<__mt_bfloat16, int32_t><<<grid_size, block_size, 0, musa_stream>>>(
+                reinterpret_cast<__mt_bfloat16 *>(output),
+                indices_ptr,
+                reinterpret_cast<const __mt_bfloat16 *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else if (_input_dtype == INFINI_DTYPE_I64) {
+        const int64_t *indices_ptr = reinterpret_cast<const int64_t *>(input);
+
+        if (_weight_dtype == INFINI_DTYPE_F32) {
+            embeddingKernel<float, int64_t><<<grid_size, block_size, 0, musa_stream>>>(
+                reinterpret_cast<float *>(output),
+                indices_ptr,
+                reinterpret_cast<const float *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else if (_weight_dtype == INFINI_DTYPE_F16) {
+            embeddingKernel<half, int64_t><<<grid_size, block_size, 0, musa_stream>>>(
+                reinterpret_cast<half *>(output),
+                indices_ptr,
+                reinterpret_cast<const half *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else if (_weight_dtype == INFINI_DTYPE_BF16) {
+            embeddingKernel<__mt_bfloat16, int64_t><<<grid_size, block_size, 0, musa_stream>>>(
+                reinterpret_cast<__mt_bfloat16 *>(output),
+                indices_ptr,
+                reinterpret_cast<const __mt_bfloat16 *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    // Check for kernel launch errors
+    musaError_t err = musaGetLastError();
+    if (err != musaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::embedding::moore
--- a/src/infiniop/ops/embedding/moore/embedding_moore_kernel.h
+++ b/src/infiniop/ops/embedding/moore/embedding_moore_kernel.h
+#ifndef __EMBEDDING_MOORE_KERNEL_CUH__
+#define __EMBEDDING_MOORE_KERNEL_CUH__
+
+#include <type_traits>
+
+// Helper function to check memory alignment
+__forceinline__ __device__ bool is_aligned(const void *ptr, size_t alignment) {
+    // Use size_t for pointer arithmetic in device code (more compatible)
+    return (reinterpret_cast<size_t>(ptr) % alignment == 0);
+}
+
+// Vectorized copy for float type using float4
+template <typename IndexType>
+__forceinline__ __device__ void copyVectorizedFloat4(
+    float *__restrict__ dst,
+    const float *__restrict__ src,
+    size_t embedding_dim) {
+    // Use float4 for vectorized access (16 bytes, 4 floats)
+    const float4 *src_vec = reinterpret_cast<const float4 *>(src);
+    float4 *dst_vec = reinterpret_cast<float4 *>(dst);
+    size_t vec_count = embedding_dim / 4;
+
+    // Vectorized copy with __ldg equivalent for Moore platform
+    for (size_t i = 0; i < vec_count; ++i) {
+        dst_vec[i] = src_vec[i];
+    }
+
+    // Copy remaining elements
+    size_t remaining = embedding_dim % 4;
+    if (remaining > 0) {
+        size_t offset = vec_count * 4;
+        for (size_t i = 0; i < remaining; ++i) {
+            dst[offset + i] = src[offset + i];
+        }
+    }
+}
+
+// Vectorized copy for float type using float2 (fallback when not aligned to 16 bytes)
+template <typename IndexType>
+__forceinline__ __device__ void copyVectorizedFloat2(
+    float *__restrict__ dst,
+    const float *__restrict__ src,
+    size_t embedding_dim) {
+    // Use float2 for vectorized access (8 bytes, 2 floats)
+    const float2 *src_vec = reinterpret_cast<const float2 *>(src);
+    float2 *dst_vec = reinterpret_cast<float2 *>(dst);
+    size_t vec_count = embedding_dim / 2;
+
+    // Vectorized copy with Moore-optimized memory access
+    for (size_t i = 0; i < vec_count; ++i) {
+        dst_vec[i] = src_vec[i];
+    }
+
+    // Copy remaining element if odd
+    if (embedding_dim % 2 != 0) {
+        dst[embedding_dim - 1] = src[embedding_dim - 1];
+    }
+}
+
+// Vectorized copy for half type using half2
+template <typename IndexType>
+__forceinline__ __device__ void copyVectorizedHalf2(
+    half *__restrict__ dst,
+    const half *__restrict__ src,
+    size_t embedding_dim) {
+    // Use half2 for vectorized access (4 bytes, 2 halfs)
+    const half2 *src_vec = reinterpret_cast<const half2 *>(src);
+    half2 *dst_vec = reinterpret_cast<half2 *>(dst);
+    size_t vec_count = embedding_dim / 2;
+
+    // Vectorized copy optimized for Moore architecture
+    for (size_t i = 0; i < vec_count; ++i) {
+        dst_vec[i] = src_vec[i];
+    }
+
+    // Copy remaining element if odd
+    if (embedding_dim % 2 != 0) {
+        dst[embedding_dim - 1] = src[embedding_dim - 1];
+    }
+}
+
+// Vectorized copy for Moore bfloat16 type using bfloat162
+template <typename IndexType>
+__forceinline__ __device__ void copyVectorizedBFloat162(
+    __mt_bfloat16 *__restrict__ dst,
+    const __mt_bfloat16 *__restrict__ src,
+    size_t embedding_dim) {
+    // Use mt_bfloat162 for vectorized access (4 bytes, 2 bfloat16s)
+    const __mt_bfloat162 *src_vec = reinterpret_cast<const __mt_bfloat162 *>(src);
+    __mt_bfloat162 *dst_vec = reinterpret_cast<__mt_bfloat162 *>(dst);
+    size_t vec_count = embedding_dim / 2;
+
+    // Vectorized copy with Moore-specific optimization
+    for (size_t i = 0; i < vec_count; ++i) {
+        dst_vec[i] = src_vec[i];
+    }
+
+    // Copy remaining element if odd
+    if (embedding_dim % 2 != 0) {
+        dst[embedding_dim - 1] = src[embedding_dim - 1];
+    }
+}
+
+// Scalar copy fallback with Moore-optimized memory access
+template <typename T, typename IndexType>
+__forceinline__ __device__ void copyScalar(
+    T *__restrict__ dst,
+    const T *__restrict__ src,
+    size_t embedding_dim) {
+    // Scalar copy with Moore read-only weight optimization
+    for (size_t i = 0; i < embedding_dim; ++i) {
+        dst[i] = src[i];
+    }
+}
+
+#endif // __EMBEDDING_MOORE_KERNEL_CUH__
--- a/src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu
+++ b/src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu
+#include "../../../../utils.h"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../../../tensor.h"
+#include "../cuda/embedding_kernel.cuh"
+#include "embedding_nvidia.cuh"
+#include <cuda_runtime.h>
+
+template <typename T, typename IndexType>
+INFINIOP_CUDA_KERNEL embeddingKernel(
+    T *__restrict__ output,
+    const IndexType *__restrict__ indices,
+    const T *__restrict__ weight,
+    size_t num_indices,
+    size_t embedding_dim,
+    size_t vocab_size) {
+    // Calculate global thread index
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < num_indices) {
+        // Get the index value
+        IndexType index_val = __ldg(&indices[idx]);
+
+        // Bounds check - handle negative indices gracefully
+        if (index_val >= 0 && static_cast<size_t>(index_val) < vocab_size) {
+            // Copy embedding vector from weight to output
+            const T *src = weight + static_cast<size_t>(index_val) * embedding_dim;
+            T *dst = output + idx * embedding_dim;
+
+            // Choose optimal copy strategy based on type and alignment
+            if constexpr (std::is_same_v<T, float>) {
+                // Check alignment for float4 (16 bytes)
+                bool aligned_16 = is_aligned(src, 16) && is_aligned(dst, 16);
+                if (aligned_16 && embedding_dim >= 4 && embedding_dim % 4 == 0) {
+                    copyVectorizedFloat4<IndexType>(dst, src, embedding_dim);
+                } else if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
+                    // Try float2 if not aligned to 16 bytes
+                    copyVectorizedFloat2<IndexType>(dst, src, embedding_dim);
+                } else {
+                    copyScalar<T, IndexType>(dst, src, embedding_dim);
+                }
+            } else if constexpr (std::is_same_v<T, half>) {
+                // Use half2 for vectorized access
+                if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
+                    copyVectorizedHalf2<IndexType>(dst, src, embedding_dim);
+                } else {
+                    copyScalar<T, IndexType>(dst, src, embedding_dim);
+                }
+            } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+                // Use bfloat162 for vectorized access
+                if (embedding_dim >= 2 && embedding_dim % 2 == 0) {
+                    copyVectorizedBFloat162<IndexType>(dst, src, embedding_dim);
+                } else {
+                    copyScalar<T, IndexType>(dst, src, embedding_dim);
+                }
+            } else {
+                // Fallback to scalar copy with __ldg
+                copyScalar<T, IndexType>(dst, src, embedding_dim);
+            }
+        }
+    }
+}
+
+namespace op::embedding::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc) {
+
+    auto input_shape = input_desc->shape();
+    auto weight_shape = weight_desc->shape();
+
+    // Validate shapes
+    CHECK_OR_RETURN(weight_shape.size() == 2, INFINI_STATUS_BAD_TENSOR_SHAPE);
+    CHECK_OR_RETURN(output_desc->shape().size() == input_shape.size() + 1, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+    // Check output shape matches input shape + embedding_dim
+    auto output_shape = output_desc->shape();
+    size_t embedding_dim = weight_shape[1];
+    CHECK_OR_RETURN(output_shape.back() == embedding_dim, INFINI_STATUS_BAD_TENSOR_SHAPE);
+
+    for (size_t i = 0; i < input_shape.size(); ++i) {
+        CHECK_OR_RETURN(output_shape[i] == input_shape[i], INFINI_STATUS_BAD_TENSOR_SHAPE);
+    }
+
+    // Validate dtypes
+    auto input_dtype = input_desc->dtype();
+    auto weight_dtype = weight_desc->dtype();
+    CHECK_OR_RETURN(input_dtype == INFINI_DTYPE_I32 || input_dtype == INFINI_DTYPE_I64,
+                    INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_OR_RETURN(weight_dtype == INFINI_DTYPE_F32 || weight_dtype == INFINI_DTYPE_F16 || weight_dtype == INFINI_DTYPE_BF16, INFINI_STATUS_BAD_TENSOR_DTYPE);
+    CHECK_OR_RETURN(output_desc->dtype() == weight_dtype, INFINI_STATUS_BAD_TENSOR_DTYPE);
+
+    // Calculate number of indices (supporting batch dimension)
+    size_t num_indices = 1;
+    for (auto dim : input_shape) {
+        num_indices *= dim;
+    }
+
+    size_t vocab_size = weight_shape[0];
+
+    *desc_ptr = new Descriptor(
+        num_indices,
+        embedding_dim,
+        vocab_size,
+        input_dtype,
+        weight_dtype,
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        handle->device,
+        handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *output,
+    const void *input,
+    const void *weight,
+    void *stream) const {
+
+    if (_num_indices == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    // Dynamic block size optimization based on embedding_dim
+    // Smaller embedding_dim benefits from larger block size (better occupancy)
+    // Larger embedding_dim benefits from smaller block size (more registers per thread)
+    size_t block_size = 256; // Default
+    if (_embedding_dim <= 64) {
+        block_size = 512; // Small embedding_dim: use larger block for better occupancy
+    } else if (_embedding_dim >= 1024) {
+        block_size = 128; // Large embedding_dim: use smaller block to reduce register pressure
+    }
+
+    size_t grid_size = (_num_indices + block_size - 1) / block_size;
+
+    // Launch kernel based on dtypes
+    if (_input_dtype == INFINI_DTYPE_I32) {
+        const int32_t *indices_ptr = reinterpret_cast<const int32_t *>(input);
+
+        if (_weight_dtype == INFINI_DTYPE_F32) {
+            embeddingKernel<float, int32_t><<<grid_size, block_size, 0, cuda_stream>>>(
+                reinterpret_cast<float *>(output),
+                indices_ptr,
+                reinterpret_cast<const float *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else if (_weight_dtype == INFINI_DTYPE_F16) {
+            embeddingKernel<half, int32_t><<<grid_size, block_size, 0, cuda_stream>>>(
+                reinterpret_cast<half *>(output),
+                indices_ptr,
+                reinterpret_cast<const half *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else if (_weight_dtype == INFINI_DTYPE_BF16) {
+            embeddingKernel<cuda_bfloat16, int32_t><<<grid_size, block_size, 0, cuda_stream>>>(
+                reinterpret_cast<cuda_bfloat16 *>(output),
+                indices_ptr,
+                reinterpret_cast<const cuda_bfloat16 *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else if (_input_dtype == INFINI_DTYPE_I64) {
+        const int64_t *indices_ptr = reinterpret_cast<const int64_t *>(input);
+
+        if (_weight_dtype == INFINI_DTYPE_F32) {
+            embeddingKernel<float, int64_t><<<grid_size, block_size, 0, cuda_stream>>>(
+                reinterpret_cast<float *>(output),
+                indices_ptr,
+                reinterpret_cast<const float *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else if (_weight_dtype == INFINI_DTYPE_F16) {
+            embeddingKernel<half, int64_t><<<grid_size, block_size, 0, cuda_stream>>>(
+                reinterpret_cast<half *>(output),
+                indices_ptr,
+                reinterpret_cast<const half *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else if (_weight_dtype == INFINI_DTYPE_BF16) {
+            embeddingKernel<cuda_bfloat16, int64_t><<<grid_size, block_size, 0, cuda_stream>>>(
+                reinterpret_cast<cuda_bfloat16 *>(output),
+                indices_ptr,
+                reinterpret_cast<const cuda_bfloat16 *>(weight),
+                _num_indices,
+                _embedding_dim,
+                _vocab_size);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    // Check for kernel launch errors
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::embedding::nvidia
--- a/src/infiniop/ops/embedding/nvidia/embedding_nvidia.cuh
+++ b/src/infiniop/ops/embedding/nvidia/embedding_nvidia.cuh
+#ifndef __EMBEDDING_CUDA_H__
+#define __EMBEDDING_CUDA_H__
+
+#include "../embedding.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __EMBEDDING_CUDA_H__
--- a/src/infiniop/ops/embedding/operator.cc
+++ b/src/infiniop/ops/embedding/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/embedding.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/embedding_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API) || defined(ENABLE_ALI_API)
+#include "nvidia/embedding_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/embedding_metax.cuh"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/embedding_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateEmbeddingDescriptor(
+    infiniopHandle_t handle,
+    infiniopEmbeddingDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        return op::embedding::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::embedding::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                         \
+            input_desc,                                                          \
+            weight_desc)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_ALI_API
+        CREATE(INFINI_DEVICE_ALI, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CREATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopEmbedding(
+    infiniopEmbeddingDescriptor_t desc,
+    void *output,
+    const void *input,
+    const void *weight,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                      \
+        return reinterpret_cast<const op::embedding::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(output, input, weight, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_ALI_API
+        CALCULATE(INFINI_DEVICE_ALI, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CALCULATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyEmbeddingDescriptor(infiniopEmbeddingDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                       \
+        delete reinterpret_cast<const op::embedding::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_ALI_API
+        DESTROY(INFINI_DEVICE_ALI, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DESTROY(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        DESTROY(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DESTROY
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}