issue/900 - adapt to graph and adjust test script

eb34d4d6 · wooway777 · 835209e7 · eb34d4d6 · eb34d4d6 · eb34d4d6
Commit eb34d4d6 authored Jan 09, 2026 by wooway777
6 changed files
--- a/include/infinicore/ops/embedding.hpp
+++ b/include/infinicore/ops/embedding.hpp
 #pragma once

+#include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"

 namespace infinicore::op {

-class Embedding {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor);
-    static void execute(Tensor out, Tensor input, Tensor weight);
-    static common::OpDispatcher<schema> &dispatcher();
-};
+INFINICORE_GRAPH_OP_CLASS(Embedding, Tensor, const Tensor &, const Tensor &);

-Tensor embedding(Tensor input, Tensor weight);
-void embedding_(Tensor out, Tensor input, Tensor weight);
+Tensor embedding(const Tensor &input, const Tensor &weight);
+void embedding_(Tensor out, const Tensor &input, const Tensor &weight);
 } // namespace infinicore::op
--- a/src/infinicore/ops/embedding/embedding.cc
+++ b/src/infinicore/ops/embedding/embedding.cc
@@ -5,27 +5,19 @@
 #include <stdexcept>

 namespace infinicore::op {
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Embedding);

-common::OpDispatcher<Embedding::schema> &Embedding::dispatcher() {
-    static common::OpDispatcher<Embedding::schema> dispatcher_;
-    return dispatcher_;
-}
-
-void Embedding::execute(Tensor out, Tensor input, Tensor weight) {
-    // Check that all tensors are on the same device
-    // This is critical: if input is on CPU while out/weight are on GPU,
-    // passing CPU pointer to CUDA kernel will cause memory access errors
+Embedding::Embedding(Tensor out, const Tensor &input, const Tensor &weight) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, weight);
+    INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, input, weight);
+}

-    // Set device context
-    infinicore::context::setDevice(out->device());
-
-    // Use dispatcher to lookup kernel (infiniop implementation)
-    dispatcher().lookup(out->device().getType())(out, input, weight);
+void Embedding::execute(Tensor out, const Tensor &input, const Tensor &weight) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Embedding, out, input, weight);
 }

-Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the indices to extract
-                 Tensor weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
+Tensor embedding(const Tensor &input, // LongTensor of arbitrary shape containing the indices to extract
+                 const Tensor &weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
 ) {
    auto input_shape = input->shape();
    auto weight_shape = weight->shape();
@@ -40,7 +32,7 @@ Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the i
    return inputs_embeds;
 }

-void embedding_(Tensor out, Tensor input, Tensor weight) {
+void embedding_(Tensor out, const Tensor &input, const Tensor &weight) {
    Embedding::execute(out, input, weight);
 }


--- a/src/infinicore/ops/embedding/embedding_infiniop.cc
+++ b/src/infinicore/ops/embedding/embedding_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
+#include "../infiniop_impl.hpp"
 #include "infinicore/ops/embedding.hpp"
-#include <infiniop.h>

 namespace infinicore::op::embedding_impl::infiniop {

-thread_local common::OpCache<size_t, infiniopEmbeddingDescriptor_t> caches(
-    100, // capacity
-    [](infiniopEmbeddingDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyEmbeddingDescriptor(desc));
-            desc = nullptr;
-        }
-    });
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Embedding, 100);

-void calculate(Tensor out, Tensor input, Tensor weight) {
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor out, input, weight;
+};
+
+void *plan(Tensor out, const Tensor &input, const Tensor &weight) {
    size_t seed = hash_combine(out, input, weight);

-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Embedding,
+        seed, out->desc(), input->desc(), weight->desc());
+
+    auto planned = new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(out),
+        graph::GraphTensor(input),
+        graph::GraphTensor(weight)};

-    auto desc_opt = cache.get(seed);
-    infiniopEmbeddingDescriptor_t desc = nullptr;
+    return planned;
+}

-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateEmbeddingDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            out->desc(), input->desc(), weight->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);

    INFINICORE_CHECK_ERROR(infiniopEmbedding(
-        desc,
-        out->data(),
-        input->data(),
-        weight->data(),
-        context::getStream()));
+        planned->descriptor->desc,
+        planned->out->data(), planned->input->data(), planned->weight->data(), context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }

-static bool registered = []() {
-    Embedding::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Embedding, &plan, &run, cleanup);

 } // namespace infinicore::op::embedding_impl::infiniop
--- a/test/infinicore/nn/test_embedding_graph_recording.py
+++ b/test/infinicore/nn/test_embedding_graph_recording.py
 """
-测试 embedding 是否支持 CUDA Graph 录制
+Test if embedding supports CUDA Graph recording

-使用方法：
+Usage:
    python test/infinicore/nn/test_embedding_graph_recording.py

-关键验证点：
-1. 改动前：indices->to(cpu_device) 会触发同步的 D2H 拷贝，导致图录制失败
-2. 改动后：使用设备端 CUDA kernel，完全异步，支持图录制
+Key verification points:
+1. Before modification: indices->to(cpu_device) triggers synchronous D2H copy, causing graph recording to fail
+2. After modification: Uses device-side CUDA kernel, fully asynchronous, supports graph recording

-预期结果：
- 改动前：图录制失败，设备端输入可能失败
- 改动后：图录制成功，设备端输入成功
+Expected results:
+- Before modification: Graph recording fails, device-side input may fail
+- After modification: Graph recording succeeds, device-side input succeeds
 """

 import infinicore
 import torch
-import ctypes


 def test_embedding_graph_recording():
-    """测试 embedding 是否支持 CUDA Graph 录制"""
+    """Test if embedding supports CUDA Graph recording"""
    print("=" * 60)
-    print("测试 Embedding 图录制支持")
+    print("Testing Embedding Graph Recording Support")
    print("=" * 60)
-    
-    # 检查是否有 CUDA
+
+    # Check if CUDA is available
    if not torch.cuda.is_available():
-        print("⚠ CUDA 不可用，跳过图录制测试")
+        print("⚠ CUDA not available, skipping graph recording test")
        return False
-    
+
    device = infinicore.device("cuda", 0)
-    
-    # 创建 embedding 模块
+
+    # Create embedding module
    vocab_size = 1000
    embedding_dim = 128
    embedding = infinicore.nn.Embedding(
        num_embeddings=vocab_size,
        embedding_dim=embedding_dim,
        dtype=infinicore.float32,
-        device=device
+        device=device,
    )
-    
-    # 创建设备端的 input_ids（这是关键：改动前不支持，改动后支持）
+
+    # Create device-side input_ids (key point: unsupported before modification, supported after)
    batch_size = 4
    seq_len = 32
    input_ids_device = infinicore.from_list(
        [[i % vocab_size for i in range(seq_len)] for _ in range(batch_size)],
        dtype=infinicore.int64,
-        device=device
+        device=device,
    )
-    
-    print(f"\n1. 输入张量信息:")
+
+    print(f"\n1. Input tensor information:")
    print(f"   - Shape: {input_ids_device.shape}")
    print(f"   - Device: {input_ids_device.device.type}")
    print(f"   - Dtype: {input_ids_device.dtype}")
-    
-    # 尝试使用 CUDA Graph 录制
-    print(f"\n2. 尝试 CUDA Graph 录制...")
-    
-    # 使用 PyTorch 的 CUDA Graph API 进行测试（更简单可靠）
+
+    # Attempt CUDA Graph recording
+    print(f"\n2. Attempting CUDA Graph recording...")
+
+    # Use PyTorch's CUDA Graph API for testing (simpler and more reliable)
    try:
-        # 设置设备
+        # Set device
        infinicore.set_device(device)
-        
-        # 使用 PyTorch 的 CUDA Graph API
-        # 注意：PyTorch 2.0+ 支持 torch.cuda.graph
+
+        # Use PyTorch's CUDA Graph API
+        # Note: PyTorch 2.0+ supports torch.cuda.graph
        try:
-            # 方法 1: 使用 PyTorch 的 CUDA Graph（推荐）
-            print("   使用 PyTorch CUDA Graph API 测试...")
-            
-            # 创建 warmup 输入
+            # Method 1: Use PyTorch CUDA Graph (recommended)
+            print("   Using PyTorch CUDA Graph API for testing...")
+
+            # Create warmup input
            warmup_input = input_ids_device
-            
-            # Warmup（图录制前需要先执行一次，包括内存分配）
-            warmup_output = embedding.forward(warmup_input)
-            infinicore.sync_stream()  # 同步确保 warmup 完成
-            
-            # 预先分配输出张量（CUDA Graph 不支持动态内存分配）
-            # 输出形状: input_shape + [embedding_dim]
+
+            # Warmup (need to execute once before graph recording, including memory allocation)
+            embedding.forward(warmup_input)
+            infinicore.sync_stream()  # Synchronize to ensure warmup completes
+
+            # Pre-allocate output tensor (CUDA Graph doesn't support dynamic memory allocation)
+            # Output shape: input_shape + [embedding_dim]
            output_shape = list(input_ids_device.shape) + [embedding_dim]
            output = infinicore.empty(
-                output_shape,
-                dtype=embedding.weight.dtype,
-                device=device
+                output_shape, dtype=embedding.weight.dtype, device=device
            )
-            
-            # Warmup embedding（确保内存分配完成）
+
+            # Warmup embedding (ensure memory allocation is complete)
            import infinicore.nn.functional as F
+
            F.embedding(warmup_input, embedding.weight, out=output)
            infinicore.sync_stream()
-            
-            # 开始图录制（使用预先分配的 output）
+
+            # Start graph recording (using pre-allocated output)
            graph = torch.cuda.CUDAGraph()
            with torch.cuda.graph(graph):
-                # 使用 embedding 的 out 参数（in-place），传入预先分配的 output
+                # Use embedding's out parameter (in-place), passing pre-allocated output
                F.embedding(input_ids_device, embedding.weight, out=output)
-            
-            print("   ✓ 成功完成图录制！")
-            print("   ✓ Embedding 支持 CUDA Graph 录制")
-            
-            # 验证图可以重复执行
+
+            print("   ✓ Graph recording successful!")
+            print("   ✓ Embedding supports CUDA Graph recording")
+
+            # Verify graph can be replayed
            graph.replay()
            infinicore.sync_stream()
-            
-            print("   ✓ 图可以成功重放")
+
+            print("   ✓ Graph can be successfully replayed")
            return True
-            
+
        except AttributeError:
-            # PyTorch 版本可能不支持 torch.cuda.graph
-            print("   ⚠ PyTorch 版本不支持 torch.cuda.graph，使用简化验证方法")
+            # PyTorch version may not support torch.cuda.graph
+            print(
+                "   ⚠ PyTorch version doesn't support torch.cuda.graph, using simplified verification method"
+            )
            return test_embedding_async_verification(embedding, input_ids_device)
        except RuntimeError as e:
            error_msg = str(e)
            if "capture" in error_msg.lower() or "graph" in error_msg.lower():
-                print(f"   ✗ 图录制失败: {e}")
-                print("   ✗ Embedding 不支持 CUDA Graph 录制（可能包含同步操作）")
+                print(f"   ✗ Graph recording failed: {e}")
+                print(
+                    "   ✗ Embedding doesn't support CUDA Graph recording (may contain synchronous operations)"
+                )
                return False
            else:
-                print(f"   ⚠ 图录制测试异常: {e}")
+                print(f"   ⚠ Graph recording test exception: {e}")
                return test_embedding_async_verification(embedding, input_ids_device)
-            
+
    except Exception as e:
-        print(f"   ⚠ 图录制测试异常: {e}")
-        print("   使用简化验证方法...")
+        print(f"   ⚠ Graph recording test exception: {e}")
+        print("   Using simplified verification method...")
        import traceback
+
        traceback.print_exc()
        return test_embedding_async_verification(embedding, input_ids_device)


 def test_embedding_async_verification(embedding, input_ids_device):
    """
-    简化验证：检查是否有同步操作
-    
-    关键检查点：
-    1. 输入是否可以在设备上（改动前需要 CPU，改动后支持设备）
-    2. 操作是否完全异步（没有同步点）
+    Simplified verification: Check if there are synchronous operations
+
+    Key checkpoints:
+    1. Whether input can be on device (needed CPU before modification, supports device after)
+    2. Whether operations are fully asynchronous (no synchronization points)
    """
-    print("\n3. 简化验证：检查异步操作支持")
-    
-    # 验证 1: 输入可以在设备上
+    print("\n3. Simplified verification: Checking asynchronous operation support")
+
+    # Verification 1: Input can be on device
    if input_ids_device.device.type != "cuda":
-        print("   ✗ 输入不在设备上，无法验证")
+        print("   ✗ Input not on device, cannot verify")
        return False
-    
-    print("   ✓ 输入在设备上")
-    
-    # 验证 2: 执行 forward，检查是否有同步操作
-    # 如果改动前，这里会调用 indices->to(cpu_device)，触发同步
-    # 如果改动后，直接使用设备端 kernel，完全异步
-    
+
+    print("   ✓ Input is on device")
+
+    # Verification 2: Execute forward, check for synchronous operations
+    # Before modification, this would call indices->to(cpu_device), triggering synchronization
+    # After modification, directly uses device-side kernel, fully asynchronous
+
    try:
-        # 记录开始时间
+        # Record start time
        start_event = infinicore.DeviceEvent(enable_timing=True)
        end_event = infinicore.DeviceEvent(enable_timing=True)
-        
+
        start_event.record()
        output = embedding.forward(input_ids_device)
        end_event.record()
-        
-        # 不立即同步，检查操作是否异步
-        # 如果操作是异步的，query 应该返回 False（未完成）
-        # 如果操作是同步的，可能已经完成
-        
-        # 等待一小段时间
+
+        # Don't synchronize immediately, check if operation is asynchronous
+        # If operation is asynchronous, query should return False (not completed)
+        # If operation is synchronous, may have already completed
+
+        # Wait a short time
        import time
+
        time.sleep(0.001)  # 1ms
-        
-        # 检查事件状态
+
+        # Check event status
        is_complete = end_event.query()
-        
+
        if not is_complete:
-            print("   ✓ 操作是异步的（事件未立即完成）")
+            print("   ✓ Operation is asynchronous (event not immediately completed)")
        else:
-            print("   ⚠ 操作可能包含同步点（事件立即完成）")
-        
-        # 同步并测量时间
+            print(
+                "   ⚠ Operation may contain synchronization points (event immediately completed)"
+            )
+
+        # Synchronize and measure time
        end_event.synchronize()
        elapsed = start_event.elapsed_time(end_event)
-        
-        print(f"   ✓ Forward 执行时间: {elapsed:.3f} ms")
-        print(f"   ✓ 输出形状: {output.shape}")
-        print(f"   ✓ 输出设备: {output.device.type}")
-        
-        # 验证输出正确性
+
+        print(f"   ✓ Forward execution time: {elapsed:.3f} ms")
+        print(f"   ✓ Output shape: {output.shape}")
+        print(f"   ✓ Output device: {output.device.type}")
+
+        # Verify output correctness
        embedding_dim = embedding.embedding_dim()
        expected_shape = (*input_ids_device.shape, embedding_dim)
        if output.device.type == "cuda" and output.shape == expected_shape:
-            print("   ✓ 输出在设备上，形状正确")
+            print("   ✓ Output on device, shape correct")
            return True
        else:
-            print(f"   ✗ 输出验证失败")
-            print(f"     期望形状: {expected_shape}, 实际形状: {output.shape}")
-            print(f"     期望设备: cuda, 实际设备: {output.device.type}")
+            print(f"   ✗ Output verification failed")
+            print(
+                f"     Expected shape: {expected_shape}, actual shape: {output.shape}"
+            )
+            print(f"     Expected device: cuda, actual device: {output.device.type}")
            return False
-            
+
    except Exception as e:
-        print(f"   ✗ 验证失败: {e}")
+        print(f"   ✗ Verification failed: {e}")
        import traceback
+
        traceback.print_exc()
        return False


 def test_embedding_device_input_support():
-    """测试 embedding 是否支持设备端输入"""
+    """Test if embedding supports device-side input"""
    print("\n" + "=" * 60)
-    print("测试 Embedding 设备端输入支持")
+    print("Testing Embedding Device-side Input Support")
    print("=" * 60)
-    
+
    if not torch.cuda.is_available():
-        print("⚠ CUDA 不可用，跳过测试")
+        print("⚠ CUDA not available, skipping test")
        return False
-    
+
    device = infinicore.device("cuda", 0)
    vocab_size = 100
    embedding_dim = 64
-    
+
    embedding = infinicore.nn.Embedding(
        num_embeddings=vocab_size,
        embedding_dim=embedding_dim,
        dtype=infinicore.float32,
-        device=device
+        device=device,
    )
-    
-    # 测试 1: 设备端输入（改动后支持）
-    print("\n测试 1: 设备端输入")
+
+    # Test 1: Device-side input (supported after modification)
+    print("\nTest 1: Device-side input")
    try:
        input_ids_device = infinicore.from_list(
-            [[1, 2, 3, 4, 5]],
-            dtype=infinicore.int64,
-            device=device
+            [[1, 2, 3, 4, 5]], dtype=infinicore.int64, device=device
        )
        output = embedding.forward(input_ids_device)
-        print(f"   ✓ 设备端输入成功")
-        print(f"   - 输入设备: {input_ids_device.device.type}")
-        print(f"   - 输出设备: {output.device.type}")
-        print(f"   - 输出形状: {output.shape}")
+        print(f"   ✓ Device-side input successful")
+        print(f"   - Input device: {input_ids_device.device.type}")
+        print(f"   - Output device: {output.device.type}")
+        print(f"   - Output shape: {output.shape}")
        return True
    except Exception as e:
-        print(f"   ✗ 设备端输入失败: {e}")
+        print(f"   ✗ Device-side input failed: {e}")
        return False


 def main():
-    """主测试函数"""
+    """Main test function"""
    print("\n" + "=" * 60)
-    print("Embedding 图录制支持验证")
+    print("Embedding Graph Recording Support Verification")
    print("=" * 60)
-    
+
    results = []
-    
-    # 测试 1: 图录制支持
+
+    # Test 1: Graph recording support
    result1 = test_embedding_graph_recording()
-    results.append(("CUDA Graph 录制", result1))
-    
-    # 测试 2: 设备端输入支持
+    results.append(("CUDA Graph Recording", result1))
+
+    # Test 2: Device-side input support
    result2 = test_embedding_device_input_support()
-    results.append(("设备端输入", result2))
-    
-    # 总结
+    results.append(("Device-side Input", result2))
+
+    # Summary
    print("\n" + "=" * 60)
-    print("测试结果总结")
+    print("Test Results Summary")
    print("=" * 60)
-    
+
    all_passed = True
    for test_name, result in results:
-        status = "✓ 通过" if result else "✗ 失败"
+        status = "✓ Passed" if result else "✗ Failed"
        print(f"{test_name}: {status}")
        if not result:
            all_passed = False
-    
+
    print("\n" + "=" * 60)
    if all_passed:
-        print("✓ 所有测试通过！Embedding 支持图录制")
+        print("✓ All tests passed! Embedding supports graph recording")
    else:
-        print("✗ 部分测试失败，Embedding 可能不完全支持图录制")
+        print("✗ Some tests failed, embedding may not fully support graph recording")
    print("=" * 60)
-    
+
    return all_passed



--- a/test/infinicore/nn/EMBEDDING_GRAPH_RECORDING_COMPARISON.md
+++ b/test/infinicore/nn/EMBEDDING_GRAPH_RECORDING_COMPARISON.md
-# Embedding 图录制支持对比
-
-## 改动前后对比
-
-### ❌ 改动前：不支持图录制
-
-**关键问题代码**（在 `nn::Embedding::forward` 中）：
-```cpp
-// 改动前的实现
-Tensor Embedding::forward(const Tensor &indices) const {
-    auto cpu_device = Device(Device::Type::CPU, 0);
-    auto indices_cpu = indices->to(cpu_device)->contiguous();  // ❌ 同步操作！
-    
-    // ... 后续处理
-}
-```
-
-**问题分析**：
-1. `indices->to(cpu_device)` 会触发 **同步的 D2H（Device-to-Host）内存拷贝**
-2. CUDA Graph 录制要求所有操作都是**异步的**，不能有同步点
-3. 同步操作会导致图录制失败或产生错误
-
-**验证方法**：
-```python
-# 改动前：这个操作会失败或产生同步
-input_ids_device = infinicore.from_list(..., device="cuda:0")  # 设备端输入
-output = embedding.forward(input_ids_device)  # ❌ 内部会同步拷贝到 CPU
-```
-
---
-
-### ✅ 改动后：支持图录制
-
-**关键改进代码**：
-```cpp
-// 改动后的实现
-Tensor Embedding::forward(const Tensor &indices) const {
-    Tensor indices_contiguous = indices->is_contiguous() ? indices : indices->contiguous();
-    return op::embedding(indices_contiguous, weight_);  // ✅ 直接使用设备端 kernel
-}
-```
-
-**改进点**：
-1. **移除了同步操作**：不再调用 `indices->to(cpu_device)`
-2. **使用设备端 CUDA kernel**：通过 InfiniOP 调用 `embeddingKernel`，完全在设备端执行
-3. **完全异步**：所有操作都在 CUDA stream 上异步执行
-
-**实现位置**：
- CUDA Kernel: `src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu`
- Kernel 启动：使用 `cudaStream_t`，完全异步
- 无同步点：没有 `cudaDeviceSynchronize()` 或 D2H 拷贝
-
-**验证方法**：
-```python
-# 改动后：这个操作完全异步，支持图录制
-input_ids_device = infinicore.from_list(..., device="cuda:0")  # 设备端输入
-output = embedding.forward(input_ids_device)  # ✅ 直接使用设备端 kernel，无同步
-```
-
---
-
-## 验证方法
-
-### 方法 1: 代码检查
-
-**检查点**：
-1. ✅ 是否有 `->to(cpu_device)` 调用？
-2. ✅ 是否有 `synchronize()` 调用？
-3. ✅ 是否有设备端 kernel 实现？
-
-**改动前**：
-```cpp
-// ❌ 有同步操作
-auto indices_cpu = indices->to(cpu_device)->contiguous();
-```
-
-**改动后**：
-```cpp
-// ✅ 无同步操作，直接使用设备端 kernel
-return op::embedding(indices_contiguous, weight_);
-```
-
-### 方法 2: CUDA Graph API 测试
-
-运行测试脚本：
-```bash
-python test/infinicore/nn/test_embedding_graph_recording.py
-```
-
-**预期结果**：
- ✅ 改动后：图录制成功
- ❌ 改动前：图录制失败（因为同步操作）
-
-### 方法 3: 设备端输入测试
-
-**关键测试**：
-```python
-# 创建设备端输入
-input_ids = infinicore.from_list([[1, 2, 3]], dtype=int64, device="cuda:0")
-
-# 执行 forward
-output = embedding.forward(input_ids)  # 改动前会失败或同步，改动后成功
-```
-
-**改动前**：
- 需要先将 `input_ids` 拷贝到 CPU
- 触发同步操作，无法图录制
-
-**改动后**：
- 直接使用设备端 `input_ids`
- 完全异步，支持图录制
-
---
-
-## 技术细节对比
-
-| 特性 | 改动前 | 改动后 |
-|------|--------|--------|
-| **输入设备** | 必须在 CPU | 支持设备端 |
-| **同步操作** | ❌ 有（D2H拷贝） | ✅ 无 |
-| **Kernel位置** | CPU 实现 | CUDA kernel |
-| **图录制支持** | ❌ 不支持 | ✅ 支持 |
-| **Batch维度** | ✅ 支持 | ✅ 支持 |
-| **性能** | 较慢（同步开销） | 更快（异步） |
-
---
-
-## 关键代码位置
-
-### 改动前的问题代码
- `src/infinicore/nn/embedding.cc` (旧版本)
-  - 第58行：`indices->to(cpu_device)->contiguous()` ❌
-
-### 改动后的实现
- `src/infinicore/nn/embedding.cc` (新版本)
-  - 第48行：`indices->is_contiguous() ? indices : indices->contiguous()` ✅
-  - 第52行：`return op::embedding(indices_contiguous, weight_)` ✅
-
- `src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu`
-  - CUDA kernel 实现，完全异步 ✅
-
- `src/infinicore/ops/embedding/embedding_infiniop.cc`
-  - InfiniOP 包装，调用设备端 kernel ✅
-
---
-
-## 总结
-
-**改动前的关键问题**：
- ❌ `indices->to(cpu_device)` 触发同步 D2H 拷贝
- ❌ 无法进行 CUDA Graph 录制
- ❌ 性能较差（同步开销）
-
-**改动后的改进**：
- ✅ 移除所有同步操作
- ✅ 使用设备端 CUDA kernel
- ✅ 完全支持 CUDA Graph 录制
- ✅ 性能更好（完全异步）
-
--- a/test/infinicore/nn/HOW_TO_USE_GRAPH_RECORDING_TEST.md
+++ b/test/infinicore/nn/HOW_TO_USE_GRAPH_RECORDING_TEST.md
-# Embedding 图录制测试使用指南
-
-## 🚀 快速开始
-
-### 运行测试
-
-```bash
-cd /home/zhuyue/codes/InfiniCore
-python test/infinicore/nn/test_embedding_graph_recording.py
-```
-
---
-
-## 📊 改动前后对比
-
-### ❌ 改动前：不支持图录制
-
-#### 1. 运行测试
-
-```bash
-python test/infinicore/nn/test_embedding_graph_recording.py
-```
-
-#### 2. 预期输出
-
-```
-============================================================
-Embedding 图录制支持验证
-============================================================
-============================================================
-测试 Embedding 图录制支持
-============================================================
-
-1. 输入张量信息:
-   - Shape: [4, 32]
-   - Device: cuda
-   - Dtype: int64
-
-2. 尝试 CUDA Graph 录制...
-   使用 PyTorch CUDA Graph API 测试...
-   ✗ 图录制失败: [错误信息]
-   ✗ Embedding 不支持 CUDA Graph 录制（可能包含同步操作）
-
-3. 简化验证：检查异步操作支持
-   ✓ 输入在设备上
-   ⚠ 操作可能包含同步点（事件立即完成）  ← 关键：说明有同步操作
-   ✓ Forward 执行时间: X.XXX ms
-   ✓ 输出形状: [4, 32, 128]
-   ✓ 输出设备: cuda
-   ✗ 输出验证失败
-
-============================================================
-测试 Embedding 设备端输入支持
-============================================================
-
-测试 1: 设备端输入
-   ✗ 设备端输入失败: [错误信息]
-
-============================================================
-测试结果总结
-============================================================
-CUDA Graph 录制: ✗ 失败
-设备端输入: ✗ 失败
-============================================================
-✗ 部分测试失败，Embedding 可能不完全支持图录制
-============================================================
-```
-
-#### 3. 关键失败点
-
- **图录制失败**：因为代码中有 `indices->to(cpu_device)` 同步操作
- **设备端输入失败**：需要先将输入拷贝到 CPU
- **异步验证显示同步点**：事件立即完成，说明有同步操作
-
---
-
-### ✅ 改动后：支持图录制
-
-#### 1. 运行测试
-
-```bash
-python test/infinicore/nn/test_embedding_graph_recording.py
-```
-
-#### 2. 预期输出
-
-```
-============================================================
-Embedding 图录制支持验证
-============================================================
-============================================================
-测试 Embedding 图录制支持
-============================================================
-
-1. 输入张量信息:
-   - Shape: [4, 32]
-   - Device: cuda
-   - Dtype: int64
-
-2. 尝试 CUDA Graph 录制...
-   使用 PyTorch CUDA Graph API 测试...
-   ✓ 成功完成图录制！
-   ✓ Embedding 支持 CUDA Graph 录制
-   ✓ 图可以成功重放
-
-============================================================
-测试 Embedding 设备端输入支持
-============================================================
-
-测试 1: 设备端输入
-   ✓ 设备端输入成功
-   - 输入设备: cuda
-   - 输出设备: cuda
-   - 输出形状: [1, 5, 64]
-
-============================================================
-测试结果总结
-============================================================
-CUDA Graph 录制: ✓ 通过
-设备端输入: ✓ 通过
-============================================================
-✓ 所有测试通过！Embedding 支持图录制
-============================================================
-```
-
-#### 3. 关键成功点
-
- **图录制成功**：所有操作都是异步的，无同步点
- **设备端输入成功**：直接支持设备端输入，无需拷贝
- **图可以重放**：验证图录制的正确性
-
---
-
-## 🔍 如何判断当前是改动前还是改动后？
-
-### 方法 1: 代码检查（最快）
-
-```bash
-# 检查是否有同步操作
-grep -n "to(cpu_device)" src/infinicore/nn/embedding.cc
-
-# 结果解读：
-# - 有输出 → ❌ 改动前（不支持图录制）
-# - 无输出 → ✅ 改动后（支持图录制）
-```
-
-### 方法 2: 检查设备端实现
-
-```bash
-# 检查是否有设备端 CUDA kernel
-ls src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu
-
-# 结果解读：
-# - 不存在 → ❌ 改动前（不支持图录制）
-# - 存在 → ✅ 改动后（支持图录制）
-```
-
-### 方法 3: 运行测试（最准确）
-
-```bash
-python test/infinicore/nn/test_embedding_graph_recording.py
-
-# 查看 "CUDA Graph 录制" 测试结果：
-# - ✓ 通过 → ✅ 改动后（支持图录制）
-# - ✗ 失败 → ❌ 改动前（不支持图录制）
-```
-
---
-
-## 📝 测试内容详解
-
-### 测试 1: CUDA Graph 录制
-
-**目的**：验证 embedding 是否可以在 CUDA Graph 中录制
-
-**工作原理**：
-1. 使用 PyTorch 的 `torch.cuda.CUDAGraph()` API
-2. 在图录制模式下执行 `embedding.forward()`
-3. 如果包含同步操作，录制会失败
-4. 如果完全异步，录制会成功
-
-**改动前**：
- ❌ 录制失败：因为 `indices->to(cpu_device)` 触发同步
-
-**改动后**：
- ✅ 录制成功：使用设备端 CUDA kernel，完全异步
-
-### 测试 2: 设备端输入支持
-
-**目的**：验证 embedding 是否支持设备端输入
-
-**工作原理**：
-1. 创建设备端的 `input_ids`
-2. 直接调用 `embedding.forward(input_ids)`
-3. 检查是否成功且输出在设备上
-
-**改动前**：
- ❌ 可能需要先将输入拷贝到 CPU（同步操作）
-
-**改动后**：
- ✅ 直接支持设备端输入（完全异步）
-
-### 测试 3: 异步操作验证（备用）
-
-**目的**：当 CUDA Graph API 不可用时，使用事件验证异步性
-
-**工作原理**：
-1. 使用 `DeviceEvent` 记录操作时间
-2. 检查操作是否立即完成（同步）或异步执行
-
-**改动前**：
- ⚠️ 事件立即完成，说明有同步操作
-
-**改动后**：
- ✅ 事件未立即完成，说明是异步操作
-
---
-
-## 🛠️ 故障排查
-
-### 问题 1: PyTorch 版本不支持 CUDA Graph
-
-**现象**：
-```
-⚠ PyTorch 版本不支持 torch.cuda.graph，使用简化验证方法
-```
-
-**解决**：
- 需要 PyTorch 2.0+ 版本
- 测试会自动降级到简化验证方法
- 简化验证也能检测是否支持图录制
-
-### 问题 2: CUDA 不可用
-
-**现象**：
-```
-⚠ CUDA 不可用，跳过图录制测试
-```
-
-**解决**：
- 确保 CUDA 设备可用
- 测试需要 CUDA 环境
-
-### 问题 3: 测试失败但不确定原因
-
-**检查清单**：
-1. ✅ 确认代码已编译（特别是 CUDA 支持）
-2. ✅ 确认 CUDA 设备可用
-3. ✅ 检查 `src/infinicore/nn/embedding.cc` 是否还有 `to(cpu_device)`
-4. ✅ 检查是否有 `src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu`
-
---
-
-## 💡 快速验证脚本
-
-创建一个简单的验证脚本：
-
-```bash
-#!/bin/bash
-# quick_check.sh
-
-cd /home/zhuyue/codes/InfiniCore
-
-echo "=== 1. 代码检查 ==="
-if grep -q "to(cpu_device)" src/infinicore/nn/embedding.cc; then
-    echo "❌ 改动前：发现同步操作 to(cpu_device)"
-else
-    echo "✅ 改动后：无同步操作"
-fi
-
-echo ""
-echo "=== 2. 设备端实现检查 ==="
-if [ -f "src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu" ]; then
-    echo "✅ 改动后：有设备端 CUDA kernel"
-else
-    echo "❌ 改动前：无设备端 CUDA kernel"
-fi
-
-echo ""
-echo "=== 3. 运行测试 ==="
-python test/infinicore/nn/test_embedding_graph_recording.py
-```
-
-使用方法：
-```bash
-chmod +x quick_check.sh
-./quick_check.sh
-```
-
---
-
-## 📋 总结
-
-### 改动前特征
-
-| 检查项 | 结果 |
-|--------|------|
-| 代码中有 `to(cpu_device)` | ✅ 有 |
-| 有设备端 CUDA kernel | ❌ 无 |
-| 图录制测试 | ❌ 失败 |
-| 设备端输入 | ❌ 失败 |
-
-### 改动后特征
-
-| 检查项 | 结果 |
-|--------|------|
-| 代码中有 `to(cpu_device)` | ❌ 无 |
-| 有设备端 CUDA kernel | ✅ 有 |
-| 图录制测试 | ✅ 成功 |
-| 设备端输入 | ✅ 成功 |
-
-### 最简单的判断方法
-
-**运行测试脚本**，查看 "CUDA Graph 录制" 测试结果：
- ✅ **通过** → 支持图录制（改动后）
- ❌ **失败** → 不支持图录制（改动前）
-