Commit eb34d4d6 authored by wooway777's avatar wooway777
Browse files

issue/900 - adapt to graph and adjust test script

parent 835209e7
#pragma once
#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp"
namespace infinicore::op {
class Embedding {
public:
using schema = void (*)(Tensor, Tensor, Tensor);
static void execute(Tensor out, Tensor input, Tensor weight);
static common::OpDispatcher<schema> &dispatcher();
};
INFINICORE_GRAPH_OP_CLASS(Embedding, Tensor, const Tensor &, const Tensor &);
Tensor embedding(Tensor input, Tensor weight);
void embedding_(Tensor out, Tensor input, Tensor weight);
Tensor embedding(const Tensor &input, const Tensor &weight);
void embedding_(Tensor out, const Tensor &input, const Tensor &weight);
} // namespace infinicore::op
......@@ -5,27 +5,19 @@
#include <stdexcept>
namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Embedding);
common::OpDispatcher<Embedding::schema> &Embedding::dispatcher() {
static common::OpDispatcher<Embedding::schema> dispatcher_;
return dispatcher_;
}
void Embedding::execute(Tensor out, Tensor input, Tensor weight) {
// Check that all tensors are on the same device
// This is critical: if input is on CPU while out/weight are on GPU,
// passing CPU pointer to CUDA kernel will cause memory access errors
Embedding::Embedding(Tensor out, const Tensor &input, const Tensor &weight) {
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, weight);
INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, input, weight);
}
// Set device context
infinicore::context::setDevice(out->device());
// Use dispatcher to lookup kernel (infiniop implementation)
dispatcher().lookup(out->device().getType())(out, input, weight);
void Embedding::execute(Tensor out, const Tensor &input, const Tensor &weight) {
INFINICORE_GRAPH_OP_RECORD_OR_RUN(Embedding, out, input, weight);
}
Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the indices to extract
Tensor weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
Tensor embedding(const Tensor &input, // LongTensor of arbitrary shape containing the indices to extract
const Tensor &weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
) {
auto input_shape = input->shape();
auto weight_shape = weight->shape();
......@@ -40,7 +32,7 @@ Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the i
return inputs_embeds;
}
void embedding_(Tensor out, Tensor input, Tensor weight) {
void embedding_(Tensor out, const Tensor &input, const Tensor &weight) {
Embedding::execute(out, input, weight);
}
......
#include "../../utils.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "../infiniop_impl.hpp"
#include "infinicore/ops/embedding.hpp"
#include <infiniop.h>
namespace infinicore::op::embedding_impl::infiniop {
thread_local common::OpCache<size_t, infiniopEmbeddingDescriptor_t> caches(
100, // capacity
[](infiniopEmbeddingDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyEmbeddingDescriptor(desc));
desc = nullptr;
}
});
INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Embedding, 100);
void calculate(Tensor out, Tensor input, Tensor weight) {
struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor out, input, weight;
};
void *plan(Tensor out, const Tensor &input, const Tensor &weight) {
size_t seed = hash_combine(out, input, weight);
auto device = context::getDevice();
auto &cache = caches.getCache(device);
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
Descriptor, descriptor, Embedding,
seed, out->desc(), input->desc(), weight->desc());
auto planned = new PlannedMeta{
descriptor,
graph::GraphTensor(out),
graph::GraphTensor(input),
graph::GraphTensor(weight)};
auto desc_opt = cache.get(seed);
infiniopEmbeddingDescriptor_t desc = nullptr;
return planned;
}
if (!desc_opt) {
INFINICORE_CHECK_ERROR(infiniopCreateEmbeddingDescriptor(
context::getInfiniopHandle(device), &desc,
out->desc(), input->desc(), weight->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
void run(void *planned_meta) {
auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
INFINICORE_CHECK_ERROR(infiniopEmbedding(
desc,
out->data(),
input->data(),
weight->data(),
context::getStream()));
planned->descriptor->desc,
planned->out->data(), planned->input->data(), planned->weight->data(), context::getStream()));
}
void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
}
static bool registered = []() {
Embedding::dispatcher().registerAll(&calculate, false);
return true;
}();
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Embedding, &plan, &run, cleanup);
} // namespace infinicore::op::embedding_impl::infiniop
"""
测试 embedding 是否支持 CUDA Graph 录制
Test if embedding supports CUDA Graph recording
使用方法:
Usage:
python test/infinicore/nn/test_embedding_graph_recording.py
关键验证点:
1. 改动前:indices->to(cpu_device) 会触发同步的 D2H 拷贝,导致图录制失败
2. 改动后:使用设备端 CUDA kernel,完全异步,支持图录制
Key verification points:
1. Before modification: indices->to(cpu_device) triggers synchronous D2H copy, causing graph recording to fail
2. After modification: Uses device-side CUDA kernel, fully asynchronous, supports graph recording
预期结果:
- 改动前:图录制失败,设备端输入可能失败
- 改动后:图录制成功,设备端输入成功
Expected results:
- Before modification: Graph recording fails, device-side input may fail
- After modification: Graph recording succeeds, device-side input succeeds
"""
import infinicore
import torch
import ctypes
def test_embedding_graph_recording():
"""测试 embedding 是否支持 CUDA Graph 录制"""
"""Test if embedding supports CUDA Graph recording"""
print("=" * 60)
print("测试 Embedding 图录制支持")
print("Testing Embedding Graph Recording Support")
print("=" * 60)
# 检查是否有 CUDA
# Check if CUDA is available
if not torch.cuda.is_available():
print("⚠ CUDA 不可用,跳过图录制测试")
print("⚠ CUDA not available, skipping graph recording test")
return False
device = infinicore.device("cuda", 0)
# 创建 embedding 模块
# Create embedding module
vocab_size = 1000
embedding_dim = 128
embedding = infinicore.nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=embedding_dim,
dtype=infinicore.float32,
device=device
device=device,
)
# 创建设备端的 input_ids(这是关键:改动前不支持,改动后支持)
# Create device-side input_ids (key point: unsupported before modification, supported after)
batch_size = 4
seq_len = 32
input_ids_device = infinicore.from_list(
[[i % vocab_size for i in range(seq_len)] for _ in range(batch_size)],
dtype=infinicore.int64,
device=device
device=device,
)
print(f"\n1. 输入张量信息:")
print(f"\n1. Input tensor information:")
print(f" - Shape: {input_ids_device.shape}")
print(f" - Device: {input_ids_device.device.type}")
print(f" - Dtype: {input_ids_device.dtype}")
# 尝试使用 CUDA Graph 录制
print(f"\n2. 尝试 CUDA Graph 录制...")
# 使用 PyTorch CUDA Graph API 进行测试(更简单可靠)
# Attempt CUDA Graph recording
print(f"\n2. Attempting CUDA Graph recording...")
# Use PyTorch's CUDA Graph API for testing (simpler and more reliable)
try:
# 设置设备
# Set device
infinicore.set_device(device)
# 使用 PyTorch CUDA Graph API
# 注意:PyTorch 2.0+ 支持 torch.cuda.graph
# Use PyTorch's CUDA Graph API
# Note: PyTorch 2.0+ supports torch.cuda.graph
try:
# 方法 1: 使用 PyTorch CUDA Graph(推荐)
print(" 使用 PyTorch CUDA Graph API 测试...")
# 创建 warmup 输入
# Method 1: Use PyTorch CUDA Graph (recommended)
print(" Using PyTorch CUDA Graph API for testing...")
# Create warmup input
warmup_input = input_ids_device
# Warmup(图录制前需要先执行一次,包括内存分配)
warmup_output = embedding.forward(warmup_input)
infinicore.sync_stream() # 同步确保 warmup 完成
# 预先分配输出张量(CUDA Graph 不支持动态内存分配)
# 输出形状: input_shape + [embedding_dim]
# Warmup (need to execute once before graph recording, including memory allocation)
embedding.forward(warmup_input)
infinicore.sync_stream() # Synchronize to ensure warmup completes
# Pre-allocate output tensor (CUDA Graph doesn't support dynamic memory allocation)
# Output shape: input_shape + [embedding_dim]
output_shape = list(input_ids_device.shape) + [embedding_dim]
output = infinicore.empty(
output_shape,
dtype=embedding.weight.dtype,
device=device
output_shape, dtype=embedding.weight.dtype, device=device
)
# Warmup embedding(确保内存分配完成)
# Warmup embedding (ensure memory allocation is complete)
import infinicore.nn.functional as F
F.embedding(warmup_input, embedding.weight, out=output)
infinicore.sync_stream()
# 开始图录制(使用预先分配的 output
# Start graph recording (using pre-allocated output)
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
# 使用 embedding out 参数(in-place),传入预先分配的 output
# Use embedding's out parameter (in-place), passing pre-allocated output
F.embedding(input_ids_device, embedding.weight, out=output)
print(" ✓ 成功完成图录制!")
print(" ✓ Embedding 支持 CUDA Graph 录制")
# 验证图可以重复执行
print(" ✓ Graph recording successful!")
print(" ✓ Embedding supports CUDA Graph recording")
# Verify graph can be replayed
graph.replay()
infinicore.sync_stream()
print(" ✓ 图可以成功重放")
print(" ✓ Graph can be successfully replayed")
return True
except AttributeError:
# PyTorch 版本可能不支持 torch.cuda.graph
print(" ⚠ PyTorch 版本不支持 torch.cuda.graph,使用简化验证方法")
# PyTorch version may not support torch.cuda.graph
print(
" ⚠ PyTorch version doesn't support torch.cuda.graph, using simplified verification method"
)
return test_embedding_async_verification(embedding, input_ids_device)
except RuntimeError as e:
error_msg = str(e)
if "capture" in error_msg.lower() or "graph" in error_msg.lower():
print(f" ✗ 图录制失败: {e}")
print(" ✗ Embedding 不支持 CUDA Graph 录制(可能包含同步操作)")
print(f" ✗ Graph recording failed: {e}")
print(
" ✗ Embedding doesn't support CUDA Graph recording (may contain synchronous operations)"
)
return False
else:
print(f" ⚠ 图录制测试异常: {e}")
print(f" ⚠ Graph recording test exception: {e}")
return test_embedding_async_verification(embedding, input_ids_device)
except Exception as e:
print(f" ⚠ 图录制测试异常: {e}")
print(" 使用简化验证方法...")
print(f" ⚠ Graph recording test exception: {e}")
print(" Using simplified verification method...")
import traceback
traceback.print_exc()
return test_embedding_async_verification(embedding, input_ids_device)
def test_embedding_async_verification(embedding, input_ids_device):
"""
简化验证:检查是否有同步操作
关键检查点:
1. 输入是否可以在设备上(改动前需要 CPU,改动后支持设备)
2. 操作是否完全异步(没有同步点)
Simplified verification: Check if there are synchronous operations
Key checkpoints:
1. Whether input can be on device (needed CPU before modification, supports device after)
2. Whether operations are fully asynchronous (no synchronization points)
"""
print("\n3. 简化验证:检查异步操作支持")
# 验证 1: 输入可以在设备上
print("\n3. Simplified verification: Checking asynchronous operation support")
# Verification 1: Input can be on device
if input_ids_device.device.type != "cuda":
print(" ✗ 输入不在设备上,无法验证")
print(" ✗ Input not on device, cannot verify")
return False
print(" ✓ 输入在设备上")
# 验证 2: 执行 forward,检查是否有同步操作
# 如果改动前,这里会调用 indices->to(cpu_device),触发同步
# 如果改动后,直接使用设备端 kernel,完全异步
print(" ✓ Input is on device")
# Verification 2: Execute forward, check for synchronous operations
# Before modification, this would call indices->to(cpu_device), triggering synchronization
# After modification, directly uses device-side kernel, fully asynchronous
try:
# 记录开始时间
# Record start time
start_event = infinicore.DeviceEvent(enable_timing=True)
end_event = infinicore.DeviceEvent(enable_timing=True)
start_event.record()
output = embedding.forward(input_ids_device)
end_event.record()
# 不立即同步,检查操作是否异步
# 如果操作是异步的,query 应该返回 False(未完成)
# 如果操作是同步的,可能已经完成
# 等待一小段时间
# Don't synchronize immediately, check if operation is asynchronous
# If operation is asynchronous, query should return False (not completed)
# If operation is synchronous, may have already completed
# Wait a short time
import time
time.sleep(0.001) # 1ms
# 检查事件状态
# Check event status
is_complete = end_event.query()
if not is_complete:
print(" ✓ 操作是异步的(事件未立即完成)")
print(" ✓ Operation is asynchronous (event not immediately completed)")
else:
print(" ⚠ 操作可能包含同步点(事件立即完成)")
# 同步并测量时间
print(
" ⚠ Operation may contain synchronization points (event immediately completed)"
)
# Synchronize and measure time
end_event.synchronize()
elapsed = start_event.elapsed_time(end_event)
print(f" ✓ Forward 执行时间: {elapsed:.3f} ms")
print(f" ✓ 输出形状: {output.shape}")
print(f" ✓ 输出设备: {output.device.type}")
# 验证输出正确性
print(f" ✓ Forward execution time: {elapsed:.3f} ms")
print(f" ✓ Output shape: {output.shape}")
print(f" ✓ Output device: {output.device.type}")
# Verify output correctness
embedding_dim = embedding.embedding_dim()
expected_shape = (*input_ids_device.shape, embedding_dim)
if output.device.type == "cuda" and output.shape == expected_shape:
print(" ✓ 输出在设备上,形状正确")
print(" ✓ Output on device, shape correct")
return True
else:
print(f" ✗ 输出验证失败")
print(f" 期望形状: {expected_shape}, 实际形状: {output.shape}")
print(f" 期望设备: cuda, 实际设备: {output.device.type}")
print(f" ✗ Output verification failed")
print(
f" Expected shape: {expected_shape}, actual shape: {output.shape}"
)
print(f" Expected device: cuda, actual device: {output.device.type}")
return False
except Exception as e:
print(f" ✗ 验证失败: {e}")
print(f" ✗ Verification failed: {e}")
import traceback
traceback.print_exc()
return False
def test_embedding_device_input_support():
"""测试 embedding 是否支持设备端输入"""
"""Test if embedding supports device-side input"""
print("\n" + "=" * 60)
print("测试 Embedding 设备端输入支持")
print("Testing Embedding Device-side Input Support")
print("=" * 60)
if not torch.cuda.is_available():
print("⚠ CUDA 不可用,跳过测试")
print("⚠ CUDA not available, skipping test")
return False
device = infinicore.device("cuda", 0)
vocab_size = 100
embedding_dim = 64
embedding = infinicore.nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=embedding_dim,
dtype=infinicore.float32,
device=device
device=device,
)
# 测试 1: 设备端输入(改动后支持)
print("\n测试 1: 设备端输入")
# Test 1: Device-side input (supported after modification)
print("\nTest 1: Device-side input")
try:
input_ids_device = infinicore.from_list(
[[1, 2, 3, 4, 5]],
dtype=infinicore.int64,
device=device
[[1, 2, 3, 4, 5]], dtype=infinicore.int64, device=device
)
output = embedding.forward(input_ids_device)
print(f" ✓ 设备端输入成功")
print(f" - 输入设备: {input_ids_device.device.type}")
print(f" - 输出设备: {output.device.type}")
print(f" - 输出形状: {output.shape}")
print(f" ✓ Device-side input successful")
print(f" - Input device: {input_ids_device.device.type}")
print(f" - Output device: {output.device.type}")
print(f" - Output shape: {output.shape}")
return True
except Exception as e:
print(f" ✗ 设备端输入失败: {e}")
print(f" ✗ Device-side input failed: {e}")
return False
def main():
"""主测试函数"""
"""Main test function"""
print("\n" + "=" * 60)
print("Embedding 图录制支持验证")
print("Embedding Graph Recording Support Verification")
print("=" * 60)
results = []
# 测试 1: 图录制支持
# Test 1: Graph recording support
result1 = test_embedding_graph_recording()
results.append(("CUDA Graph 录制", result1))
# 测试 2: 设备端输入支持
results.append(("CUDA Graph Recording", result1))
# Test 2: Device-side input support
result2 = test_embedding_device_input_support()
results.append(("设备端输入", result2))
# 总结
results.append(("Device-side Input", result2))
# Summary
print("\n" + "=" * 60)
print("测试结果总结")
print("Test Results Summary")
print("=" * 60)
all_passed = True
for test_name, result in results:
status = "✓ 通过" if result else "✗ 失败"
status = "✓ Passed" if result else "✗ Failed"
print(f"{test_name}: {status}")
if not result:
all_passed = False
print("\n" + "=" * 60)
if all_passed:
print("✓ 所有测试通过!Embedding 支持图录制")
print("✓ All tests passed! Embedding supports graph recording")
else:
print("✗ 部分测试失败,Embedding 可能不完全支持图录制")
print("✗ Some tests failed, embedding may not fully support graph recording")
print("=" * 60)
return all_passed
......
# Embedding 图录制支持对比
## 改动前后对比
### ❌ 改动前:不支持图录制
**关键问题代码**(在 `nn::Embedding::forward` 中):
```cpp
// 改动前的实现
Tensor Embedding::forward(const Tensor &indices) const {
auto cpu_device = Device(Device::Type::CPU, 0);
auto indices_cpu = indices->to(cpu_device)->contiguous(); // ❌ 同步操作!
// ... 后续处理
}
```
**问题分析**
1. `indices->to(cpu_device)` 会触发 **同步的 D2H(Device-to-Host)内存拷贝**
2. CUDA Graph 录制要求所有操作都是**异步的**,不能有同步点
3. 同步操作会导致图录制失败或产生错误
**验证方法**
```python
# 改动前:这个操作会失败或产生同步
input_ids_device = infinicore.from_list(..., device="cuda:0") # 设备端输入
output = embedding.forward(input_ids_device) # ❌ 内部会同步拷贝到 CPU
```
---
### ✅ 改动后:支持图录制
**关键改进代码**
```cpp
// 改动后的实现
Tensor Embedding::forward(const Tensor &indices) const {
Tensor indices_contiguous = indices->is_contiguous() ? indices : indices->contiguous();
return op::embedding(indices_contiguous, weight_); // ✅ 直接使用设备端 kernel
}
```
**改进点**
1. **移除了同步操作**:不再调用 `indices->to(cpu_device)`
2. **使用设备端 CUDA kernel**:通过 InfiniOP 调用 `embeddingKernel`,完全在设备端执行
3. **完全异步**:所有操作都在 CUDA stream 上异步执行
**实现位置**
- CUDA Kernel: `src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu`
- Kernel 启动:使用 `cudaStream_t`,完全异步
- 无同步点:没有 `cudaDeviceSynchronize()` 或 D2H 拷贝
**验证方法**
```python
# 改动后:这个操作完全异步,支持图录制
input_ids_device = infinicore.from_list(..., device="cuda:0") # 设备端输入
output = embedding.forward(input_ids_device) # ✅ 直接使用设备端 kernel,无同步
```
---
## 验证方法
### 方法 1: 代码检查
**检查点**
1. ✅ 是否有 `->to(cpu_device)` 调用?
2. ✅ 是否有 `synchronize()` 调用?
3. ✅ 是否有设备端 kernel 实现?
**改动前**
```cpp
// ❌ 有同步操作
auto indices_cpu = indices->to(cpu_device)->contiguous();
```
**改动后**
```cpp
// ✅ 无同步操作,直接使用设备端 kernel
return op::embedding(indices_contiguous, weight_);
```
### 方法 2: CUDA Graph API 测试
运行测试脚本:
```bash
python test/infinicore/nn/test_embedding_graph_recording.py
```
**预期结果**
- ✅ 改动后:图录制成功
- ❌ 改动前:图录制失败(因为同步操作)
### 方法 3: 设备端输入测试
**关键测试**
```python
# 创建设备端输入
input_ids = infinicore.from_list([[1, 2, 3]], dtype=int64, device="cuda:0")
# 执行 forward
output = embedding.forward(input_ids) # 改动前会失败或同步,改动后成功
```
**改动前**
- 需要先将 `input_ids` 拷贝到 CPU
- 触发同步操作,无法图录制
**改动后**
- 直接使用设备端 `input_ids`
- 完全异步,支持图录制
---
## 技术细节对比
| 特性 | 改动前 | 改动后 |
|------|--------|--------|
| **输入设备** | 必须在 CPU | 支持设备端 |
| **同步操作** | ❌ 有(D2H拷贝) | ✅ 无 |
| **Kernel位置** | CPU 实现 | CUDA kernel |
| **图录制支持** | ❌ 不支持 | ✅ 支持 |
| **Batch维度** | ✅ 支持 | ✅ 支持 |
| **性能** | 较慢(同步开销) | 更快(异步) |
---
## 关键代码位置
### 改动前的问题代码
- `src/infinicore/nn/embedding.cc` (旧版本)
- 第58行:`indices->to(cpu_device)->contiguous()`
### 改动后的实现
- `src/infinicore/nn/embedding.cc` (新版本)
- 第48行:`indices->is_contiguous() ? indices : indices->contiguous()`
- 第52行:`return op::embedding(indices_contiguous, weight_)`
- `src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu`
- CUDA kernel 实现,完全异步 ✅
- `src/infinicore/ops/embedding/embedding_infiniop.cc`
- InfiniOP 包装,调用设备端 kernel ✅
---
## 总结
**改动前的关键问题**
-`indices->to(cpu_device)` 触发同步 D2H 拷贝
- ❌ 无法进行 CUDA Graph 录制
- ❌ 性能较差(同步开销)
**改动后的改进**
- ✅ 移除所有同步操作
- ✅ 使用设备端 CUDA kernel
- ✅ 完全支持 CUDA Graph 录制
- ✅ 性能更好(完全异步)
# Embedding 图录制测试使用指南
## 🚀 快速开始
### 运行测试
```bash
cd /home/zhuyue/codes/InfiniCore
python test/infinicore/nn/test_embedding_graph_recording.py
```
---
## 📊 改动前后对比
### ❌ 改动前:不支持图录制
#### 1. 运行测试
```bash
python test/infinicore/nn/test_embedding_graph_recording.py
```
#### 2. 预期输出
```
============================================================
Embedding 图录制支持验证
============================================================
============================================================
测试 Embedding 图录制支持
============================================================
1. 输入张量信息:
- Shape: [4, 32]
- Device: cuda
- Dtype: int64
2. 尝试 CUDA Graph 录制...
使用 PyTorch CUDA Graph API 测试...
✗ 图录制失败: [错误信息]
✗ Embedding 不支持 CUDA Graph 录制(可能包含同步操作)
3. 简化验证:检查异步操作支持
✓ 输入在设备上
⚠ 操作可能包含同步点(事件立即完成) ← 关键:说明有同步操作
✓ Forward 执行时间: X.XXX ms
✓ 输出形状: [4, 32, 128]
✓ 输出设备: cuda
✗ 输出验证失败
============================================================
测试 Embedding 设备端输入支持
============================================================
测试 1: 设备端输入
✗ 设备端输入失败: [错误信息]
============================================================
测试结果总结
============================================================
CUDA Graph 录制: ✗ 失败
设备端输入: ✗ 失败
============================================================
✗ 部分测试失败,Embedding 可能不完全支持图录制
============================================================
```
#### 3. 关键失败点
- **图录制失败**:因为代码中有 `indices->to(cpu_device)` 同步操作
- **设备端输入失败**:需要先将输入拷贝到 CPU
- **异步验证显示同步点**:事件立即完成,说明有同步操作
---
### ✅ 改动后:支持图录制
#### 1. 运行测试
```bash
python test/infinicore/nn/test_embedding_graph_recording.py
```
#### 2. 预期输出
```
============================================================
Embedding 图录制支持验证
============================================================
============================================================
测试 Embedding 图录制支持
============================================================
1. 输入张量信息:
- Shape: [4, 32]
- Device: cuda
- Dtype: int64
2. 尝试 CUDA Graph 录制...
使用 PyTorch CUDA Graph API 测试...
✓ 成功完成图录制!
✓ Embedding 支持 CUDA Graph 录制
✓ 图可以成功重放
============================================================
测试 Embedding 设备端输入支持
============================================================
测试 1: 设备端输入
✓ 设备端输入成功
- 输入设备: cuda
- 输出设备: cuda
- 输出形状: [1, 5, 64]
============================================================
测试结果总结
============================================================
CUDA Graph 录制: ✓ 通过
设备端输入: ✓ 通过
============================================================
✓ 所有测试通过!Embedding 支持图录制
============================================================
```
#### 3. 关键成功点
- **图录制成功**:所有操作都是异步的,无同步点
- **设备端输入成功**:直接支持设备端输入,无需拷贝
- **图可以重放**:验证图录制的正确性
---
## 🔍 如何判断当前是改动前还是改动后?
### 方法 1: 代码检查(最快)
```bash
# 检查是否有同步操作
grep -n "to(cpu_device)" src/infinicore/nn/embedding.cc
# 结果解读:
# - 有输出 → ❌ 改动前(不支持图录制)
# - 无输出 → ✅ 改动后(支持图录制)
```
### 方法 2: 检查设备端实现
```bash
# 检查是否有设备端 CUDA kernel
ls src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu
# 结果解读:
# - 不存在 → ❌ 改动前(不支持图录制)
# - 存在 → ✅ 改动后(支持图录制)
```
### 方法 3: 运行测试(最准确)
```bash
python test/infinicore/nn/test_embedding_graph_recording.py
# 查看 "CUDA Graph 录制" 测试结果:
# - ✓ 通过 → ✅ 改动后(支持图录制)
# - ✗ 失败 → ❌ 改动前(不支持图录制)
```
---
## 📝 测试内容详解
### 测试 1: CUDA Graph 录制
**目的**:验证 embedding 是否可以在 CUDA Graph 中录制
**工作原理**:
1. 使用 PyTorch 的 `torch.cuda.CUDAGraph()` API
2. 在图录制模式下执行 `embedding.forward()`
3. 如果包含同步操作,录制会失败
4. 如果完全异步,录制会成功
**改动前**:
- ❌ 录制失败:因为 `indices->to(cpu_device)` 触发同步
**改动后**:
- ✅ 录制成功:使用设备端 CUDA kernel,完全异步
### 测试 2: 设备端输入支持
**目的**:验证 embedding 是否支持设备端输入
**工作原理**:
1. 创建设备端的 `input_ids`
2. 直接调用 `embedding.forward(input_ids)`
3. 检查是否成功且输出在设备上
**改动前**:
- ❌ 可能需要先将输入拷贝到 CPU(同步操作)
**改动后**:
- ✅ 直接支持设备端输入(完全异步)
### 测试 3: 异步操作验证(备用)
**目的**:当 CUDA Graph API 不可用时,使用事件验证异步性
**工作原理**:
1. 使用 `DeviceEvent` 记录操作时间
2. 检查操作是否立即完成(同步)或异步执行
**改动前**:
- ⚠️ 事件立即完成,说明有同步操作
**改动后**:
- ✅ 事件未立即完成,说明是异步操作
---
## 🛠️ 故障排查
### 问题 1: PyTorch 版本不支持 CUDA Graph
**现象**:
```
⚠ PyTorch 版本不支持 torch.cuda.graph,使用简化验证方法
```
**解决**:
- 需要 PyTorch 2.0+ 版本
- 测试会自动降级到简化验证方法
- 简化验证也能检测是否支持图录制
### 问题 2: CUDA 不可用
**现象**:
```
⚠ CUDA 不可用,跳过图录制测试
```
**解决**:
- 确保 CUDA 设备可用
- 测试需要 CUDA 环境
### 问题 3: 测试失败但不确定原因
**检查清单**:
1. ✅ 确认代码已编译(特别是 CUDA 支持)
2. ✅ 确认 CUDA 设备可用
3. ✅ 检查 `src/infinicore/nn/embedding.cc` 是否还有 `to(cpu_device)`
4. ✅ 检查是否有 `src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu`
---
## 💡 快速验证脚本
创建一个简单的验证脚本:
```bash
#!/bin/bash
# quick_check.sh
cd /home/zhuyue/codes/InfiniCore
echo "=== 1. 代码检查 ==="
if grep -q "to(cpu_device)" src/infinicore/nn/embedding.cc; then
echo "❌ 改动前:发现同步操作 to(cpu_device)"
else
echo "✅ 改动后:无同步操作"
fi
echo ""
echo "=== 2. 设备端实现检查 ==="
if [ -f "src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu" ]; then
echo "✅ 改动后:有设备端 CUDA kernel"
else
echo "❌ 改动前:无设备端 CUDA kernel"
fi
echo ""
echo "=== 3. 运行测试 ==="
python test/infinicore/nn/test_embedding_graph_recording.py
```
使用方法:
```bash
chmod +x quick_check.sh
./quick_check.sh
```
---
## 📋 总结
### 改动前特征
| 检查项 | 结果 |
|--------|------|
| 代码中有 `to(cpu_device)` | ✅ 有 |
| 有设备端 CUDA kernel | ❌ 无 |
| 图录制测试 | ❌ 失败 |
| 设备端输入 | ❌ 失败 |
### 改动后特征
| 检查项 | 结果 |
|--------|------|
| 代码中有 `to(cpu_device)` | ❌ 无 |
| 有设备端 CUDA kernel | ✅ 有 |
| 图录制测试 | ✅ 成功 |
| 设备端输入 | ✅ 成功 |
### 最简单的判断方法
**运行测试脚本**,查看 "CUDA Graph 录制" 测试结果:
- ✅ **通过** → 支持图录制(改动后)
- ❌ **失败** → 不支持图录制(改动前)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment