Commit eb34d4d6 authored by wooway777's avatar wooway777
Browse files

issue/900 - adapt to graph and adjust test script

parent 835209e7
#pragma once #pragma once
#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp" #include "common/op.hpp"
namespace infinicore::op { namespace infinicore::op {
class Embedding { INFINICORE_GRAPH_OP_CLASS(Embedding, Tensor, const Tensor &, const Tensor &);
public:
using schema = void (*)(Tensor, Tensor, Tensor);
static void execute(Tensor out, Tensor input, Tensor weight);
static common::OpDispatcher<schema> &dispatcher();
};
Tensor embedding(Tensor input, Tensor weight); Tensor embedding(const Tensor &input, const Tensor &weight);
void embedding_(Tensor out, Tensor input, Tensor weight); void embedding_(Tensor out, const Tensor &input, const Tensor &weight);
} // namespace infinicore::op } // namespace infinicore::op
...@@ -5,27 +5,19 @@ ...@@ -5,27 +5,19 @@
#include <stdexcept> #include <stdexcept>
namespace infinicore::op { namespace infinicore::op {
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Embedding);
common::OpDispatcher<Embedding::schema> &Embedding::dispatcher() { Embedding::Embedding(Tensor out, const Tensor &input, const Tensor &weight) {
static common::OpDispatcher<Embedding::schema> dispatcher_;
return dispatcher_;
}
void Embedding::execute(Tensor out, Tensor input, Tensor weight) {
// Check that all tensors are on the same device
// This is critical: if input is on CPU while out/weight are on GPU,
// passing CPU pointer to CUDA kernel will cause memory access errors
INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, weight); INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, weight);
INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, input, weight);
}
// Set device context void Embedding::execute(Tensor out, const Tensor &input, const Tensor &weight) {
infinicore::context::setDevice(out->device()); INFINICORE_GRAPH_OP_RECORD_OR_RUN(Embedding, out, input, weight);
// Use dispatcher to lookup kernel (infiniop implementation)
dispatcher().lookup(out->device().getType())(out, input, weight);
} }
Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the indices to extract Tensor embedding(const Tensor &input, // LongTensor of arbitrary shape containing the indices to extract
Tensor weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1 const Tensor &weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
) { ) {
auto input_shape = input->shape(); auto input_shape = input->shape();
auto weight_shape = weight->shape(); auto weight_shape = weight->shape();
...@@ -40,7 +32,7 @@ Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the i ...@@ -40,7 +32,7 @@ Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the i
return inputs_embeds; return inputs_embeds;
} }
void embedding_(Tensor out, Tensor input, Tensor weight) { void embedding_(Tensor out, const Tensor &input, const Tensor &weight) {
Embedding::execute(out, input, weight); Embedding::execute(out, input, weight);
} }
......
#include "../../utils.hpp" #include "../infiniop_impl.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/embedding.hpp" #include "infinicore/ops/embedding.hpp"
#include <infiniop.h>
namespace infinicore::op::embedding_impl::infiniop { namespace infinicore::op::embedding_impl::infiniop {
thread_local common::OpCache<size_t, infiniopEmbeddingDescriptor_t> caches( INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Embedding, 100);
100, // capacity
[](infiniopEmbeddingDescriptor_t &desc) {
if (desc != nullptr) {
INFINICORE_CHECK_ERROR(infiniopDestroyEmbeddingDescriptor(desc));
desc = nullptr;
}
});
void calculate(Tensor out, Tensor input, Tensor weight) { struct PlannedMeta {
std::shared_ptr<Descriptor> descriptor;
graph::GraphTensor out, input, weight;
};
void *plan(Tensor out, const Tensor &input, const Tensor &weight) {
size_t seed = hash_combine(out, input, weight); size_t seed = hash_combine(out, input, weight);
auto device = context::getDevice(); INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
auto &cache = caches.getCache(device); Descriptor, descriptor, Embedding,
seed, out->desc(), input->desc(), weight->desc());
auto planned = new PlannedMeta{
descriptor,
graph::GraphTensor(out),
graph::GraphTensor(input),
graph::GraphTensor(weight)};
auto desc_opt = cache.get(seed); return planned;
infiniopEmbeddingDescriptor_t desc = nullptr; }
if (!desc_opt) { void run(void *planned_meta) {
INFINICORE_CHECK_ERROR(infiniopCreateEmbeddingDescriptor( auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
context::getInfiniopHandle(device), &desc,
out->desc(), input->desc(), weight->desc()));
cache.put(seed, desc);
} else {
desc = *desc_opt;
}
INFINICORE_CHECK_ERROR(infiniopEmbedding( INFINICORE_CHECK_ERROR(infiniopEmbedding(
desc, planned->descriptor->desc,
out->data(), planned->out->data(), planned->input->data(), planned->weight->data(), context::getStream()));
input->data(), }
weight->data(),
context::getStream())); void cleanup(void **planned_meta_ptr) {
delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
*planned_meta_ptr = nullptr;
} }
static bool registered = []() { INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Embedding, &plan, &run, cleanup);
Embedding::dispatcher().registerAll(&calculate, false);
return true;
}();
} // namespace infinicore::op::embedding_impl::infiniop } // namespace infinicore::op::embedding_impl::infiniop
""" """
测试 embedding 是否支持 CUDA Graph 录制 Test if embedding supports CUDA Graph recording
使用方法: Usage:
python test/infinicore/nn/test_embedding_graph_recording.py python test/infinicore/nn/test_embedding_graph_recording.py
关键验证点: Key verification points:
1. 改动前:indices->to(cpu_device) 会触发同步的 D2H 拷贝,导致图录制失败 1. Before modification: indices->to(cpu_device) triggers synchronous D2H copy, causing graph recording to fail
2. 改动后:使用设备端 CUDA kernel,完全异步,支持图录制 2. After modification: Uses device-side CUDA kernel, fully asynchronous, supports graph recording
预期结果: Expected results:
- 改动前:图录制失败,设备端输入可能失败 - Before modification: Graph recording fails, device-side input may fail
- 改动后:图录制成功,设备端输入成功 - After modification: Graph recording succeeds, device-side input succeeds
""" """
import infinicore import infinicore
import torch import torch
import ctypes
def test_embedding_graph_recording(): def test_embedding_graph_recording():
"""测试 embedding 是否支持 CUDA Graph 录制""" """Test if embedding supports CUDA Graph recording"""
print("=" * 60) print("=" * 60)
print("测试 Embedding 图录制支持") print("Testing Embedding Graph Recording Support")
print("=" * 60) print("=" * 60)
# 检查是否有 CUDA # Check if CUDA is available
if not torch.cuda.is_available(): if not torch.cuda.is_available():
print("⚠ CUDA 不可用,跳过图录制测试") print("⚠ CUDA not available, skipping graph recording test")
return False return False
device = infinicore.device("cuda", 0) device = infinicore.device("cuda", 0)
# 创建 embedding 模块 # Create embedding module
vocab_size = 1000 vocab_size = 1000
embedding_dim = 128 embedding_dim = 128
embedding = infinicore.nn.Embedding( embedding = infinicore.nn.Embedding(
num_embeddings=vocab_size, num_embeddings=vocab_size,
embedding_dim=embedding_dim, embedding_dim=embedding_dim,
dtype=infinicore.float32, dtype=infinicore.float32,
device=device device=device,
) )
# 创建设备端的 input_ids(这是关键:改动前不支持,改动后支持) # Create device-side input_ids (key point: unsupported before modification, supported after)
batch_size = 4 batch_size = 4
seq_len = 32 seq_len = 32
input_ids_device = infinicore.from_list( input_ids_device = infinicore.from_list(
[[i % vocab_size for i in range(seq_len)] for _ in range(batch_size)], [[i % vocab_size for i in range(seq_len)] for _ in range(batch_size)],
dtype=infinicore.int64, dtype=infinicore.int64,
device=device device=device,
) )
print(f"\n1. 输入张量信息:") print(f"\n1. Input tensor information:")
print(f" - Shape: {input_ids_device.shape}") print(f" - Shape: {input_ids_device.shape}")
print(f" - Device: {input_ids_device.device.type}") print(f" - Device: {input_ids_device.device.type}")
print(f" - Dtype: {input_ids_device.dtype}") print(f" - Dtype: {input_ids_device.dtype}")
# 尝试使用 CUDA Graph 录制 # Attempt CUDA Graph recording
print(f"\n2. 尝试 CUDA Graph 录制...") print(f"\n2. Attempting CUDA Graph recording...")
# 使用 PyTorch CUDA Graph API 进行测试(更简单可靠) # Use PyTorch's CUDA Graph API for testing (simpler and more reliable)
try: try:
# 设置设备 # Set device
infinicore.set_device(device) infinicore.set_device(device)
# 使用 PyTorch CUDA Graph API # Use PyTorch's CUDA Graph API
# 注意:PyTorch 2.0+ 支持 torch.cuda.graph # Note: PyTorch 2.0+ supports torch.cuda.graph
try: try:
# 方法 1: 使用 PyTorch CUDA Graph(推荐) # Method 1: Use PyTorch CUDA Graph (recommended)
print(" 使用 PyTorch CUDA Graph API 测试...") print(" Using PyTorch CUDA Graph API for testing...")
# 创建 warmup 输入 # Create warmup input
warmup_input = input_ids_device warmup_input = input_ids_device
# Warmup(图录制前需要先执行一次,包括内存分配) # Warmup (need to execute once before graph recording, including memory allocation)
warmup_output = embedding.forward(warmup_input) embedding.forward(warmup_input)
infinicore.sync_stream() # 同步确保 warmup 完成 infinicore.sync_stream() # Synchronize to ensure warmup completes
# 预先分配输出张量(CUDA Graph 不支持动态内存分配) # Pre-allocate output tensor (CUDA Graph doesn't support dynamic memory allocation)
# 输出形状: input_shape + [embedding_dim] # Output shape: input_shape + [embedding_dim]
output_shape = list(input_ids_device.shape) + [embedding_dim] output_shape = list(input_ids_device.shape) + [embedding_dim]
output = infinicore.empty( output = infinicore.empty(
output_shape, output_shape, dtype=embedding.weight.dtype, device=device
dtype=embedding.weight.dtype,
device=device
) )
# Warmup embedding(确保内存分配完成) # Warmup embedding (ensure memory allocation is complete)
import infinicore.nn.functional as F import infinicore.nn.functional as F
F.embedding(warmup_input, embedding.weight, out=output) F.embedding(warmup_input, embedding.weight, out=output)
infinicore.sync_stream() infinicore.sync_stream()
# 开始图录制(使用预先分配的 output # Start graph recording (using pre-allocated output)
graph = torch.cuda.CUDAGraph() graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph): with torch.cuda.graph(graph):
# 使用 embedding out 参数(in-place),传入预先分配的 output # Use embedding's out parameter (in-place), passing pre-allocated output
F.embedding(input_ids_device, embedding.weight, out=output) F.embedding(input_ids_device, embedding.weight, out=output)
print(" ✓ 成功完成图录制!") print(" ✓ Graph recording successful!")
print(" ✓ Embedding 支持 CUDA Graph 录制") print(" ✓ Embedding supports CUDA Graph recording")
# 验证图可以重复执行 # Verify graph can be replayed
graph.replay() graph.replay()
infinicore.sync_stream() infinicore.sync_stream()
print(" ✓ 图可以成功重放") print(" ✓ Graph can be successfully replayed")
return True return True
except AttributeError: except AttributeError:
# PyTorch 版本可能不支持 torch.cuda.graph # PyTorch version may not support torch.cuda.graph
print(" ⚠ PyTorch 版本不支持 torch.cuda.graph,使用简化验证方法") print(
" ⚠ PyTorch version doesn't support torch.cuda.graph, using simplified verification method"
)
return test_embedding_async_verification(embedding, input_ids_device) return test_embedding_async_verification(embedding, input_ids_device)
except RuntimeError as e: except RuntimeError as e:
error_msg = str(e) error_msg = str(e)
if "capture" in error_msg.lower() or "graph" in error_msg.lower(): if "capture" in error_msg.lower() or "graph" in error_msg.lower():
print(f" ✗ 图录制失败: {e}") print(f" ✗ Graph recording failed: {e}")
print(" ✗ Embedding 不支持 CUDA Graph 录制(可能包含同步操作)") print(
" ✗ Embedding doesn't support CUDA Graph recording (may contain synchronous operations)"
)
return False return False
else: else:
print(f" ⚠ 图录制测试异常: {e}") print(f" ⚠ Graph recording test exception: {e}")
return test_embedding_async_verification(embedding, input_ids_device) return test_embedding_async_verification(embedding, input_ids_device)
except Exception as e: except Exception as e:
print(f" ⚠ 图录制测试异常: {e}") print(f" ⚠ Graph recording test exception: {e}")
print(" 使用简化验证方法...") print(" Using simplified verification method...")
import traceback import traceback
traceback.print_exc() traceback.print_exc()
return test_embedding_async_verification(embedding, input_ids_device) return test_embedding_async_verification(embedding, input_ids_device)
def test_embedding_async_verification(embedding, input_ids_device): def test_embedding_async_verification(embedding, input_ids_device):
""" """
简化验证:检查是否有同步操作 Simplified verification: Check if there are synchronous operations
关键检查点: Key checkpoints:
1. 输入是否可以在设备上(改动前需要 CPU,改动后支持设备) 1. Whether input can be on device (needed CPU before modification, supports device after)
2. 操作是否完全异步(没有同步点) 2. Whether operations are fully asynchronous (no synchronization points)
""" """
print("\n3. 简化验证:检查异步操作支持") print("\n3. Simplified verification: Checking asynchronous operation support")
# 验证 1: 输入可以在设备上 # Verification 1: Input can be on device
if input_ids_device.device.type != "cuda": if input_ids_device.device.type != "cuda":
print(" ✗ 输入不在设备上,无法验证") print(" ✗ Input not on device, cannot verify")
return False return False
print(" ✓ 输入在设备上") print(" ✓ Input is on device")
# 验证 2: 执行 forward,检查是否有同步操作 # Verification 2: Execute forward, check for synchronous operations
# 如果改动前,这里会调用 indices->to(cpu_device),触发同步 # Before modification, this would call indices->to(cpu_device), triggering synchronization
# 如果改动后,直接使用设备端 kernel,完全异步 # After modification, directly uses device-side kernel, fully asynchronous
try: try:
# 记录开始时间 # Record start time
start_event = infinicore.DeviceEvent(enable_timing=True) start_event = infinicore.DeviceEvent(enable_timing=True)
end_event = infinicore.DeviceEvent(enable_timing=True) end_event = infinicore.DeviceEvent(enable_timing=True)
start_event.record() start_event.record()
output = embedding.forward(input_ids_device) output = embedding.forward(input_ids_device)
end_event.record() end_event.record()
# 不立即同步,检查操作是否异步 # Don't synchronize immediately, check if operation is asynchronous
# 如果操作是异步的,query 应该返回 False(未完成) # If operation is asynchronous, query should return False (not completed)
# 如果操作是同步的,可能已经完成 # If operation is synchronous, may have already completed
# 等待一小段时间 # Wait a short time
import time import time
time.sleep(0.001) # 1ms time.sleep(0.001) # 1ms
# 检查事件状态 # Check event status
is_complete = end_event.query() is_complete = end_event.query()
if not is_complete: if not is_complete:
print(" ✓ 操作是异步的(事件未立即完成)") print(" ✓ Operation is asynchronous (event not immediately completed)")
else: else:
print(" ⚠ 操作可能包含同步点(事件立即完成)") print(
" ⚠ Operation may contain synchronization points (event immediately completed)"
# 同步并测量时间 )
# Synchronize and measure time
end_event.synchronize() end_event.synchronize()
elapsed = start_event.elapsed_time(end_event) elapsed = start_event.elapsed_time(end_event)
print(f" ✓ Forward 执行时间: {elapsed:.3f} ms") print(f" ✓ Forward execution time: {elapsed:.3f} ms")
print(f" ✓ 输出形状: {output.shape}") print(f" ✓ Output shape: {output.shape}")
print(f" ✓ 输出设备: {output.device.type}") print(f" ✓ Output device: {output.device.type}")
# 验证输出正确性 # Verify output correctness
embedding_dim = embedding.embedding_dim() embedding_dim = embedding.embedding_dim()
expected_shape = (*input_ids_device.shape, embedding_dim) expected_shape = (*input_ids_device.shape, embedding_dim)
if output.device.type == "cuda" and output.shape == expected_shape: if output.device.type == "cuda" and output.shape == expected_shape:
print(" ✓ 输出在设备上,形状正确") print(" ✓ Output on device, shape correct")
return True return True
else: else:
print(f" ✗ 输出验证失败") print(f" ✗ Output verification failed")
print(f" 期望形状: {expected_shape}, 实际形状: {output.shape}") print(
print(f" 期望设备: cuda, 实际设备: {output.device.type}") f" Expected shape: {expected_shape}, actual shape: {output.shape}"
)
print(f" Expected device: cuda, actual device: {output.device.type}")
return False return False
except Exception as e: except Exception as e:
print(f" ✗ 验证失败: {e}") print(f" ✗ Verification failed: {e}")
import traceback import traceback
traceback.print_exc() traceback.print_exc()
return False return False
def test_embedding_device_input_support(): def test_embedding_device_input_support():
"""测试 embedding 是否支持设备端输入""" """Test if embedding supports device-side input"""
print("\n" + "=" * 60) print("\n" + "=" * 60)
print("测试 Embedding 设备端输入支持") print("Testing Embedding Device-side Input Support")
print("=" * 60) print("=" * 60)
if not torch.cuda.is_available(): if not torch.cuda.is_available():
print("⚠ CUDA 不可用,跳过测试") print("⚠ CUDA not available, skipping test")
return False return False
device = infinicore.device("cuda", 0) device = infinicore.device("cuda", 0)
vocab_size = 100 vocab_size = 100
embedding_dim = 64 embedding_dim = 64
embedding = infinicore.nn.Embedding( embedding = infinicore.nn.Embedding(
num_embeddings=vocab_size, num_embeddings=vocab_size,
embedding_dim=embedding_dim, embedding_dim=embedding_dim,
dtype=infinicore.float32, dtype=infinicore.float32,
device=device device=device,
) )
# 测试 1: 设备端输入(改动后支持) # Test 1: Device-side input (supported after modification)
print("\n测试 1: 设备端输入") print("\nTest 1: Device-side input")
try: try:
input_ids_device = infinicore.from_list( input_ids_device = infinicore.from_list(
[[1, 2, 3, 4, 5]], [[1, 2, 3, 4, 5]], dtype=infinicore.int64, device=device
dtype=infinicore.int64,
device=device
) )
output = embedding.forward(input_ids_device) output = embedding.forward(input_ids_device)
print(f" ✓ 设备端输入成功") print(f" ✓ Device-side input successful")
print(f" - 输入设备: {input_ids_device.device.type}") print(f" - Input device: {input_ids_device.device.type}")
print(f" - 输出设备: {output.device.type}") print(f" - Output device: {output.device.type}")
print(f" - 输出形状: {output.shape}") print(f" - Output shape: {output.shape}")
return True return True
except Exception as e: except Exception as e:
print(f" ✗ 设备端输入失败: {e}") print(f" ✗ Device-side input failed: {e}")
return False return False
def main(): def main():
"""主测试函数""" """Main test function"""
print("\n" + "=" * 60) print("\n" + "=" * 60)
print("Embedding 图录制支持验证") print("Embedding Graph Recording Support Verification")
print("=" * 60) print("=" * 60)
results = [] results = []
# 测试 1: 图录制支持 # Test 1: Graph recording support
result1 = test_embedding_graph_recording() result1 = test_embedding_graph_recording()
results.append(("CUDA Graph 录制", result1)) results.append(("CUDA Graph Recording", result1))
# 测试 2: 设备端输入支持 # Test 2: Device-side input support
result2 = test_embedding_device_input_support() result2 = test_embedding_device_input_support()
results.append(("设备端输入", result2)) results.append(("Device-side Input", result2))
# 总结 # Summary
print("\n" + "=" * 60) print("\n" + "=" * 60)
print("测试结果总结") print("Test Results Summary")
print("=" * 60) print("=" * 60)
all_passed = True all_passed = True
for test_name, result in results: for test_name, result in results:
status = "✓ 通过" if result else "✗ 失败" status = "✓ Passed" if result else "✗ Failed"
print(f"{test_name}: {status}") print(f"{test_name}: {status}")
if not result: if not result:
all_passed = False all_passed = False
print("\n" + "=" * 60) print("\n" + "=" * 60)
if all_passed: if all_passed:
print("✓ 所有测试通过!Embedding 支持图录制") print("✓ All tests passed! Embedding supports graph recording")
else: else:
print("✗ 部分测试失败,Embedding 可能不完全支持图录制") print("✗ Some tests failed, embedding may not fully support graph recording")
print("=" * 60) print("=" * 60)
return all_passed return all_passed
......
# Embedding 图录制支持对比
## 改动前后对比
### ❌ 改动前:不支持图录制
**关键问题代码**(在 `nn::Embedding::forward` 中):
```cpp
// 改动前的实现
Tensor Embedding::forward(const Tensor &indices) const {
auto cpu_device = Device(Device::Type::CPU, 0);
auto indices_cpu = indices->to(cpu_device)->contiguous(); // ❌ 同步操作!
// ... 后续处理
}
```
**问题分析**
1. `indices->to(cpu_device)` 会触发 **同步的 D2H(Device-to-Host)内存拷贝**
2. CUDA Graph 录制要求所有操作都是**异步的**,不能有同步点
3. 同步操作会导致图录制失败或产生错误
**验证方法**
```python
# 改动前:这个操作会失败或产生同步
input_ids_device = infinicore.from_list(..., device="cuda:0") # 设备端输入
output = embedding.forward(input_ids_device) # ❌ 内部会同步拷贝到 CPU
```
---
### ✅ 改动后:支持图录制
**关键改进代码**
```cpp
// 改动后的实现
Tensor Embedding::forward(const Tensor &indices) const {
Tensor indices_contiguous = indices->is_contiguous() ? indices : indices->contiguous();
return op::embedding(indices_contiguous, weight_); // ✅ 直接使用设备端 kernel
}
```
**改进点**
1. **移除了同步操作**:不再调用 `indices->to(cpu_device)`
2. **使用设备端 CUDA kernel**:通过 InfiniOP 调用 `embeddingKernel`,完全在设备端执行
3. **完全异步**:所有操作都在 CUDA stream 上异步执行
**实现位置**
- CUDA Kernel: `src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu`
- Kernel 启动:使用 `cudaStream_t`,完全异步
- 无同步点:没有 `cudaDeviceSynchronize()` 或 D2H 拷贝
**验证方法**
```python
# 改动后:这个操作完全异步,支持图录制
input_ids_device = infinicore.from_list(..., device="cuda:0") # 设备端输入
output = embedding.forward(input_ids_device) # ✅ 直接使用设备端 kernel,无同步
```
---
## 验证方法
### 方法 1: 代码检查
**检查点**
1. ✅ 是否有 `->to(cpu_device)` 调用?
2. ✅ 是否有 `synchronize()` 调用?
3. ✅ 是否有设备端 kernel 实现?
**改动前**
```cpp
// ❌ 有同步操作
auto indices_cpu = indices->to(cpu_device)->contiguous();
```
**改动后**
```cpp
// ✅ 无同步操作,直接使用设备端 kernel
return op::embedding(indices_contiguous, weight_);
```
### 方法 2: CUDA Graph API 测试
运行测试脚本:
```bash
python test/infinicore/nn/test_embedding_graph_recording.py
```
**预期结果**
- ✅ 改动后:图录制成功
- ❌ 改动前:图录制失败(因为同步操作)
### 方法 3: 设备端输入测试
**关键测试**
```python
# 创建设备端输入
input_ids = infinicore.from_list([[1, 2, 3]], dtype=int64, device="cuda:0")
# 执行 forward
output = embedding.forward(input_ids) # 改动前会失败或同步,改动后成功
```
**改动前**
- 需要先将 `input_ids` 拷贝到 CPU
- 触发同步操作,无法图录制
**改动后**
- 直接使用设备端 `input_ids`
- 完全异步,支持图录制
---
## 技术细节对比
| 特性 | 改动前 | 改动后 |
|------|--------|--------|
| **输入设备** | 必须在 CPU | 支持设备端 |
| **同步操作** | ❌ 有(D2H拷贝) | ✅ 无 |
| **Kernel位置** | CPU 实现 | CUDA kernel |
| **图录制支持** | ❌ 不支持 | ✅ 支持 |
| **Batch维度** | ✅ 支持 | ✅ 支持 |
| **性能** | 较慢(同步开销) | 更快(异步) |
---
## 关键代码位置
### 改动前的问题代码
- `src/infinicore/nn/embedding.cc` (旧版本)
- 第58行:`indices->to(cpu_device)->contiguous()`
### 改动后的实现
- `src/infinicore/nn/embedding.cc` (新版本)
- 第48行:`indices->is_contiguous() ? indices : indices->contiguous()`
- 第52行:`return op::embedding(indices_contiguous, weight_)`
- `src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu`
- CUDA kernel 实现,完全异步 ✅
- `src/infinicore/ops/embedding/embedding_infiniop.cc`
- InfiniOP 包装,调用设备端 kernel ✅
---
## 总结
**改动前的关键问题**
-`indices->to(cpu_device)` 触发同步 D2H 拷贝
- ❌ 无法进行 CUDA Graph 录制
- ❌ 性能较差(同步开销)
**改动后的改进**
- ✅ 移除所有同步操作
- ✅ 使用设备端 CUDA kernel
- ✅ 完全支持 CUDA Graph 录制
- ✅ 性能更好(完全异步)
# Embedding 图录制测试使用指南
## 🚀 快速开始
### 运行测试
```bash
cd /home/zhuyue/codes/InfiniCore
python test/infinicore/nn/test_embedding_graph_recording.py
```
---
## 📊 改动前后对比
### ❌ 改动前:不支持图录制
#### 1. 运行测试
```bash
python test/infinicore/nn/test_embedding_graph_recording.py
```
#### 2. 预期输出
```
============================================================
Embedding 图录制支持验证
============================================================
============================================================
测试 Embedding 图录制支持
============================================================
1. 输入张量信息:
- Shape: [4, 32]
- Device: cuda
- Dtype: int64
2. 尝试 CUDA Graph 录制...
使用 PyTorch CUDA Graph API 测试...
✗ 图录制失败: [错误信息]
✗ Embedding 不支持 CUDA Graph 录制(可能包含同步操作)
3. 简化验证:检查异步操作支持
✓ 输入在设备上
⚠ 操作可能包含同步点(事件立即完成) ← 关键:说明有同步操作
✓ Forward 执行时间: X.XXX ms
✓ 输出形状: [4, 32, 128]
✓ 输出设备: cuda
✗ 输出验证失败
============================================================
测试 Embedding 设备端输入支持
============================================================
测试 1: 设备端输入
✗ 设备端输入失败: [错误信息]
============================================================
测试结果总结
============================================================
CUDA Graph 录制: ✗ 失败
设备端输入: ✗ 失败
============================================================
✗ 部分测试失败,Embedding 可能不完全支持图录制
============================================================
```
#### 3. 关键失败点
- **图录制失败**:因为代码中有 `indices->to(cpu_device)` 同步操作
- **设备端输入失败**:需要先将输入拷贝到 CPU
- **异步验证显示同步点**:事件立即完成,说明有同步操作
---
### ✅ 改动后:支持图录制
#### 1. 运行测试
```bash
python test/infinicore/nn/test_embedding_graph_recording.py
```
#### 2. 预期输出
```
============================================================
Embedding 图录制支持验证
============================================================
============================================================
测试 Embedding 图录制支持
============================================================
1. 输入张量信息:
- Shape: [4, 32]
- Device: cuda
- Dtype: int64
2. 尝试 CUDA Graph 录制...
使用 PyTorch CUDA Graph API 测试...
✓ 成功完成图录制!
✓ Embedding 支持 CUDA Graph 录制
✓ 图可以成功重放
============================================================
测试 Embedding 设备端输入支持
============================================================
测试 1: 设备端输入
✓ 设备端输入成功
- 输入设备: cuda
- 输出设备: cuda
- 输出形状: [1, 5, 64]
============================================================
测试结果总结
============================================================
CUDA Graph 录制: ✓ 通过
设备端输入: ✓ 通过
============================================================
✓ 所有测试通过!Embedding 支持图录制
============================================================
```
#### 3. 关键成功点
- **图录制成功**:所有操作都是异步的,无同步点
- **设备端输入成功**:直接支持设备端输入,无需拷贝
- **图可以重放**:验证图录制的正确性
---
## 🔍 如何判断当前是改动前还是改动后?
### 方法 1: 代码检查(最快)
```bash
# 检查是否有同步操作
grep -n "to(cpu_device)" src/infinicore/nn/embedding.cc
# 结果解读:
# - 有输出 → ❌ 改动前(不支持图录制)
# - 无输出 → ✅ 改动后(支持图录制)
```
### 方法 2: 检查设备端实现
```bash
# 检查是否有设备端 CUDA kernel
ls src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu
# 结果解读:
# - 不存在 → ❌ 改动前(不支持图录制)
# - 存在 → ✅ 改动后(支持图录制)
```
### 方法 3: 运行测试(最准确)
```bash
python test/infinicore/nn/test_embedding_graph_recording.py
# 查看 "CUDA Graph 录制" 测试结果:
# - ✓ 通过 → ✅ 改动后(支持图录制)
# - ✗ 失败 → ❌ 改动前(不支持图录制)
```
---
## 📝 测试内容详解
### 测试 1: CUDA Graph 录制
**目的**:验证 embedding 是否可以在 CUDA Graph 中录制
**工作原理**:
1. 使用 PyTorch 的 `torch.cuda.CUDAGraph()` API
2. 在图录制模式下执行 `embedding.forward()`
3. 如果包含同步操作,录制会失败
4. 如果完全异步,录制会成功
**改动前**:
- ❌ 录制失败:因为 `indices->to(cpu_device)` 触发同步
**改动后**:
- ✅ 录制成功:使用设备端 CUDA kernel,完全异步
### 测试 2: 设备端输入支持
**目的**:验证 embedding 是否支持设备端输入
**工作原理**:
1. 创建设备端的 `input_ids`
2. 直接调用 `embedding.forward(input_ids)`
3. 检查是否成功且输出在设备上
**改动前**:
- ❌ 可能需要先将输入拷贝到 CPU(同步操作)
**改动后**:
- ✅ 直接支持设备端输入(完全异步)
### 测试 3: 异步操作验证(备用)
**目的**:当 CUDA Graph API 不可用时,使用事件验证异步性
**工作原理**:
1. 使用 `DeviceEvent` 记录操作时间
2. 检查操作是否立即完成(同步)或异步执行
**改动前**:
- ⚠️ 事件立即完成,说明有同步操作
**改动后**:
- ✅ 事件未立即完成,说明是异步操作
---
## 🛠️ 故障排查
### 问题 1: PyTorch 版本不支持 CUDA Graph
**现象**:
```
⚠ PyTorch 版本不支持 torch.cuda.graph,使用简化验证方法
```
**解决**:
- 需要 PyTorch 2.0+ 版本
- 测试会自动降级到简化验证方法
- 简化验证也能检测是否支持图录制
### 问题 2: CUDA 不可用
**现象**:
```
⚠ CUDA 不可用,跳过图录制测试
```
**解决**:
- 确保 CUDA 设备可用
- 测试需要 CUDA 环境
### 问题 3: 测试失败但不确定原因
**检查清单**:
1. ✅ 确认代码已编译(特别是 CUDA 支持)
2. ✅ 确认 CUDA 设备可用
3. ✅ 检查 `src/infinicore/nn/embedding.cc` 是否还有 `to(cpu_device)`
4. ✅ 检查是否有 `src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu`
---
## 💡 快速验证脚本
创建一个简单的验证脚本:
```bash
#!/bin/bash
# quick_check.sh
cd /home/zhuyue/codes/InfiniCore
echo "=== 1. 代码检查 ==="
if grep -q "to(cpu_device)" src/infinicore/nn/embedding.cc; then
echo "❌ 改动前:发现同步操作 to(cpu_device)"
else
echo "✅ 改动后:无同步操作"
fi
echo ""
echo "=== 2. 设备端实现检查 ==="
if [ -f "src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu" ]; then
echo "✅ 改动后:有设备端 CUDA kernel"
else
echo "❌ 改动前:无设备端 CUDA kernel"
fi
echo ""
echo "=== 3. 运行测试 ==="
python test/infinicore/nn/test_embedding_graph_recording.py
```
使用方法:
```bash
chmod +x quick_check.sh
./quick_check.sh
```
---
## 📋 总结
### 改动前特征
| 检查项 | 结果 |
|--------|------|
| 代码中有 `to(cpu_device)` | ✅ 有 |
| 有设备端 CUDA kernel | ❌ 无 |
| 图录制测试 | ❌ 失败 |
| 设备端输入 | ❌ 失败 |
### 改动后特征
| 检查项 | 结果 |
|--------|------|
| 代码中有 `to(cpu_device)` | ❌ 无 |
| 有设备端 CUDA kernel | ✅ 有 |
| 图录制测试 | ✅ 成功 |
| 设备端输入 | ✅ 成功 |
### 最简单的判断方法
**运行测试脚本**,查看 "CUDA Graph 录制" 测试结果:
- ✅ **通过** → 支持图录制(改动后)
- ❌ **失败** → 不支持图录制(改动前)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment