Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
eb34d4d6
Commit
eb34d4d6
authored
Jan 09, 2026
by
wooway777
Browse files
issue/900 - adapt to graph and adjust test script
parent
835209e7
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
334 additions
and
535 deletions
+334
-535
include/infinicore/ops/embedding.hpp
include/infinicore/ops/embedding.hpp
+5
-8
src/infinicore/ops/embedding/embedding.cc
src/infinicore/ops/embedding/embedding.cc
+9
-17
src/infinicore/ops/embedding/embedding_infiniop.cc
src/infinicore/ops/embedding/embedding_infiniop.cc
+29
-34
test/infinicore/graph/test_embedding_graph_recording.py
test/infinicore/graph/test_embedding_graph_recording.py
+291
-0
test/infinicore/nn/EMBEDDING_GRAPH_RECORDING_COMPARISON.md
test/infinicore/nn/EMBEDDING_GRAPH_RECORDING_COMPARISON.md
+0
-159
test/infinicore/nn/HOW_TO_USE_GRAPH_RECORDING_TEST.md
test/infinicore/nn/HOW_TO_USE_GRAPH_RECORDING_TEST.md
+0
-317
No files found.
include/infinicore/ops/embedding.hpp
View file @
eb34d4d6
#pragma once
#pragma once
#include "../device.hpp"
#include "../graph/graph.hpp"
#include "common/op.hpp"
#include "common/op.hpp"
namespace
infinicore
::
op
{
namespace
infinicore
::
op
{
class
Embedding
{
INFINICORE_GRAPH_OP_CLASS
(
Embedding
,
Tensor
,
const
Tensor
&
,
const
Tensor
&
);
public:
using
schema
=
void
(
*
)(
Tensor
,
Tensor
,
Tensor
);
static
void
execute
(
Tensor
out
,
Tensor
input
,
Tensor
weight
);
static
common
::
OpDispatcher
<
schema
>
&
dispatcher
();
};
Tensor
embedding
(
Tensor
input
,
Tensor
weight
);
Tensor
embedding
(
const
Tensor
&
input
,
const
Tensor
&
weight
);
void
embedding_
(
Tensor
out
,
Tensor
input
,
Tensor
weight
);
void
embedding_
(
Tensor
out
,
const
Tensor
&
input
,
const
Tensor
&
weight
);
}
// namespace infinicore::op
}
// namespace infinicore::op
src/infinicore/ops/embedding/embedding.cc
View file @
eb34d4d6
...
@@ -5,27 +5,19 @@
...
@@ -5,27 +5,19 @@
#include <stdexcept>
#include <stdexcept>
namespace
infinicore
::
op
{
namespace
infinicore
::
op
{
INFINICORE_GRAPH_OP_DISPATCHERS_IMPL
(
Embedding
);
common
::
OpDispatcher
<
Embedding
::
schema
>
&
Embedding
::
dispatcher
()
{
Embedding
::
Embedding
(
Tensor
out
,
const
Tensor
&
input
,
const
Tensor
&
weight
)
{
static
common
::
OpDispatcher
<
Embedding
::
schema
>
dispatcher_
;
return
dispatcher_
;
}
void
Embedding
::
execute
(
Tensor
out
,
Tensor
input
,
Tensor
weight
)
{
// Check that all tensors are on the same device
// This is critical: if input is on CPU while out/weight are on GPU,
// passing CPU pointer to CUDA kernel will cause memory access errors
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
out
,
input
,
weight
);
INFINICORE_ASSERT_TENSORS_SAME_DEVICE
(
out
,
input
,
weight
);
INFINICORE_GRAPH_OP_DISPATCH
(
out
->
device
().
getType
(),
out
,
input
,
weight
);
}
// Set device context
void
Embedding
::
execute
(
Tensor
out
,
const
Tensor
&
input
,
const
Tensor
&
weight
)
{
infinicore
::
context
::
setDevice
(
out
->
device
());
INFINICORE_GRAPH_OP_RECORD_OR_RUN
(
Embedding
,
out
,
input
,
weight
);
// Use dispatcher to lookup kernel (infiniop implementation)
dispatcher
().
lookup
(
out
->
device
().
getType
())(
out
,
input
,
weight
);
}
}
Tensor
embedding
(
Tensor
input
,
// LongTensor of arbitrary shape containing the indices to extract
Tensor
embedding
(
const
Tensor
&
input
,
// LongTensor of arbitrary shape containing the indices to extract
Tensor
weight
// Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
const
Tensor
&
weight
// Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
)
{
)
{
auto
input_shape
=
input
->
shape
();
auto
input_shape
=
input
->
shape
();
auto
weight_shape
=
weight
->
shape
();
auto
weight_shape
=
weight
->
shape
();
...
@@ -40,7 +32,7 @@ Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the i
...
@@ -40,7 +32,7 @@ Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the i
return
inputs_embeds
;
return
inputs_embeds
;
}
}
void
embedding_
(
Tensor
out
,
Tensor
input
,
Tensor
weight
)
{
void
embedding_
(
Tensor
out
,
const
Tensor
&
input
,
const
Tensor
&
weight
)
{
Embedding
::
execute
(
out
,
input
,
weight
);
Embedding
::
execute
(
out
,
input
,
weight
);
}
}
...
...
src/infinicore/ops/embedding/embedding_infiniop.cc
View file @
eb34d4d6
#include "../../utils.hpp"
#include "../infiniop_impl.hpp"
#include "infinicore/common/hash.hpp"
#include "infinicore/ops/common/cache.hpp"
#include "infinicore/ops/embedding.hpp"
#include "infinicore/ops/embedding.hpp"
#include <infiniop.h>
namespace
infinicore
::
op
::
embedding_impl
::
infiniop
{
namespace
infinicore
::
op
::
embedding_impl
::
infiniop
{
thread_local
common
::
OpCache
<
size_t
,
infiniopEmbeddingDescriptor_t
>
caches
(
INFINIOP_CACHABLE_DESCRIPTOR
(
Descriptor
,
Embedding
,
100
);
100
,
// capacity
[](
infiniopEmbeddingDescriptor_t
&
desc
)
{
if
(
desc
!=
nullptr
)
{
INFINICORE_CHECK_ERROR
(
infiniopDestroyEmbeddingDescriptor
(
desc
));
desc
=
nullptr
;
}
});
void
calculate
(
Tensor
out
,
Tensor
input
,
Tensor
weight
)
{
struct
PlannedMeta
{
std
::
shared_ptr
<
Descriptor
>
descriptor
;
graph
::
GraphTensor
out
,
input
,
weight
;
};
void
*
plan
(
Tensor
out
,
const
Tensor
&
input
,
const
Tensor
&
weight
)
{
size_t
seed
=
hash_combine
(
out
,
input
,
weight
);
size_t
seed
=
hash_combine
(
out
,
input
,
weight
);
auto
device
=
context
::
getDevice
();
INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE
(
auto
&
cache
=
caches
.
getCache
(
device
);
Descriptor
,
descriptor
,
Embedding
,
seed
,
out
->
desc
(),
input
->
desc
(),
weight
->
desc
());
auto
planned
=
new
PlannedMeta
{
descriptor
,
graph
::
GraphTensor
(
out
),
graph
::
GraphTensor
(
input
),
graph
::
GraphTensor
(
weight
)};
auto
desc_opt
=
cache
.
get
(
se
ed
)
;
return
plann
ed
;
infiniopEmbeddingDescriptor_t
desc
=
nullptr
;
}
if
(
!
desc_opt
)
{
void
run
(
void
*
planned_meta
)
{
INFINICORE_CHECK_ERROR
(
infiniopCreateEmbeddingDescriptor
(
auto
planned
=
reinterpret_cast
<
PlannedMeta
*>
(
planned_meta
);
context
::
getInfiniopHandle
(
device
),
&
desc
,
out
->
desc
(),
input
->
desc
(),
weight
->
desc
()));
cache
.
put
(
seed
,
desc
);
}
else
{
desc
=
*
desc_opt
;
}
INFINICORE_CHECK_ERROR
(
infiniopEmbedding
(
INFINICORE_CHECK_ERROR
(
infiniopEmbedding
(
desc
,
planned
->
descriptor
->
desc
,
out
->
data
(),
planned
->
out
->
data
(),
planned
->
input
->
data
(),
planned
->
weight
->
data
(),
context
::
getStream
()));
input
->
data
(),
}
weight
->
data
(),
context
::
getStream
()));
void
cleanup
(
void
**
planned_meta_ptr
)
{
delete
*
reinterpret_cast
<
PlannedMeta
**>
(
planned_meta_ptr
);
*
planned_meta_ptr
=
nullptr
;
}
}
static
bool
registered
=
[]()
{
INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE
(
Embedding
,
&
plan
,
&
run
,
cleanup
);
Embedding
::
dispatcher
().
registerAll
(
&
calculate
,
false
);
return
true
;
}();
}
// namespace infinicore::op::embedding_impl::infiniop
}
// namespace infinicore::op::embedding_impl::infiniop
test/infinicore/
nn
/test_embedding_graph_recording.py
→
test/infinicore/
graph
/test_embedding_graph_recording.py
View file @
eb34d4d6
"""
"""
测试
embedding
是否支持
CUDA Graph
录制
Test if
embedding
supports
CUDA Graph
recording
使用方法:
Usage:
python test/infinicore/nn/test_embedding_graph_recording.py
python test/infinicore/nn/test_embedding_graph_recording.py
关键验证点:
Key verification points:
1.
改动前:
indices->to(cpu_device)
会触发同步的 D2H 拷贝,导致图录制失败
1.
Before modification:
indices->to(cpu_device)
triggers synchronous D2H copy, causing graph recording to fail
2.
改动后:使用设备端 CUDA kernel,完全异步,支持图录制
2.
After modification: Uses device-side CUDA kernel, fully asynchronous, supports graph recording
预期结果:
Expected results:
-
改动前:图录制失败,设备端输入可能失败
-
Before modification: Graph recording fails, device-side input may fail
-
改动后:图录制成功,设备端输入成功
-
After modification: Graph recording succeeds, device-side input succeeds
"""
"""
import
infinicore
import
infinicore
import
torch
import
torch
import
ctypes
def
test_embedding_graph_recording
():
def
test_embedding_graph_recording
():
"""
测试
embedding
是否支持
CUDA Graph
录制
"""
"""
Test if
embedding
supports
CUDA Graph
recording
"""
print
(
"="
*
60
)
print
(
"="
*
60
)
print
(
"
测试
Embedding
图录制支持
"
)
print
(
"
Testing
Embedding
Graph Recording Support
"
)
print
(
"="
*
60
)
print
(
"="
*
60
)
#
检查是否有 CUDA
#
Check if CUDA is available
if
not
torch
.
cuda
.
is_available
():
if
not
torch
.
cuda
.
is_available
():
print
(
"⚠ CUDA
不可用,跳过图录制测试
"
)
print
(
"⚠ CUDA
not available, skipping graph recording test
"
)
return
False
return
False
device
=
infinicore
.
device
(
"cuda"
,
0
)
device
=
infinicore
.
device
(
"cuda"
,
0
)
#
创建
embedding
模块
#
Create
embedding
module
vocab_size
=
1000
vocab_size
=
1000
embedding_dim
=
128
embedding_dim
=
128
embedding
=
infinicore
.
nn
.
Embedding
(
embedding
=
infinicore
.
nn
.
Embedding
(
num_embeddings
=
vocab_size
,
num_embeddings
=
vocab_size
,
embedding_dim
=
embedding_dim
,
embedding_dim
=
embedding_dim
,
dtype
=
infinicore
.
float32
,
dtype
=
infinicore
.
float32
,
device
=
device
device
=
device
,
)
)
#
创建设备端的 input_ids(这是关键:改动前不支持,改动后支持)
#
Create device-side input_ids (key point: unsupported before modification, supported after)
batch_size
=
4
batch_size
=
4
seq_len
=
32
seq_len
=
32
input_ids_device
=
infinicore
.
from_list
(
input_ids_device
=
infinicore
.
from_list
(
[[
i
%
vocab_size
for
i
in
range
(
seq_len
)]
for
_
in
range
(
batch_size
)],
[[
i
%
vocab_size
for
i
in
range
(
seq_len
)]
for
_
in
range
(
batch_size
)],
dtype
=
infinicore
.
int64
,
dtype
=
infinicore
.
int64
,
device
=
device
device
=
device
,
)
)
print
(
f
"
\n
1.
输入张量信息
:"
)
print
(
f
"
\n
1.
Input tensor information
:"
)
print
(
f
" - Shape:
{
input_ids_device
.
shape
}
"
)
print
(
f
" - Shape:
{
input_ids_device
.
shape
}
"
)
print
(
f
" - Device:
{
input_ids_device
.
device
.
type
}
"
)
print
(
f
" - Device:
{
input_ids_device
.
device
.
type
}
"
)
print
(
f
" - Dtype:
{
input_ids_device
.
dtype
}
"
)
print
(
f
" - Dtype:
{
input_ids_device
.
dtype
}
"
)
#
尝试使用
CUDA Graph
录制
#
Attempt
CUDA Graph
recording
print
(
f
"
\n
2.
尝试
CUDA Graph
录制
..."
)
print
(
f
"
\n
2.
Attempting
CUDA Graph
recording
..."
)
#
使用
PyTorch
的
CUDA Graph API
进行测试(更简单可靠)
#
Use
PyTorch
's
CUDA Graph API
for testing (simpler and more reliable)
try
:
try
:
#
设置设备
#
Set device
infinicore
.
set_device
(
device
)
infinicore
.
set_device
(
device
)
#
使用
PyTorch
的
CUDA Graph API
#
Use
PyTorch
's
CUDA Graph API
#
注意:
PyTorch 2.0+
支持
torch.cuda.graph
#
Note:
PyTorch 2.0+
supports
torch.cuda.graph
try
:
try
:
#
方法
1:
使用
PyTorch
的
CUDA Graph
(推荐)
#
Method
1:
Use
PyTorch CUDA Graph
(recommended)
print
(
"
使用
PyTorch CUDA Graph API
测试
..."
)
print
(
"
Using
PyTorch CUDA Graph API
for testing
..."
)
#
创建
warmup
输入
#
Create
warmup
input
warmup_input
=
input_ids_device
warmup_input
=
input_ids_device
# Warmup
(图录制前需要先执行一次,包括内存分配)
# Warmup
(need to execute once before graph recording, including memory allocation)
warmup_output
=
embedding
.
forward
(
warmup_input
)
embedding
.
forward
(
warmup_input
)
infinicore
.
sync_stream
()
#
同步确保 warmup 完成
infinicore
.
sync_stream
()
#
Synchronize to ensure warmup completes
#
预先分配输出张量(CUDA Graph 不支持动态内存分配)
#
Pre-allocate output tensor (CUDA Graph doesn't support dynamic memory allocation)
#
输出形状
: input_shape + [embedding_dim]
#
Output shape
: input_shape + [embedding_dim]
output_shape
=
list
(
input_ids_device
.
shape
)
+
[
embedding_dim
]
output_shape
=
list
(
input_ids_device
.
shape
)
+
[
embedding_dim
]
output
=
infinicore
.
empty
(
output
=
infinicore
.
empty
(
output_shape
,
output_shape
,
dtype
=
embedding
.
weight
.
dtype
,
device
=
device
dtype
=
embedding
.
weight
.
dtype
,
device
=
device
)
)
# Warmup embedding
(确保内存分配完成)
# Warmup embedding
(ensure memory allocation is complete)
import
infinicore.nn.functional
as
F
import
infinicore.nn.functional
as
F
F
.
embedding
(
warmup_input
,
embedding
.
weight
,
out
=
output
)
F
.
embedding
(
warmup_input
,
embedding
.
weight
,
out
=
output
)
infinicore
.
sync_stream
()
infinicore
.
sync_stream
()
#
开始图录制(使用预先分配的
output
)
#
Start graph recording (using pre-allocated
output
)
graph
=
torch
.
cuda
.
CUDAGraph
()
graph
=
torch
.
cuda
.
CUDAGraph
()
with
torch
.
cuda
.
graph
(
graph
):
with
torch
.
cuda
.
graph
(
graph
):
#
使用
embedding
的
out
参数(in-place),传入预先分配的
output
#
Use
embedding
's
out
parameter (in-place), passing pre-allocated
output
F
.
embedding
(
input_ids_device
,
embedding
.
weight
,
out
=
output
)
F
.
embedding
(
input_ids_device
,
embedding
.
weight
,
out
=
output
)
print
(
" ✓
成功完成图录制!
"
)
print
(
" ✓
Graph recording successful!
"
)
print
(
" ✓ Embedding
支持
CUDA Graph
录制
"
)
print
(
" ✓ Embedding
supports
CUDA Graph
recording
"
)
#
验证图可以重复执行
#
Verify graph can be replayed
graph
.
replay
()
graph
.
replay
()
infinicore
.
sync_stream
()
infinicore
.
sync_stream
()
print
(
" ✓
图可以成功重放
"
)
print
(
" ✓
Graph can be successfully replayed
"
)
return
True
return
True
except
AttributeError
:
except
AttributeError
:
# PyTorch 版本可能不支持 torch.cuda.graph
# PyTorch version may not support torch.cuda.graph
print
(
" ⚠ PyTorch 版本不支持 torch.cuda.graph,使用简化验证方法"
)
print
(
" ⚠ PyTorch version doesn't support torch.cuda.graph, using simplified verification method"
)
return
test_embedding_async_verification
(
embedding
,
input_ids_device
)
return
test_embedding_async_verification
(
embedding
,
input_ids_device
)
except
RuntimeError
as
e
:
except
RuntimeError
as
e
:
error_msg
=
str
(
e
)
error_msg
=
str
(
e
)
if
"capture"
in
error_msg
.
lower
()
or
"graph"
in
error_msg
.
lower
():
if
"capture"
in
error_msg
.
lower
()
or
"graph"
in
error_msg
.
lower
():
print
(
f
" ✗ 图录制失败:
{
e
}
"
)
print
(
f
" ✗ Graph recording failed:
{
e
}
"
)
print
(
" ✗ Embedding 不支持 CUDA Graph 录制(可能包含同步操作)"
)
print
(
" ✗ Embedding doesn't support CUDA Graph recording (may contain synchronous operations)"
)
return
False
return
False
else
:
else
:
print
(
f
" ⚠
图录制测试异常
:
{
e
}
"
)
print
(
f
" ⚠
Graph recording test exception
:
{
e
}
"
)
return
test_embedding_async_verification
(
embedding
,
input_ids_device
)
return
test_embedding_async_verification
(
embedding
,
input_ids_device
)
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
" ⚠
图录制测试异常
:
{
e
}
"
)
print
(
f
" ⚠
Graph recording test exception
:
{
e
}
"
)
print
(
"
使用简化验证方法
..."
)
print
(
"
Using simplified verification method
..."
)
import
traceback
import
traceback
traceback
.
print_exc
()
traceback
.
print_exc
()
return
test_embedding_async_verification
(
embedding
,
input_ids_device
)
return
test_embedding_async_verification
(
embedding
,
input_ids_device
)
def
test_embedding_async_verification
(
embedding
,
input_ids_device
):
def
test_embedding_async_verification
(
embedding
,
input_ids_device
):
"""
"""
简化验证:检查是否有同步操作
Simplified verification: Check if there are synchronous operations
关键检查点:
Key checkpoints:
1.
输入是否可以在设备上(改动前需要 CPU,改动后支持设备)
1.
Whether input can be on device (needed CPU before modification, supports device after)
2.
操作是否完全异步(没有同步点)
2.
Whether operations are fully asynchronous (no synchronization points)
"""
"""
print
(
"
\n
3.
简化验证:检查异步操作支持
"
)
print
(
"
\n
3.
Simplified verification: Checking asynchronous operation support
"
)
#
验证 1: 输入可以在设备上
#
Verification 1: Input can be on device
if
input_ids_device
.
device
.
type
!=
"cuda"
:
if
input_ids_device
.
device
.
type
!=
"cuda"
:
print
(
" ✗
输入不在设备上,无法验证
"
)
print
(
" ✗
Input not on device, cannot verify
"
)
return
False
return
False
print
(
" ✓
输入在设备上
"
)
print
(
" ✓
Input is on device
"
)
#
验证 2: 执行 forward,检查是否有同步操作
#
Verification 2: Execute forward, check for synchronous operations
#
如果改动前,这里会调用 indices->to(cpu_device),触发同步
#
Before modification, this would call indices->to(cpu_device), triggering synchronization
#
如果改动后,直接使用设备端 kernel,完全异步
#
After modification, directly uses device-side kernel, fully asynchronous
try
:
try
:
#
记录开始时间
#
Record start time
start_event
=
infinicore
.
DeviceEvent
(
enable_timing
=
True
)
start_event
=
infinicore
.
DeviceEvent
(
enable_timing
=
True
)
end_event
=
infinicore
.
DeviceEvent
(
enable_timing
=
True
)
end_event
=
infinicore
.
DeviceEvent
(
enable_timing
=
True
)
start_event
.
record
()
start_event
.
record
()
output
=
embedding
.
forward
(
input_ids_device
)
output
=
embedding
.
forward
(
input_ids_device
)
end_event
.
record
()
end_event
.
record
()
#
不立即同步,检查操作是否异步
#
Don't synchronize immediately, check if operation is asynchronous
#
如果操作是异步的,query 应该返回 False(未完成)
#
If operation is asynchronous, query should return False (not completed)
#
如果操作是同步的,可能已经完成
#
If operation is synchronous, may have already completed
#
等待一小段时间
#
Wait a short time
import
time
import
time
time
.
sleep
(
0.001
)
# 1ms
time
.
sleep
(
0.001
)
# 1ms
#
检查事件状态
#
Check event status
is_complete
=
end_event
.
query
()
is_complete
=
end_event
.
query
()
if
not
is_complete
:
if
not
is_complete
:
print
(
" ✓
操作是异步的(事件未立即完成)
"
)
print
(
" ✓
Operation is asynchronous (event not immediately completed)
"
)
else
:
else
:
print
(
" ⚠ 操作可能包含同步点(事件立即完成)"
)
print
(
" ⚠ Operation may contain synchronization points (event immediately completed)"
# 同步并测量时间
)
# Synchronize and measure time
end_event
.
synchronize
()
end_event
.
synchronize
()
elapsed
=
start_event
.
elapsed_time
(
end_event
)
elapsed
=
start_event
.
elapsed_time
(
end_event
)
print
(
f
" ✓ Forward
执行时间
:
{
elapsed
:.
3
f
}
ms"
)
print
(
f
" ✓ Forward
execution time
:
{
elapsed
:.
3
f
}
ms"
)
print
(
f
" ✓
输出形状
:
{
output
.
shape
}
"
)
print
(
f
" ✓
Output shape
:
{
output
.
shape
}
"
)
print
(
f
" ✓
输出设备
:
{
output
.
device
.
type
}
"
)
print
(
f
" ✓
Output device
:
{
output
.
device
.
type
}
"
)
#
验证输出正确性
#
Verify output correctness
embedding_dim
=
embedding
.
embedding_dim
()
embedding_dim
=
embedding
.
embedding_dim
()
expected_shape
=
(
*
input_ids_device
.
shape
,
embedding_dim
)
expected_shape
=
(
*
input_ids_device
.
shape
,
embedding_dim
)
if
output
.
device
.
type
==
"cuda"
and
output
.
shape
==
expected_shape
:
if
output
.
device
.
type
==
"cuda"
and
output
.
shape
==
expected_shape
:
print
(
" ✓
输出在设备上,形状正确
"
)
print
(
" ✓
Output on device, shape correct
"
)
return
True
return
True
else
:
else
:
print
(
f
" ✗ 输出验证失败"
)
print
(
f
" ✗ Output verification failed"
)
print
(
f
" 期望形状:
{
expected_shape
}
, 实际形状:
{
output
.
shape
}
"
)
print
(
print
(
f
" 期望设备: cuda, 实际设备:
{
output
.
device
.
type
}
"
)
f
" Expected shape:
{
expected_shape
}
, actual shape:
{
output
.
shape
}
"
)
print
(
f
" Expected device: cuda, actual device:
{
output
.
device
.
type
}
"
)
return
False
return
False
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
" ✗
验证失败
:
{
e
}
"
)
print
(
f
" ✗
Verification failed
:
{
e
}
"
)
import
traceback
import
traceback
traceback
.
print_exc
()
traceback
.
print_exc
()
return
False
return
False
def
test_embedding_device_input_support
():
def
test_embedding_device_input_support
():
"""
测试
embedding
是否支持设备端输入
"""
"""
Test if
embedding
supports device-side input
"""
print
(
"
\n
"
+
"="
*
60
)
print
(
"
\n
"
+
"="
*
60
)
print
(
"
测试
Embedding
设备端输入支持
"
)
print
(
"
Testing
Embedding
Device-side Input Support
"
)
print
(
"="
*
60
)
print
(
"="
*
60
)
if
not
torch
.
cuda
.
is_available
():
if
not
torch
.
cuda
.
is_available
():
print
(
"⚠ CUDA
不可用,跳过测试
"
)
print
(
"⚠ CUDA
not available, skipping test
"
)
return
False
return
False
device
=
infinicore
.
device
(
"cuda"
,
0
)
device
=
infinicore
.
device
(
"cuda"
,
0
)
vocab_size
=
100
vocab_size
=
100
embedding_dim
=
64
embedding_dim
=
64
embedding
=
infinicore
.
nn
.
Embedding
(
embedding
=
infinicore
.
nn
.
Embedding
(
num_embeddings
=
vocab_size
,
num_embeddings
=
vocab_size
,
embedding_dim
=
embedding_dim
,
embedding_dim
=
embedding_dim
,
dtype
=
infinicore
.
float32
,
dtype
=
infinicore
.
float32
,
device
=
device
device
=
device
,
)
)
#
测试 1: 设备端输入(改动后支持)
#
Test 1: Device-side input (supported after modification)
print
(
"
\n
测试 1: 设备端输入
"
)
print
(
"
\n
Test 1: Device-side input
"
)
try
:
try
:
input_ids_device
=
infinicore
.
from_list
(
input_ids_device
=
infinicore
.
from_list
(
[[
1
,
2
,
3
,
4
,
5
]],
[[
1
,
2
,
3
,
4
,
5
]],
dtype
=
infinicore
.
int64
,
device
=
device
dtype
=
infinicore
.
int64
,
device
=
device
)
)
output
=
embedding
.
forward
(
input_ids_device
)
output
=
embedding
.
forward
(
input_ids_device
)
print
(
f
" ✓
设备端输入成功
"
)
print
(
f
" ✓
Device-side input successful
"
)
print
(
f
" -
输入设备
:
{
input_ids_device
.
device
.
type
}
"
)
print
(
f
" -
Input device
:
{
input_ids_device
.
device
.
type
}
"
)
print
(
f
" -
输出设备
:
{
output
.
device
.
type
}
"
)
print
(
f
" -
Output device
:
{
output
.
device
.
type
}
"
)
print
(
f
" -
输出形状
:
{
output
.
shape
}
"
)
print
(
f
" -
Output shape
:
{
output
.
shape
}
"
)
return
True
return
True
except
Exception
as
e
:
except
Exception
as
e
:
print
(
f
" ✗
设备端输入失败
:
{
e
}
"
)
print
(
f
" ✗
Device-side input failed
:
{
e
}
"
)
return
False
return
False
def
main
():
def
main
():
"""
主测试函数
"""
"""
Main test function
"""
print
(
"
\n
"
+
"="
*
60
)
print
(
"
\n
"
+
"="
*
60
)
print
(
"Embedding
图录制支持验证
"
)
print
(
"Embedding
Graph Recording Support Verification
"
)
print
(
"="
*
60
)
print
(
"="
*
60
)
results
=
[]
results
=
[]
#
测试 1: 图录制支持
#
Test 1: Graph recording support
result1
=
test_embedding_graph_recording
()
result1
=
test_embedding_graph_recording
()
results
.
append
((
"CUDA Graph
录制
"
,
result1
))
results
.
append
((
"CUDA Graph
Recording
"
,
result1
))
#
测试 2: 设备端输入支持
#
Test 2: Device-side input support
result2
=
test_embedding_device_input_support
()
result2
=
test_embedding_device_input_support
()
results
.
append
((
"
设备端输入
"
,
result2
))
results
.
append
((
"
Device-side Input
"
,
result2
))
#
总结
#
Summary
print
(
"
\n
"
+
"="
*
60
)
print
(
"
\n
"
+
"="
*
60
)
print
(
"
测试结果总结
"
)
print
(
"
Test Results Summary
"
)
print
(
"="
*
60
)
print
(
"="
*
60
)
all_passed
=
True
all_passed
=
True
for
test_name
,
result
in
results
:
for
test_name
,
result
in
results
:
status
=
"✓
通过
"
if
result
else
"✗
失败
"
status
=
"✓
Passed
"
if
result
else
"✗
Failed
"
print
(
f
"
{
test_name
}
:
{
status
}
"
)
print
(
f
"
{
test_name
}
:
{
status
}
"
)
if
not
result
:
if
not
result
:
all_passed
=
False
all_passed
=
False
print
(
"
\n
"
+
"="
*
60
)
print
(
"
\n
"
+
"="
*
60
)
if
all_passed
:
if
all_passed
:
print
(
"✓
所有测试通过!Embedding 支持图录制
"
)
print
(
"✓
All tests passed! Embedding supports graph recording
"
)
else
:
else
:
print
(
"✗
部分测试失败,Embedding 可能不完全支持图录制
"
)
print
(
"✗
Some tests failed, embedding may not fully support graph recording
"
)
print
(
"="
*
60
)
print
(
"="
*
60
)
return
all_passed
return
all_passed
...
...
test/infinicore/nn/EMBEDDING_GRAPH_RECORDING_COMPARISON.md
deleted
100644 → 0
View file @
835209e7
# Embedding 图录制支持对比
## 改动前后对比
### ❌ 改动前:不支持图录制
**关键问题代码**
(在
`nn::Embedding::forward`
中):
```
cpp
// 改动前的实现
Tensor
Embedding
::
forward
(
const
Tensor
&
indices
)
const
{
auto
cpu_device
=
Device
(
Device
::
Type
::
CPU
,
0
);
auto
indices_cpu
=
indices
->
to
(
cpu_device
)
->
contiguous
();
// ❌ 同步操作!
// ... 后续处理
}
```
**问题分析**
:
1.
`indices->to(cpu_device)`
会触发
**同步的 D2H(Device-to-Host)内存拷贝**
2.
CUDA Graph 录制要求所有操作都是
**异步的**
,不能有同步点
3.
同步操作会导致图录制失败或产生错误
**验证方法**
:
```
python
# 改动前:这个操作会失败或产生同步
input_ids_device
=
infinicore
.
from_list
(...,
device
=
"cuda:0"
)
# 设备端输入
output
=
embedding
.
forward
(
input_ids_device
)
# ❌ 内部会同步拷贝到 CPU
```
---
### ✅ 改动后:支持图录制
**关键改进代码**
:
```
cpp
// 改动后的实现
Tensor
Embedding
::
forward
(
const
Tensor
&
indices
)
const
{
Tensor
indices_contiguous
=
indices
->
is_contiguous
()
?
indices
:
indices
->
contiguous
();
return
op
::
embedding
(
indices_contiguous
,
weight_
);
// ✅ 直接使用设备端 kernel
}
```
**改进点**
:
1.
**移除了同步操作**
:不再调用
`indices->to(cpu_device)`
2.
**使用设备端 CUDA kernel**
:通过 InfiniOP 调用
`embeddingKernel`
,完全在设备端执行
3.
**完全异步**
:所有操作都在 CUDA stream 上异步执行
**实现位置**
:
-
CUDA Kernel:
`src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu`
-
Kernel 启动:使用
`cudaStream_t`
,完全异步
-
无同步点:没有
`cudaDeviceSynchronize()`
或 D2H 拷贝
**验证方法**
:
```
python
# 改动后:这个操作完全异步,支持图录制
input_ids_device
=
infinicore
.
from_list
(...,
device
=
"cuda:0"
)
# 设备端输入
output
=
embedding
.
forward
(
input_ids_device
)
# ✅ 直接使用设备端 kernel,无同步
```
---
## 验证方法
### 方法 1: 代码检查
**检查点**
:
1.
✅ 是否有
`->to(cpu_device)`
调用?
2.
✅ 是否有
`synchronize()`
调用?
3.
✅ 是否有设备端 kernel 实现?
**改动前**
:
```
cpp
// ❌ 有同步操作
auto
indices_cpu
=
indices
->
to
(
cpu_device
)
->
contiguous
();
```
**改动后**
:
```
cpp
// ✅ 无同步操作,直接使用设备端 kernel
return
op
::
embedding
(
indices_contiguous
,
weight_
);
```
### 方法 2: CUDA Graph API 测试
运行测试脚本:
```
bash
python
test
/infinicore/nn/test_embedding_graph_recording.py
```
**预期结果**
:
-
✅ 改动后:图录制成功
-
❌ 改动前:图录制失败(因为同步操作)
### 方法 3: 设备端输入测试
**关键测试**
:
```
python
# 创建设备端输入
input_ids
=
infinicore
.
from_list
([[
1
,
2
,
3
]],
dtype
=
int64
,
device
=
"cuda:0"
)
# 执行 forward
output
=
embedding
.
forward
(
input_ids
)
# 改动前会失败或同步,改动后成功
```
**改动前**
:
-
需要先将
`input_ids`
拷贝到 CPU
-
触发同步操作,无法图录制
**改动后**
:
-
直接使用设备端
`input_ids`
-
完全异步,支持图录制
---
## 技术细节对比
| 特性 | 改动前 | 改动后 |
|------|--------|--------|
|
**输入设备**
| 必须在 CPU | 支持设备端 |
|
**同步操作**
| ❌ 有(D2H拷贝) | ✅ 无 |
|
**Kernel位置**
| CPU 实现 | CUDA kernel |
|
**图录制支持**
| ❌ 不支持 | ✅ 支持 |
|
**Batch维度**
| ✅ 支持 | ✅ 支持 |
|
**性能**
| 较慢(同步开销) | 更快(异步) |
---
## 关键代码位置
### 改动前的问题代码
-
`src/infinicore/nn/embedding.cc`
(旧版本)
-
第58行:
`indices->to(cpu_device)->contiguous()`
❌
### 改动后的实现
-
`src/infinicore/nn/embedding.cc`
(新版本)
-
第48行:
`indices->is_contiguous() ? indices : indices->contiguous()`
✅
-
第52行:
`return op::embedding(indices_contiguous, weight_)`
✅
-
`src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu`
-
CUDA kernel 实现,完全异步 ✅
-
`src/infinicore/ops/embedding/embedding_infiniop.cc`
-
InfiniOP 包装,调用设备端 kernel ✅
---
## 总结
**改动前的关键问题**
:
-
❌
`indices->to(cpu_device)`
触发同步 D2H 拷贝
-
❌ 无法进行 CUDA Graph 录制
-
❌ 性能较差(同步开销)
**改动后的改进**
:
-
✅ 移除所有同步操作
-
✅ 使用设备端 CUDA kernel
-
✅ 完全支持 CUDA Graph 录制
-
✅ 性能更好(完全异步)
test/infinicore/nn/HOW_TO_USE_GRAPH_RECORDING_TEST.md
deleted
100644 → 0
View file @
835209e7
# Embedding 图录制测试使用指南
## 🚀 快速开始
### 运行测试
```
bash
cd
/home/zhuyue/codes/InfiniCore
python
test
/infinicore/nn/test_embedding_graph_recording.py
```
---
## 📊 改动前后对比
### ❌ 改动前:不支持图录制
#### 1. 运行测试
```
bash
python
test
/infinicore/nn/test_embedding_graph_recording.py
```
#### 2. 预期输出
```
============================================================
Embedding 图录制支持验证
============================================================
============================================================
测试 Embedding 图录制支持
============================================================
1.
输入张量信息:
-
Shape: [4, 32]
-
Device: cuda
-
Dtype: int64
2.
尝试 CUDA Graph 录制...
使用 PyTorch CUDA Graph API 测试...
✗ 图录制失败: [错误信息]
✗ Embedding 不支持 CUDA Graph 录制(可能包含同步操作)
3.
简化验证:检查异步操作支持
✓ 输入在设备上
⚠ 操作可能包含同步点(事件立即完成) ← 关键:说明有同步操作
✓ Forward 执行时间: X.XXX ms
✓ 输出形状: [4, 32, 128]
✓ 输出设备: cuda
✗ 输出验证失败
============================================================
测试 Embedding 设备端输入支持
============================================================
测试 1: 设备端输入
✗ 设备端输入失败: [错误信息]
============================================================
测试结果总结
============================================================
CUDA Graph 录制: ✗ 失败
设备端输入: ✗ 失败
============================================================
✗ 部分测试失败,Embedding 可能不完全支持图录制
============================================================
```
#### 3. 关键失败点
- **图录制失败**:因为代码中有 `indices->to(cpu_device)` 同步操作
- **设备端输入失败**:需要先将输入拷贝到 CPU
- **异步验证显示同步点**:事件立即完成,说明有同步操作
---
### ✅ 改动后:支持图录制
#### 1. 运行测试
```
bash
python test/infinicore/nn/test_embedding_graph_recording.py
```
#### 2. 预期输出
```
============================================================
Embedding 图录制支持验证
============================================================
============================================================
测试 Embedding 图录制支持
============================================================
1.
输入张量信息:
-
Shape: [4, 32]
-
Device: cuda
-
Dtype: int64
2.
尝试 CUDA Graph 录制...
使用 PyTorch CUDA Graph API 测试...
✓ 成功完成图录制!
✓ Embedding 支持 CUDA Graph 录制
✓ 图可以成功重放
============================================================
测试 Embedding 设备端输入支持
============================================================
测试 1: 设备端输入
✓ 设备端输入成功
-
输入设备: cuda
-
输出设备: cuda
-
输出形状: [1, 5, 64]
============================================================
测试结果总结
============================================================
CUDA Graph 录制: ✓ 通过
设备端输入: ✓ 通过
============================================================
✓ 所有测试通过!Embedding 支持图录制
============================================================
```
#### 3. 关键成功点
- **图录制成功**:所有操作都是异步的,无同步点
- **设备端输入成功**:直接支持设备端输入,无需拷贝
- **图可以重放**:验证图录制的正确性
---
## 🔍 如何判断当前是改动前还是改动后?
### 方法 1: 代码检查(最快)
```
bash
# 检查是否有同步操作
grep -n "to(cpu_device)" src/infinicore/nn/embedding.cc
# 结果解读:
# - 有输出 → ❌ 改动前(不支持图录制)
# - 无输出 → ✅ 改动后(支持图录制)
```
### 方法 2: 检查设备端实现
```
bash
# 检查是否有设备端 CUDA kernel
ls src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu
# 结果解读:
# - 不存在 → ❌ 改动前(不支持图录制)
# - 存在 → ✅ 改动后(支持图录制)
```
### 方法 3: 运行测试(最准确)
```
bash
python test/infinicore/nn/test_embedding_graph_recording.py
# 查看 "CUDA Graph 录制" 测试结果:
# - ✓ 通过 → ✅ 改动后(支持图录制)
# - ✗ 失败 → ❌ 改动前(不支持图录制)
```
---
## 📝 测试内容详解
### 测试 1: CUDA Graph 录制
**目的**:验证 embedding 是否可以在 CUDA Graph 中录制
**工作原理**:
1. 使用 PyTorch 的 `torch.cuda.CUDAGraph()` API
2. 在图录制模式下执行 `embedding.forward()`
3. 如果包含同步操作,录制会失败
4. 如果完全异步,录制会成功
**改动前**:
- ❌ 录制失败:因为 `indices->to(cpu_device)` 触发同步
**改动后**:
- ✅ 录制成功:使用设备端 CUDA kernel,完全异步
### 测试 2: 设备端输入支持
**目的**:验证 embedding 是否支持设备端输入
**工作原理**:
1. 创建设备端的 `input_ids`
2. 直接调用 `embedding.forward(input_ids)`
3. 检查是否成功且输出在设备上
**改动前**:
- ❌ 可能需要先将输入拷贝到 CPU(同步操作)
**改动后**:
- ✅ 直接支持设备端输入(完全异步)
### 测试 3: 异步操作验证(备用)
**目的**:当 CUDA Graph API 不可用时,使用事件验证异步性
**工作原理**:
1. 使用 `DeviceEvent` 记录操作时间
2. 检查操作是否立即完成(同步)或异步执行
**改动前**:
- ⚠️ 事件立即完成,说明有同步操作
**改动后**:
- ✅ 事件未立即完成,说明是异步操作
---
## 🛠️ 故障排查
### 问题 1: PyTorch 版本不支持 CUDA Graph
**现象**:
```
⚠ PyTorch 版本不支持 torch.cuda.graph,使用简化验证方法
```
**解决**:
- 需要 PyTorch 2.0+ 版本
- 测试会自动降级到简化验证方法
- 简化验证也能检测是否支持图录制
### 问题 2: CUDA 不可用
**现象**:
```
⚠ CUDA 不可用,跳过图录制测试
```
**解决**:
- 确保 CUDA 设备可用
- 测试需要 CUDA 环境
### 问题 3: 测试失败但不确定原因
**检查清单**:
1. ✅ 确认代码已编译(特别是 CUDA 支持)
2. ✅ 确认 CUDA 设备可用
3. ✅ 检查 `src/infinicore/nn/embedding.cc` 是否还有 `to(cpu_device)`
4. ✅ 检查是否有 `src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu`
---
## 💡 快速验证脚本
创建一个简单的验证脚本:
```
bash
#!/bin/bash
# quick_check.sh
cd /home/zhuyue/codes/InfiniCore
echo "=== 1. 代码检查 ==="
if grep -q "to(cpu_device)" src/infinicore/nn/embedding.cc; then
echo "❌ 改动前:发现同步操作 to(cpu_device)"
else
echo "✅ 改动后:无同步操作"
fi
echo ""
echo "=== 2. 设备端实现检查 ==="
if [ -f "src/infiniop/ops/embedding/nvidia/embedding_nvidia.cu" ]; then
echo "✅ 改动后:有设备端 CUDA kernel"
else
echo "❌ 改动前:无设备端 CUDA kernel"
fi
echo ""
echo "=== 3. 运行测试 ==="
python test/infinicore/nn/test_embedding_graph_recording.py
```
使用方法:
```
bash
chmod +x quick_check.sh
./quick_check.sh
```
---
## 📋 总结
### 改动前特征
| 检查项 | 结果 |
|--------|------|
| 代码中有 `to(cpu_device)` | ✅ 有 |
| 有设备端 CUDA kernel | ❌ 无 |
| 图录制测试 | ❌ 失败 |
| 设备端输入 | ❌ 失败 |
### 改动后特征
| 检查项 | 结果 |
|--------|------|
| 代码中有 `to(cpu_device)` | ❌ 无 |
| 有设备端 CUDA kernel | ✅ 有 |
| 图录制测试 | ✅ 成功 |
| 设备端输入 | ✅ 成功 |
### 最简单的判断方法
**运行测试脚本**,查看 "CUDA Graph 录制" 测试结果:
- ✅ **通过** → 支持图录制(改动后)
- ❌ **失败** → 不支持图录制(改动前)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment