Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
hg-misc-tools
Commits
63d618ba
Commit
63d618ba
authored
May 18, 2026
by
one
Browse files
Add FastPT-C host-side overhead MRE
parent
ff6a4830
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
654 additions
and
0 deletions
+654
-0
projects/fastpt-overhead/CMakeLists.txt
projects/fastpt-overhead/CMakeLists.txt
+64
-0
projects/fastpt-overhead/README.md
projects/fastpt-overhead/README.md
+86
-0
projects/fastpt-overhead/scripts/bench_guard.py
projects/fastpt-overhead/scripts/bench_guard.py
+81
-0
projects/fastpt-overhead/scripts/build.sh
projects/fastpt-overhead/scripts/build.sh
+33
-0
projects/fastpt-overhead/scripts/compare.py
projects/fastpt-overhead/scripts/compare.py
+69
-0
projects/fastpt-overhead/scripts/run_compare.sh
projects/fastpt-overhead/scripts/run_compare.sh
+13
-0
projects/fastpt-overhead/scripts/run_one.sh
projects/fastpt-overhead/scripts/run_one.sh
+48
-0
projects/fastpt-overhead/scripts/run_with_probe.sh
projects/fastpt-overhead/scripts/run_with_probe.sh
+26
-0
projects/fastpt-overhead/src/device_query.cpp
projects/fastpt-overhead/src/device_query.cpp
+77
-0
projects/fastpt-overhead/src/guard_ext.cpp
projects/fastpt-overhead/src/guard_ext.cpp
+26
-0
projects/fastpt-overhead/src/runtime_probe.cpp
projects/fastpt-overhead/src/runtime_probe.cpp
+131
-0
No files found.
projects/fastpt-overhead/CMakeLists.txt
0 → 100644
View file @
63d618ba
cmake_minimum_required
(
VERSION 3.21 FATAL_ERROR
)
project
(
FastPTCOverheadMRE LANGUAGES CXX
)
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CXX_STANDARD_REQUIRED ON
)
set
(
CMAKE_POSITION_INDEPENDENT_CODE ON
)
set
(
BACKEND
"hip"
CACHE STRING
"Backend: hip or cuda"
)
execute_process
(
COMMAND python3 -c
"import torch; print(torch.utils.cmake_prefix_path)"
OUTPUT_VARIABLE TORCH_CMAKE_PREFIX_PATH
OUTPUT_STRIP_TRAILING_WHITESPACE
)
list
(
APPEND CMAKE_PREFIX_PATH
"
${
TORCH_CMAKE_PREFIX_PATH
}
"
)
find_package
(
Torch REQUIRED
)
string
(
REPLACE
"-Wno-duplicate-decl-specifier"
""
TORCH_CXX_FLAGS
"
${
TORCH_CXX_FLAGS
}
"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
${
TORCH_CXX_FLAGS
}
"
)
function
(
disable_noisy_warnings target_name
)
target_compile_options
(
${
target_name
}
PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:-Wno-unused-result>
$<$<COMPILE_LANGUAGE:HIP>:-Wno-unused-result>
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-unused-result>
)
endfunction
()
if
(
BACKEND STREQUAL
"hip"
)
enable_language
(
HIP
)
set_source_files_properties
(
src/device_query.cpp PROPERTIES LANGUAGE HIP
)
add_executable
(
device_query src/device_query.cpp
)
add_library
(
guard_ext SHARED src/guard_ext.cpp
)
target_compile_definitions
(
device_query PRIVATE BACKEND_HIP=1
)
target_compile_definitions
(
guard_ext PRIVATE BACKEND_HIP=1
)
target_compile_options
(
device_query PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-O3>
)
elseif
(
BACKEND STREQUAL
"cuda"
)
enable_language
(
CUDA
)
set
(
CMAKE_INCLUDE_SYSTEM_FLAG_CUDA
"-I"
)
set
(
CMAKE_CUDA_STANDARD 17
)
set
(
CMAKE_CUDA_STANDARD_REQUIRED ON
)
if
(
NOT DEFINED CMAKE_CUDA_ARCHITECTURES
)
set
(
CMAKE_CUDA_ARCHITECTURES
"60;70;80;90"
)
endif
()
set_source_files_properties
(
src/device_query.cpp PROPERTIES LANGUAGE CUDA
)
add_executable
(
device_query src/device_query.cpp
)
add_library
(
guard_ext SHARED src/guard_ext.cpp
)
target_compile_definitions
(
device_query PRIVATE BACKEND_CUDA=1
)
target_compile_definitions
(
guard_ext PRIVATE BACKEND_CUDA=1
)
target_compile_options
(
device_query PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3>
)
else
()
message
(
FATAL_ERROR
"BACKEND must be hip or cuda"
)
endif
()
target_link_libraries
(
guard_ext PRIVATE
${
TORCH_LIBRARIES
}
)
disable_noisy_warnings
(
device_query
)
disable_noisy_warnings
(
guard_ext
)
set_target_properties
(
device_query PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
"
${
CMAKE_BINARY_DIR
}
/bin"
)
set_target_properties
(
guard_ext PROPERTIES
LIBRARY_OUTPUT_DIRECTORY
"
${
CMAKE_BINARY_DIR
}
/lib"
)
projects/fastpt-overhead/README.md
0 → 100644
View file @
63d618ba
# FastPT-C Host-side Overhead MRE
使用 FastPT 不转码模式适配 MatPL 的过程中发现了一定的性能损失,因此单独建立该项目来观察 FastPT 不转码模式 CUDA 兼容路径带来的 host 侧开销。它不依赖 MatPL 训练流程,只保留两个实验:
1.
`device_query`
:对比原生 HIP 路径的
`hipGetDevice`
和 FastPT-C 路径的
`cudaGetDevice`
。
2.
`guard_loop`
:通过 PyTorch C++ extension 对比
`c10::hip::HIPGuard`
和
`c10::cuda::CUDAGuard`
的循环调用开销。
第二个实验更接近 MatPL 的性能现象:当 CUDA ABI 的 PyTorch C++ extension 通过 FastPT-C 运行时,频繁使用 c10 device guard/device query 可能产生额外的 host 侧开销。
## 目录结构
```
text
.
├── CMakeLists.txt
├── README.md
├── scripts
│ ├── bench_guard.py
│ ├── build.sh
│ ├── compare.py
│ ├── run_compare.sh
│ ├── run_one.sh
│ └── run_with_probe.sh
└── src
├── device_query.cpp
├── guard_ext.cpp
└── runtime_probe.cpp
```
## 运行方式
在 DTK/FastPT 容器中执行:
```
bash
cd
/workspace/tools/fastpt_c_overhead_mre
bash scripts/run_compare.sh
```
脚本会构建并运行两种模式:
-
`hip`
:原生 DTK/HIP/PyTorch HIP 路径。
-
`fastpt-C`
:FastPT-C CUDA 兼容路径。
结果会写入:
```
text
results/hip/
results/fastpt-C/
results/compare.csv
```
常用参数可以通过环境变量调整:
```
bash
DEVICE
=
0
\
DEVICE_QUERY_LOOPS
=
1000000
\
DEVICE_QUERY_ROUNDS
=
7
\
GUARD_STEPS
=
10000
\
GUARD_WARMUP
=
1000
\
GUARD_ROUNDS
=
5
\
GUARD_INNER_LOOPS
=
0,1,2,4,8,16,32,64
\
bash scripts/run_compare.sh
```
## 结果解读
-
如果
`device_query`
在
`fastpt-C`
下明显更慢,说明 FastPT-C 的 CUDA runtime 兼容调用本身有额外开销。
-
如果
`guard_loop`
的差异随着
`inner_loops`
增大而扩大,说明 c10 CUDA guard/device query 路径已经足以复现主机侧开销。
这个复现的目标是帮助定位 FastPT-C 兼容层的主机侧开销来源;它不是 MatPL 训练性能测试,也不包含历史调查过程中使用过的所有实验分支。
## 可选 probe
如果需要进一步确认
`guard_loop`
中触发了多少 CUDA runtime 调用,可以使用
`LD_PRELOAD`
probe:
```
bash
cd
/workspace/tools/fastpt_c_overhead_mre
bash scripts/run_with_probe.sh fastpt-C 0
```
结果写入:
```
text
results-probe/fastpt-C/runtime_probe.csv
```
该 probe 统计
`cudaGetDevice`
、
`cudaSetDevice`
、
`hipGetDevice`
、
`hipSetDevice`
的调用次数、总耗时和平均耗时;默认
`run_compare.sh`
不会使用它。
projects/fastpt-overhead/scripts/bench_guard.py
0 → 100755
View file @
63d618ba
#!/usr/bin/env python3
from
__future__
import
annotations
import
argparse
import
csv
import
statistics
import
sys
import
time
import
torch
def
parse_int_list
(
value
:
str
)
->
list
[
int
]:
return
[
int
(
item
)
for
item
in
value
.
split
(
","
)
if
item
.
strip
()]
def
sync
()
->
None
:
torch
.
cuda
.
synchronize
()
def
main
()
->
None
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--lib"
,
required
=
True
)
parser
.
add_argument
(
"--device"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--inner-loops"
,
type
=
parse_int_list
,
default
=
parse_int_list
(
"0,1,2,4,8,16,32,64"
))
parser
.
add_argument
(
"--steps"
,
type
=
int
,
default
=
10000
)
parser
.
add_argument
(
"--warmup"
,
type
=
int
,
default
=
1000
)
parser
.
add_argument
(
"--rounds"
,
type
=
int
,
default
=
5
)
args
=
parser
.
parse_args
()
torch
.
ops
.
load_library
(
args
.
lib
)
torch
.
cuda
.
set_device
(
args
.
device
)
tensor
=
torch
.
empty
(
1024
,
device
=
"cuda"
)
op
=
torch
.
ops
.
fastpt_c_overhead_mre
.
guard_loop
writer
=
csv
.
writer
(
sys
.
stdout
)
writer
.
writerow
(
[
"section"
,
"inner_loops"
,
"steps"
,
"warmup"
,
"rounds"
,
"median_step_us"
,
"mean_step_us"
,
"median_per_guard_us"
,
]
)
for
inner_loops
in
args
.
inner_loops
:
for
_
in
range
(
args
.
warmup
):
op
(
tensor
,
inner_loops
)
sync
()
values
=
[]
for
_
in
range
(
args
.
rounds
):
sync
()
start
=
time
.
perf_counter_ns
()
for
_
in
range
(
args
.
steps
):
op
(
tensor
,
inner_loops
)
sync
()
stop
=
time
.
perf_counter_ns
()
values
.
append
((
stop
-
start
)
/
args
.
steps
/
1000.0
)
median_step
=
statistics
.
median
(
values
)
writer
.
writerow
(
[
"guard_loop"
,
inner_loops
,
args
.
steps
,
args
.
warmup
,
args
.
rounds
,
f
"
{
median_step
:.
6
f
}
"
,
f
"
{
statistics
.
mean
(
values
):.
6
f
}
"
,
f
"
{
median_step
/
inner_loops
:.
6
f
}
"
if
inner_loops
else
"0.000000"
,
]
)
if
__name__
==
"__main__"
:
main
()
projects/fastpt-overhead/scripts/build.sh
0 → 100755
View file @
63d618ba
#!/usr/bin/env bash
set
-euo
pipefail
mode
=
"
${
1
:?usage:
build.sh hip|fastpt-C
}
"
script_dir
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
root
=
"
$(
cd
"
${
script_dir
}
/.."
&&
pwd
)
"
source_dtk
=
"
${
root
}
/../source_dtk_library_path.sh"
if
[[
-f
"
${
source_dtk
}
"
]]
;
then
# shellcheck disable=SC1091
source
"
${
source_dtk
}
"
fi
if
[[
"
${
mode
}
"
==
"fastpt-C"
]]
;
then
set
+u
# shellcheck disable=SC1091
source
/usr/local/bin/fastpt
-C
>
/dev/null
set
-u
backend
=
"cuda"
elif
[[
"
${
mode
}
"
==
"hip"
]]
;
then
backend
=
"hip"
else
echo
"usage: build.sh hip|fastpt-C"
>
&2
exit
2
fi
build_dir
=
"
${
root
}
/build-
${
mode
}
"
cmake
-S
"
${
root
}
"
-B
"
${
build_dir
}
"
\
-DBACKEND
=
"
${
backend
}
"
\
-DCMAKE_BUILD_TYPE
=
Release
cmake
--build
"
${
build_dir
}
"
-j
"
${
JOBS
:-
$(
nproc
)
}
"
echo
"BUILD_DONE,
${
mode
}
,
${
build_dir
}
"
projects/fastpt-overhead/scripts/compare.py
0 → 100755
View file @
63d618ba
#!/usr/bin/env python3
from
__future__
import
annotations
import
csv
import
sys
from
pathlib
import
Path
def
read_one_row
(
path
:
Path
)
->
dict
[
str
,
str
]:
with
path
.
open
(
newline
=
""
)
as
handle
:
rows
=
list
(
csv
.
DictReader
(
handle
))
if
len
(
rows
)
!=
1
:
raise
RuntimeError
(
f
"expected one data row in
{
path
}
"
)
return
rows
[
0
]
def
read_guard
(
path
:
Path
)
->
dict
[
int
,
dict
[
str
,
str
]]:
with
path
.
open
(
newline
=
""
)
as
handle
:
return
{
int
(
row
[
"inner_loops"
]):
row
for
row
in
csv
.
DictReader
(
handle
)}
def
emit_row
(
writer
:
csv
.
writer
,
section
:
str
,
key
:
str
,
hip_us
:
float
,
fastpt_us
:
float
)
->
None
:
delta
=
fastpt_us
-
hip_us
writer
.
writerow
(
[
section
,
key
,
f
"
{
hip_us
:.
9
f
}
"
,
f
"
{
fastpt_us
:.
9
f
}
"
,
f
"
{
delta
:.
9
f
}
"
,
f
"
{
delta
/
hip_us
*
100.0
:.
6
f
}
"
if
hip_us
else
"nan"
,
f
"
{
fastpt_us
/
hip_us
:.
6
f
}
"
if
hip_us
else
"nan"
,
]
)
def
main
()
->
int
:
root
=
Path
(
sys
.
argv
[
1
])
if
len
(
sys
.
argv
)
>
1
else
Path
(
"results"
)
hip
=
root
/
"hip"
fastpt
=
root
/
"fastpt-C"
writer
=
csv
.
writer
(
sys
.
stdout
)
writer
.
writerow
([
"section"
,
"case"
,
"hip_us"
,
"fastpt_c_us"
,
"delta_us"
,
"delta_pct"
,
"ratio"
])
hip_query
=
read_one_row
(
hip
/
"device_query.csv"
)
fastpt_query
=
read_one_row
(
fastpt
/
"device_query.csv"
)
emit_row
(
writer
,
"device_query"
,
f
"
{
hip_query
[
'api'
]
}
vs
{
fastpt_query
[
'api'
]
}
"
,
float
(
hip_query
[
"median_us"
]),
float
(
fastpt_query
[
"median_us"
]),
)
hip_guard
=
read_guard
(
hip
/
"guard_loop.csv"
)
fastpt_guard
=
read_guard
(
fastpt
/
"guard_loop.csv"
)
for
inner_loops
in
sorted
(
set
(
hip_guard
)
&
set
(
fastpt_guard
)):
emit_row
(
writer
,
"guard_loop"
,
str
(
inner_loops
),
float
(
hip_guard
[
inner_loops
][
"median_step_us"
]),
float
(
fastpt_guard
[
inner_loops
][
"median_step_us"
]),
)
return
0
if
__name__
==
"__main__"
:
raise
SystemExit
(
main
())
projects/fastpt-overhead/scripts/run_compare.sh
0 → 100755
View file @
63d618ba
#!/usr/bin/env bash
set
-euo
pipefail
script_dir
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
root
=
"
$(
cd
"
${
script_dir
}
/.."
&&
pwd
)
"
out_root
=
"
${
OUT_ROOT
:-${
root
}
/results
}
"
device
=
"
${
DEVICE
:-
0
}
"
bash
"
${
script_dir
}
/run_one.sh"
hip
"
${
device
}
"
bash
"
${
script_dir
}
/run_one.sh"
fastpt-C
"
${
device
}
"
python3
"
${
script_dir
}
/compare.py"
"
${
out_root
}
"
>
"
${
out_root
}
/compare.csv"
cat
"
${
out_root
}
/compare.csv"
projects/fastpt-overhead/scripts/run_one.sh
0 → 100755
View file @
63d618ba
#!/usr/bin/env bash
set
-euo
pipefail
mode
=
"
${
1
:?usage:
run_one.sh hip|fastpt-C [device]
}
"
device
=
"
${
2
:-${
DEVICE
:-
0
}}
"
script_dir
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
root
=
"
$(
cd
"
${
script_dir
}
/.."
&&
pwd
)
"
out_dir
=
"
${
OUT_ROOT
:-${
root
}
/results
}
/
${
mode
}
"
build_dir
=
"
${
root
}
/build-
${
mode
}
"
source_dtk
=
"
${
root
}
/../source_dtk_library_path.sh"
if
[[
-f
"
${
source_dtk
}
"
]]
;
then
# shellcheck disable=SC1091
source
"
${
source_dtk
}
"
fi
if
[[
"
${
mode
}
"
==
"fastpt-C"
]]
;
then
set
+u
# shellcheck disable=SC1091
source
/usr/local/bin/fastpt
-C
>
/dev/null
set
-u
elif
[[
"
${
mode
}
"
!=
"hip"
]]
;
then
echo
"usage: run_one.sh hip|fastpt-C [device]"
>
&2
exit
2
fi
if
[[
!
-x
"
${
build_dir
}
/bin/device_query"
||
!
-f
"
${
build_dir
}
/lib/libguard_ext.so"
]]
;
then
bash
"
${
script_dir
}
/build.sh"
"
${
mode
}
"
fi
mkdir
-p
"
${
out_dir
}
"
"
${
build_dir
}
/bin/device_query"
\
"
${
device
}
"
\
"
${
DEVICE_QUERY_LOOPS
:-
1000000
}
"
\
"
${
DEVICE_QUERY_ROUNDS
:-
7
}
"
\
"
${
DEVICE_QUERY_WARMUP
:-
10000
}
"
\
>
"
${
out_dir
}
/device_query.csv"
python3
"
${
script_dir
}
/bench_guard.py"
\
--device
"
${
device
}
"
\
--lib
"
${
build_dir
}
/lib/libguard_ext.so"
\
--inner-loops
"
${
GUARD_INNER_LOOPS
:-
0
,1,2,4,8,16,32,64
}
"
\
--steps
"
${
GUARD_STEPS
:-
10000
}
"
\
--warmup
"
${
GUARD_WARMUP
:-
1000
}
"
\
--rounds
"
${
GUARD_ROUNDS
:-
5
}
"
\
>
"
${
out_dir
}
/guard_loop.csv"
echo
"RUN_DONE,
${
mode
}
,
${
out_dir
}
"
projects/fastpt-overhead/scripts/run_with_probe.sh
0 → 100755
View file @
63d618ba
#!/usr/bin/env bash
set
-euo
pipefail
script_dir
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
root
=
"
$(
cd
"
${
script_dir
}
/.."
&&
pwd
)
"
mode
=
"
${
1
:-
fastpt
-C
}
"
device
=
"
${
2
:-${
DEVICE
:-
0
}}
"
out_root
=
"
${
OUT_ROOT
:-${
root
}
/results-probe
}
"
log
=
"
${
FASTPT_MRE_PROBE_LOG
:-${
out_root
}
/
${
mode
}
/runtime_probe.csv
}
"
probe_lib
=
"
${
root
}
/build-probe/libruntime_probe.so"
mkdir
-p
"
${
root
}
/build-probe"
c++
-std
=
c++17
-O2
-fPIC
-shared
\
"
${
root
}
/src/runtime_probe.cpp"
\
-ldl
\
-o
"
${
probe_lib
}
"
mkdir
-p
"
$(
dirname
"
${
log
}
"
)
"
echo
"pid,api,calls,total_ns,avg_ns"
>
"
${
log
}
"
FASTPT_MRE_PROBE_LOG
=
"
${
log
}
"
\
OUT_ROOT
=
"
${
out_root
}
"
\
LD_PRELOAD
=
"
${
probe_lib
}${
LD_PRELOAD
:+:
${
LD_PRELOAD
}}
"
\
bash
"
${
script_dir
}
/run_one.sh"
"
${
mode
}
"
"
${
device
}
"
echo
"PROBE_LOG,
${
log
}
"
projects/fastpt-overhead/src/device_query.cpp
0 → 100644
View file @
63d618ba
#include <algorithm>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <vector>
#if defined(BACKEND_CUDA)
#include <cuda_runtime_api.h>
#elif defined(BACKEND_HIP)
#include <hip/hip_runtime_api.h>
#else
#error "BACKEND_CUDA or BACKEND_HIP must be defined"
#endif
namespace
{
#if defined(BACKEND_CUDA)
const
char
*
api_name
()
{
return
"cudaGetDevice"
;
}
int
set_device
(
int
device
)
{
return
static_cast
<
int
>
(
cudaSetDevice
(
device
));
}
int
get_device
(
int
*
device
)
{
return
static_cast
<
int
>
(
cudaGetDevice
(
device
));
}
#else
const
char
*
api_name
()
{
return
"hipGetDevice"
;
}
int
set_device
(
int
device
)
{
return
static_cast
<
int
>
(
hipSetDevice
(
device
));
}
int
get_device
(
int
*
device
)
{
return
static_cast
<
int
>
(
hipGetDevice
(
device
));
}
#endif
double
median
(
std
::
vector
<
double
>
values
)
{
std
::
sort
(
values
.
begin
(),
values
.
end
());
return
values
[
values
.
size
()
/
2
];
}
double
mean
(
const
std
::
vector
<
double
>
&
values
)
{
double
total
=
0.0
;
for
(
double
value
:
values
)
{
total
+=
value
;
}
return
total
/
static_cast
<
double
>
(
values
.
size
());
}
}
// namespace
int
main
(
int
argc
,
char
**
argv
)
{
const
int
device
=
argc
>
1
?
std
::
atoi
(
argv
[
1
])
:
0
;
const
int
loops
=
argc
>
2
?
std
::
atoi
(
argv
[
2
])
:
1000000
;
const
int
rounds
=
argc
>
3
?
std
::
atoi
(
argv
[
3
])
:
7
;
const
int
warmup
=
argc
>
4
?
std
::
atoi
(
argv
[
4
])
:
10000
;
volatile
int
sink
=
0
;
sink
+=
set_device
(
device
);
int
current
=
0
;
for
(
int
i
=
0
;
i
<
warmup
;
++
i
)
{
sink
+=
get_device
(
&
current
);
sink
+=
current
;
}
std
::
vector
<
double
>
samples
;
for
(
int
round
=
0
;
round
<
rounds
;
++
round
)
{
auto
start
=
std
::
chrono
::
steady_clock
::
now
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
{
sink
+=
get_device
(
&
current
);
sink
+=
current
;
}
auto
stop
=
std
::
chrono
::
steady_clock
::
now
();
double
total_us
=
std
::
chrono
::
duration
<
double
,
std
::
micro
>
(
stop
-
start
).
count
();
samples
.
push_back
(
total_us
/
static_cast
<
double
>
(
loops
));
}
auto
minmax
=
std
::
minmax_element
(
samples
.
begin
(),
samples
.
end
());
std
::
printf
(
"section,api,loops,warmup,rounds,median_us,mean_us,min_us,max_us,sink
\n
"
);
std
::
printf
(
"device_query,%s,%d,%d,%d,%.9f,%.9f,%.9f,%.9f,%d
\n
"
,
api_name
(),
loops
,
warmup
,
rounds
,
median
(
samples
),
mean
(
samples
),
*
minmax
.
first
,
*
minmax
.
second
,
static_cast
<
int
>
(
sink
));
return
0
;
}
projects/fastpt-overhead/src/guard_ext.cpp
0 → 100644
View file @
63d618ba
#include <ATen/ATen.h>
#include <torch/library.h>
#if defined(BACKEND_CUDA)
#include <c10/cuda/CUDAGuard.h>
#elif defined(BACKEND_HIP)
#include <c10/hip/HIPGuard.h>
#else
#error "BACKEND_CUDA or BACKEND_HIP must be defined"
#endif
at
::
Tensor
guard_loop
(
at
::
Tensor
tensor
,
int64_t
loops
)
{
const
auto
device
=
static_cast
<
c10
::
DeviceIndex
>
(
tensor
.
device
().
index
());
for
(
int64_t
i
=
0
;
i
<
loops
;
++
i
)
{
#if defined(BACKEND_CUDA)
c10
::
cuda
::
CUDAGuard
guard
(
device
);
#else
c10
::
hip
::
HIPGuard
guard
(
device
);
#endif
}
return
tensor
;
}
TORCH_LIBRARY
(
fastpt_c_overhead_mre
,
m
)
{
m
.
def
(
"guard_loop(Tensor tensor, int loops) -> Tensor"
,
guard_loop
);
}
projects/fastpt-overhead/src/runtime_probe.cpp
0 → 100644
View file @
63d618ba
#include <atomic>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <dlfcn.h>
#include <unistd.h>
namespace
{
using
cuda_get_device_fn
=
int
(
*
)(
int
*
);
using
cuda_set_device_fn
=
int
(
*
)(
int
);
using
hip_get_device_fn
=
int
(
*
)(
int
*
);
using
hip_set_device_fn
=
int
(
*
)(
int
);
struct
Stats
{
std
::
atomic
<
unsigned
long
long
>
calls
{
0
};
std
::
atomic
<
unsigned
long
long
>
ns
{
0
};
};
Stats
cuda_get_device_stats
;
Stats
cuda_set_device_stats
;
Stats
hip_get_device_stats
;
Stats
hip_set_device_stats
;
void
*
resolve_symbol
(
const
char
*
name
,
const
char
*
const
*
libs
)
{
auto
*
symbol
=
dlsym
(
RTLD_NEXT
,
name
);
if
(
symbol
)
{
return
symbol
;
}
for
(
const
char
*
const
*
lib
=
libs
;
*
lib
;
++
lib
)
{
void
*
handle
=
dlopen
(
*
lib
,
RTLD_LAZY
|
RTLD_LOCAL
);
if
(
!
handle
)
{
continue
;
}
symbol
=
dlsym
(
handle
,
name
);
if
(
symbol
)
{
return
symbol
;
}
}
std
::
fprintf
(
stderr
,
"probe_missing_symbol,%s,%s
\n
"
,
name
,
dlerror
());
std
::
abort
();
}
template
<
typename
Fn
>
Fn
cuda_symbol
(
const
char
*
name
)
{
static
const
char
*
const
libs
[]
=
{
"libcudart.so"
,
nullptr
,
};
return
reinterpret_cast
<
Fn
>
(
resolve_symbol
(
name
,
libs
));
}
template
<
typename
Fn
>
Fn
hip_symbol
(
const
char
*
name
)
{
static
const
char
*
const
libs
[]
=
{
"libamdhip64.so"
,
nullptr
,
};
return
reinterpret_cast
<
Fn
>
(
resolve_symbol
(
name
,
libs
));
}
template
<
typename
Fn
,
typename
Call
>
int
measure
(
Stats
&
stats
,
Fn
fn
,
Call
call
)
{
const
auto
start
=
std
::
chrono
::
steady_clock
::
now
();
const
int
result
=
call
(
fn
);
const
auto
stop
=
std
::
chrono
::
steady_clock
::
now
();
const
auto
ns
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
stop
-
start
)
.
count
();
stats
.
calls
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
stats
.
ns
.
fetch_add
(
static_cast
<
unsigned
long
long
>
(
ns
),
std
::
memory_order_relaxed
);
return
result
;
}
void
print_one
(
FILE
*
out
,
const
char
*
name
,
const
Stats
&
stats
)
{
const
auto
calls
=
stats
.
calls
.
load
(
std
::
memory_order_relaxed
);
const
auto
ns
=
stats
.
ns
.
load
(
std
::
memory_order_relaxed
);
const
double
avg_ns
=
calls
?
static_cast
<
double
>
(
ns
)
/
calls
:
0.0
;
if
(
calls
)
{
std
::
fprintf
(
out
,
"%d,%s,%llu,%llu,%.3f
\n
"
,
static_cast
<
int
>
(
getpid
()),
name
,
calls
,
ns
,
avg_ns
);
}
}
void
print_summary
()
{
const
char
*
path
=
std
::
getenv
(
"FASTPT_MRE_PROBE_LOG"
);
FILE
*
out
=
path
?
std
::
fopen
(
path
,
"a"
)
:
stderr
;
if
(
!
out
)
{
out
=
stderr
;
}
print_one
(
out
,
"cudaGetDevice"
,
cuda_get_device_stats
);
print_one
(
out
,
"cudaSetDevice"
,
cuda_set_device_stats
);
print_one
(
out
,
"hipGetDevice"
,
hip_get_device_stats
);
print_one
(
out
,
"hipSetDevice"
,
hip_set_device_stats
);
if
(
out
!=
stderr
)
{
std
::
fclose
(
out
);
}
}
struct
AtExit
{
AtExit
()
{
std
::
atexit
(
print_summary
);
}
}
at_exit
;
}
// namespace
extern
"C"
int
cudaGetDevice
(
int
*
device
)
{
static
auto
real
=
cuda_symbol
<
cuda_get_device_fn
>
(
"cudaGetDevice"
);
return
measure
(
cuda_get_device_stats
,
real
,
[
device
](
auto
fn
)
{
return
fn
(
device
);
});
}
extern
"C"
int
cudaSetDevice
(
int
device
)
{
static
auto
real
=
cuda_symbol
<
cuda_set_device_fn
>
(
"cudaSetDevice"
);
return
measure
(
cuda_set_device_stats
,
real
,
[
device
](
auto
fn
)
{
return
fn
(
device
);
});
}
extern
"C"
int
hipGetDevice
(
int
*
device
)
{
static
auto
real
=
hip_symbol
<
hip_get_device_fn
>
(
"hipGetDevice"
);
return
measure
(
hip_get_device_stats
,
real
,
[
device
](
auto
fn
)
{
return
fn
(
device
);
});
}
extern
"C"
int
hipSetDevice
(
int
device
)
{
static
auto
real
=
hip_symbol
<
hip_set_device_fn
>
(
"hipSetDevice"
);
return
measure
(
hip_set_device_stats
,
real
,
[
device
](
auto
fn
)
{
return
fn
(
device
);
});
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment