"vscode:/vscode.git/clone" did not exist on "0b306e512ef9485a78828288f101e64fdca821ac"
Commit 63d618ba authored by one's avatar one
Browse files

Add FastPT-C host-side overhead MRE

parent ff6a4830
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(FastPTCOverheadMRE LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(BACKEND "hip" CACHE STRING "Backend: hip or cuda")
execute_process(
COMMAND python3 -c "import torch; print(torch.utils.cmake_prefix_path)"
OUTPUT_VARIABLE TORCH_CMAKE_PREFIX_PATH
OUTPUT_STRIP_TRAILING_WHITESPACE
)
list(APPEND CMAKE_PREFIX_PATH "${TORCH_CMAKE_PREFIX_PATH}")
find_package(Torch REQUIRED)
string(REPLACE "-Wno-duplicate-decl-specifier" "" TORCH_CXX_FLAGS "${TORCH_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
function(disable_noisy_warnings target_name)
target_compile_options(${target_name} PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:-Wno-unused-result>
$<$<COMPILE_LANGUAGE:HIP>:-Wno-unused-result>
$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-unused-result>
)
endfunction()
if(BACKEND STREQUAL "hip")
enable_language(HIP)
set_source_files_properties(src/device_query.cpp PROPERTIES LANGUAGE HIP)
add_executable(device_query src/device_query.cpp)
add_library(guard_ext SHARED src/guard_ext.cpp)
target_compile_definitions(device_query PRIVATE BACKEND_HIP=1)
target_compile_definitions(guard_ext PRIVATE BACKEND_HIP=1)
target_compile_options(device_query PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-O3>)
elseif(BACKEND STREQUAL "cuda")
enable_language(CUDA)
set(CMAKE_INCLUDE_SYSTEM_FLAG_CUDA "-I")
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES "60;70;80;90")
endif()
set_source_files_properties(src/device_query.cpp PROPERTIES LANGUAGE CUDA)
add_executable(device_query src/device_query.cpp)
add_library(guard_ext SHARED src/guard_ext.cpp)
target_compile_definitions(device_query PRIVATE BACKEND_CUDA=1)
target_compile_definitions(guard_ext PRIVATE BACKEND_CUDA=1)
target_compile_options(device_query PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3>)
else()
message(FATAL_ERROR "BACKEND must be hip or cuda")
endif()
target_link_libraries(guard_ext PRIVATE ${TORCH_LIBRARIES})
disable_noisy_warnings(device_query)
disable_noisy_warnings(guard_ext)
set_target_properties(device_query PROPERTIES
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
)
set_target_properties(guard_ext PROPERTIES
LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
)
# FastPT-C Host-side Overhead MRE
使用 FastPT 不转码模式适配 MatPL 的过程中发现了一定的性能损失,因此单独建立该项目来观察 FastPT 不转码模式 CUDA 兼容路径带来的 host 侧开销。它不依赖 MatPL 训练流程,只保留两个实验:
1. `device_query`:对比原生 HIP 路径的 `hipGetDevice` 和 FastPT-C 路径的 `cudaGetDevice`
2. `guard_loop`:通过 PyTorch C++ extension 对比 `c10::hip::HIPGuard``c10::cuda::CUDAGuard` 的循环调用开销。
第二个实验更接近 MatPL 的性能现象:当 CUDA ABI 的 PyTorch C++ extension 通过 FastPT-C 运行时,频繁使用 c10 device guard/device query 可能产生额外的 host 侧开销。
## 目录结构
```text
.
├── CMakeLists.txt
├── README.md
├── scripts
│ ├── bench_guard.py
│ ├── build.sh
│ ├── compare.py
│ ├── run_compare.sh
│ ├── run_one.sh
│ └── run_with_probe.sh
└── src
├── device_query.cpp
├── guard_ext.cpp
└── runtime_probe.cpp
```
## 运行方式
在 DTK/FastPT 容器中执行:
```bash
cd /workspace/tools/fastpt_c_overhead_mre
bash scripts/run_compare.sh
```
脚本会构建并运行两种模式:
- `hip`:原生 DTK/HIP/PyTorch HIP 路径。
- `fastpt-C`:FastPT-C CUDA 兼容路径。
结果会写入:
```text
results/hip/
results/fastpt-C/
results/compare.csv
```
常用参数可以通过环境变量调整:
```bash
DEVICE=0 \
DEVICE_QUERY_LOOPS=1000000 \
DEVICE_QUERY_ROUNDS=7 \
GUARD_STEPS=10000 \
GUARD_WARMUP=1000 \
GUARD_ROUNDS=5 \
GUARD_INNER_LOOPS=0,1,2,4,8,16,32,64 \
bash scripts/run_compare.sh
```
## 结果解读
- 如果 `device_query``fastpt-C` 下明显更慢,说明 FastPT-C 的 CUDA runtime 兼容调用本身有额外开销。
- 如果 `guard_loop` 的差异随着 `inner_loops` 增大而扩大,说明 c10 CUDA guard/device query 路径已经足以复现主机侧开销。
这个复现的目标是帮助定位 FastPT-C 兼容层的主机侧开销来源;它不是 MatPL 训练性能测试,也不包含历史调查过程中使用过的所有实验分支。
## 可选 probe
如果需要进一步确认 `guard_loop` 中触发了多少 CUDA runtime 调用,可以使用 `LD_PRELOAD` probe:
```bash
cd /workspace/tools/fastpt_c_overhead_mre
bash scripts/run_with_probe.sh fastpt-C 0
```
结果写入:
```text
results-probe/fastpt-C/runtime_probe.csv
```
该 probe 统计 `cudaGetDevice``cudaSetDevice``hipGetDevice``hipSetDevice` 的调用次数、总耗时和平均耗时;默认 `run_compare.sh` 不会使用它。
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import csv
import statistics
import sys
import time
import torch
def parse_int_list(value: str) -> list[int]:
return [int(item) for item in value.split(",") if item.strip()]
def sync() -> None:
torch.cuda.synchronize()
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--lib", required=True)
parser.add_argument("--device", type=int, default=0)
parser.add_argument("--inner-loops", type=parse_int_list, default=parse_int_list("0,1,2,4,8,16,32,64"))
parser.add_argument("--steps", type=int, default=10000)
parser.add_argument("--warmup", type=int, default=1000)
parser.add_argument("--rounds", type=int, default=5)
args = parser.parse_args()
torch.ops.load_library(args.lib)
torch.cuda.set_device(args.device)
tensor = torch.empty(1024, device="cuda")
op = torch.ops.fastpt_c_overhead_mre.guard_loop
writer = csv.writer(sys.stdout)
writer.writerow(
[
"section",
"inner_loops",
"steps",
"warmup",
"rounds",
"median_step_us",
"mean_step_us",
"median_per_guard_us",
]
)
for inner_loops in args.inner_loops:
for _ in range(args.warmup):
op(tensor, inner_loops)
sync()
values = []
for _ in range(args.rounds):
sync()
start = time.perf_counter_ns()
for _ in range(args.steps):
op(tensor, inner_loops)
sync()
stop = time.perf_counter_ns()
values.append((stop - start) / args.steps / 1000.0)
median_step = statistics.median(values)
writer.writerow(
[
"guard_loop",
inner_loops,
args.steps,
args.warmup,
args.rounds,
f"{median_step:.6f}",
f"{statistics.mean(values):.6f}",
f"{median_step / inner_loops:.6f}" if inner_loops else "0.000000",
]
)
if __name__ == "__main__":
main()
#!/usr/bin/env bash
set -euo pipefail
mode="${1:?usage: build.sh hip|fastpt-C}"
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
root="$(cd "${script_dir}/.." && pwd)"
source_dtk="${root}/../source_dtk_library_path.sh"
if [[ -f "${source_dtk}" ]]; then
# shellcheck disable=SC1091
source "${source_dtk}"
fi
if [[ "${mode}" == "fastpt-C" ]]; then
set +u
# shellcheck disable=SC1091
source /usr/local/bin/fastpt -C >/dev/null
set -u
backend="cuda"
elif [[ "${mode}" == "hip" ]]; then
backend="hip"
else
echo "usage: build.sh hip|fastpt-C" >&2
exit 2
fi
build_dir="${root}/build-${mode}"
cmake -S "${root}" -B "${build_dir}" \
-DBACKEND="${backend}" \
-DCMAKE_BUILD_TYPE=Release
cmake --build "${build_dir}" -j"${JOBS:-$(nproc)}"
echo "BUILD_DONE,${mode},${build_dir}"
#!/usr/bin/env python3
from __future__ import annotations
import csv
import sys
from pathlib import Path
def read_one_row(path: Path) -> dict[str, str]:
with path.open(newline="") as handle:
rows = list(csv.DictReader(handle))
if len(rows) != 1:
raise RuntimeError(f"expected one data row in {path}")
return rows[0]
def read_guard(path: Path) -> dict[int, dict[str, str]]:
with path.open(newline="") as handle:
return {int(row["inner_loops"]): row for row in csv.DictReader(handle)}
def emit_row(writer: csv.writer, section: str, key: str, hip_us: float, fastpt_us: float) -> None:
delta = fastpt_us - hip_us
writer.writerow(
[
section,
key,
f"{hip_us:.9f}",
f"{fastpt_us:.9f}",
f"{delta:.9f}",
f"{delta / hip_us * 100.0:.6f}" if hip_us else "nan",
f"{fastpt_us / hip_us:.6f}" if hip_us else "nan",
]
)
def main() -> int:
root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("results")
hip = root / "hip"
fastpt = root / "fastpt-C"
writer = csv.writer(sys.stdout)
writer.writerow(["section", "case", "hip_us", "fastpt_c_us", "delta_us", "delta_pct", "ratio"])
hip_query = read_one_row(hip / "device_query.csv")
fastpt_query = read_one_row(fastpt / "device_query.csv")
emit_row(
writer,
"device_query",
f"{hip_query['api']} vs {fastpt_query['api']}",
float(hip_query["median_us"]),
float(fastpt_query["median_us"]),
)
hip_guard = read_guard(hip / "guard_loop.csv")
fastpt_guard = read_guard(fastpt / "guard_loop.csv")
for inner_loops in sorted(set(hip_guard) & set(fastpt_guard)):
emit_row(
writer,
"guard_loop",
str(inner_loops),
float(hip_guard[inner_loops]["median_step_us"]),
float(fastpt_guard[inner_loops]["median_step_us"]),
)
return 0
if __name__ == "__main__":
raise SystemExit(main())
#!/usr/bin/env bash
set -euo pipefail
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
root="$(cd "${script_dir}/.." && pwd)"
out_root="${OUT_ROOT:-${root}/results}"
device="${DEVICE:-0}"
bash "${script_dir}/run_one.sh" hip "${device}"
bash "${script_dir}/run_one.sh" fastpt-C "${device}"
python3 "${script_dir}/compare.py" "${out_root}" > "${out_root}/compare.csv"
cat "${out_root}/compare.csv"
#!/usr/bin/env bash
set -euo pipefail
mode="${1:?usage: run_one.sh hip|fastpt-C [device]}"
device="${2:-${DEVICE:-0}}"
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
root="$(cd "${script_dir}/.." && pwd)"
out_dir="${OUT_ROOT:-${root}/results}/${mode}"
build_dir="${root}/build-${mode}"
source_dtk="${root}/../source_dtk_library_path.sh"
if [[ -f "${source_dtk}" ]]; then
# shellcheck disable=SC1091
source "${source_dtk}"
fi
if [[ "${mode}" == "fastpt-C" ]]; then
set +u
# shellcheck disable=SC1091
source /usr/local/bin/fastpt -C >/dev/null
set -u
elif [[ "${mode}" != "hip" ]]; then
echo "usage: run_one.sh hip|fastpt-C [device]" >&2
exit 2
fi
if [[ ! -x "${build_dir}/bin/device_query" || ! -f "${build_dir}/lib/libguard_ext.so" ]]; then
bash "${script_dir}/build.sh" "${mode}"
fi
mkdir -p "${out_dir}"
"${build_dir}/bin/device_query" \
"${device}" \
"${DEVICE_QUERY_LOOPS:-1000000}" \
"${DEVICE_QUERY_ROUNDS:-7}" \
"${DEVICE_QUERY_WARMUP:-10000}" \
> "${out_dir}/device_query.csv"
python3 "${script_dir}/bench_guard.py" \
--device "${device}" \
--lib "${build_dir}/lib/libguard_ext.so" \
--inner-loops "${GUARD_INNER_LOOPS:-0,1,2,4,8,16,32,64}" \
--steps "${GUARD_STEPS:-10000}" \
--warmup "${GUARD_WARMUP:-1000}" \
--rounds "${GUARD_ROUNDS:-5}" \
> "${out_dir}/guard_loop.csv"
echo "RUN_DONE,${mode},${out_dir}"
#!/usr/bin/env bash
set -euo pipefail
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
root="$(cd "${script_dir}/.." && pwd)"
mode="${1:-fastpt-C}"
device="${2:-${DEVICE:-0}}"
out_root="${OUT_ROOT:-${root}/results-probe}"
log="${FASTPT_MRE_PROBE_LOG:-${out_root}/${mode}/runtime_probe.csv}"
probe_lib="${root}/build-probe/libruntime_probe.so"
mkdir -p "${root}/build-probe"
c++ -std=c++17 -O2 -fPIC -shared \
"${root}/src/runtime_probe.cpp" \
-ldl \
-o "${probe_lib}"
mkdir -p "$(dirname "${log}")"
echo "pid,api,calls,total_ns,avg_ns" > "${log}"
FASTPT_MRE_PROBE_LOG="${log}" \
OUT_ROOT="${out_root}" \
LD_PRELOAD="${probe_lib}${LD_PRELOAD:+:${LD_PRELOAD}}" \
bash "${script_dir}/run_one.sh" "${mode}" "${device}"
echo "PROBE_LOG,${log}"
#include <algorithm>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <vector>
#if defined(BACKEND_CUDA)
#include <cuda_runtime_api.h>
#elif defined(BACKEND_HIP)
#include <hip/hip_runtime_api.h>
#else
#error "BACKEND_CUDA or BACKEND_HIP must be defined"
#endif
namespace {
#if defined(BACKEND_CUDA)
const char *api_name() { return "cudaGetDevice"; }
int set_device(int device) { return static_cast<int>(cudaSetDevice(device)); }
int get_device(int *device) { return static_cast<int>(cudaGetDevice(device)); }
#else
const char *api_name() { return "hipGetDevice"; }
int set_device(int device) { return static_cast<int>(hipSetDevice(device)); }
int get_device(int *device) { return static_cast<int>(hipGetDevice(device)); }
#endif
double median(std::vector<double> values) {
std::sort(values.begin(), values.end());
return values[values.size() / 2];
}
double mean(const std::vector<double> &values) {
double total = 0.0;
for (double value : values) {
total += value;
}
return total / static_cast<double>(values.size());
}
} // namespace
int main(int argc, char **argv) {
const int device = argc > 1 ? std::atoi(argv[1]) : 0;
const int loops = argc > 2 ? std::atoi(argv[2]) : 1000000;
const int rounds = argc > 3 ? std::atoi(argv[3]) : 7;
const int warmup = argc > 4 ? std::atoi(argv[4]) : 10000;
volatile int sink = 0;
sink += set_device(device);
int current = 0;
for (int i = 0; i < warmup; ++i) {
sink += get_device(&current);
sink += current;
}
std::vector<double> samples;
for (int round = 0; round < rounds; ++round) {
auto start = std::chrono::steady_clock::now();
for (int i = 0; i < loops; ++i) {
sink += get_device(&current);
sink += current;
}
auto stop = std::chrono::steady_clock::now();
double total_us =
std::chrono::duration<double, std::micro>(stop - start).count();
samples.push_back(total_us / static_cast<double>(loops));
}
auto minmax = std::minmax_element(samples.begin(), samples.end());
std::printf("section,api,loops,warmup,rounds,median_us,mean_us,min_us,max_us,sink\n");
std::printf("device_query,%s,%d,%d,%d,%.9f,%.9f,%.9f,%.9f,%d\n",
api_name(), loops, warmup, rounds, median(samples),
mean(samples), *minmax.first, *minmax.second,
static_cast<int>(sink));
return 0;
}
#include <ATen/ATen.h>
#include <torch/library.h>
#if defined(BACKEND_CUDA)
#include <c10/cuda/CUDAGuard.h>
#elif defined(BACKEND_HIP)
#include <c10/hip/HIPGuard.h>
#else
#error "BACKEND_CUDA or BACKEND_HIP must be defined"
#endif
at::Tensor guard_loop(at::Tensor tensor, int64_t loops) {
const auto device = static_cast<c10::DeviceIndex>(tensor.device().index());
for (int64_t i = 0; i < loops; ++i) {
#if defined(BACKEND_CUDA)
c10::cuda::CUDAGuard guard(device);
#else
c10::hip::HIPGuard guard(device);
#endif
}
return tensor;
}
TORCH_LIBRARY(fastpt_c_overhead_mre, m) {
m.def("guard_loop(Tensor tensor, int loops) -> Tensor", guard_loop);
}
#include <atomic>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <dlfcn.h>
#include <unistd.h>
namespace {
using cuda_get_device_fn = int (*)(int *);
using cuda_set_device_fn = int (*)(int);
using hip_get_device_fn = int (*)(int *);
using hip_set_device_fn = int (*)(int);
struct Stats {
std::atomic<unsigned long long> calls{0};
std::atomic<unsigned long long> ns{0};
};
Stats cuda_get_device_stats;
Stats cuda_set_device_stats;
Stats hip_get_device_stats;
Stats hip_set_device_stats;
void *resolve_symbol(const char *name, const char *const *libs) {
auto *symbol = dlsym(RTLD_NEXT, name);
if (symbol) {
return symbol;
}
for (const char *const *lib = libs; *lib; ++lib) {
void *handle = dlopen(*lib, RTLD_LAZY | RTLD_LOCAL);
if (!handle) {
continue;
}
symbol = dlsym(handle, name);
if (symbol) {
return symbol;
}
}
std::fprintf(stderr, "probe_missing_symbol,%s,%s\n", name, dlerror());
std::abort();
}
template <typename Fn> Fn cuda_symbol(const char *name) {
static const char *const libs[] = {
"libcudart.so",
nullptr,
};
return reinterpret_cast<Fn>(resolve_symbol(name, libs));
}
template <typename Fn> Fn hip_symbol(const char *name) {
static const char *const libs[] = {
"libamdhip64.so",
nullptr,
};
return reinterpret_cast<Fn>(resolve_symbol(name, libs));
}
template <typename Fn, typename Call>
int measure(Stats &stats, Fn fn, Call call) {
const auto start = std::chrono::steady_clock::now();
const int result = call(fn);
const auto stop = std::chrono::steady_clock::now();
const auto ns =
std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start)
.count();
stats.calls.fetch_add(1, std::memory_order_relaxed);
stats.ns.fetch_add(static_cast<unsigned long long>(ns),
std::memory_order_relaxed);
return result;
}
void print_one(FILE *out, const char *name, const Stats &stats) {
const auto calls = stats.calls.load(std::memory_order_relaxed);
const auto ns = stats.ns.load(std::memory_order_relaxed);
const double avg_ns = calls ? static_cast<double>(ns) / calls : 0.0;
if (calls) {
std::fprintf(out, "%d,%s,%llu,%llu,%.3f\n", static_cast<int>(getpid()),
name, calls, ns, avg_ns);
}
}
void print_summary() {
const char *path = std::getenv("FASTPT_MRE_PROBE_LOG");
FILE *out = path ? std::fopen(path, "a") : stderr;
if (!out) {
out = stderr;
}
print_one(out, "cudaGetDevice", cuda_get_device_stats);
print_one(out, "cudaSetDevice", cuda_set_device_stats);
print_one(out, "hipGetDevice", hip_get_device_stats);
print_one(out, "hipSetDevice", hip_set_device_stats);
if (out != stderr) {
std::fclose(out);
}
}
struct AtExit {
AtExit() { std::atexit(print_summary); }
} at_exit;
} // namespace
extern "C" int cudaGetDevice(int *device) {
static auto real = cuda_symbol<cuda_get_device_fn>("cudaGetDevice");
return measure(cuda_get_device_stats, real,
[device](auto fn) { return fn(device); });
}
extern "C" int cudaSetDevice(int device) {
static auto real = cuda_symbol<cuda_set_device_fn>("cudaSetDevice");
return measure(cuda_set_device_stats, real,
[device](auto fn) { return fn(device); });
}
extern "C" int hipGetDevice(int *device) {
static auto real = hip_symbol<hip_get_device_fn>("hipGetDevice");
return measure(hip_get_device_stats, real,
[device](auto fn) { return fn(device); });
}
extern "C" int hipSetDevice(int device) {
static auto real = hip_symbol<hip_set_device_fn>("hipSetDevice");
return measure(hip_set_device_stats, real,
[device](auto fn) { return fn(device); });
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment