Add FastPT-C host-side overhead MRE

63d618ba · one · ff6a4830 · 63d618ba · 63d618ba · 63d618ba
Commit 63d618ba authored May 18, 2026 by one
11 changed files
--- a/projects/fastpt-overhead/CMakeLists.txt
+++ b/projects/fastpt-overhead/CMakeLists.txt
+cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
+project(FastPTCOverheadMRE LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(BACKEND "hip" CACHE STRING "Backend: hip or cuda")
+
+execute_process(
+    COMMAND python3 -c "import torch; print(torch.utils.cmake_prefix_path)"
+    OUTPUT_VARIABLE TORCH_CMAKE_PREFIX_PATH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+list(APPEND CMAKE_PREFIX_PATH "${TORCH_CMAKE_PREFIX_PATH}")
+
+find_package(Torch REQUIRED)
+string(REPLACE "-Wno-duplicate-decl-specifier" "" TORCH_CXX_FLAGS "${TORCH_CXX_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+
+function(disable_noisy_warnings target_name)
+    target_compile_options(${target_name} PRIVATE
+        $<$<COMPILE_LANGUAGE:CXX>:-Wno-unused-result>
+        $<$<COMPILE_LANGUAGE:HIP>:-Wno-unused-result>
+        $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-unused-result>
+    )
+endfunction()
+
+if(BACKEND STREQUAL "hip")
+    enable_language(HIP)
+    set_source_files_properties(src/device_query.cpp PROPERTIES LANGUAGE HIP)
+    add_executable(device_query src/device_query.cpp)
+    add_library(guard_ext SHARED src/guard_ext.cpp)
+    target_compile_definitions(device_query PRIVATE BACKEND_HIP=1)
+    target_compile_definitions(guard_ext PRIVATE BACKEND_HIP=1)
+    target_compile_options(device_query PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-O3>)
+elseif(BACKEND STREQUAL "cuda")
+    enable_language(CUDA)
+    set(CMAKE_INCLUDE_SYSTEM_FLAG_CUDA "-I")
+    set(CMAKE_CUDA_STANDARD 17)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        set(CMAKE_CUDA_ARCHITECTURES "60;70;80;90")
+    endif()
+    set_source_files_properties(src/device_query.cpp PROPERTIES LANGUAGE CUDA)
+    add_executable(device_query src/device_query.cpp)
+    add_library(guard_ext SHARED src/guard_ext.cpp)
+    target_compile_definitions(device_query PRIVATE BACKEND_CUDA=1)
+    target_compile_definitions(guard_ext PRIVATE BACKEND_CUDA=1)
+    target_compile_options(device_query PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-O3>)
+else()
+    message(FATAL_ERROR "BACKEND must be hip or cuda")
+endif()
+
+target_link_libraries(guard_ext PRIVATE ${TORCH_LIBRARIES})
+disable_noisy_warnings(device_query)
+disable_noisy_warnings(guard_ext)
+
+set_target_properties(device_query PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+)
+set_target_properties(guard_ext PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
--- a/projects/fastpt-overhead/README.md
+++ b/projects/fastpt-overhead/README.md
+# FastPT-C Host-side Overhead MRE
+
+使用 FastPT 不转码模式适配 MatPL 的过程中发现了一定的性能损失，因此单独建立该项目来观察 FastPT 不转码模式 CUDA 兼容路径带来的 host 侧开销。它不依赖 MatPL 训练流程，只保留两个实验：
+
+1. `device_query`：对比原生 HIP 路径的 `hipGetDevice` 和 FastPT-C 路径的 `cudaGetDevice`。
+2. `guard_loop`：通过 PyTorch C++ extension 对比 `c10::hip::HIPGuard` 和 `c10::cuda::CUDAGuard` 的循环调用开销。
+
+第二个实验更接近 MatPL 的性能现象：当 CUDA ABI 的 PyTorch C++ extension 通过 FastPT-C 运行时，频繁使用 c10 device guard/device query 可能产生额外的 host 侧开销。
+
+## 目录结构
+
+```text
+.
+├── CMakeLists.txt
+├── README.md
+├── scripts
+│   ├── bench_guard.py
+│   ├── build.sh
+│   ├── compare.py
+│   ├── run_compare.sh
+│   ├── run_one.sh
+│   └── run_with_probe.sh
+└── src
+    ├── device_query.cpp
+    ├── guard_ext.cpp
+    └── runtime_probe.cpp
+```
+
+## 运行方式
+
+在 DTK/FastPT 容器中执行：
+
+```bash
+cd /workspace/tools/fastpt_c_overhead_mre
+bash scripts/run_compare.sh
+```
+
+脚本会构建并运行两种模式：
+
+- `hip`：原生 DTK/HIP/PyTorch HIP 路径。
+- `fastpt-C`：FastPT-C CUDA 兼容路径。
+
+结果会写入：
+
+```text
+results/hip/
+results/fastpt-C/
+results/compare.csv
+```
+
+常用参数可以通过环境变量调整：
+
+```bash
+DEVICE=0 \
+DEVICE_QUERY_LOOPS=1000000 \
+DEVICE_QUERY_ROUNDS=7 \
+GUARD_STEPS=10000 \
+GUARD_WARMUP=1000 \
+GUARD_ROUNDS=5 \
+GUARD_INNER_LOOPS=0,1,2,4,8,16,32,64 \
+bash scripts/run_compare.sh
+```
+
+## 结果解读
+
+- 如果 `device_query` 在 `fastpt-C` 下明显更慢，说明 FastPT-C 的 CUDA runtime 兼容调用本身有额外开销。
+- 如果 `guard_loop` 的差异随着 `inner_loops` 增大而扩大，说明 c10 CUDA guard/device query 路径已经足以复现主机侧开销。
+
+这个复现的目标是帮助定位 FastPT-C 兼容层的主机侧开销来源；它不是 MatPL 训练性能测试，也不包含历史调查过程中使用过的所有实验分支。
+
+## 可选 probe
+
+如果需要进一步确认 `guard_loop` 中触发了多少 CUDA runtime 调用，可以使用 `LD_PRELOAD` probe：
+
+```bash
+cd /workspace/tools/fastpt_c_overhead_mre
+bash scripts/run_with_probe.sh fastpt-C 0
+```
+
+结果写入：
+
+```text
+results-probe/fastpt-C/runtime_probe.csv
+```
+
+该 probe 统计 `cudaGetDevice`、`cudaSetDevice`、`hipGetDevice`、`hipSetDevice` 的调用次数、总耗时和平均耗时；默认 `run_compare.sh` 不会使用它。
--- a/projects/fastpt-overhead/scripts/bench_guard.py
+++ b/projects/fastpt-overhead/scripts/bench_guard.py
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import csv
+import statistics
+import sys
+import time
+
+import torch
+
+
+def parse_int_list(value: str) -> list[int]:
+    return [int(item) for item in value.split(",") if item.strip()]
+
+
+def sync() -> None:
+    torch.cuda.synchronize()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lib", required=True)
+    parser.add_argument("--device", type=int, default=0)
+    parser.add_argument("--inner-loops", type=parse_int_list, default=parse_int_list("0,1,2,4,8,16,32,64"))
+    parser.add_argument("--steps", type=int, default=10000)
+    parser.add_argument("--warmup", type=int, default=1000)
+    parser.add_argument("--rounds", type=int, default=5)
+    args = parser.parse_args()
+
+    torch.ops.load_library(args.lib)
+    torch.cuda.set_device(args.device)
+    tensor = torch.empty(1024, device="cuda")
+    op = torch.ops.fastpt_c_overhead_mre.guard_loop
+
+    writer = csv.writer(sys.stdout)
+    writer.writerow(
+        [
+            "section",
+            "inner_loops",
+            "steps",
+            "warmup",
+            "rounds",
+            "median_step_us",
+            "mean_step_us",
+            "median_per_guard_us",
+        ]
+    )
+
+    for inner_loops in args.inner_loops:
+        for _ in range(args.warmup):
+            op(tensor, inner_loops)
+        sync()
+
+        values = []
+        for _ in range(args.rounds):
+            sync()
+            start = time.perf_counter_ns()
+            for _ in range(args.steps):
+                op(tensor, inner_loops)
+            sync()
+            stop = time.perf_counter_ns()
+            values.append((stop - start) / args.steps / 1000.0)
+
+        median_step = statistics.median(values)
+        writer.writerow(
+            [
+                "guard_loop",
+                inner_loops,
+                args.steps,
+                args.warmup,
+                args.rounds,
+                f"{median_step:.6f}",
+                f"{statistics.mean(values):.6f}",
+                f"{median_step / inner_loops:.6f}" if inner_loops else "0.000000",
+            ]
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/projects/fastpt-overhead/scripts/build.sh
+++ b/projects/fastpt-overhead/scripts/build.sh
+#!/usr/bin/env bash
+set -euo pipefail
+
+mode="${1:?usage: build.sh hip|fastpt-C}"
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+root="$(cd "${script_dir}/.." && pwd)"
+
+source_dtk="${root}/../source_dtk_library_path.sh"
+if [[ -f "${source_dtk}" ]]; then
+    # shellcheck disable=SC1091
+    source "${source_dtk}"
+fi
+
+if [[ "${mode}" == "fastpt-C" ]]; then
+    set +u
+    # shellcheck disable=SC1091
+    source /usr/local/bin/fastpt -C >/dev/null
+    set -u
+    backend="cuda"
+elif [[ "${mode}" == "hip" ]]; then
+    backend="hip"
+else
+    echo "usage: build.sh hip|fastpt-C" >&2
+    exit 2
+fi
+
+build_dir="${root}/build-${mode}"
+cmake -S "${root}" -B "${build_dir}" \
+    -DBACKEND="${backend}" \
+    -DCMAKE_BUILD_TYPE=Release
+
+cmake --build "${build_dir}" -j"${JOBS:-$(nproc)}"
+echo "BUILD_DONE,${mode},${build_dir}"
--- a/projects/fastpt-overhead/scripts/compare.py
+++ b/projects/fastpt-overhead/scripts/compare.py
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import csv
+import sys
+from pathlib import Path
+
+
+def read_one_row(path: Path) -> dict[str, str]:
+    with path.open(newline="") as handle:
+        rows = list(csv.DictReader(handle))
+    if len(rows) != 1:
+        raise RuntimeError(f"expected one data row in {path}")
+    return rows[0]
+
+
+def read_guard(path: Path) -> dict[int, dict[str, str]]:
+    with path.open(newline="") as handle:
+        return {int(row["inner_loops"]): row for row in csv.DictReader(handle)}
+
+
+def emit_row(writer: csv.writer, section: str, key: str, hip_us: float, fastpt_us: float) -> None:
+    delta = fastpt_us - hip_us
+    writer.writerow(
+        [
+            section,
+            key,
+            f"{hip_us:.9f}",
+            f"{fastpt_us:.9f}",
+            f"{delta:.9f}",
+            f"{delta / hip_us * 100.0:.6f}" if hip_us else "nan",
+            f"{fastpt_us / hip_us:.6f}" if hip_us else "nan",
+        ]
+    )
+
+
+def main() -> int:
+    root = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("results")
+    hip = root / "hip"
+    fastpt = root / "fastpt-C"
+
+    writer = csv.writer(sys.stdout)
+    writer.writerow(["section", "case", "hip_us", "fastpt_c_us", "delta_us", "delta_pct", "ratio"])
+
+    hip_query = read_one_row(hip / "device_query.csv")
+    fastpt_query = read_one_row(fastpt / "device_query.csv")
+    emit_row(
+        writer,
+        "device_query",
+        f"{hip_query['api']} vs {fastpt_query['api']}",
+        float(hip_query["median_us"]),
+        float(fastpt_query["median_us"]),
+    )
+
+    hip_guard = read_guard(hip / "guard_loop.csv")
+    fastpt_guard = read_guard(fastpt / "guard_loop.csv")
+    for inner_loops in sorted(set(hip_guard) & set(fastpt_guard)):
+        emit_row(
+            writer,
+            "guard_loop",
+            str(inner_loops),
+            float(hip_guard[inner_loops]["median_step_us"]),
+            float(fastpt_guard[inner_loops]["median_step_us"]),
+        )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/projects/fastpt-overhead/scripts/run_compare.sh
+++ b/projects/fastpt-overhead/scripts/run_compare.sh
+#!/usr/bin/env bash
+set -euo pipefail
+
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+root="$(cd "${script_dir}/.." && pwd)"
+out_root="${OUT_ROOT:-${root}/results}"
+device="${DEVICE:-0}"
+
+bash "${script_dir}/run_one.sh" hip "${device}"
+bash "${script_dir}/run_one.sh" fastpt-C "${device}"
+
+python3 "${script_dir}/compare.py" "${out_root}" > "${out_root}/compare.csv"
+cat "${out_root}/compare.csv"
--- a/projects/fastpt-overhead/scripts/run_one.sh
+++ b/projects/fastpt-overhead/scripts/run_one.sh
+#!/usr/bin/env bash
+set -euo pipefail
+
+mode="${1:?usage: run_one.sh hip|fastpt-C [device]}"
+device="${2:-${DEVICE:-0}}"
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+root="$(cd "${script_dir}/.." && pwd)"
+out_dir="${OUT_ROOT:-${root}/results}/${mode}"
+build_dir="${root}/build-${mode}"
+
+source_dtk="${root}/../source_dtk_library_path.sh"
+if [[ -f "${source_dtk}" ]]; then
+    # shellcheck disable=SC1091
+    source "${source_dtk}"
+fi
+
+if [[ "${mode}" == "fastpt-C" ]]; then
+    set +u
+    # shellcheck disable=SC1091
+    source /usr/local/bin/fastpt -C >/dev/null
+    set -u
+elif [[ "${mode}" != "hip" ]]; then
+    echo "usage: run_one.sh hip|fastpt-C [device]" >&2
+    exit 2
+fi
+
+if [[ ! -x "${build_dir}/bin/device_query" || ! -f "${build_dir}/lib/libguard_ext.so" ]]; then
+    bash "${script_dir}/build.sh" "${mode}"
+fi
+
+mkdir -p "${out_dir}"
+"${build_dir}/bin/device_query" \
+    "${device}" \
+    "${DEVICE_QUERY_LOOPS:-1000000}" \
+    "${DEVICE_QUERY_ROUNDS:-7}" \
+    "${DEVICE_QUERY_WARMUP:-10000}" \
+    > "${out_dir}/device_query.csv"
+
+python3 "${script_dir}/bench_guard.py" \
+    --device "${device}" \
+    --lib "${build_dir}/lib/libguard_ext.so" \
+    --inner-loops "${GUARD_INNER_LOOPS:-0,1,2,4,8,16,32,64}" \
+    --steps "${GUARD_STEPS:-10000}" \
+    --warmup "${GUARD_WARMUP:-1000}" \
+    --rounds "${GUARD_ROUNDS:-5}" \
+    > "${out_dir}/guard_loop.csv"
+
+echo "RUN_DONE,${mode},${out_dir}"
--- a/projects/fastpt-overhead/scripts/run_with_probe.sh
+++ b/projects/fastpt-overhead/scripts/run_with_probe.sh
+#!/usr/bin/env bash
+set -euo pipefail
+
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+root="$(cd "${script_dir}/.." && pwd)"
+mode="${1:-fastpt-C}"
+device="${2:-${DEVICE:-0}}"
+out_root="${OUT_ROOT:-${root}/results-probe}"
+log="${FASTPT_MRE_PROBE_LOG:-${out_root}/${mode}/runtime_probe.csv}"
+probe_lib="${root}/build-probe/libruntime_probe.so"
+
+mkdir -p "${root}/build-probe"
+c++ -std=c++17 -O2 -fPIC -shared \
+    "${root}/src/runtime_probe.cpp" \
+    -ldl \
+    -o "${probe_lib}"
+
+mkdir -p "$(dirname "${log}")"
+echo "pid,api,calls,total_ns,avg_ns" > "${log}"
+
+FASTPT_MRE_PROBE_LOG="${log}" \
+OUT_ROOT="${out_root}" \
+LD_PRELOAD="${probe_lib}${LD_PRELOAD:+:${LD_PRELOAD}}" \
+bash "${script_dir}/run_one.sh" "${mode}" "${device}"
+
+echo "PROBE_LOG,${log}"
--- a/projects/fastpt-overhead/src/device_query.cpp
+++ b/projects/fastpt-overhead/src/device_query.cpp
+#include <algorithm>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+#if defined(BACKEND_CUDA)
+#include <cuda_runtime_api.h>
+#elif defined(BACKEND_HIP)
+#include <hip/hip_runtime_api.h>
+#else
+#error "BACKEND_CUDA or BACKEND_HIP must be defined"
+#endif
+
+namespace {
+
+#if defined(BACKEND_CUDA)
+const char *api_name() { return "cudaGetDevice"; }
+int set_device(int device) { return static_cast<int>(cudaSetDevice(device)); }
+int get_device(int *device) { return static_cast<int>(cudaGetDevice(device)); }
+#else
+const char *api_name() { return "hipGetDevice"; }
+int set_device(int device) { return static_cast<int>(hipSetDevice(device)); }
+int get_device(int *device) { return static_cast<int>(hipGetDevice(device)); }
+#endif
+
+double median(std::vector<double> values) {
+  std::sort(values.begin(), values.end());
+  return values[values.size() / 2];
+}
+
+double mean(const std::vector<double> &values) {
+  double total = 0.0;
+  for (double value : values) {
+    total += value;
+  }
+  return total / static_cast<double>(values.size());
+}
+
+} // namespace
+
+int main(int argc, char **argv) {
+  const int device = argc > 1 ? std::atoi(argv[1]) : 0;
+  const int loops = argc > 2 ? std::atoi(argv[2]) : 1000000;
+  const int rounds = argc > 3 ? std::atoi(argv[3]) : 7;
+  const int warmup = argc > 4 ? std::atoi(argv[4]) : 10000;
+
+  volatile int sink = 0;
+  sink += set_device(device);
+
+  int current = 0;
+  for (int i = 0; i < warmup; ++i) {
+    sink += get_device(&current);
+    sink += current;
+  }
+
+  std::vector<double> samples;
+  for (int round = 0; round < rounds; ++round) {
+    auto start = std::chrono::steady_clock::now();
+    for (int i = 0; i < loops; ++i) {
+      sink += get_device(&current);
+      sink += current;
+    }
+    auto stop = std::chrono::steady_clock::now();
+    double total_us =
+        std::chrono::duration<double, std::micro>(stop - start).count();
+    samples.push_back(total_us / static_cast<double>(loops));
+  }
+
+  auto minmax = std::minmax_element(samples.begin(), samples.end());
+  std::printf("section,api,loops,warmup,rounds,median_us,mean_us,min_us,max_us,sink\n");
+  std::printf("device_query,%s,%d,%d,%d,%.9f,%.9f,%.9f,%.9f,%d\n",
+              api_name(), loops, warmup, rounds, median(samples),
+              mean(samples), *minmax.first, *minmax.second,
+              static_cast<int>(sink));
+  return 0;
+}
--- a/projects/fastpt-overhead/src/guard_ext.cpp
+++ b/projects/fastpt-overhead/src/guard_ext.cpp
+#include <ATen/ATen.h>
+#include <torch/library.h>
+
+#if defined(BACKEND_CUDA)
+#include <c10/cuda/CUDAGuard.h>
+#elif defined(BACKEND_HIP)
+#include <c10/hip/HIPGuard.h>
+#else
+#error "BACKEND_CUDA or BACKEND_HIP must be defined"
+#endif
+
+at::Tensor guard_loop(at::Tensor tensor, int64_t loops) {
+  const auto device = static_cast<c10::DeviceIndex>(tensor.device().index());
+  for (int64_t i = 0; i < loops; ++i) {
+#if defined(BACKEND_CUDA)
+    c10::cuda::CUDAGuard guard(device);
+#else
+    c10::hip::HIPGuard guard(device);
+#endif
+  }
+  return tensor;
+}
+
+TORCH_LIBRARY(fastpt_c_overhead_mre, m) {
+  m.def("guard_loop(Tensor tensor, int loops) -> Tensor", guard_loop);
+}
--- a/projects/fastpt-overhead/src/runtime_probe.cpp
+++ b/projects/fastpt-overhead/src/runtime_probe.cpp
+#include <atomic>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <dlfcn.h>
+#include <unistd.h>
+
+namespace {
+
+using cuda_get_device_fn = int (*)(int *);
+using cuda_set_device_fn = int (*)(int);
+using hip_get_device_fn = int (*)(int *);
+using hip_set_device_fn = int (*)(int);
+
+struct Stats {
+  std::atomic<unsigned long long> calls{0};
+  std::atomic<unsigned long long> ns{0};
+};
+
+Stats cuda_get_device_stats;
+Stats cuda_set_device_stats;
+Stats hip_get_device_stats;
+Stats hip_set_device_stats;
+
+void *resolve_symbol(const char *name, const char *const *libs) {
+  auto *symbol = dlsym(RTLD_NEXT, name);
+  if (symbol) {
+    return symbol;
+  }
+
+  for (const char *const *lib = libs; *lib; ++lib) {
+    void *handle = dlopen(*lib, RTLD_LAZY | RTLD_LOCAL);
+    if (!handle) {
+      continue;
+    }
+    symbol = dlsym(handle, name);
+    if (symbol) {
+      return symbol;
+    }
+  }
+
+  std::fprintf(stderr, "probe_missing_symbol,%s,%s\n", name, dlerror());
+  std::abort();
+}
+
+template <typename Fn> Fn cuda_symbol(const char *name) {
+  static const char *const libs[] = {
+      "libcudart.so",
+      nullptr,
+  };
+  return reinterpret_cast<Fn>(resolve_symbol(name, libs));
+}
+
+template <typename Fn> Fn hip_symbol(const char *name) {
+  static const char *const libs[] = {
+      "libamdhip64.so",
+      nullptr,
+  };
+  return reinterpret_cast<Fn>(resolve_symbol(name, libs));
+}
+
+template <typename Fn, typename Call>
+int measure(Stats &stats, Fn fn, Call call) {
+  const auto start = std::chrono::steady_clock::now();
+  const int result = call(fn);
+  const auto stop = std::chrono::steady_clock::now();
+  const auto ns =
+      std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start)
+          .count();
+  stats.calls.fetch_add(1, std::memory_order_relaxed);
+  stats.ns.fetch_add(static_cast<unsigned long long>(ns),
+                     std::memory_order_relaxed);
+  return result;
+}
+
+void print_one(FILE *out, const char *name, const Stats &stats) {
+  const auto calls = stats.calls.load(std::memory_order_relaxed);
+  const auto ns = stats.ns.load(std::memory_order_relaxed);
+  const double avg_ns = calls ? static_cast<double>(ns) / calls : 0.0;
+  if (calls) {
+    std::fprintf(out, "%d,%s,%llu,%llu,%.3f\n", static_cast<int>(getpid()),
+                 name, calls, ns, avg_ns);
+  }
+}
+
+void print_summary() {
+  const char *path = std::getenv("FASTPT_MRE_PROBE_LOG");
+  FILE *out = path ? std::fopen(path, "a") : stderr;
+  if (!out) {
+    out = stderr;
+  }
+
+  print_one(out, "cudaGetDevice", cuda_get_device_stats);
+  print_one(out, "cudaSetDevice", cuda_set_device_stats);
+  print_one(out, "hipGetDevice", hip_get_device_stats);
+  print_one(out, "hipSetDevice", hip_set_device_stats);
+
+  if (out != stderr) {
+    std::fclose(out);
+  }
+}
+
+struct AtExit {
+  AtExit() { std::atexit(print_summary); }
+} at_exit;
+
+} // namespace
+
+extern "C" int cudaGetDevice(int *device) {
+  static auto real = cuda_symbol<cuda_get_device_fn>("cudaGetDevice");
+  return measure(cuda_get_device_stats, real,
+                 [device](auto fn) { return fn(device); });
+}
+
+extern "C" int cudaSetDevice(int device) {
+  static auto real = cuda_symbol<cuda_set_device_fn>("cudaSetDevice");
+  return measure(cuda_set_device_stats, real,
+                 [device](auto fn) { return fn(device); });
+}
+
+extern "C" int hipGetDevice(int *device) {
+  static auto real = hip_symbol<hip_get_device_fn>("hipGetDevice");
+  return measure(hip_get_device_stats, real,
+                 [device](auto fn) { return fn(device); });
+}
+
+extern "C" int hipSetDevice(int device) {
+  static auto real = hip_symbol<hip_set_device_fn>("hipSetDevice");
+  return measure(hip_set_device_stats, real,
+                 [device](auto fn) { return fn(device); });
+}