基本实现bootstrap功能，所有rank硬件信息共享

85db7de4 · lishen · a4ac3320 · 85db7de4 · 85db7de4 · 85db7de4
Commit 85db7de4 authored Jul 17, 2025 by lishen
8 changed files
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -213,7 +213,7 @@ static void scclDebugInit() {
 ////////////////////////////// 打印DEBUG信息 //////////////////////////////
 template <scclDebugLogLevel_t level>
-void scclDebugLog(scclDebugLogSubSys_t pos_flags, const char* filepath, int line, const char* fmt, ...) {
+void scclDebugLog(scclDebugLogSubSys_t pos_flags, const char* filepath, const char* filefunc, int line, const char* fmt, ...) {
    if(__atomic_load_n(&scclDebugLevel, __ATOMIC_ACQUIRE) == -1)
        scclDebugInit();
@@ -245,9 +245,9 @@ void scclDebugLog(scclDebugLogSubSys_t pos_flags, const char* filepath, int line
    char buffer[1024];
    size_t len = 0;
    if constexpr(level == SCCL_LOG_WARN) {
-        len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d %s:%d SCCL WARN ", hostname, pid, tid, filepath, line);
+        len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d %s:%s:%d SCCL WARN ", hostname, pid, tid, filepath, filefunc, line);
    } else if constexpr(level == SCCL_LOG_INFO) {
-        len = snprintf(buffer, sizeof(buffer), "%s:%d:%d %s:%d SCCL INFO ", hostname, pid, tid, filepath, line);
+        len = snprintf(buffer, sizeof(buffer), "%s:%d:%d %s:%s:%d SCCL INFO ", hostname, pid, tid, filepath, filefunc, line);
    }
    if(len) {
@@ -262,7 +262,7 @@ void scclDebugLog(scclDebugLogSubSys_t pos_flags, const char* filepath, int line
 } // namespace debug
-#define WARN(...) debug::scclDebugLog<SCCL_LOG_WARN>(SCCL_LOG_CODEALL, __FILE__, __LINE__, __VA_ARGS__)
+#define WARN(...) debug::scclDebugLog<SCCL_LOG_WARN>(SCCL_LOG_CODEALL, __FILE__, __func__, __LINE__, __VA_ARGS__)
-#define INFO(FLAGS, ...) debug::scclDebugLog<SCCL_LOG_INFO>((FLAGS), __FILE__, __LINE__, __VA_ARGS__)
+#define INFO(FLAGS, ...) debug::scclDebugLog<SCCL_LOG_INFO>((FLAGS), __FILE__, __func__, __LINE__, __VA_ARGS__)
 } // namespace sccl
--- a/src/include/sccl.h
+++ b/src/include/sccl.h
+#pragma once
+#include <hip/hip_runtime_api.h>
+#include <hip/hip_fp16.h>
+/**
+ * @brief 对选中的代码进行简要功能说明
+ * @note 根据代码作用域（如公开API或内部实现）编写适当的文档注释
+ */
+typedef enum {
+    scclSuccess           = 0, /*!< 无错误 */
+    scclUnhandledHipError = 1, /*!< 未处理的 HIP 错误 */
+    scclSystemError       = 2, /*!< 未处理的系统错误 */
+    scclInternalError     = 3, /*!< 内部错误 - 请报告给 RCCL 开发者 */
+    scclInvalidArgument   = 4, /*!< 无效参数 */
+    scclInvalidUsage      = 5, /*!< 无效使用 */
+    scclRemoteError       = 6, /*!< 远程进程退出或发生网络错误 */
+    scclInProgress        = 7, /*!< RCCL 操作正在进行中 */
+    scclNumResults        = 8  /*!< 结果类型数量 */
+} scclResult_t;
--- a/src/utils/asm_ops.h
+++ b/src/utils/asm_ops.h
@@ -63,6 +63,11 @@ __device__ __forceinline__ void trap() {
    __builtin_trap();
 }
+/**
+ * @brief 执行全系统内存屏障（memory fence），确保所有线程都能看到最新的内存状态
+ * @device 该函数仅在设备端（GPU）执行
+ * @note 使用__threadfence_system()实现跨设备的全局内存一致性
+ */
 __device__ __forceinline__ void memory_fence() {
    // __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "");
    __threadfence_system();

--- a/src/utils/param.cpp
+++ b/src/utils/param.cpp
@@ -74,7 +74,7 @@ void setEnvFile(const char* fileName) {
 *
 * 每个找到的配置文件都会被通过setEnvFile函数加载
 */
-static void initEnvFunc() {
+void initEnv() {
    char confFilePath[1024];
    const char* userFile = getenv("SCCL_CONF_FILE");
    if(userFile && strlen(userFile) > 0) {
@@ -92,18 +92,6 @@ static void initEnvFunc() {
    return;
 }
-/**
- * 初始化环境变量（线程安全）
- *
- * 使用pthread_once确保initEnvFunc仅被调用一次
- * 适用于多线程环境下环境变量的初始化
- */
-void initEnv() {
-    static pthread_once_t once = PTHREAD_ONCE_INIT;
-    pthread_once(&once, initEnvFunc);
-    return;
-}
 /**
 * @brief 加载环境变量参数并缓存
 *
@@ -121,7 +109,7 @@ void scclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
    static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
    pthread_mutex_lock(&mutex);
    if(__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
-        const char* str = scclGetEnv(env);
+        const char* str = getenv(env);
        int64_t value   = deftVal;
        if(str && strlen(str) > 0) {
            errno = 0;
@@ -139,20 +127,7 @@ void scclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
    return;
 }
-/**
+constexpr int SCCL_THREAD_NAMELEN = 16;
- * 获取环境变量的值
- *
- * @param name 环境变量名称
- * @return 环境变量的值，如果未找到则返回NULL
- *
- * @note 该函数会先初始化环境变量
- */
-const char* scclGetEnv(const char* name) {
-    initEnv();
-    return getenv(name);
-}
-#define SCCL_THREAD_NAMELEN 16
 SCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
 /**

--- a/src/utils/param.h
+++ b/src/utils/param.h
@@ -13,9 +13,6 @@ void setEnvFile(const char* fileName);
 // 初始化环境变量
 void initEnv();
-// 获取指定名称的环境变量
-const char* scclGetEnv(const char* name);
 // 加载参数，如果环境变量未设置，则使用默认值
 void scclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);

--- a/src/utils/thread_pool.cpp
+++ b/src/utils/thread_pool.cpp
+#include <algorithm>
 #include "thread_pool.h"
 namespace sccl {
-ThreadPool::ThreadPool(size_t threads_num) : stop(false) {
+// 设置线程池最大线程
+static constexpr int THREADS_POOL_MAX_SIZE = 128;
+/**
+ * @brief 线程池构造函数
+ *
+ * 初始化线程池，创建指定数量的工作线程。
+ *
+ * @param threads_num 线程池中初始线程数量，不超过THREADS_POOL_MAX_SIZE限制
+ *
+ * @note 会初始化互斥锁和条件变量，并启动工作线程执行ThreadPool::run函数
+ */
+ThreadPool::ThreadPool(size_t threads_num, int cpu_cord_offset) : stop(false), active_tasks(0) {
+    threads_num = min(THREADS_POOL_MAX_SIZE, threads_num);
    pthread_mutex_init(&queue_mutex, nullptr);
    pthread_cond_init(&condition, nullptr);
+    // printf("ThreadPool 构造函数");
    for(size_t i = 0; i < threads_num; ++i) {
        pthread_t worker;
        pthread_create(&worker, nullptr, ThreadPool::run, this);
        workers.push_back(worker);
+        // 设置工作线程的CPU亲和性，跳过核心0
+        setThreadAffinity(worker, i + cpu_cord_offset);
    }
 }
+/**
+ * @brief 线程池析构函数
+ *
+ * 负责安全地停止所有工作线程并清理资源：
+ * 1. 设置停止标志并通知所有等待的线程
+ * 2. 等待所有工作线程结束
+ * 3. 销毁互斥锁和条件变量
+ */
 ThreadPool::~ThreadPool() {
    {
        pthread_mutex_lock(&queue_mutex);
@@ -28,7 +54,7 @@ ThreadPool::~ThreadPool() {
    pthread_mutex_destroy(&queue_mutex);
    pthread_cond_destroy(&condition);
 }
 /**
 * @brief 线程池中工作线程的执行函数
 *
@@ -61,6 +87,8 @@ void* ThreadPool::run(void* arg) {
        task(); // 执行任务
        {
            pthread_mutex_lock(&pool->queue_mutex);
+            printf("ThreadPool active_tasks--");
            pool->active_tasks--; // 任务完成减少活动任务计数
            pthread_mutex_unlock(&pool->queue_mutex);
        }
@@ -75,9 +103,22 @@ void* ThreadPool::run(void* arg) {
 */
 bool ThreadPool::allTasksCompleted() {
    pthread_mutex_lock(&queue_mutex);
+    printf("active_tasks: %d, tasks.size(): %lu\n", active_tasks, tasks.size());
    bool completed = (active_tasks == 0) && tasks.empty();
    pthread_mutex_unlock(&queue_mutex);
    return completed;
 }
+/**
+ * 设置指定线程的CPU亲和性，将其绑定到指定的核心上
+ * @param thread 需要设置亲和性的线程
+ * @param core_id 要绑定的CPU核心ID
+ */
+void ThreadPool::setThreadAffinity(pthread_t thread, int core_id) {
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    CPU_SET(core_id, &cpuset);
+    pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
+}
 } // namespace sccl
--- a/src/utils/thread_pool.h
+++ b/src/utils/thread_pool.h
@@ -12,7 +12,7 @@ namespace sccl {
 class ThreadPool {
 public:
-    ThreadPool(size_t);
+    ThreadPool(size_t, int cpu_cord_offset = 1);
    ~ThreadPool();
    // 将任务加入线程池队列并返回关联的future
@@ -28,6 +28,7 @@ public:
            tasks.push([task]() { (*task)(); });
            active_tasks++; // 新任务增加活动任务计数
+            // printf("ThreadPool active_tasks++");
            pthread_mutex_unlock(&queue_mutex);
            pthread_cond_signal(&condition);
@@ -48,6 +49,9 @@ private:
    int active_tasks;            // 追踪活动任务的数量
    static void* run(void* arg);
+    // 用于设置线程的CPU亲和性
+    void setThreadAffinity(pthread_t thread, int core_id);
 };
 } // namespace sccl
--- a/src/utils/utils.h
+++ b/src/utils/utils.h
-/*************************************************************************
+#pragma once
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-#ifndef NCCL_UTILS_H_
-#define NCCL_UTILS_H_
 #include "check.h"
 #include <stdint.h>
@@ -14,6 +7,12 @@
 #include <new>
 namespace sccl {
+static inline void thread_bind_cpu(int coreid) {
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    CPU_SET(coreid, &cpuset);
+    pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+}
 // int ncclCudaCompCap();
@@ -533,5 +532,3 @@ namespace sccl {
 // }
 } // namespace sccl
-#endif