Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
lishen01
Sccl
Commits
85db7de4
Commit
85db7de4
authored
Jul 17, 2025
by
lishen
Browse files
基本实现bootstrap功能,所有rank硬件信息共享
parent
a4ac3320
Changes
28
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
88 additions
and
49 deletions
+88
-49
src/include/debug.h
src/include/debug.h
+5
-5
src/include/sccl.h
src/include/sccl.h
+20
-0
src/utils/asm_ops.h
src/utils/asm_ops.h
+5
-0
src/utils/param.cpp
src/utils/param.cpp
+3
-28
src/utils/param.h
src/utils/param.h
+0
-3
src/utils/thread_pool.cpp
src/utils/thread_pool.cpp
+43
-2
src/utils/thread_pool.h
src/utils/thread_pool.h
+5
-1
src/utils/utils.h
src/utils/utils.h
+7
-10
No files found.
src/include/debug.h
View file @
85db7de4
...
...
@@ -213,7 +213,7 @@ static void scclDebugInit() {
////////////////////////////// 打印DEBUG信息 //////////////////////////////
template
<
scclDebugLogLevel_t
level
>
void
scclDebugLog
(
scclDebugLogSubSys_t
pos_flags
,
const
char
*
filepath
,
int
line
,
const
char
*
fmt
,
...)
{
void
scclDebugLog
(
scclDebugLogSubSys_t
pos_flags
,
const
char
*
filepath
,
const
char
*
filefunc
,
int
line
,
const
char
*
fmt
,
...)
{
if
(
__atomic_load_n
(
&
scclDebugLevel
,
__ATOMIC_ACQUIRE
)
==
-
1
)
scclDebugInit
();
...
...
@@ -245,9 +245,9 @@ void scclDebugLog(scclDebugLogSubSys_t pos_flags, const char* filepath, int line
char
buffer
[
1024
];
size_t
len
=
0
;
if
constexpr
(
level
==
SCCL_LOG_WARN
)
{
len
=
snprintf
(
buffer
,
sizeof
(
buffer
),
"
\n
%s:%d:%d %s:%d SCCL WARN "
,
hostname
,
pid
,
tid
,
filepath
,
line
);
len
=
snprintf
(
buffer
,
sizeof
(
buffer
),
"
\n
%s:%d:%d %s:%
s:%
d SCCL WARN "
,
hostname
,
pid
,
tid
,
filepath
,
filefunc
,
line
);
}
else
if
constexpr
(
level
==
SCCL_LOG_INFO
)
{
len
=
snprintf
(
buffer
,
sizeof
(
buffer
),
"%s:%d:%d %s:%d SCCL INFO "
,
hostname
,
pid
,
tid
,
filepath
,
line
);
len
=
snprintf
(
buffer
,
sizeof
(
buffer
),
"%s:%d:%d %s:%
s:%
d SCCL INFO "
,
hostname
,
pid
,
tid
,
filepath
,
filefunc
,
line
);
}
if
(
len
)
{
...
...
@@ -262,7 +262,7 @@ void scclDebugLog(scclDebugLogSubSys_t pos_flags, const char* filepath, int line
}
// namespace debug
#define WARN(...) debug::scclDebugLog<SCCL_LOG_WARN>(SCCL_LOG_CODEALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) debug::scclDebugLog<SCCL_LOG_INFO>((FLAGS), __FILE__, __LINE__, __VA_ARGS__)
#define WARN(...) debug::scclDebugLog<SCCL_LOG_WARN>(SCCL_LOG_CODEALL, __FILE__,
__func__,
__LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) debug::scclDebugLog<SCCL_LOG_INFO>((FLAGS), __FILE__,
__func__,
__LINE__, __VA_ARGS__)
}
// namespace sccl
src/include/sccl.h
0 → 100644
View file @
85db7de4
#pragma once
#include <hip/hip_runtime_api.h>
#include <hip/hip_fp16.h>
/**
* @brief 对选中的代码进行简要功能说明
* @note 根据代码作用域(如公开API或内部实现)编写适当的文档注释
*/
typedef
enum
{
scclSuccess
=
0
,
/*!< 无错误 */
scclUnhandledHipError
=
1
,
/*!< 未处理的 HIP 错误 */
scclSystemError
=
2
,
/*!< 未处理的系统错误 */
scclInternalError
=
3
,
/*!< 内部错误 - 请报告给 RCCL 开发者 */
scclInvalidArgument
=
4
,
/*!< 无效参数 */
scclInvalidUsage
=
5
,
/*!< 无效使用 */
scclRemoteError
=
6
,
/*!< 远程进程退出或发生网络错误 */
scclInProgress
=
7
,
/*!< RCCL 操作正在进行中 */
scclNumResults
=
8
/*!< 结果类型数量 */
}
scclResult_t
;
src/utils/asm_ops.h
View file @
85db7de4
...
...
@@ -63,6 +63,11 @@ __device__ __forceinline__ void trap() {
__builtin_trap
();
}
/**
* @brief 执行全系统内存屏障(memory fence),确保所有线程都能看到最新的内存状态
* @device 该函数仅在设备端(GPU)执行
* @note 使用__threadfence_system()实现跨设备的全局内存一致性
*/
__device__
__forceinline__
void
memory_fence
()
{
// __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "");
__threadfence_system
();
...
...
src/utils/param.cpp
View file @
85db7de4
...
...
@@ -74,7 +74,7 @@ void setEnvFile(const char* fileName) {
*
* 每个找到的配置文件都会被通过setEnvFile函数加载
*/
static
void
initEnv
Func
()
{
void
initEnv
()
{
char
confFilePath
[
1024
];
const
char
*
userFile
=
getenv
(
"SCCL_CONF_FILE"
);
if
(
userFile
&&
strlen
(
userFile
)
>
0
)
{
...
...
@@ -92,18 +92,6 @@ static void initEnvFunc() {
return
;
}
/**
* 初始化环境变量(线程安全)
*
* 使用pthread_once确保initEnvFunc仅被调用一次
* 适用于多线程环境下环境变量的初始化
*/
void
initEnv
()
{
static
pthread_once_t
once
=
PTHREAD_ONCE_INIT
;
pthread_once
(
&
once
,
initEnvFunc
);
return
;
}
/**
* @brief 加载环境变量参数并缓存
*
...
...
@@ -121,7 +109,7 @@ void scclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
static
pthread_mutex_t
mutex
=
PTHREAD_MUTEX_INITIALIZER
;
pthread_mutex_lock
(
&
mutex
);
if
(
__atomic_load_n
(
cache
,
__ATOMIC_RELAXED
)
==
uninitialized
)
{
const
char
*
str
=
scclG
et
E
nv
(
env
);
const
char
*
str
=
g
et
e
nv
(
env
);
int64_t
value
=
deftVal
;
if
(
str
&&
strlen
(
str
)
>
0
)
{
errno
=
0
;
...
...
@@ -139,20 +127,7 @@ void scclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
return
;
}
/**
* 获取环境变量的值
*
* @param name 环境变量名称
* @return 环境变量的值,如果未找到则返回NULL
*
* @note 该函数会先初始化环境变量
*/
const
char
*
scclGetEnv
(
const
char
*
name
)
{
initEnv
();
return
getenv
(
name
);
}
#define SCCL_THREAD_NAMELEN 16
constexpr
int
SCCL_THREAD_NAMELEN
=
16
;
SCCL_PARAM
(
SetThreadName
,
"SET_THREAD_NAME"
,
0
);
/**
...
...
src/utils/param.h
View file @
85db7de4
...
...
@@ -13,9 +13,6 @@ void setEnvFile(const char* fileName);
// 初始化环境变量
void
initEnv
();
// 获取指定名称的环境变量
const
char
*
scclGetEnv
(
const
char
*
name
);
// 加载参数,如果环境变量未设置,则使用默认值
void
scclLoadParam
(
char
const
*
env
,
int64_t
deftVal
,
int64_t
uninitialized
,
int64_t
*
cache
);
...
...
src/utils/thread_pool.cpp
View file @
85db7de4
#include <algorithm>
#include "thread_pool.h"
namespace
sccl
{
ThreadPool
::
ThreadPool
(
size_t
threads_num
)
:
stop
(
false
)
{
// 设置线程池最大线程
static
constexpr
int
THREADS_POOL_MAX_SIZE
=
128
;
/**
* @brief 线程池构造函数
*
* 初始化线程池,创建指定数量的工作线程。
*
* @param threads_num 线程池中初始线程数量,不超过THREADS_POOL_MAX_SIZE限制
*
* @note 会初始化互斥锁和条件变量,并启动工作线程执行ThreadPool::run函数
*/
ThreadPool
::
ThreadPool
(
size_t
threads_num
,
int
cpu_cord_offset
)
:
stop
(
false
),
active_tasks
(
0
)
{
threads_num
=
min
(
THREADS_POOL_MAX_SIZE
,
threads_num
);
pthread_mutex_init
(
&
queue_mutex
,
nullptr
);
pthread_cond_init
(
&
condition
,
nullptr
);
// printf("ThreadPool 构造函数");
for
(
size_t
i
=
0
;
i
<
threads_num
;
++
i
)
{
pthread_t
worker
;
pthread_create
(
&
worker
,
nullptr
,
ThreadPool
::
run
,
this
);
workers
.
push_back
(
worker
);
// 设置工作线程的CPU亲和性,跳过核心0
setThreadAffinity
(
worker
,
i
+
cpu_cord_offset
);
}
}
/**
* @brief 线程池析构函数
*
* 负责安全地停止所有工作线程并清理资源:
* 1. 设置停止标志并通知所有等待的线程
* 2. 等待所有工作线程结束
* 3. 销毁互斥锁和条件变量
*/
ThreadPool
::~
ThreadPool
()
{
{
pthread_mutex_lock
(
&
queue_mutex
);
...
...
@@ -28,7 +54,7 @@ ThreadPool::~ThreadPool() {
pthread_mutex_destroy
(
&
queue_mutex
);
pthread_cond_destroy
(
&
condition
);
}
/**
* @brief 线程池中工作线程的执行函数
*
...
...
@@ -61,6 +87,8 @@ void* ThreadPool::run(void* arg) {
task
();
// 执行任务
{
pthread_mutex_lock
(
&
pool
->
queue_mutex
);
printf
(
"ThreadPool active_tasks--"
);
pool
->
active_tasks
--
;
// 任务完成减少活动任务计数
pthread_mutex_unlock
(
&
pool
->
queue_mutex
);
}
...
...
@@ -75,9 +103,22 @@ void* ThreadPool::run(void* arg) {
*/
bool
ThreadPool
::
allTasksCompleted
()
{
pthread_mutex_lock
(
&
queue_mutex
);
printf
(
"active_tasks: %d, tasks.size(): %lu
\n
"
,
active_tasks
,
tasks
.
size
());
bool
completed
=
(
active_tasks
==
0
)
&&
tasks
.
empty
();
pthread_mutex_unlock
(
&
queue_mutex
);
return
completed
;
}
/**
* 设置指定线程的CPU亲和性,将其绑定到指定的核心上
* @param thread 需要设置亲和性的线程
* @param core_id 要绑定的CPU核心ID
*/
void
ThreadPool
::
setThreadAffinity
(
pthread_t
thread
,
int
core_id
)
{
cpu_set_t
cpuset
;
CPU_ZERO
(
&
cpuset
);
CPU_SET
(
core_id
,
&
cpuset
);
pthread_setaffinity_np
(
thread
,
sizeof
(
cpu_set_t
),
&
cpuset
);
}
}
// namespace sccl
src/utils/thread_pool.h
View file @
85db7de4
...
...
@@ -12,7 +12,7 @@ namespace sccl {
class
ThreadPool
{
public:
ThreadPool
(
size_t
);
ThreadPool
(
size_t
,
int
cpu_cord_offset
=
1
);
~
ThreadPool
();
// 将任务加入线程池队列并返回关联的future
...
...
@@ -28,6 +28,7 @@ public:
tasks
.
push
([
task
]()
{
(
*
task
)();
});
active_tasks
++
;
// 新任务增加活动任务计数
// printf("ThreadPool active_tasks++");
pthread_mutex_unlock
(
&
queue_mutex
);
pthread_cond_signal
(
&
condition
);
...
...
@@ -48,6 +49,9 @@ private:
int
active_tasks
;
// 追踪活动任务的数量
static
void
*
run
(
void
*
arg
);
// 用于设置线程的CPU亲和性
void
setThreadAffinity
(
pthread_t
thread
,
int
core_id
);
};
}
// namespace sccl
src/utils/utils.h
View file @
85db7de4
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_UTILS_H_
#define NCCL_UTILS_H_
#pragma once
#include "check.h"
#include <stdint.h>
...
...
@@ -14,6 +7,12 @@
#include <new>
namespace
sccl
{
static
inline
void
thread_bind_cpu
(
int
coreid
)
{
cpu_set_t
cpuset
;
CPU_ZERO
(
&
cpuset
);
CPU_SET
(
coreid
,
&
cpuset
);
pthread_setaffinity_np
(
pthread_self
(),
sizeof
(
cpu_set_t
),
&
cpuset
);
}
// int ncclCudaCompCap();
...
...
@@ -533,5 +532,3 @@ namespace sccl {
// }
}
// namespace sccl
#endif
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment