infiniccl.h 2.61 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#ifndef __INFINICCL_API_H__
#define __INFINICCL_API_H__

#include "infinirt.h"

typedef enum {
    INFINICCL_SUM = 0,
    INFINICCL_PROD = 1,
    INFINICCL_MAX = 2,
    INFINICCL_MIN = 3,
    INFINICCL_AVG = 4,
} infinicclReduceOp_t;

struct InfinicclComm;

typedef struct InfinicclComm *infinicclComm_t;

zhangyue's avatar
zhangyue committed
18
19
20
21
22
23
24
25
26
/**
 * Initialize NCCL communicators (one per device). On Hygon DCU builds (ENABLE_HYGON_API), when
 * device_type is INFINI_DEVICE_HYGON and ndevice is 2/4/6/8, also allocates per-GPU shared buffers
 * (vLLM-style staging + Signal + rank_data) and wires infiniccl_ar::CustomAllreduce automatically;
 * otherwise custom path stays disabled until infinicclCommSetHygonCustomAllreduce is used.
 *
 * Hygon switch: INFINICCL_CUSTOM_ALLREDUCE=0 or off disables that wiring; infinicclAllReduce then
 * uses NCCL only for the same process (see infinicclAllReduce).
 */
27
__INFINI_C __export infiniStatus_t infinicclCommInitAll(
28
29
30
31
32
    infiniDevice_t device_type,
    infinicclComm_t *comms,
    int ndevice,
    const int *device_ids);

33
__INFINI_C __export infiniStatus_t infinicclCommDestroy(infinicclComm_t comm);
34

zhangyue's avatar
zhangyue committed
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
/**
 * Hygon DCU only: attach an optional custom allreduce (opaque infiniccl_ar::CustomAllreduce*).
 * Other device types receive INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED.
 * When set, infinicclAllReduce may use it for SUM on f32/f16/bf16 payloads up to 8192 * 1024 bytes
 * (8 MiB); larger or unsupported cases use NCCL.
 *
 * If reg_buffer is non-null, sendbuf is copied to reg_buffer on the same stream before the custom
 * kernel (vLLM-style): fixed IPC-registered buffer for CUDA graph or unregistered sendbuf.
 * reg_buffer_bytes must be >= payload when reg_buffer is used. Pass custom_allreduce == nullptr to clear.
 * Do not call this after commInitAll has already auto-wired Hygon custom allreduce (returns BAD_PARAM).
 */
__INFINI_C __export infiniStatus_t infinicclCommSetHygonCustomAllreduce(
    infinicclComm_t comm,
    void *custom_allreduce,
    void *reg_buffer,
    size_t reg_buffer_bytes);

/**
 * Hygon: optional custom allreduce for small SUM payloads (see comm init / setHygon).
 * Runtime switch: INFINICCL_CUSTOM_ALLREDUCE=0 or off forces NCCL even if custom objects were initialized.
 * Diagnostics to stderr: INFINICCL_CUSTOM_ALLREDUCE_DEBUG=1 (coarse path hints);
 * INFINICCL_CUSTOM_ALLREDUCE_TRACE=1 (first 128 custom kernel invocations and up to 48 NCCL fallbacks after try_custom, per OS process).
 */
58
__INFINI_C __export infiniStatus_t infinicclAllReduce(
59
60
61
62
63
64
65
66
67
    void *sendbuf,
    void *recvbuf,
    size_t count,
    infiniDtype_t dataype,
    infinicclReduceOp_t op,
    infinicclComm_t comm,
    infinirtStream_t stream);

#endif