#ifndef __INFINICCL_API_H__ #define __INFINICCL_API_H__ #include "infinirt.h" typedef enum { INFINICCL_SUM = 0, INFINICCL_PROD = 1, INFINICCL_MAX = 2, INFINICCL_MIN = 3, INFINICCL_AVG = 4, } infinicclReduceOp_t; struct InfinicclComm; typedef struct InfinicclComm *infinicclComm_t; /** * Initialize NCCL communicators (one per device). On Hygon DCU builds (ENABLE_HYGON_API), when * device_type is INFINI_DEVICE_HYGON and ndevice is 2/4/6/8, also allocates per-GPU shared buffers * (vLLM-style staging + Signal + rank_data) and wires infiniccl_ar::CustomAllreduce automatically; * otherwise custom path stays disabled until infinicclCommSetHygonCustomAllreduce is used. * * Hygon switch: INFINICCL_CUSTOM_ALLREDUCE=0 or off disables that wiring; infinicclAllReduce then * uses NCCL only for the same process (see infinicclAllReduce). */ __INFINI_C __export infiniStatus_t infinicclCommInitAll( infiniDevice_t device_type, infinicclComm_t *comms, int ndevice, const int *device_ids); __INFINI_C __export infiniStatus_t infinicclCommDestroy(infinicclComm_t comm); /** * Hygon DCU only: attach an optional custom allreduce (opaque infiniccl_ar::CustomAllreduce*). * Other device types receive INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED. * When set, infinicclAllReduce may use it for SUM on f32/f16/bf16 payloads up to 8192 * 1024 bytes * (8 MiB); larger or unsupported cases use NCCL. * * If reg_buffer is non-null, sendbuf is copied to reg_buffer on the same stream before the custom * kernel (vLLM-style): fixed IPC-registered buffer for CUDA graph or unregistered sendbuf. * reg_buffer_bytes must be >= payload when reg_buffer is used. Pass custom_allreduce == nullptr to clear. * Do not call this after commInitAll has already auto-wired Hygon custom allreduce (returns BAD_PARAM). */ __INFINI_C __export infiniStatus_t infinicclCommSetHygonCustomAllreduce( infinicclComm_t comm, void *custom_allreduce, void *reg_buffer, size_t reg_buffer_bytes); /** * Hygon: optional custom allreduce for small SUM payloads (see comm init / setHygon). * Runtime switch: INFINICCL_CUSTOM_ALLREDUCE=0 or off forces NCCL even if custom objects were initialized. * Diagnostics to stderr: INFINICCL_CUSTOM_ALLREDUCE_DEBUG=1 (coarse path hints); * INFINICCL_CUSTOM_ALLREDUCE_TRACE=1 (first 128 custom kernel invocations and up to 48 NCCL fallbacks after try_custom, per OS process). */ __INFINI_C __export infiniStatus_t infinicclAllReduce( void *sendbuf, void *recvbuf, size_t count, infiniDtype_t dataype, infinicclReduceOp_t op, infinicclComm_t comm, infinirtStream_t stream); #endif