Commit 7dc4e964 authored by wanghan's avatar wanghan
Browse files

Initial commit: RCCL auto-tuning project

parents
/**
* @file: hip_profile_common.h
* @brief: common define for hip_prof and profile。
* @author: lizhigong
* @data: 2021/05/11
*/
#ifndef HIP_HIP_PROFILE_COMMON_H
#define HIP_HIP_PROFILE_COMMON_H
#include <cstdint>
#include <map>
#include <bitset>
#include <cstring>
typedef uint32_t activity_kind_t;
typedef uint64_t activity_correlation_id_t;
struct hip_prof_rccl_entry {
activity_kind_t kind;
uint32_t cid;
// NCCL Coll Args
const void* sendbuff;
void* recvbuff;
size_t count;
uint32_t datatype;
uint32_t op;
int rid; // peer for p2p operations
// Computed later
int algorithm;
int protocol;
uint8_t pattern;
int nChannels;
int nThreads;
size_t nBytes;
int chunkSize;
int channelId;
uint64_t elapsed;
struct {
activity_correlation_id_t correlation_id; /* activity ID uint64_t */
uint64_t begin_ns; /* host begin timestamp uint64_t*/
uint64_t end_ns; /* host end timestamp uint64_t */
};
union {
struct {
int device_id; /* device id */
uint64_t queue_id; /* queue id */
};
struct {
uint32_t process_id; /* device id */
uint32_t thread_id; /* thread id */
};
};
uint32_t ret_stat;
const char* kernel_name;
double BW_GBps; //(nBytes/1.0E9)/[(end_ns-begin_ns)/1.0E9]
};
typedef enum {
STATUS_SUCCESS = 0,
STATUS_ERROR = -1,
GET_TIMESTAMP_FAILED = -2,
INVALID_KIND = -3,
INVALID_OP = -4,
API_RETURN_ERROR = -5,
} prof_error_t;
/* rccl entry kinds */
typedef enum {
RCCL_KIND_ID_KERNEL = 0,
RCCL_KIND_ID_API = 1,
RCCL_KIND_ID_NUMBER =2
} entry_kind_t;
#endif // HIP_HIP_PROFILE_COMMON_H
\ No newline at end of file
/* Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef NCCL_HSA_EXTRA_H_
#define NCCL_HSA_EXTRA_H_
#include "hsa/hsa.h"
#include <dlfcn.h>
#include <mutex>
#ifdef HYGON_SDMA_FEATURE
#ifdef DEFINE_SDMA_STRUCT
typedef struct hsa_sdma_info_s {
uint32_t queue_id;
uint32_t dep_signal;
uint32_t completion_signal;
uint32_t data_size;
uint64_t src_addr;
uint64_t dst_addr;
uint64_t start_ts;
uint64_t end_ts;
uint64_t *wptr;
uint64_t *rptr;
uint64_t reserved[8];
} hsa_sdma_info_t;
typedef enum hsa_sdma_group_queue_flag_s {
HSA_SDMA_GROUP_QUEUE_FLAG_DEFAULT = 0,
HSA_SDMA_GROUP_QUEUE_FLAG_PROFILING = 1
} hsa_sdma_group_queue_flag_t;
#define HSA_MAX_SDMA_QUEUE_NUM 7
typedef struct hsa_sdma_group_queue_s {
uint32_t queue_count;
hsa_sdma_info_t *sdma_info[HSA_MAX_SDMA_QUEUE_NUM];
} hsa_sdma_group_queue_t;
hsa_status_t hsa_ext_submit_sdma_task(hsa_agent_t agent, uint32_t count);
hsa_status_t hsa_ext_get_xhcl_link_count(hsa_agent_t src_agent,
hsa_agent_t dst_agent, uint32_t *link_count);
hsa_status_t hsa_ext_create_sdma_group_queue(hsa_agent_t src_agent,
hsa_agent_t dst_agent, uint32_t size, uint32_t flag,
hsa_sdma_group_queue_t *group_queue);
hsa_status_t hsa_ext_destroy_sdma_group_queue(hsa_agent_t agent);
#endif
typedef hsa_status_t (*hsa_ext_submit_sdma_task_t)(hsa_agent_t agent, int flags);
typedef hsa_status_t (*hsa_ext_destroy_sdma_group_queue_t)(hsa_agent_t agent);
typedef hsa_status_t (*hsa_agent_get_info_t)(hsa_agent_t agent,hsa_agent_info_t attribute,void* value);
typedef hsa_status_t (*hsa_agent_callback_t)(hsa_agent_t agent, void* data);
typedef hsa_status_t (*hsa_iterate_agents_t)(hsa_status_t (*callback)(hsa_agent_t agent, void* data),void* data);
typedef hsa_status_t (*hsa_ext_get_xhcl_link_count_t)(hsa_agent_t src_agent,hsa_agent_t dst_agent, uint32_t *link_count);
typedef hsa_status_t (*hsa_ext_create_sdma_group_queue_t)(hsa_agent_t src_agent,hsa_agent_t dst_agent, uint32_t size, uint32_t flag,hsa_sdma_group_queue_t *group_queue);
static void *hsaLib = nullptr;
static std::mutex hsaMtx;
static void* loadHsaLib() {
char path[1024];
char *ncclRocrPath = getenv("RCCL_ROCR_PATH");
if (ncclRocrPath == NULL) {
snprintf(path, sizeof(path), "libhsa-runtime64.so");
} else {
snprintf(path, sizeof(path), "%s/%s", ncclRocrPath, "libhsa-runtime64.so");
}
void* hsaLib = dlopen(path, RTLD_LAZY);
if (hsaLib == nullptr) {
const char *error = dlerror();
fprintf(stderr, "Failed to find ROCm runtime library in %s (RCCL_ROCR_PATH=%s): %s\n",
path, ncclRocrPath, error ? error : "Unknown error");
return nullptr;
}
return hsaLib;
}
static void* getHsaLib() {
if (hsaLib == nullptr) {
std::lock_guard<std::mutex> lock(hsaMtx);
if (hsaLib == nullptr) {
hsaLib = loadHsaLib();
}
}
return hsaLib;
}
#endif
#endif
#ifndef NCCL_IBV_CORE_H_
#define NCCL_IBV_CORE_H_
/* Basic IB verbs structs. Needed to dynamically load IB verbs functions without
* explicit including of IB verbs header.
*/
#include <stddef.h>
#include <stdint.h>
#include <sys/types.h>
#include <unistd.h>
#if __GNUC__ >= 3
# define __attribute_const __attribute__((const))
#else
# define __attribute_const
#endif
union ibv_gid {
uint8_t raw[16];
struct {
uint64_t subnet_prefix;
uint64_t interface_id;
} global;
};
#ifndef container_of
/**
* container_of - cast a member of a structure out to the containing structure
* @ptr: the pointer to the member.
* @type: the type of the container struct this is embedded in.
* @member: the name of the member within the struct.
*
*/
#define container_of(ptr, type, member) \
((type *) ((uint8_t *)(ptr) - offsetof(type, member)))
#endif
#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz))
/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1;
enum ibv_node_type {
IBV_NODE_UNKNOWN = -1,
IBV_NODE_CA = 1,
IBV_NODE_SWITCH,
IBV_NODE_ROUTER,
IBV_NODE_RNIC,
/* Leave a gap for future node types before starting with
* experimental node types.
*/
IBV_EXP_NODE_TYPE_START = 32,
IBV_EXP_NODE_MIC = IBV_EXP_NODE_TYPE_START
};
enum ibv_transport_type {
IBV_TRANSPORT_UNKNOWN = -1,
IBV_TRANSPORT_IB = 0,
IBV_TRANSPORT_IWARP,
/* Leave a gap for future transport types before starting with
* experimental transport types.
*/
IBV_EXP_TRANSPORT_TYPE_START = 32,
IBV_EXP_TRANSPORT_SCIF = IBV_EXP_TRANSPORT_TYPE_START
};
enum ibv_device_cap_flags {
IBV_DEVICE_RESIZE_MAX_WR = 1,
IBV_DEVICE_BAD_PKEY_CNTR = 1 << 1,
IBV_DEVICE_BAD_QKEY_CNTR = 1 << 2,
IBV_DEVICE_RAW_MULTI = 1 << 3,
IBV_DEVICE_AUTO_PATH_MIG = 1 << 4,
IBV_DEVICE_CHANGE_PHY_PORT = 1 << 5,
IBV_DEVICE_UD_AV_PORT_ENFORCE = 1 << 6,
IBV_DEVICE_CURR_QP_STATE_MOD = 1 << 7,
IBV_DEVICE_SHUTDOWN_PORT = 1 << 8,
IBV_DEVICE_INIT_TYPE = 1 << 9,
IBV_DEVICE_PORT_ACTIVE_EVENT = 1 << 10,
IBV_DEVICE_SYS_IMAGE_GUID = 1 << 11,
IBV_DEVICE_RC_RNR_NAK_GEN = 1 << 12,
IBV_DEVICE_SRQ_RESIZE = 1 << 13,
IBV_DEVICE_N_NOTIFY_CQ = 1 << 14,
IBV_DEVICE_XRC = 1 << 20,
IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29
};
enum ibv_atomic_cap {
IBV_ATOMIC_NONE,
IBV_ATOMIC_HCA,
IBV_ATOMIC_GLOB
};
struct ibv_device_attr {
char fw_ver[64];
uint64_t node_guid;
uint64_t sys_image_guid;
uint64_t max_mr_size;
uint64_t page_size_cap;
uint32_t vendor_id;
uint32_t vendor_part_id;
uint32_t hw_ver;
int max_qp;
int max_qp_wr;
int device_cap_flags;
int max_sge;
int max_sge_rd;
int max_cq;
int max_cqe;
int max_mr;
int max_pd;
int max_qp_rd_atom;
int max_ee_rd_atom;
int max_res_rd_atom;
int max_qp_init_rd_atom;
int max_ee_init_rd_atom;
enum ibv_atomic_cap atomic_cap;
int max_ee;
int max_rdd;
int max_mw;
int max_raw_ipv6_qp;
int max_raw_ethy_qp;
int max_mcast_grp;
int max_mcast_qp_attach;
int max_total_mcast_qp_attach;
int max_ah;
int max_fmr;
int max_map_per_fmr;
int max_srq;
int max_srq_wr;
int max_srq_sge;
uint16_t max_pkeys;
uint8_t local_ca_ack_delay;
uint8_t phys_port_cnt;
};
enum ibv_mtu {
IBV_MTU_256 = 1,
IBV_MTU_512 = 2,
IBV_MTU_1024 = 3,
IBV_MTU_2048 = 4,
IBV_MTU_4096 = 5
};
enum ibv_port_state {
IBV_PORT_NOP = 0,
IBV_PORT_DOWN = 1,
IBV_PORT_INIT = 2,
IBV_PORT_ARMED = 3,
IBV_PORT_ACTIVE = 4,
IBV_PORT_ACTIVE_DEFER = 5
};
enum {
IBV_LINK_LAYER_UNSPECIFIED,
IBV_LINK_LAYER_INFINIBAND,
IBV_LINK_LAYER_ETHERNET,
/* Leave a gap for future link layer types before starting with
* experimental link layer.
*/
IBV_EXP_LINK_LAYER_START = 32,
IBV_EXP_LINK_LAYER_SCIF = IBV_EXP_LINK_LAYER_START
};
enum ibv_port_cap_flags {
IBV_PORT_SM = 1 << 1,
IBV_PORT_NOTICE_SUP = 1 << 2,
IBV_PORT_TRAP_SUP = 1 << 3,
IBV_PORT_OPT_IPD_SUP = 1 << 4,
IBV_PORT_AUTO_MIGR_SUP = 1 << 5,
IBV_PORT_SL_MAP_SUP = 1 << 6,
IBV_PORT_MKEY_NVRAM = 1 << 7,
IBV_PORT_PKEY_NVRAM = 1 << 8,
IBV_PORT_LED_INFO_SUP = 1 << 9,
IBV_PORT_SYS_IMAGE_GUID_SUP = 1 << 11,
IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12,
IBV_PORT_EXTENDED_SPEEDS_SUP = 1 << 14,
IBV_PORT_CM_SUP = 1 << 16,
IBV_PORT_SNMP_TUNNEL_SUP = 1 << 17,
IBV_PORT_REINIT_SUP = 1 << 18,
IBV_PORT_DEVICE_MGMT_SUP = 1 << 19,
IBV_PORT_VENDOR_CLASS = 1 << 24,
IBV_PORT_CLIENT_REG_SUP = 1 << 25,
IBV_PORT_IP_BASED_GIDS = 1 << 26,
};
struct ibv_port_attr {
enum ibv_port_state state;
enum ibv_mtu max_mtu;
enum ibv_mtu active_mtu;
int gid_tbl_len;
uint32_t port_cap_flags;
uint32_t max_msg_sz;
uint32_t bad_pkey_cntr;
uint32_t qkey_viol_cntr;
uint16_t pkey_tbl_len;
uint16_t lid;
uint16_t sm_lid;
uint8_t lmc;
uint8_t max_vl_num;
uint8_t sm_sl;
uint8_t subnet_timeout;
uint8_t init_type_reply;
uint8_t active_width;
uint8_t active_speed;
uint8_t phys_state;
uint8_t link_layer;
uint8_t reserved;
};
enum ibv_event_type {
IBV_EVENT_CQ_ERR,
IBV_EVENT_QP_FATAL,
IBV_EVENT_QP_REQ_ERR,
IBV_EVENT_QP_ACCESS_ERR,
IBV_EVENT_COMM_EST,
IBV_EVENT_SQ_DRAINED,
IBV_EVENT_PATH_MIG,
IBV_EVENT_PATH_MIG_ERR,
IBV_EVENT_DEVICE_FATAL,
IBV_EVENT_PORT_ACTIVE,
IBV_EVENT_PORT_ERR,
IBV_EVENT_LID_CHANGE,
IBV_EVENT_PKEY_CHANGE,
IBV_EVENT_SM_CHANGE,
IBV_EVENT_SRQ_ERR,
IBV_EVENT_SRQ_LIMIT_REACHED,
IBV_EVENT_QP_LAST_WQE_REACHED,
IBV_EVENT_CLIENT_REREGISTER,
IBV_EVENT_GID_CHANGE,
/* new experimental events start here leaving enough
* room for 14 events which should be enough
*/
IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32,
IBV_EXP_EVENT_DCT_ACCESS_ERR,
IBV_EXP_EVENT_DCT_REQ_ERR,
};
struct ibv_async_event {
union {
struct ibv_cq *cq;
struct ibv_qp *qp;
struct ibv_srq *srq;
struct ibv_exp_dct *dct;
int port_num;
/* For source compatible with Legacy API */
uint32_t xrc_qp_num;
} element;
enum ibv_event_type event_type;
};
enum ibv_wc_status {
IBV_WC_SUCCESS,
IBV_WC_LOC_LEN_ERR,
IBV_WC_LOC_QP_OP_ERR,
IBV_WC_LOC_EEC_OP_ERR,
IBV_WC_LOC_PROT_ERR,
IBV_WC_WR_FLUSH_ERR,
IBV_WC_MW_BIND_ERR,
IBV_WC_BAD_RESP_ERR,
IBV_WC_LOC_ACCESS_ERR,
IBV_WC_REM_INV_REQ_ERR,
IBV_WC_REM_ACCESS_ERR,
IBV_WC_REM_OP_ERR,
IBV_WC_RETRY_EXC_ERR,
IBV_WC_RNR_RETRY_EXC_ERR,
IBV_WC_LOC_RDD_VIOL_ERR,
IBV_WC_REM_INV_RD_REQ_ERR,
IBV_WC_REM_ABORT_ERR,
IBV_WC_INV_EECN_ERR,
IBV_WC_INV_EEC_STATE_ERR,
IBV_WC_FATAL_ERR,
IBV_WC_RESP_TIMEOUT_ERR,
IBV_WC_GENERAL_ERR
};
const char *ibv_wc_status_str(enum ibv_wc_status status);
enum ibv_wc_opcode {
IBV_WC_SEND,
IBV_WC_RDMA_WRITE,
IBV_WC_RDMA_READ,
IBV_WC_COMP_SWAP,
IBV_WC_FETCH_ADD,
IBV_WC_BIND_MW,
/*
* Set value of IBV_WC_RECV so consumers can test if a completion is a
* receive by testing (opcode & IBV_WC_RECV).
*/
IBV_WC_RECV = 1 << 7,
IBV_WC_RECV_RDMA_WITH_IMM
};
enum ibv_wc_flags {
IBV_WC_GRH = 1 << 0,
IBV_WC_WITH_IMM = 1 << 1
};
struct ibv_wc {
uint64_t wr_id;
enum ibv_wc_status status;
enum ibv_wc_opcode opcode;
uint32_t vendor_err;
uint32_t byte_len;
uint32_t imm_data; /* in network byte order */
uint32_t qp_num;
uint32_t src_qp;
int wc_flags;
uint16_t pkey_index;
uint16_t slid;
uint8_t sl;
uint8_t dlid_path_bits;
};
enum ibv_access_flags {
IBV_ACCESS_LOCAL_WRITE = 1,
IBV_ACCESS_REMOTE_WRITE = (1<<1),
IBV_ACCESS_REMOTE_READ = (1<<2),
IBV_ACCESS_REMOTE_ATOMIC = (1<<3),
IBV_ACCESS_MW_BIND = (1<<4),
IBV_ACCESS_RELAXED_ORDERING = (1<<20),
};
struct ibv_pd {
struct ibv_context *context;
uint32_t handle;
};
enum ibv_xrcd_init_attr_mask {
IBV_XRCD_INIT_ATTR_FD = 1 << 0,
IBV_XRCD_INIT_ATTR_OFLAGS = 1 << 1,
IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2
};
struct ibv_xrcd_init_attr {
uint32_t comp_mask;
int fd;
int oflags;
};
struct ibv_xrcd {
struct ibv_context *context;
};
enum ibv_rereg_mr_flags {
IBV_REREG_MR_CHANGE_TRANSLATION = (1 << 0),
IBV_REREG_MR_CHANGE_PD = (1 << 1),
IBV_REREG_MR_CHANGE_ACCESS = (1 << 2),
IBV_REREG_MR_KEEP_VALID = (1 << 3)
};
struct ibv_mr {
struct ibv_context *context;
struct ibv_pd *pd;
void *addr;
size_t length;
uint32_t handle;
uint32_t lkey;
uint32_t rkey;
};
enum ibv_mw_type {
IBV_MW_TYPE_1 = 1,
IBV_MW_TYPE_2 = 2
};
struct ibv_mw {
struct ibv_context *context;
struct ibv_pd *pd;
uint32_t rkey;
};
struct ibv_global_route {
union ibv_gid dgid;
uint32_t flow_label;
uint8_t sgid_index;
uint8_t hop_limit;
uint8_t traffic_class;
};
struct ibv_grh {
uint32_t version_tclass_flow;
uint16_t paylen;
uint8_t next_hdr;
uint8_t hop_limit;
union ibv_gid sgid;
union ibv_gid dgid;
};
enum ibv_rate {
IBV_RATE_MAX = 0,
IBV_RATE_2_5_GBPS = 2,
IBV_RATE_5_GBPS = 5,
IBV_RATE_10_GBPS = 3,
IBV_RATE_20_GBPS = 6,
IBV_RATE_30_GBPS = 4,
IBV_RATE_40_GBPS = 7,
IBV_RATE_60_GBPS = 8,
IBV_RATE_80_GBPS = 9,
IBV_RATE_120_GBPS = 10,
IBV_RATE_14_GBPS = 11,
IBV_RATE_56_GBPS = 12,
IBV_RATE_112_GBPS = 13,
IBV_RATE_168_GBPS = 14,
IBV_RATE_25_GBPS = 15,
IBV_RATE_100_GBPS = 16,
IBV_RATE_200_GBPS = 17,
IBV_RATE_300_GBPS = 18
};
/**
* ibv_rate_to_mult - Convert the IB rate enum to a multiple of the
* base rate of 2.5 Gbit/sec. For example, IBV_RATE_5_GBPS will be
* converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
* @rate: rate to convert.
*/
int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const;
/**
* mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum.
* @mult: multiple to convert.
*/
enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const;
/**
* ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec.
* For example, IBV_RATE_5_GBPS will return the value 5000.
* @rate: rate to convert.
*/
int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const;
/**
* mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum.
* @mbps: value to convert.
*/
enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const;
struct ibv_ah_attr {
struct ibv_global_route grh;
uint16_t dlid;
uint8_t sl;
uint8_t src_path_bits;
uint8_t static_rate;
uint8_t is_global;
uint8_t port_num;
};
enum ibv_srq_attr_mask {
IBV_SRQ_MAX_WR = 1 << 0,
IBV_SRQ_LIMIT = 1 << 1
};
struct ibv_srq_attr {
uint32_t max_wr;
uint32_t max_sge;
uint32_t srq_limit;
};
struct ibv_srq_init_attr {
void *srq_context;
struct ibv_srq_attr attr;
};
enum ibv_srq_type {
IBV_SRQT_BASIC,
IBV_SRQT_XRC
};
enum ibv_srq_init_attr_mask {
IBV_SRQ_INIT_ATTR_TYPE = 1 << 0,
IBV_SRQ_INIT_ATTR_PD = 1 << 1,
IBV_SRQ_INIT_ATTR_XRCD = 1 << 2,
IBV_SRQ_INIT_ATTR_CQ = 1 << 3,
IBV_SRQ_INIT_ATTR_RESERVED = 1 << 4
};
struct ibv_srq_init_attr_ex {
void *srq_context;
struct ibv_srq_attr attr;
uint32_t comp_mask;
enum ibv_srq_type srq_type;
struct ibv_pd *pd;
struct ibv_xrcd *xrcd;
struct ibv_cq *cq;
};
enum ibv_qp_type {
IBV_QPT_RC = 2,
IBV_QPT_UC,
IBV_QPT_UD,
/* XRC compatible code */
IBV_QPT_XRC,
IBV_QPT_RAW_PACKET = 8,
IBV_QPT_RAW_ETH = 8,
IBV_QPT_XRC_SEND = 9,
IBV_QPT_XRC_RECV,
/* Leave a gap for future qp types before starting with
* experimental qp types.
*/
IBV_EXP_QP_TYPE_START = 32,
IBV_EXP_QPT_DC_INI = IBV_EXP_QP_TYPE_START
};
struct ibv_qp_cap {
uint32_t max_send_wr;
uint32_t max_recv_wr;
uint32_t max_send_sge;
uint32_t max_recv_sge;
uint32_t max_inline_data;
};
struct ibv_qp_init_attr {
void *qp_context;
struct ibv_cq *send_cq;
struct ibv_cq *recv_cq;
struct ibv_srq *srq;
struct ibv_qp_cap cap;
enum ibv_qp_type qp_type;
int sq_sig_all;
/* Below is needed for backwards compatabile */
struct ibv_xrc_domain *xrc_domain;
};
enum ibv_qp_init_attr_mask {
IBV_QP_INIT_ATTR_PD = 1 << 0,
IBV_QP_INIT_ATTR_XRCD = 1 << 1,
IBV_QP_INIT_ATTR_RESERVED = 1 << 2
};
struct ibv_qp_init_attr_ex {
void *qp_context;
struct ibv_cq *send_cq;
struct ibv_cq *recv_cq;
struct ibv_srq *srq;
struct ibv_qp_cap cap;
enum ibv_qp_type qp_type;
int sq_sig_all;
uint32_t comp_mask;
struct ibv_pd *pd;
struct ibv_xrcd *xrcd;
};
enum ibv_qp_open_attr_mask {
IBV_QP_OPEN_ATTR_NUM = 1 << 0,
IBV_QP_OPEN_ATTR_XRCD = 1 << 1,
IBV_QP_OPEN_ATTR_CONTEXT = 1 << 2,
IBV_QP_OPEN_ATTR_TYPE = 1 << 3,
IBV_QP_OPEN_ATTR_RESERVED = 1 << 4
};
struct ibv_qp_open_attr {
uint32_t comp_mask;
uint32_t qp_num;
struct ibv_xrcd *xrcd;
void *qp_context;
enum ibv_qp_type qp_type;
};
enum ibv_qp_attr_mask {
IBV_QP_STATE = 1 << 0,
IBV_QP_CUR_STATE = 1 << 1,
IBV_QP_EN_SQD_ASYNC_NOTIFY = 1 << 2,
IBV_QP_ACCESS_FLAGS = 1 << 3,
IBV_QP_PKEY_INDEX = 1 << 4,
IBV_QP_PORT = 1 << 5,
IBV_QP_QKEY = 1 << 6,
IBV_QP_AV = 1 << 7,
IBV_QP_PATH_MTU = 1 << 8,
IBV_QP_TIMEOUT = 1 << 9,
IBV_QP_RETRY_CNT = 1 << 10,
IBV_QP_RNR_RETRY = 1 << 11,
IBV_QP_RQ_PSN = 1 << 12,
IBV_QP_MAX_QP_RD_ATOMIC = 1 << 13,
IBV_QP_ALT_PATH = 1 << 14,
IBV_QP_MIN_RNR_TIMER = 1 << 15,
IBV_QP_SQ_PSN = 1 << 16,
IBV_QP_MAX_DEST_RD_ATOMIC = 1 << 17,
IBV_QP_PATH_MIG_STATE = 1 << 18,
IBV_QP_CAP = 1 << 19,
IBV_QP_DEST_QPN = 1 << 20
};
enum ibv_qp_state {
IBV_QPS_RESET,
IBV_QPS_INIT,
IBV_QPS_RTR,
IBV_QPS_RTS,
IBV_QPS_SQD,
IBV_QPS_SQE,
IBV_QPS_ERR,
IBV_QPS_UNKNOWN
};
enum ibv_mig_state {
IBV_MIG_MIGRATED,
IBV_MIG_REARM,
IBV_MIG_ARMED
};
struct ibv_qp_attr {
enum ibv_qp_state qp_state;
enum ibv_qp_state cur_qp_state;
enum ibv_mtu path_mtu;
enum ibv_mig_state path_mig_state;
uint32_t qkey;
uint32_t rq_psn;
uint32_t sq_psn;
uint32_t dest_qp_num;
int qp_access_flags;
struct ibv_qp_cap cap;
struct ibv_ah_attr ah_attr;
struct ibv_ah_attr alt_ah_attr;
uint16_t pkey_index;
uint16_t alt_pkey_index;
uint8_t en_sqd_async_notify;
uint8_t sq_draining;
uint8_t max_rd_atomic;
uint8_t max_dest_rd_atomic;
uint8_t min_rnr_timer;
uint8_t port_num;
uint8_t timeout;
uint8_t retry_cnt;
uint8_t rnr_retry;
uint8_t alt_port_num;
uint8_t alt_timeout;
};
enum ibv_wr_opcode {
IBV_WR_RDMA_WRITE,
IBV_WR_RDMA_WRITE_WITH_IMM,
IBV_WR_SEND,
IBV_WR_SEND_WITH_IMM,
IBV_WR_RDMA_READ,
IBV_WR_ATOMIC_CMP_AND_SWP,
IBV_WR_ATOMIC_FETCH_AND_ADD
};
enum ibv_send_flags {
IBV_SEND_FENCE = 1 << 0,
IBV_SEND_SIGNALED = 1 << 1,
IBV_SEND_SOLICITED = 1 << 2,
IBV_SEND_INLINE = 1 << 3
};
struct ibv_sge {
uint64_t addr;
uint32_t length;
uint32_t lkey;
};
struct ibv_send_wr {
uint64_t wr_id;
struct ibv_send_wr *next;
struct ibv_sge *sg_list;
int num_sge;
enum ibv_wr_opcode opcode;
int send_flags;
uint32_t imm_data; /* in network byte order */
union {
struct {
uint64_t remote_addr;
uint32_t rkey;
} rdma;
struct {
uint64_t remote_addr;
uint64_t compare_add;
uint64_t swap;
uint32_t rkey;
} atomic;
struct {
struct ibv_ah *ah;
uint32_t remote_qpn;
uint32_t remote_qkey;
} ud;
} wr;
union {
union {
struct {
uint32_t remote_srqn;
} xrc;
} qp_type;
uint32_t xrc_remote_srq_num;
};
};
struct ibv_recv_wr {
uint64_t wr_id;
struct ibv_recv_wr *next;
struct ibv_sge *sg_list;
int num_sge;
};
struct ibv_mw_bind {
uint64_t wr_id;
struct ibv_mr *mr;
void *addr;
size_t length;
int send_flags;
int mw_access_flags;
};
struct ibv_srq {
struct ibv_context *context;
void *srq_context;
struct ibv_pd *pd;
uint32_t handle;
pthread_mutex_t mutex;
pthread_cond_t cond;
uint32_t events_completed;
/* below are for source compatabilty with legacy XRC,
* padding based on ibv_srq_legacy.
*/
uint32_t xrc_srq_num_bin_compat_padding;
struct ibv_xrc_domain *xrc_domain_bin_compat_padding;
struct ibv_cq *xrc_cq_bin_compat_padding;
void *ibv_srq_padding;
/* legacy fields */
uint32_t xrc_srq_num;
struct ibv_xrc_domain *xrc_domain;
struct ibv_cq *xrc_cq;
};
/* Not in use in new API, needed for compilation as part of source compat layer */
enum ibv_event_flags {
IBV_XRC_QP_EVENT_FLAG = 0x80000000,
};
struct ibv_qp {
struct ibv_context *context;
void *qp_context;
struct ibv_pd *pd;
struct ibv_cq *send_cq;
struct ibv_cq *recv_cq;
struct ibv_srq *srq;
uint32_t handle;
uint32_t qp_num;
enum ibv_qp_state state;
enum ibv_qp_type qp_type;
pthread_mutex_t mutex;
pthread_cond_t cond;
uint32_t events_completed;
};
struct ibv_comp_channel {
struct ibv_context *context;
int fd;
int refcnt;
};
struct ibv_cq {
struct ibv_context *context;
struct ibv_comp_channel *channel;
void *cq_context;
uint32_t handle;
int cqe;
pthread_mutex_t mutex;
pthread_cond_t cond;
uint32_t comp_events_completed;
uint32_t async_events_completed;
};
struct ibv_ah {
struct ibv_context *context;
struct ibv_pd *pd;
uint32_t handle;
};
enum ibv_flow_flags {
IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1,
IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1,
};
enum ibv_flow_attr_type {
/* steering according to rule specifications */
IBV_FLOW_ATTR_NORMAL = 0x0,
/* default unicast and multicast rule -
* receive all Eth traffic which isn't steered to any QP
*/
IBV_FLOW_ATTR_ALL_DEFAULT = 0x1,
/* default multicast rule -
* receive all Eth multicast traffic which isn't steered to any QP
*/
IBV_FLOW_ATTR_MC_DEFAULT = 0x2,
};
enum ibv_flow_spec_type {
IBV_FLOW_SPEC_ETH = 0x20,
IBV_FLOW_SPEC_IPV4 = 0x30,
IBV_FLOW_SPEC_TCP = 0x40,
IBV_FLOW_SPEC_UDP = 0x41,
};
struct ibv_flow_eth_filter {
uint8_t dst_mac[6];
uint8_t src_mac[6];
uint16_t ether_type;
/*
* same layout as 802.1q: prio 3, cfi 1, vlan id 12
*/
uint16_t vlan_tag;
};
struct ibv_flow_spec_eth {
enum ibv_flow_spec_type type;
uint16_t size;
struct ibv_flow_eth_filter val;
struct ibv_flow_eth_filter mask;
};
struct ibv_flow_ipv4_filter {
uint32_t src_ip;
uint32_t dst_ip;
};
struct ibv_flow_spec_ipv4 {
enum ibv_flow_spec_type type;
uint16_t size;
struct ibv_flow_ipv4_filter val;
struct ibv_flow_ipv4_filter mask;
};
struct ibv_flow_tcp_udp_filter {
uint16_t dst_port;
uint16_t src_port;
};
struct ibv_flow_spec_tcp_udp {
enum ibv_flow_spec_type type;
uint16_t size;
struct ibv_flow_tcp_udp_filter val;
struct ibv_flow_tcp_udp_filter mask;
};
struct ibv_flow_spec {
union {
struct {
enum ibv_flow_spec_type type;
uint16_t size;
} hdr;
struct ibv_flow_spec_eth eth;
struct ibv_flow_spec_ipv4 ipv4;
struct ibv_flow_spec_tcp_udp tcp_udp;
};
};
struct ibv_flow_attr {
uint32_t comp_mask;
enum ibv_flow_attr_type type;
uint16_t size;
uint16_t priority;
uint8_t num_of_specs;
uint8_t port;
uint32_t flags;
/* Following are the optional layers according to user request
* struct ibv_flow_spec_xxx [L2]
* struct ibv_flow_spec_yyy [L3/L4]
*/
};
struct ibv_flow {
uint32_t comp_mask;
struct ibv_context *context;
uint32_t handle;
};
struct ibv_device;
struct ibv_context;
struct ibv_device_ops {
struct ibv_context * (*alloc_context)(struct ibv_device *device, int cmd_fd);
void (*free_context)(struct ibv_context *context);
};
enum {
IBV_SYSFS_NAME_MAX = 64,
IBV_SYSFS_PATH_MAX = 256
};
struct ibv_device {
struct ibv_device_ops ops;
enum ibv_node_type node_type;
enum ibv_transport_type transport_type;
/* Name of underlying kernel IB device, eg "mthca0" */
char name[IBV_SYSFS_NAME_MAX];
/* Name of uverbs device, eg "uverbs0" */
char dev_name[IBV_SYSFS_NAME_MAX];
/* Path to infiniband_verbs class device in sysfs */
char dev_path[IBV_SYSFS_PATH_MAX];
/* Path to infiniband class device in sysfs */
char ibdev_path[IBV_SYSFS_PATH_MAX];
};
struct verbs_device {
struct ibv_device device; /* Must be first */
size_t sz;
size_t size_of_context;
int (*init_context)(struct verbs_device *device,
struct ibv_context *ctx, int cmd_fd);
void (*uninit_context)(struct verbs_device *device,
struct ibv_context *ctx);
/* future fields added here */
};
struct ibv_context_ops {
int (*query_device)(struct ibv_context *context,
struct ibv_device_attr *device_attr);
int (*query_port)(struct ibv_context *context, uint8_t port_num,
struct ibv_port_attr *port_attr);
struct ibv_pd * (*alloc_pd)(struct ibv_context *context);
int (*dealloc_pd)(struct ibv_pd *pd);
struct ibv_mr * (*reg_mr)(struct ibv_pd *pd, void *addr, size_t length,
int access);
struct ibv_mr * (*rereg_mr)(struct ibv_mr *mr,
int flags,
struct ibv_pd *pd, void *addr,
size_t length,
int access);
int (*dereg_mr)(struct ibv_mr *mr);
struct ibv_mw * (*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type);
int (*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw,
struct ibv_mw_bind *mw_bind);
int (*dealloc_mw)(struct ibv_mw *mw);
struct ibv_cq * (*create_cq)(struct ibv_context *context, int cqe,
struct ibv_comp_channel *channel,
int comp_vector);
int (*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc);
int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only);
void (*cq_event)(struct ibv_cq *cq);
int (*resize_cq)(struct ibv_cq *cq, int cqe);
int (*destroy_cq)(struct ibv_cq *cq);
struct ibv_srq * (*create_srq)(struct ibv_pd *pd,
struct ibv_srq_init_attr *srq_init_attr);
int (*modify_srq)(struct ibv_srq *srq,
struct ibv_srq_attr *srq_attr,
int srq_attr_mask);
int (*query_srq)(struct ibv_srq *srq,
struct ibv_srq_attr *srq_attr);
int (*destroy_srq)(struct ibv_srq *srq);
int (*post_srq_recv)(struct ibv_srq *srq,
struct ibv_recv_wr *recv_wr,
struct ibv_recv_wr **bad_recv_wr);
struct ibv_qp * (*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
int (*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
int attr_mask,
struct ibv_qp_init_attr *init_attr);
int (*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
int attr_mask);
int (*destroy_qp)(struct ibv_qp *qp);
int (*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad_wr);
int (*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr,
struct ibv_recv_wr **bad_wr);
struct ibv_ah * (*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr);
int (*destroy_ah)(struct ibv_ah *ah);
int (*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
uint16_t lid);
int (*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
uint16_t lid);
void (*async_event)(struct ibv_async_event *event);
};
struct ibv_context {
struct ibv_device *device;
struct ibv_context_ops ops;
int cmd_fd;
int async_fd;
int num_comp_vectors;
pthread_mutex_t mutex;
void *abi_compat;
};
enum verbs_context_mask {
VERBS_CONTEXT_XRCD = (uint64_t)1 << 0,
VERBS_CONTEXT_SRQ = (uint64_t)1 << 1,
VERBS_CONTEXT_QP = (uint64_t)1 << 2,
VERBS_CONTEXT_RESERVED = (uint64_t)1 << 3,
VERBS_CONTEXT_EXP = (uint64_t)1 << 62
};
struct verbs_context {
/* "grows up" - new fields go here */
int (*_reserved_2) (void);
int (*destroy_flow) (struct ibv_flow *flow);
int (*_reserved_1) (void);
struct ibv_flow * (*create_flow) (struct ibv_qp *qp,
struct ibv_flow_attr *flow_attr);
struct ibv_qp * (*open_qp)(struct ibv_context *context,
struct ibv_qp_open_attr *attr);
struct ibv_qp * (*create_qp_ex)(struct ibv_context *context,
struct ibv_qp_init_attr_ex *qp_init_attr_ex);
int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num);
struct ibv_srq * (*create_srq_ex)(struct ibv_context *context,
struct ibv_srq_init_attr_ex *srq_init_attr_ex);
struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context,
struct ibv_xrcd_init_attr *xrcd_init_attr);
int (*close_xrcd)(struct ibv_xrcd *xrcd);
uint64_t has_comp_mask;
size_t sz; /* Must be immediately before struct ibv_context */
struct ibv_context context;/* Must be last field in the struct */
};
/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx)
{
return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ?
NULL : container_of(ctx, struct verbs_context, context);
}
#define verbs_get_ctx_op(ctx, op) ({ \
struct verbs_context *_vctx = verbs_get_ctx(ctx); \
(!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \
!_vctx->op) ? NULL : _vctx; })*/
#define verbs_set_ctx_op(_vctx, op, ptr) ({ \
struct verbs_context *vctx = _vctx; \
if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \
vctx->op = ptr; })
static inline struct verbs_device *verbs_get_device(struct ibv_device *dev)
{
return (dev->ops.alloc_context) ?
NULL : container_of(dev, struct verbs_device, device);
}
static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
return qp->context->ops.post_send(qp, wr, bad_wr);
}
#endif // NCCL_IBV_CORE_H_
#ifndef NCCL_IBV_SYMBOLS_H_
#define NCCL_IBV_SYMBOLS_H_
#ifdef NCCL_BUILD_RDMA_CORE
#include <infiniband/verbs.h>
#else
#include "ibvcore.h"
#endif
#include "nccl.h"
/* IB Verbs Function Pointers*/
struct ncclIbvSymbols {
int (*ibv_internal_fork_init)(void);
struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
void (*ibv_internal_free_device_list)(struct ibv_device **list);
const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
int (*ibv_internal_close_device)(struct ibv_context *context);
int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access);
/* DMA-BUF support */
struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
};
/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols);
#endif // NCCL_IBV_SYMBOLS_H_
/*************************************************************************
* Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
* Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved.
* Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2005 PathScale, Inc. All rights reserved.
*
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_IBVWRAP_H_
#define NCCL_IBVWRAP_H_
#ifdef NCCL_BUILD_RDMA_CORE
#include <infiniband/verbs.h>
#else
#include "ibvcore.h"
#endif
#include "core.h"
#include <sys/types.h>
#include <unistd.h>
typedef enum ibv_return_enum
{
IBV_SUCCESS = 0, //!< The operation was successful
} ibv_return_t;
ncclResult_t wrap_ibv_symbols(void);
/* NCCL wrappers of IB verbs functions */
ncclResult_t wrap_ibv_fork_init(void);
ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices);
ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list);
const char *wrap_ibv_get_device_name(struct ibv_device *device);
ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device);
ncclResult_t wrap_ibv_close_device(struct ibv_context *context);
ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event);
ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event);
ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr);
ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context);
ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
/* DMA-BUF support */
ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq);
static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) {
int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/
if (done < 0) {
WARN("Call to ibv_poll_cq() returned %d", done);
return ncclSystemError;
}
*num_done = done;
return ncclSuccess;
}
ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
if (ret != IBV_SUCCESS) {
WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr);
return ncclSystemError;
}
return ncclSuccess;
}
static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) {
int ret = qp->context->ops.post_recv(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
if (ret != IBV_SUCCESS) {
WARN("ibv_post_recv() failed with error %s", strerror(ret));
return ncclSystemError;
}
return ncclSuccess;
}
ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);
#endif //End include guard
/*************************************************************************
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_INFO_H_
#define NCCL_INFO_H_
#include "nccl.h"
#include "devcomm.h"
#include "collectives.h"
#include "core.h"
#include "utils.h"
#include "strongstream.h"
typedef enum : uint8_t {
ncclPatternRing,
ncclPatternRingTwice,
ncclPatternPipelineFrom,
ncclPatternPipelineTo,
ncclPatternTreeUp,
ncclPatternTreeDown,
ncclPatternTreeUpDown,
ncclPatternCollnetChain,
ncclPatternCollnetDirect,
ncclPatternNvls,
ncclPatternNvlsTree,
ncclPatternSend,
ncclPatternRecv
} ncclPattern_t;
// Used to pass NCCL call information between functions
struct ncclInfo {
ncclFunc_t coll;
const char* opName;
// NCCL Coll Args
const void* sendbuff;
void* recvbuff;
size_t count;
ncclDataType_t datatype;
ncclRedOp_t op;
int root; // peer for p2p operations
ncclComm_t comm;
cudaStream_t stream;
// Algorithm details
int chunkSteps;
int sliceSteps;
// Computed later
ncclDevRedOpFull opFull;
int algorithm;
int protocol;
ncclPattern_t pattern;
int nChannels;
int nThreads;
size_t nBytes;
int nstepsPerLoop;
int nchunksPerLoop;
int chunkSize;
int channelId;
};
inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
info->nBytes = info->count * ncclTypeSize(info->datatype);
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAllPivot) {
info->count = info->nBytes;
info->datatype = ncclInt8;
}
if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
return ncclSuccess;
}
struct ncclTaskColl {
struct ncclTaskColl* next;
ncclFunc_t func;
void const* sendbuff;
void* recvbuff;
size_t count;
int root;
ncclDataType_t datatype;
ncclDevRedOpFull op;
int chunkSteps, sliceSteps;
};
struct ncclTaskP2p {
ncclTaskP2p *next;
void *buff;
size_t bytes;
// Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
// of where it left off.
int chunk;
};
struct ncclCudaStreamList {
struct ncclCudaStreamList *next;
cudaStream_t stream;
};
struct ncclTasks {
struct Peer {
bool sendSeen, recvSeen;
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
};
struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
size_t collBytesTotal;
struct Peer* peers/*[nRanks]*/;
int *p2pSendOrder, *p2pRecvOrder;
int p2pOrderSteps;
int nTasksColl, nTasksP2p;
// The list of user streams aggregated over all tasks present.
struct ncclCudaStreamList* streams;
// Keep track of the number of user streams
int numStreams;
// The most recent user stream. Ignored if streams==nullptr
cudaStream_t streamRecent;
// The graph capturing all user streams or invalid if none. Thus we restrict the
// user that all streams must be captured in the same graph or not captured
// at all. Technically we could probably relax this, but that would mean
// collecting a different `ncclTasks` per graph and one for non-graph.
struct ncclCudaGraph capturingGraph;
};
#endif
/*
* Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
*
* See COPYRIGHT for license information
*/
#ifndef NCCL_IPCSOCKET_H
#define NCCL_IPCSOCKET_H
#include "nccl.h"
#include <stdio.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <memory.h>
#include <sys/un.h>
#include <inttypes.h>
#define NCCL_IPC_SOCKNAME_LEN 64
struct ncclIpcSocket {
int fd;
char socketName[NCCL_IPC_SOCKNAME_LEN];
volatile uint32_t* abortFlag;
};
ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
#endif /* NCCL_IPCSOCKET_H */
/*************************************************************************
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT License.
************************************************************************/
#ifndef MSCCL_KERNEL_H_
#define MSCCL_KERNEL_H_
#define MSCCL_KERNEL_ENTRY_NAME(devredop, type, proto, fullOps) mscclKernel_##devredop##_##type##_##proto##_##fullOps
#define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, proto, fullOps) \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, proto, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work);
#define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, LL, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, LL128, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, Simple, fullOps)
#define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(devredop, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, half, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, float, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, double, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, rccl_bfloat16, fullOps)
#define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(devredop, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t, fullOps) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t, fullOps)
#define MSCCL_DECL_KERNEL_ENTRY_FUNC() \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Sum, false) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Prod, false) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Min, false) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Max, false) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(PreMulSum, false) \
MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(SumPostDiv, false)
MSCCL_DECL_KERNEL_ENTRY_FUNC()
#endif
/*************************************************************************
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT License.
************************************************************************/
#ifndef MSCCL_LIFECYCLE_H_
#define MSCCL_LIFECYCLE_H_
#include "enqueue.h"
#include "msccl/msccl_struct.h"
bool mscclEnabled();
void mscclSetIsCallerFlag();
void mscclClearIsCallerFlag();
bool mscclIsCaller();
bool mscclAvailable();
ncclResult_t mscclInit(ncclComm_t comm);
ncclResult_t mscclGroupStart();
ncclResult_t mscclEnqueueCheck(
const void* sendbuff, const size_t sendcounts[], const size_t sdispls[],
void* recvbuff, const size_t recvcounts[], const size_t rdispls[],
size_t count, ncclDataType_t datatype, int root, int peer, ncclRedOp_t op,
mscclFunc_t mscclFunc, ncclComm_t comm, hipStream_t stream);
ncclResult_t mscclGroupEnd();
ncclResult_t mscclTeardown();
size_t mscclKernMaxLocalSize();
#endif
/*************************************************************************
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef MSCCL_PARSER_H_
#define MSCCL_PARSER_H_
#include "nccl.h"
#include "debug.h"
#include "checks.h"
#include <stdlib.h>
#include "msccl/msccl_struct.h"
// A few constraints to make the implementation easy
#define MAX_STR_LEN 255
#define MAX_ATTR_COUNT 16
#define MAX_SUBS 1024
#define MAX_NODES 4096
#define NODE_TYPE_NONE 0
#define NODE_TYPE_OPEN 1
#define NODE_TYPE_CLOSE 2
#define NODE_TYPE_SINGLE 3
struct mscclXmlNode {
char name[MAX_STR_LEN+1];
struct {
char key[MAX_STR_LEN+1];
char value[MAX_STR_LEN+1];
} attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
int nAttrs;
int type;
struct mscclXmlNode* parent;
struct mscclXmlNode* subs[MAX_SUBS];
int nSubs;
};
struct mscclXml {
struct mscclXmlNode nodes[MAX_NODES];
int maxIndex;
};
static ncclResult_t mscclXmlGetAttrIndex(struct mscclXmlNode* node, const char* attrName, int* index) {
*index = -1;
const int nAttrs = node->nAttrs;
for (int a=0; a<nAttrs; a++) {
if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN) == 0) {
*index = a;
return ncclSuccess;
}
}
return ncclSuccess;
}
static ncclResult_t mscclXmlGetAttr(struct mscclXmlNode* node, const char* attrName, const char** value) {
int index;
NCCLCHECK(mscclXmlGetAttrIndex(node, attrName, &index));
*value = index == -1 ? NULL : node->attrs[index].value;
return ncclSuccess;
}
static ncclResult_t mscclXmlGetAttrStr(struct mscclXmlNode* node, const char* attrName, const char** value) {
NCCLCHECK(mscclXmlGetAttr(node, attrName, value));
if (*value == NULL) {
WARN("Attribute %s of node %s not found", attrName, node->name);
return ncclInternalError;
}
return ncclSuccess;
}
static ncclResult_t mscclXmlGetAttrInt(struct mscclXmlNode* node, const char* attrName, int* value) {
const char* str;
NCCLCHECK(mscclXmlGetAttrStr(node, attrName, &str));
*value = strtol(str, NULL, 0);
return ncclSuccess;
}
static ncclResult_t mscclXmlGetAttrInt64(struct mscclXmlNode* node, const char* attrName, int64_t* value) {
const char* str;
NCCLCHECK(mscclXmlGetAttrStr(node, attrName, &str));
*value = strtoll(str, NULL, 0);
return ncclSuccess;
}
static ncclResult_t mscclXmlFindTag(struct mscclXml* xml, const char* tagName, struct mscclXmlNode** node) {
*node = NULL;
for (int i=0; i<xml->maxIndex; i++) {
struct mscclXmlNode* n = xml->nodes+i;
if (strcmp(n->name, tagName) == 0) {
*node = n;
return ncclSuccess;
}
}
return ncclSuccess;
}
ncclResult_t mscclGetAlgoFromXmlFile(const char* xmlGraphFile, struct mscclAlgo* algo, int rank);
ncclResult_t mscclGetAlgoMetaFromXmlFile(const char* xmlGraphFile, struct mscclAlgoMeta* algoMeta);
#endif
/*************************************************************************
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT License.
************************************************************************/
#ifndef MSCCL_SCHEDULER_H_
#define MSCCL_SCHEDULER_H_
typedef enum { mscclFuncReduce = 0,
mscclFuncBroadcast = 1,
mscclFuncAllReduce = 2,
mscclFuncReduceScatter = 3,
mscclFuncAllGather = 4,
mscclFuncSend = 5,
mscclFuncRecv = 6,
mscclFuncGather = 7,
mscclFuncScatter = 8,
mscclFuncAllToAll = 9,
mscclFuncAllToAllv = 10,
mscclNumFuncs = 11 } mscclFunc_t;
struct mscclSchedulerParam {
const void* sendBuff;
const size_t* sendCounts;
const size_t* sDisPls;
void* recvBuff;
const size_t* recvCounts;
const size_t* rDisPls;
size_t count;
ncclDataType_t dataType;
int root;
int peer;
ncclRedOp_t op;
mscclFunc_t func;
int rank;
int nRanks;
bool scheduled;
mscclAlgoHandle_t handle;
uint64_t opCount;
};
typedef struct {
// Name of the scheduler (mainly for logs)
const char* name;
// Load all algorithms
ncclResult_t (*init)();
// Select an algorithm
ncclResult_t (*selectAlgo)(struct mscclSchedulerParam* param);
// Unload all algorithms
ncclResult_t (*teardown)();
} mscclSchedulerInterface;
#endif
/*************************************************************************
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT License.
************************************************************************/
#ifndef MSCCL_SETUP_H_
#define MSCCL_SETUP_H_
#include <hip/hip_runtime.h>
#include "comm.h"
#include "msccl/msccl_struct.h"
ncclResult_t mscclGetCaptureStatus(hipStream_t stream);
ncclResult_t mscclSetupScratch(struct mscclAlgo* hostAlgo, hipStream_t stream);
ncclResult_t mscclSetupSyncFlags(hipStream_t stream);
ncclResult_t mscclSetupConnections(struct mscclAlgo* hostAlgo, ncclComm_t comm);
ncclResult_t mscclSetupCount(struct mscclAlgo* hostAlgo, ncclComm_t comm, size_t count, ncclDataType_t dataType);
ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm, hipStream_t stream);
ncclResult_t mscclSetupKernel(const void* sendBuff, void* recvBuff, size_t count,
ncclDataType_t dataType, ncclRedOp_t op, struct mscclAlgo* hostAlgo, struct mscclAlgo* devAlgo,
ncclComm_t comm, hipStream_t stream);
ncclResult_t mscclInitWorkFifoStatus(mscclWorkFifoStatus* workFifoStatus);
ncclResult_t mscclDestroyWorkFifoStatus(mscclWorkFifoStatus* workFifoStatus);
#endif
/*************************************************************************
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT License.
************************************************************************/
#ifndef MSCCL_STATUS_H_
#define MSCCL_STATUS_H_
#include "msccl/msccl_struct.h"
mscclStatus& mscclGetStatus();
mscclSavedProxyArgs& mscclGetSavedProxyArgs();
mscclThreadLocalStatus& mscclGetThreadLocalStatus();
#endif
/*************************************************************************
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT License.
************************************************************************/
#ifndef MSCCL_STRUCT_H_
#define MSCCL_STRUCT_H_
#include <cstdint>
#include <map>
#include <set>
#include <vector>
#include "devcomm.h"
#include "msccl/msccl_scheduler.h"
#define MSCCL_MAX_NUM_STEPS 64
#define MSCCL_MAX_NUM_THREAD_BLOCKS_PER_CHANNEL 32
#define MSCCL_MAX_NUM_THREAD_BLOCKS 64
#define MSCCL_MAX_COUNT 72 // max concurrent number of msccl chunk transmission
#define MSCCL_MAX_REDUCE_FUSION 16
#define MSCCL_MAX_NUM_ALGOS 1024
#define MSCCL_SLICESTEPS (NCCL_STEPS/4)
#define MSCCL_CHUNKSTEPS (NCCL_STEPS/2)
#define MSCCL_INPUT_BUFFER 0
#define MSCCL_OUTPUT_BUFFER 1
#define MSCCL_SCRATCH_BUFFER 2
#define MSCCL_SEND 0
#define MSCCL_RECV 1
#define MSCCL_RECV_COPY_SEND 2
#define MSCCL_RECV_REDUCE_SEND 3
#define MSCCL_RECV_REDUCE_COPY 4
#define MSCCL_RECV_REDUCE_COPY_SEND 5
#define MSCCL_LOCAL_COPY 6
#define MSCCL_REDUCE 7
struct mscclTransmission {
int16_t dependencePointer; // index to the first dependence
int16_t numDependencies; // dependencePointer+numDependencies indicate the last dependence
int16_t reductionPointer; // where the reduction starts
int16_t numReductions; // number of reductions with the same dst
int16_t srcOffset;
int16_t dstOffset;
uint8_t srcBuffer : 4; // input/output/scratch
uint8_t dstBuffer : 4; // input/output/scratch
int8_t hasDependence;
uint8_t type;
uint8_t count;
}; // 16 bytes
static_assert((1ULL << (8*sizeof(mscclTransmission::count))) - 1 > MSCCL_MAX_COUNT, "MSCCL_MAX_COUNT must representable by datatype of count");
struct mscclThreadBlock {
// step is used to index into these arrays
struct mscclTransmission transmissions[MSCCL_MAX_NUM_STEPS]; // 4KB
int8_t dependentBid[MSCCL_MAX_NUM_STEPS]; // -1 if not dependent on any thread block, 256 bytes
int16_t dependentStep[MSCCL_MAX_NUM_STEPS]; // 512 bytes
int16_t reductionSrcOffsets[MSCCL_MAX_NUM_STEPS]; // 512 bytes
int16_t sendPeer;
int16_t recvPeer;
uint16_t nSteps;
int16_t channelId; // associated channel. -1 indicates a thread block with only local copies
}; // 5384 bytes
static_assert(sizeof(struct mscclThreadBlock) % sizeof(uint64_t) == 0, "Sanity check: sizeof(struct mscclThreadBlock) % sizeof(uint64_t) != 0");
struct mscclFlag {
uint64_t flag;
uint64_t align[3]; // to avoid false sharing
};
struct mscclChannelPeerInfo {
int peer;
// nTransmissionsOfCount[i]: number of transmissions with count i (in terms of msccl chunks)
int nTransmissionsOfCount[MSCCL_MAX_COUNT + 1];
int existingCounts[MSCCL_MAX_COUNT + 1];
int nExistingCounts;
};
struct mscclChannelInfo {
struct mscclChannelPeerInfo sendPeerInfo[MSCCL_MAX_NUM_THREAD_BLOCKS_PER_CHANNEL];
int nSendPeers;
struct mscclChannelPeerInfo recvPeerInfo[MSCCL_MAX_NUM_THREAD_BLOCKS_PER_CHANNEL];
int nRecvPeers;
};
struct mscclAlgoMeta {
// Path to algorithm file
std::string filePath;
// number of chunks of input/output in each MSCCL algorithm loop
int nChunksPerLoop;
// number of ranks required by this algorithm
int nRanks;
// need to times nRanks for all-gather, reduce-scatter and all-to-all
int sizeMultiplier;
// MSCCL function type
mscclFunc_t func;
// Min message size allowed for this algorithm.
int64_t minBytes;
// Max message size allowed for this algorithm, 0 for no limit.
int64_t maxBytes;
// Whether this algorithm is suitable for in-place.
bool inPlace;
// Whether this algorithm is suitable for out-of-place.
bool outOfPlace;
};
struct mscclAlgo {
// number of chunks of input/output in each MSCCL algorithm loop
int nChunksPerLoop;
// the protocol that the algorithm needs to use
int protocol;
// number of channels needed by MSCCL algorithm
int nChannels;
// number of ranks required by this algorithm
int nRanks;
// number of necessary thread blocks
int nBlocks;
// number of scratch chunks that MSCCL will use
int nScratchChunks;
// need to times nRanks for all-gather, reduce-scatter and all-to-all
int sizeMultiplier;
// number of steps per chunk for this algorithm
int chunkSteps;
// number of steps per slice for this algorithm
int sliceSteps;
// bid is used as an index into this array
struct mscclThreadBlock mscclTBs[MSCCL_MAX_NUM_THREAD_BLOCKS];
// used to calculate proxy info
struct mscclChannelInfo mscclChannels[MAXCHANNELS];
// Whether the algorithm requires reduce operation
bool hasReduce;
// MSCCL function type
mscclFunc_t func;
// Min message size allowed for this algorithm.
int64_t minBytes;
// Max message size allowed for this algorithm, 0 for no limit.
int64_t maxBytes;
// Whether this algorithm is suitable for in-place.
bool inPlace;
// Whether this algorithm is suitable for out-of-place.
bool outOfPlace;
// Keep a bit mask of used types (max 8 at present)
uint8_t typeMask;
};
enum mscclGroupStatus {
mscclNoGroup,
mscclGroupSupportedOp,
mscclGroupUnsupportedOp
};
struct mscclSavedSchedulerParam {
struct mscclSchedulerParam p;
std::vector<size_t> savedSendCounts;
std::vector<size_t> savedSDisPls;
std::vector<size_t> savedRecvCounts;
std::vector<size_t> savedRDisPls;
ncclComm_t comm;
hipStream_t stream;
};
enum mscclCaptureStatus {
mscclNoCapture,
mscclNewCapture,
mscclExistingCapture
};
struct mscclProxyArg {
struct mscclAlgo* hostAlgo;
ncclComm_t comm;
mscclProxyArg(struct mscclAlgo* hostAlgo, ncclComm_t comm)
: hostAlgo(hostAlgo), comm(comm) {}
};
typedef std::map<unsigned long long, std::vector<struct mscclProxyArg>> mscclSavedProxyArgs;
struct mscclThreadLocalStatus {
bool mscclIsCallerFlag;
mscclGroupStatus groupStatus;
int groupDepth;
std::vector<struct mscclSavedSchedulerParam> savedSchedulerParams;
unsigned long long captureId;
mscclCaptureStatus captureStatus;
hipGraph_t graph;
};
struct mscclWorkFifoStatus {
uint64_t workFifoDepth;
struct mscclWork* workFifo;
uint32_t* workFifoDone;
uint32_t workFifoSent;
uint32_t workFifoSentPerThreadBlock[MSCCL_MAX_NUM_THREAD_BLOCKS];
uint32_t workFifoAckdMin;
};
typedef std::map<unsigned long long, mscclWorkFifoStatus> mscclSavedGraphWorkFifoStatus;
struct mscclStatus {
std::vector<mscclAlgoHandle_t> freeAlgoHandles;
std::map<mscclAlgoHandle_t, mscclAlgo *> hostAlgos;
std::map<mscclAlgoHandle_t, mscclAlgo *> devAlgos;
struct mscclFlag* syncFlags;
void *scratchBuffer;
uint64_t scratchBufferSize;
size_t nBytes;
int stepSize;
int chunkSteps;
int sliceSteps;
int chunkSize;
int chunkEffectiveSize;
uint32_t workIndex;
uint32_t maxAllowedCount;
ncclDataType_t dataType;
std::map<ncclComm_t, std::set<mscclAlgoHandle_t>> connectedAlgos;
hipStream_t lastStream;
void* mscclSchedulerLib;
mscclSchedulerInterface* mscclSchedulerPtr;
std::vector<mscclAlgoMeta> algoMetas;
std::vector<std::map<int, mscclAlgoHandle_t>> rankToAlgoHandles;
bool graphEnabled;
bool graphFirstKernel;
bool needsProxy;
mscclWorkFifoStatus defaultWorkFifoStatus;
mscclSavedGraphWorkFifoStatus graphWorkFifoStatus;
};
#pragma pack(push)
#pragma pack(8)
struct mscclWork {
volatile struct mscclFlag *syncFlags;
void *scratchBuffer;
const void *sendBuff;
void *recvBuff;
uint32_t* workFifoDone;
size_t sizePerMscclChunk;
uint64_t redOpArg;
uint32_t workIndex;
uint32_t maxAllowedCount;
uint32_t workFifoDoneAck;
int nChunksPerLoop;
bool hasReduce;
bool redOpArgIsPtr;
uint32_t fnIndex;
};
static_assert(sizeof(struct mscclWork) % 16 == 0, "mscclWork needs to be 16B aligned");
#pragma pack(pop)
struct mscclShmemData {
struct mscclThreadBlock mscclTB;
alignas(16) struct mscclWork work;
};
static_assert(offsetof(struct mscclShmemData, work) % 16 == 0, "mscclShmemData.work needs to be 16B aligned");
#endif
/*************************************************************************
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NET_H_
#define NCCL_NET_H_
#include "nccl.h"
#include <stdint.h>
#define NCCL_NET_HANDLE_MAXSIZE 128
#define NCCL_PTR_HOST 0x1
#define NCCL_PTR_CUDA 0x2
#define NCCL_PTR_DMABUF 0x4
// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 8
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
int speed; // Port speed in Mbps.
int port; // Port number.
float latency; // Network latency
int maxComms; // Maximum number of comms we can create
int maxRecvs; // Maximum number of grouped receives.
}ncclNetProperties_v6_t;
typedef ncclNetProperties_v6_t ncclNetProperties_t;
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v6_t;
typedef ncclNet_v6_t ncclNet_t;
#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
typedef struct {
// Name of the collective network (mainly for logs)
const char* name;
// Initialize the collective network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters capable of doing collective operations.
// If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create connections.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Create a group for collective operations. handles have been created
// using listen() above. rank indicates caller's rank in the collective network.
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
// Returns whether a reduction operation on a data type is supported.
// 1 for supported, 0 otherwise.
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
/* DMA-BUF support */
ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
// Performs an asynchronous allreduce operation on the collective group.
// May return request == NULL if the call cannot be performed (or would block).
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free collective comm objects
ncclResult_t (*closeColl)(void* collComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v6_t;
typedef ncclCollNet_v6_t ncclCollNet_t;
#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6
// v5 struct for backwards compatibility
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
// This call must not block for the connection to be established, and instead
// should return successfully with sendComm == NULL with the expectation that
// it will be called again until sendComm != NULL.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connect.
// This call must not block for the connection to be established, and instead
// should return successfully with recvComm == NULL with the expectation that
// it will be called again until recvComm != NULL.
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* sizes);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v5_t;
// v5 struct for backwards compatibility
typedef struct {
// Name of the collective network (mainly for logs)
const char* name;
// Initialize the collective network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters capable of doing collective operations.
// If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create connections.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Create a group for collective operations. handles have been created
// using listen() above. rank indicates caller's rank in the collective network.
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
// Returns whether a reduction operation on a data type is supported.
// 1 for supported, 0 otherwise.
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
// Performs an asynchronous allreduce operation on the collective group.
// May return request == NULL if the call cannot be performed (or would block).
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free collective comm objects
ncclResult_t (*closeColl)(void* collComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v5_t;
// v4 struct for backwards compatibility
typedef struct {
char* name; // Used mostly for logging.
char* pciPath; // Path to the PCI device in /sys.
uint64_t guid; // Unique identifier for the NIC chip. Important for
// cards with multiple PCI functions (Physical or virtual).
int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
int speed; // Port speed in Mbps.
int port; // Port number.
int maxComms; // Maximum number of comms we can create
} ncclNetProperties_v4_t;
// v4 struct for backwards compatibility
typedef struct {
// Name of the network (mainly for logs)
const char* name;
// Initialize the network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create a connection.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Connect to a handle and return a sending comm object for that peer.
ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
// Finalize connection establishment after remote peer has called connectHandle
ncclResult_t (*accept)(void* listenComm, void** recvComm);
// Register/Deregister memory. Comm can be either a sendComm or a recvComm.
// Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* comm, void* mhandle);
// Asynchronous send to a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
// Asynchronous recv from a peer.
// May return request == NULL if the call cannot be performed (or would block)
ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free send/recv comm objects
ncclResult_t (*closeSend)(void* sendComm);
ncclResult_t (*closeRecv)(void* recvComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclNet_v4_t;
// v4 struct for backwards compatibility
typedef struct {
// Name of the collective network (mainly for logs)
const char* name;
// Initialize the collective network.
ncclResult_t (*init)(ncclDebugLogger_t logFunction);
// Return the number of adapters capable of doing collective operations.
// If ndev returns 0, all other functions might be set to NULL.
ncclResult_t (*devices)(int* ndev);
// Get various device properties.
ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
// Create a receiving object and provide a handle to connect to it. The
// handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
// between ranks to create connections.
ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
// Create a group for collective operations. handles have been created
// using listen() above. rank indicates caller's rank in the collective network.
ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
// Returns whether a reduction operation on a data type is supported.
// 1 for supported, 0 otherwise.
ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
// Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
ncclResult_t (*deregMr)(void* collComm, void* mhandle);
// Performs an asynchronous allreduce operation on the collective group.
// May return request == NULL if the call cannot be performed (or would block).
ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
// Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
// visible to the GPU
ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
// Test whether a request is complete. If size is not NULL, it returns the
// number of bytes sent/received.
ncclResult_t (*test)(void* request, int* done, int* size);
// Close and free collective comm objects
ncclResult_t (*closeColl)(void* collComm);
ncclResult_t (*closeListen)(void* listenComm);
} ncclCollNet_v4_t;
#endif // end include guard
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_INT_NET_H_
#define NCCL_INT_NET_H_
#include "nccl.h"
#include "nccl_net.h"
#include "comm.h"
#include "checks.h"
typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
ncclResult_t ncclNetPluginInit();
ncclResult_t ncclNetInit(struct ncclComm* comm);
int ncclNetVersion(struct ncclComm* comm);
// Test whether the current GPU support GPU Direct RDMA.
ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
extern ncclNet_t ncclNetIb;
extern ncclNet_t ncclNetSocket;
#endif
/*************************************************************************
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT License.
************************************************************************/
#ifndef NPKIT_H_
#define NPKIT_H_
#include <string>
#include <thread>
#include <hip/hip_runtime.h>
#include "npkit/npkit_event.h"
#include "npkit/npkit_struct.h"
#include "common.h"
#include <time.h>
#include <sys/time.h>
#define NPKIT_GET_GPU_TIMESTAMP wall_clock64
#define RANK_NUM 16
#define CHANNEL_NUM 32
#define HOST_SUBMIT_CHANNEL_BUF 32
class NpKit {
public:
static const uint64_t kNumGpuEventBuffers = CHANNEL_NUM;
static const uint64_t kNumCpuEventBuffers = CHANNEL_NUM + 1;
static ncclResult_t Init(int rank);
static ncclResult_t Dump(const std::string& dump_dir, int rank);
static ncclResult_t Shutdown(int rank);
static NpKitEventCollectContext* GetGpuEventCollectContexts(int rank);
static inline __device__ void CollectGpuEvent(uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp,
NpKitEventCollectContext* ctx) {
uint64_t event_buffer_head = ctx->event_buffer_head;
if (event_buffer_head < kMaxNumGpuEventsPerBuffer) {
NpKitEvent& event = ctx->event_buffer[event_buffer_head];
event.fields.type = type;
event.fields.size = size < 0 ? 0 : size;
event.fields.rsvd = rsvd;
event.fields.timestamp = timestamp;
ctx->event_buffer_head++;
}
}
static inline __device__ void CollectGpuEventLDS(uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp) {
#if defined(ENABLE_NPKIT)
if (ncclShmem.event_buffer_head < LDS_NUM_EVENTS) {
NpKitEvent& event = ncclShmem.event_buffer[ncclShmem.event_buffer_head];
event.fields.type = type;
event.fields.size = size < 0 ? 0 : size;
event.fields.rsvd = rsvd;
event.fields.timestamp = timestamp;
ncclShmem.event_buffer_head++;
}
#endif
}
static void CollectCpuEvent(int rank, uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp, int channel_id);
static uint64_t *GetCpuTimestamp();
static uint64_t GetCpuTimeNs();
private:
static void CpuTimestampUpdateThread();
// 1M * 32 * 16B = 512MB per GPU
static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 20;
// 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU
static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21;
static NpKitEvent** gpu_event_buffers_[RANK_NUM];
static NpKitEvent** cpu_event_buffers_[RANK_NUM];
static int gpu_rtc_rate_khz[RANK_NUM];
static NpKitEventCollectContext* gpu_collect_contexts_[RANK_NUM];
static NpKitEventCollectContext* cpu_collect_contexts_[RANK_NUM];
static uint64_t* cpu_timestamp_;
static pthread_mutex_t npKitLock;
static uint64_t rank_;
static std::thread* cpu_timestamp_update_thread_;
static volatile bool cpu_timestamp_update_thread_should_stop_;
};
#endif
/*************************************************************************
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT License.
************************************************************************/
#ifndef NPKIT_EVENT_H_
#define NPKIT_EVENT_H_
#define NPKIT_EVENT_INVALID 0x0
#define NPKIT_EVENT_ALL_REDUCE_RING_ENTRY 0x1
#define NPKIT_EVENT_ALL_REDUCE_RING_EXIT 0x2
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY 0x3
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT 0x4
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY 0x5
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT 0x6
#define NPKIT_EVENT_COPY_SEND_ENTRY 0x7
#define NPKIT_EVENT_COPY_SEND_EXIT 0x8
#define NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY 0x9
#define NPKIT_EVENT_DIRECT_COPY_SEND_EXIT 0xA
#define NPKIT_EVENT_DIRECT_RECV_ENTRY 0xB
#define NPKIT_EVENT_DIRECT_RECV_EXIT 0xC
#define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY 0xD
#define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT 0xE
#define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY 0xF
#define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT 0x10
#define NPKIT_EVENT_DIRECT_SEND_ENTRY 0x11
#define NPKIT_EVENT_DIRECT_SEND_EXIT 0x12
#define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY 0x13
#define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT 0x14
#define NPKIT_EVENT_RECV_ENTRY 0x15
#define NPKIT_EVENT_RECV_EXIT 0x16
#define NPKIT_EVENT_RECV_COPY_SEND_ENTRY 0x17
#define NPKIT_EVENT_RECV_COPY_SEND_EXIT 0x18
#define NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY 0x19
#define NPKIT_EVENT_RECV_REDUCE_COPY_EXIT 0x1A
#define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY 0x1B
#define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT 0x1C
#define NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY 0x1D
#define NPKIT_EVENT_RECV_REDUCE_SEND_EXIT 0x1E
#define NPKIT_EVENT_SEND_ENTRY 0x1F
#define NPKIT_EVENT_SEND_EXIT 0x20
#define NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY 0x21
#define NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT 0x22
#define NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY 0x23
#define NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT 0x24
#define NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY 0x25
#define NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT 0x26
#define NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY 0x27
#define NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT 0x28
#define NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY 0x29
#define NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT 0x2A
#define NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY 0x2B
#define NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT 0x2C
#define NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY 0x2D
#define NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT 0x2E
#define NPKIT_EVENT_NET_SEND_ENTRY 0x2F
#define NPKIT_EVENT_NET_SEND_EXIT 0x30
#define NPKIT_EVENT_NET_RECV_ENTRY 0x31
#define NPKIT_EVENT_NET_RECV_EXIT 0x32
#define NPKIT_EVENT_TIME_SYNC_GPU 0x33
#define NPKIT_EVENT_TIME_SYNC_CPU 0x34
#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY 0x35
#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT 0x36
#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY 0x37
#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT 0x38
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY 0x39
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT 0x3A
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY 0x3B
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT 0x3C
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY 0x3D
#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT 0x3E
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY 0x3F
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT 0x40
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY 0x41
#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT 0x42
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY 0x43
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT 0x44
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY 0x45
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT 0x46
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY 0x47
#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT 0x48
#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY 0x49
#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT 0x4A
#define NPKIT_EVENT_SEND_RECV_SEND_ENTRY 0x4B
#define NPKIT_EVENT_SEND_RECV_SEND_EXIT 0x4C
#define NPKIT_EVENT_SEND_RECV_RECV_ENTRY 0x4D
#define NPKIT_EVENT_SEND_RECV_RECV_EXIT 0x4E
#define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME 0x4F
#define NPKIT_EVENT_ALL_GATHER_RING_ENTRY 0x50
#define NPKIT_EVENT_ALL_GATHER_RING_EXIT 0x51
#define NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY 0x52
#define NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT 0x53
#define NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY 0x54
#define NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT 0x55
#define NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY 0x56
#define NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT 0x57
#define NPKIT_EVENT_NET_TEST_ENTRY 0x58
#define NPKIT_EVENT_NET_TEST_EXIT 0x59
#define NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY 0x5A
#define NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT 0x5B
#define NPKIT_EVENT_MSCCL_REDUCE_ENTRY 0x5C
#define NPKIT_EVENT_MSCCL_REDUCE_EXIT 0x5D
#define NPKIT_EVENT_MSCCL_SEND_ENTRY 0x5E
#define NPKIT_EVENT_MSCCL_SEND_EXIT 0x5F
#define NPKIT_EVENT_MSCCL_RECV_ENTRY 0x60
#define NPKIT_EVENT_MSCCL_RECV_EXIT 0x61
#define NPKIT_EVENT_MSCCL_RUN_ENTRY 0x62
#define NPKIT_EVENT_MSCCL_RUN_EXIT 0x63
#define NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY 0x64
#define NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT 0x65
#define NPKIT_EVENT_MSCCL_INIT_ENTRY 0x66
#define NPKIT_EVENT_MSCCL_INIT_EXIT 0x67
#define NPKIT_EVENT_HOST_ENTRY 0x68
#define NPKIT_EVENT_HOST_EXIT 0x69
#define NPKIT_EVENT_BROADCAST_RING_ENTRY 0x6A
#define NPKIT_EVENT_BROADCAST_RING_EXIT 0x6B
#define NPKIT_EVENT_BROADCAST_RING_COPY_SEND_ENTRY 0x6C
#define NPKIT_EVENT_BROADCAST_RING_COPY_SEND_EXIT 0x6D
#define NPKIT_EVENT_COMM_TEST_ENTRY 0x6E
#define NPKIT_EVENT_COMM_TEST_EXIT 0x6F
#define NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_ENTRY 0x70
#define NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_EXIT 0x71
#define NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_PAL_ENTRY 0x72
#define NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_PAL_EXIT 0x73
#define NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST_ENTRY 0x74
#define NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST_EXIT 0x75
#endif
/*************************************************************************
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT License.
************************************************************************/
#ifndef NPKIT_STRUCT_H_
#define NPKIT_STRUCT_H_
#include <cstdint>
#pragma pack(push, 1)
union NpKitEvent {
uint64_t bits[2];
struct {
uint64_t type : 8;
uint32_t size : 32;
uint64_t rsvd : 24;
uint64_t timestamp;
} fields;
};
struct NpKitEventCollectContext {
NpKitEvent* event_buffer;
uint64_t event_buffer_head;
};
#pragma pack(pop)
#endif
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_NVMLWRAP_H_
#define NCCL_NVMLWRAP_H_
#include "nccl.h"
//#define NCCL_NVML_DIRECT 1
#ifndef NCCL_NVML_DIRECT
#define NCCL_NVML_DIRECT 0
#endif
#if NCCL_NVML_DIRECT
#include "nvml.h"
#else
// Dynamically handle dependencies on NVML
/* Extracted from nvml.h */
typedef struct nvmlDevice_st* nvmlDevice_t;
#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
typedef enum nvmlEnableState_enum
{
NVML_FEATURE_DISABLED = 0, //!< Feature disabled
NVML_FEATURE_ENABLED = 1 //!< Feature enabled
} nvmlEnableState_t;
typedef enum nvmlNvLinkCapability_enum
{
NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported
NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported
NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported
NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported
NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link
NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device
// should be last
NVML_NVLINK_CAP_COUNT
} nvmlNvLinkCapability_t;
typedef enum nvmlReturn_enum
{
NVML_SUCCESS = 0, //!< The operation was successful
NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit()
NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid
NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device
NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation
NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful
NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough
NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached
NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded
NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed
NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU
NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded
NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted
NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible
NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again
NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups
NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch
NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use
NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred
} nvmlReturn_t;
typedef struct nvmlPciInfo_st
{
char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffff
unsigned int bus; //!< The bus on which the device resides, 0 to 0xff
unsigned int device; //!< The device's id on the bus, 0 to 31
unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id
// Added in NVML 2.285 API
unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID
// NVIDIA reserved for internal use only
unsigned int reserved0;
unsigned int reserved1;
unsigned int reserved2;
unsigned int reserved3;
} nvmlPciInfo_t;
/* P2P Capability Index Status*/
typedef enum nvmlGpuP2PStatus_enum
{
NVML_P2P_STATUS_OK = 0,
NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
NVML_P2P_STATUS_DISABLED_BY_REGKEY,
NVML_P2P_STATUS_NOT_SUPPORTED,
NVML_P2P_STATUS_UNKNOWN
} nvmlGpuP2PStatus_t;
/* P2P Capability Index*/
typedef enum nvmlGpuP2PCapsIndex_enum
{
NVML_P2P_CAPS_INDEX_READ = 0,
NVML_P2P_CAPS_INDEX_WRITE,
NVML_P2P_CAPS_INDEX_NVLINK,
NVML_P2P_CAPS_INDEX_ATOMICS,
NVML_P2P_CAPS_INDEX_PROP,
NVML_P2P_CAPS_INDEX_UNKNOWN
} nvmlGpuP2PCapsIndex_t;
/**
* Represents the type for sample value returned
*/
typedef enum nvmlValueType_enum
{
NVML_VALUE_TYPE_DOUBLE = 0,
NVML_VALUE_TYPE_UNSIGNED_INT = 1,
NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4,
// Keep this last
NVML_VALUE_TYPE_COUNT
}nvmlValueType_t;
/**
* Union to represent different types of Value
*/
typedef union nvmlValue_st
{
double dVal; //!< If the value is double
unsigned int uiVal; //!< If the value is unsigned int
unsigned long ulVal; //!< If the value is unsigned long
unsigned long long ullVal; //!< If the value is unsigned long long
signed long long sllVal; //!< If the value is signed long long
}nvmlValue_t;
/**
* Field Identifiers.
*
* All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
*/
/* NVLink Speed */
#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90 //!< Common NVLink Speed in MBps for active links
#define NVML_FI_DEV_NVLINK_LINK_COUNT 91 //!< Number of NVLinks present on the device
/**
* Remote device NVLink ID
*
* Link ID needs to be specified in the scopeId field in nvmlFieldValue_t.
*/
#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146 //!< Remote device NVLink ID
/**
* NVSwitch: connected NVLink count
*/
#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch
#define NVML_FI_DEV_NVLINK_GET_SPEED 164
#define NVML_FI_DEV_NVLINK_GET_STATE 165
#define NVML_FI_DEV_NVLINK_GET_VERSION 166
#define NVML_FI_MAX 167 //!< One greater than the largest field ID defined above
/**
* Information for a Field Value Sample
*/
typedef struct nvmlFieldValue_st
{
unsigned int fieldId; //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above.
unsigned int scopeId; //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId can represent linkId.
long long timestamp; //!< CPU Timestamp of this value in microseconds since 1970
long long latencyUsec; //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call.
nvmlValueType_t valueType; //!< Type of the value stored in value
nvmlReturn_t nvmlReturn; //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS
nvmlValue_t value; //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
} nvmlFieldValue_t;
/* End of nvml.h */
#endif // NCCL_NVML_DIRECT
constexpr int ncclNvmlMaxDevices = 32;
struct ncclNvmlDeviceInfo {
nvmlDevice_t handle;
int computeCapabilityMajor, computeCapabilityMinor;
};
struct ncclNvmlDevicePairInfo {
nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite;
};
extern int ncclNvmlDeviceCount;
extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];
// All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
// Outsiders need only call it if they want to inspect the ncclNvml global
// tables above.
ncclResult_t ncclNvmlEnsureInitialized();
ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult);
ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
#endif // End include guard
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment