Initial commit: RCCL auto-tuning project

7dc4e964 · wanghan · 7dc4e964 · 7dc4e964 · 7dc4e964 · 7dc4e964
Commit 7dc4e964 authored Apr 02, 2026 by wanghan
20 changed files
--- a/rccl/src/include/hipprof/hip_profile_common.h
+++ b/rccl/src/include/hipprof/hip_profile_common.h
+/**
+ *  @file:   hip_profile_common.h
+ *  @brief:  common define for hip_prof and profile。
+ *  @author: lizhigong
+ *  @data:   2021/05/11
+ */
+#ifndef HIP_HIP_PROFILE_COMMON_H
+#define HIP_HIP_PROFILE_COMMON_H
+
+#include <cstdint>
+#include <map>
+#include <bitset>
+#include <cstring>
+
+typedef uint32_t activity_kind_t;
+typedef uint64_t activity_correlation_id_t;
+
+struct hip_prof_rccl_entry {
+    activity_kind_t kind;
+    uint32_t cid;
+    // NCCL Coll Args
+    const void* sendbuff;
+    void* recvbuff;
+    size_t count;
+    uint32_t datatype;
+    uint32_t op;
+    int rid; // peer for p2p operations
+    // Computed later
+    int algorithm;
+    int protocol;
+    uint8_t pattern;
+    int nChannels;
+    int nThreads;
+    size_t nBytes;
+    int chunkSize;
+    int channelId;
+    uint64_t elapsed;
+    struct {
+        activity_correlation_id_t correlation_id; /* activity ID uint64_t */
+        uint64_t begin_ns; /* host begin timestamp uint64_t*/
+        uint64_t end_ns; /* host end timestamp uint64_t */
+    };
+    union {
+        struct {
+            int device_id; /* device id */
+            uint64_t queue_id; /* queue id */
+        };
+        struct {
+            uint32_t process_id; /* device id */
+            uint32_t thread_id; /* thread id */
+        };
+    };
+    uint32_t ret_stat;
+    const char* kernel_name;
+    double BW_GBps; //(nBytes/1.0E9)/[(end_ns-begin_ns)/1.0E9]
+};
+
+typedef enum {
+    STATUS_SUCCESS = 0,
+    STATUS_ERROR = -1,
+    GET_TIMESTAMP_FAILED = -2,
+    INVALID_KIND = -3,
+    INVALID_OP = -4,
+    API_RETURN_ERROR = -5,
+} prof_error_t;
+
+/* rccl entry kinds */
+typedef enum {
+    RCCL_KIND_ID_KERNEL = 0,
+    RCCL_KIND_ID_API = 1,
+    RCCL_KIND_ID_NUMBER =2
+} entry_kind_t;
+
+#endif // HIP_HIP_PROFILE_COMMON_H
\ No newline at end of file
--- a/rccl/src/include/hsa_extra.h
+++ b/rccl/src/include/hsa_extra.h
+/* Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+#ifndef NCCL_HSA_EXTRA_H_
+#define NCCL_HSA_EXTRA_H_
+#include "hsa/hsa.h"
+#include <dlfcn.h>
+#include <mutex>
+
+#ifdef HYGON_SDMA_FEATURE
+#ifdef DEFINE_SDMA_STRUCT
+typedef struct hsa_sdma_info_s {
+  uint32_t queue_id;
+  uint32_t dep_signal;
+  uint32_t completion_signal;
+  uint32_t data_size;
+  uint64_t src_addr;
+  uint64_t dst_addr;
+  uint64_t start_ts;
+  uint64_t end_ts;
+  uint64_t *wptr;
+  uint64_t *rptr;
+  uint64_t reserved[8];
+} hsa_sdma_info_t;
+
+typedef enum hsa_sdma_group_queue_flag_s {
+  HSA_SDMA_GROUP_QUEUE_FLAG_DEFAULT   = 0,
+  HSA_SDMA_GROUP_QUEUE_FLAG_PROFILING = 1
+} hsa_sdma_group_queue_flag_t;
+
+#define HSA_MAX_SDMA_QUEUE_NUM  7
+
+typedef struct hsa_sdma_group_queue_s {
+  uint32_t queue_count;
+  hsa_sdma_info_t *sdma_info[HSA_MAX_SDMA_QUEUE_NUM];
+} hsa_sdma_group_queue_t;
+
+hsa_status_t hsa_ext_submit_sdma_task(hsa_agent_t agent, uint32_t count);
+
+hsa_status_t hsa_ext_get_xhcl_link_count(hsa_agent_t src_agent,
+  hsa_agent_t dst_agent, uint32_t *link_count);
+
+hsa_status_t hsa_ext_create_sdma_group_queue(hsa_agent_t src_agent,
+  hsa_agent_t dst_agent, uint32_t size, uint32_t flag,
+  hsa_sdma_group_queue_t *group_queue);
+
+hsa_status_t hsa_ext_destroy_sdma_group_queue(hsa_agent_t agent);
+
+#endif
+
+typedef hsa_status_t (*hsa_ext_submit_sdma_task_t)(hsa_agent_t agent, int flags);
+typedef hsa_status_t (*hsa_ext_destroy_sdma_group_queue_t)(hsa_agent_t agent);
+typedef hsa_status_t (*hsa_agent_get_info_t)(hsa_agent_t agent,hsa_agent_info_t attribute,void* value);
+typedef hsa_status_t (*hsa_agent_callback_t)(hsa_agent_t agent, void* data);
+typedef hsa_status_t (*hsa_iterate_agents_t)(hsa_status_t (*callback)(hsa_agent_t agent, void* data),void* data);
+typedef hsa_status_t (*hsa_ext_get_xhcl_link_count_t)(hsa_agent_t src_agent,hsa_agent_t dst_agent, uint32_t *link_count);
+typedef hsa_status_t (*hsa_ext_create_sdma_group_queue_t)(hsa_agent_t src_agent,hsa_agent_t dst_agent, uint32_t size, uint32_t flag,hsa_sdma_group_queue_t *group_queue);
+
+static void *hsaLib = nullptr;
+static std::mutex hsaMtx;
+
+static void* loadHsaLib() {
+  char path[1024];
+  char *ncclRocrPath = getenv("RCCL_ROCR_PATH");
+
+  if (ncclRocrPath == NULL) {
+    snprintf(path, sizeof(path), "libhsa-runtime64.so");
+  } else {
+    snprintf(path, sizeof(path), "%s/%s", ncclRocrPath, "libhsa-runtime64.so");
+  }
+
+  void* hsaLib = dlopen(path, RTLD_LAZY);
+  if (hsaLib == nullptr) {
+    const char *error = dlerror();  
+    fprintf(stderr, "Failed to find ROCm runtime library in %s (RCCL_ROCR_PATH=%s): %s\n",
+      path, ncclRocrPath, error ? error : "Unknown error");
+    return nullptr;
+  }
+
+  return hsaLib;
+}
+
+static void* getHsaLib() {
+  if (hsaLib == nullptr) {    
+    std::lock_guard<std::mutex> lock(hsaMtx);
+    if (hsaLib == nullptr) {
+      hsaLib = loadHsaLib();
+    }
+  }
+
+  return hsaLib;
+}
+
+#endif
+#endif 
--- a/rccl/src/include/ibvcore.h
+++ b/rccl/src/include/ibvcore.h
+#ifndef NCCL_IBV_CORE_H_
+#define NCCL_IBV_CORE_H_
+
+/* Basic IB verbs structs. Needed to dynamically load IB verbs functions without
+ * explicit including of IB verbs header.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#if __GNUC__ >= 3
+#  define __attribute_const __attribute__((const))
+#else
+#  define __attribute_const
+#endif
+
+union ibv_gid {
+	uint8_t			raw[16];
+	struct {
+		uint64_t	subnet_prefix;
+		uint64_t	interface_id;
+	} global;
+};
+
+#ifndef container_of
+/**
+  * container_of - cast a member of a structure out to the containing structure
+  * @ptr:        the pointer to the member.
+  * @type:       the type of the container struct this is embedded in.
+  * @member:     the name of the member within the struct.
+  *
+ */
+#define container_of(ptr, type, member) \
+	((type *) ((uint8_t *)(ptr) - offsetof(type, member)))
+#endif
+
+#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz))
+
+/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
+//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1;
+
+enum ibv_node_type {
+	IBV_NODE_UNKNOWN	= -1,
+	IBV_NODE_CA 		= 1,
+	IBV_NODE_SWITCH,
+	IBV_NODE_ROUTER,
+	IBV_NODE_RNIC,
+
+	/* Leave a gap for future node types before starting with
+	 * experimental node types.
+	 */
+	IBV_EXP_NODE_TYPE_START	= 32,
+	IBV_EXP_NODE_MIC	= IBV_EXP_NODE_TYPE_START
+};
+
+enum ibv_transport_type {
+	IBV_TRANSPORT_UNKNOWN	= -1,
+	IBV_TRANSPORT_IB	= 0,
+	IBV_TRANSPORT_IWARP,
+
+	/* Leave a gap for future transport types before starting with
+	 * experimental transport types.
+	 */
+	IBV_EXP_TRANSPORT_TYPE_START	= 32,
+	IBV_EXP_TRANSPORT_SCIF		= IBV_EXP_TRANSPORT_TYPE_START
+};
+
+enum ibv_device_cap_flags {
+	IBV_DEVICE_RESIZE_MAX_WR	= 1,
+	IBV_DEVICE_BAD_PKEY_CNTR	= 1 <<  1,
+	IBV_DEVICE_BAD_QKEY_CNTR	= 1 <<  2,
+	IBV_DEVICE_RAW_MULTI		= 1 <<  3,
+	IBV_DEVICE_AUTO_PATH_MIG	= 1 <<  4,
+	IBV_DEVICE_CHANGE_PHY_PORT	= 1 <<  5,
+	IBV_DEVICE_UD_AV_PORT_ENFORCE	= 1 <<  6,
+	IBV_DEVICE_CURR_QP_STATE_MOD	= 1 <<  7,
+	IBV_DEVICE_SHUTDOWN_PORT	= 1 <<  8,
+	IBV_DEVICE_INIT_TYPE		= 1 <<  9,
+	IBV_DEVICE_PORT_ACTIVE_EVENT	= 1 << 10,
+	IBV_DEVICE_SYS_IMAGE_GUID	= 1 << 11,
+	IBV_DEVICE_RC_RNR_NAK_GEN	= 1 << 12,
+	IBV_DEVICE_SRQ_RESIZE		= 1 << 13,
+	IBV_DEVICE_N_NOTIFY_CQ		= 1 << 14,
+	IBV_DEVICE_XRC			= 1 << 20,
+	IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29
+};
+
+enum ibv_atomic_cap {
+	IBV_ATOMIC_NONE,
+	IBV_ATOMIC_HCA,
+	IBV_ATOMIC_GLOB
+};
+
+struct ibv_device_attr {
+	char			fw_ver[64];
+	uint64_t		node_guid;
+	uint64_t		sys_image_guid;
+	uint64_t		max_mr_size;
+	uint64_t		page_size_cap;
+	uint32_t		vendor_id;
+	uint32_t		vendor_part_id;
+	uint32_t		hw_ver;
+	int			max_qp;
+	int			max_qp_wr;
+	int			device_cap_flags;
+	int			max_sge;
+	int			max_sge_rd;
+	int			max_cq;
+	int			max_cqe;
+	int			max_mr;
+	int			max_pd;
+	int			max_qp_rd_atom;
+	int			max_ee_rd_atom;
+	int			max_res_rd_atom;
+	int			max_qp_init_rd_atom;
+	int			max_ee_init_rd_atom;
+	enum ibv_atomic_cap	atomic_cap;
+	int			max_ee;
+	int			max_rdd;
+	int			max_mw;
+	int			max_raw_ipv6_qp;
+	int			max_raw_ethy_qp;
+	int			max_mcast_grp;
+	int			max_mcast_qp_attach;
+	int			max_total_mcast_qp_attach;
+	int			max_ah;
+	int			max_fmr;
+	int			max_map_per_fmr;
+	int			max_srq;
+	int			max_srq_wr;
+	int			max_srq_sge;
+	uint16_t		max_pkeys;
+	uint8_t			local_ca_ack_delay;
+	uint8_t			phys_port_cnt;
+};
+
+enum ibv_mtu {
+	IBV_MTU_256  = 1,
+	IBV_MTU_512  = 2,
+	IBV_MTU_1024 = 3,
+	IBV_MTU_2048 = 4,
+	IBV_MTU_4096 = 5
+};
+
+enum ibv_port_state {
+	IBV_PORT_NOP		= 0,
+	IBV_PORT_DOWN		= 1,
+	IBV_PORT_INIT		= 2,
+	IBV_PORT_ARMED		= 3,
+	IBV_PORT_ACTIVE		= 4,
+	IBV_PORT_ACTIVE_DEFER	= 5
+};
+
+enum {
+	IBV_LINK_LAYER_UNSPECIFIED,
+	IBV_LINK_LAYER_INFINIBAND,
+	IBV_LINK_LAYER_ETHERNET,
+
+	/* Leave a gap for future link layer types before starting with
+	 * experimental link layer.
+	 */
+	IBV_EXP_LINK_LAYER_START	= 32,
+	IBV_EXP_LINK_LAYER_SCIF		= IBV_EXP_LINK_LAYER_START
+};
+
+enum ibv_port_cap_flags {
+	IBV_PORT_SM				= 1 <<  1,
+	IBV_PORT_NOTICE_SUP			= 1 <<  2,
+	IBV_PORT_TRAP_SUP			= 1 <<  3,
+	IBV_PORT_OPT_IPD_SUP			= 1 <<  4,
+	IBV_PORT_AUTO_MIGR_SUP			= 1 <<  5,
+	IBV_PORT_SL_MAP_SUP			= 1 <<  6,
+	IBV_PORT_MKEY_NVRAM			= 1 <<  7,
+	IBV_PORT_PKEY_NVRAM			= 1 <<  8,
+	IBV_PORT_LED_INFO_SUP			= 1 <<  9,
+	IBV_PORT_SYS_IMAGE_GUID_SUP		= 1 << 11,
+	IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP	= 1 << 12,
+	IBV_PORT_EXTENDED_SPEEDS_SUP		= 1 << 14,
+	IBV_PORT_CM_SUP				= 1 << 16,
+	IBV_PORT_SNMP_TUNNEL_SUP		= 1 << 17,
+	IBV_PORT_REINIT_SUP			= 1 << 18,
+	IBV_PORT_DEVICE_MGMT_SUP		= 1 << 19,
+	IBV_PORT_VENDOR_CLASS			= 1 << 24,
+	IBV_PORT_CLIENT_REG_SUP			= 1 << 25,
+	IBV_PORT_IP_BASED_GIDS			= 1 << 26,
+};
+
+struct ibv_port_attr {
+	enum ibv_port_state	state;
+	enum ibv_mtu		max_mtu;
+	enum ibv_mtu		active_mtu;
+	int			gid_tbl_len;
+	uint32_t		port_cap_flags;
+	uint32_t		max_msg_sz;
+	uint32_t		bad_pkey_cntr;
+	uint32_t		qkey_viol_cntr;
+	uint16_t		pkey_tbl_len;
+	uint16_t		lid;
+	uint16_t		sm_lid;
+	uint8_t			lmc;
+	uint8_t			max_vl_num;
+	uint8_t			sm_sl;
+	uint8_t			subnet_timeout;
+	uint8_t			init_type_reply;
+	uint8_t			active_width;
+	uint8_t			active_speed;
+	uint8_t			phys_state;
+	uint8_t			link_layer;
+	uint8_t			reserved;
+};
+
+enum ibv_event_type {
+	IBV_EVENT_CQ_ERR,
+	IBV_EVENT_QP_FATAL,
+	IBV_EVENT_QP_REQ_ERR,
+	IBV_EVENT_QP_ACCESS_ERR,
+	IBV_EVENT_COMM_EST,
+	IBV_EVENT_SQ_DRAINED,
+	IBV_EVENT_PATH_MIG,
+	IBV_EVENT_PATH_MIG_ERR,
+	IBV_EVENT_DEVICE_FATAL,
+	IBV_EVENT_PORT_ACTIVE,
+	IBV_EVENT_PORT_ERR,
+	IBV_EVENT_LID_CHANGE,
+	IBV_EVENT_PKEY_CHANGE,
+	IBV_EVENT_SM_CHANGE,
+	IBV_EVENT_SRQ_ERR,
+	IBV_EVENT_SRQ_LIMIT_REACHED,
+	IBV_EVENT_QP_LAST_WQE_REACHED,
+	IBV_EVENT_CLIENT_REREGISTER,
+	IBV_EVENT_GID_CHANGE,
+
+	/* new experimental events start here leaving enough
+	 * room for 14 events which should be enough
+	 */
+	IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32,
+	IBV_EXP_EVENT_DCT_ACCESS_ERR,
+	IBV_EXP_EVENT_DCT_REQ_ERR,
+};
+
+struct ibv_async_event {
+	union {
+		struct ibv_cq  *cq;
+		struct ibv_qp  *qp;
+		struct ibv_srq *srq;
+		struct ibv_exp_dct *dct;
+		int		port_num;
+		/* For source compatible with Legacy API */
+		uint32_t	xrc_qp_num;
+	} element;
+	enum ibv_event_type	event_type;
+};
+
+enum ibv_wc_status {
+	IBV_WC_SUCCESS,
+	IBV_WC_LOC_LEN_ERR,
+	IBV_WC_LOC_QP_OP_ERR,
+	IBV_WC_LOC_EEC_OP_ERR,
+	IBV_WC_LOC_PROT_ERR,
+	IBV_WC_WR_FLUSH_ERR,
+	IBV_WC_MW_BIND_ERR,
+	IBV_WC_BAD_RESP_ERR,
+	IBV_WC_LOC_ACCESS_ERR,
+	IBV_WC_REM_INV_REQ_ERR,
+	IBV_WC_REM_ACCESS_ERR,
+	IBV_WC_REM_OP_ERR,
+	IBV_WC_RETRY_EXC_ERR,
+	IBV_WC_RNR_RETRY_EXC_ERR,
+	IBV_WC_LOC_RDD_VIOL_ERR,
+	IBV_WC_REM_INV_RD_REQ_ERR,
+	IBV_WC_REM_ABORT_ERR,
+	IBV_WC_INV_EECN_ERR,
+	IBV_WC_INV_EEC_STATE_ERR,
+	IBV_WC_FATAL_ERR,
+	IBV_WC_RESP_TIMEOUT_ERR,
+	IBV_WC_GENERAL_ERR
+};
+const char *ibv_wc_status_str(enum ibv_wc_status status);
+
+enum ibv_wc_opcode {
+	IBV_WC_SEND,
+	IBV_WC_RDMA_WRITE,
+	IBV_WC_RDMA_READ,
+	IBV_WC_COMP_SWAP,
+	IBV_WC_FETCH_ADD,
+	IBV_WC_BIND_MW,
+/*
+ * Set value of IBV_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IBV_WC_RECV).
+ */
+	IBV_WC_RECV			= 1 << 7,
+	IBV_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ibv_wc_flags {
+	IBV_WC_GRH		= 1 << 0,
+	IBV_WC_WITH_IMM		= 1 << 1
+};
+
+struct ibv_wc {
+	uint64_t		wr_id;
+	enum ibv_wc_status	status;
+	enum ibv_wc_opcode	opcode;
+	uint32_t		vendor_err;
+	uint32_t		byte_len;
+	uint32_t		imm_data;	/* in network byte order */
+	uint32_t		qp_num;
+	uint32_t		src_qp;
+	int			wc_flags;
+	uint16_t		pkey_index;
+	uint16_t		slid;
+	uint8_t			sl;
+	uint8_t			dlid_path_bits;
+};
+
+enum ibv_access_flags {
+	IBV_ACCESS_LOCAL_WRITE		= 1,
+	IBV_ACCESS_REMOTE_WRITE		= (1<<1),
+	IBV_ACCESS_REMOTE_READ		= (1<<2),
+	IBV_ACCESS_REMOTE_ATOMIC	= (1<<3),
+	IBV_ACCESS_MW_BIND		= (1<<4),
+	IBV_ACCESS_RELAXED_ORDERING     = (1<<20),
+};
+
+struct ibv_pd {
+	struct ibv_context     *context;
+	uint32_t		handle;
+};
+
+enum ibv_xrcd_init_attr_mask {
+	IBV_XRCD_INIT_ATTR_FD	    = 1 << 0,
+	IBV_XRCD_INIT_ATTR_OFLAGS   = 1 << 1,
+	IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2
+};
+
+struct ibv_xrcd_init_attr {
+	uint32_t comp_mask;
+	int	 fd;
+	int	 oflags;
+};
+
+struct ibv_xrcd {
+	struct ibv_context     *context;
+};
+
+enum ibv_rereg_mr_flags {
+	IBV_REREG_MR_CHANGE_TRANSLATION	= (1 << 0),
+	IBV_REREG_MR_CHANGE_PD		= (1 << 1),
+	IBV_REREG_MR_CHANGE_ACCESS	= (1 << 2),
+	IBV_REREG_MR_KEEP_VALID		= (1 << 3)
+};
+
+struct ibv_mr {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	void		       *addr;
+	size_t			length;
+	uint32_t		handle;
+	uint32_t		lkey;
+	uint32_t		rkey;
+};
+
+enum ibv_mw_type {
+	IBV_MW_TYPE_1			= 1,
+	IBV_MW_TYPE_2			= 2
+};
+
+struct ibv_mw {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	uint32_t		rkey;
+};
+
+struct ibv_global_route {
+	union ibv_gid		dgid;
+	uint32_t		flow_label;
+	uint8_t			sgid_index;
+	uint8_t			hop_limit;
+	uint8_t			traffic_class;
+};
+
+struct ibv_grh {
+	uint32_t		version_tclass_flow;
+	uint16_t		paylen;
+	uint8_t			next_hdr;
+	uint8_t			hop_limit;
+	union ibv_gid		sgid;
+	union ibv_gid		dgid;
+};
+
+enum ibv_rate {
+	IBV_RATE_MAX      = 0,
+	IBV_RATE_2_5_GBPS = 2,
+	IBV_RATE_5_GBPS   = 5,
+	IBV_RATE_10_GBPS  = 3,
+	IBV_RATE_20_GBPS  = 6,
+	IBV_RATE_30_GBPS  = 4,
+	IBV_RATE_40_GBPS  = 7,
+	IBV_RATE_60_GBPS  = 8,
+	IBV_RATE_80_GBPS  = 9,
+	IBV_RATE_120_GBPS = 10,
+	IBV_RATE_14_GBPS  = 11,
+	IBV_RATE_56_GBPS  = 12,
+	IBV_RATE_112_GBPS = 13,
+	IBV_RATE_168_GBPS = 14,
+	IBV_RATE_25_GBPS  = 15,
+	IBV_RATE_100_GBPS = 16,
+	IBV_RATE_200_GBPS = 17,
+	IBV_RATE_300_GBPS = 18
+};
+
+/**
+ * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the
+ * base rate of 2.5 Gbit/sec.  For example, IBV_RATE_5_GBPS will be
+ * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
+ * @rate: rate to convert.
+ */
+int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const;
+
+/**
+ * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum.
+ * @mult: multiple to convert.
+ */
+enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const;
+
+/**
+ * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec.
+ * For example, IBV_RATE_5_GBPS will return the value 5000.
+ * @rate: rate to convert.
+ */
+int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const;
+
+/**
+ * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum.
+ * @mbps: value to convert.
+ */
+enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const;
+
+struct ibv_ah_attr {
+	struct ibv_global_route	grh;
+	uint16_t		dlid;
+	uint8_t			sl;
+	uint8_t			src_path_bits;
+	uint8_t			static_rate;
+	uint8_t			is_global;
+	uint8_t			port_num;
+};
+
+enum ibv_srq_attr_mask {
+	IBV_SRQ_MAX_WR	= 1 << 0,
+	IBV_SRQ_LIMIT	= 1 << 1
+};
+
+struct ibv_srq_attr {
+	uint32_t		max_wr;
+	uint32_t		max_sge;
+	uint32_t		srq_limit;
+};
+
+struct ibv_srq_init_attr {
+	void		       *srq_context;
+	struct ibv_srq_attr	attr;
+};
+
+enum ibv_srq_type {
+	IBV_SRQT_BASIC,
+	IBV_SRQT_XRC
+};
+
+enum ibv_srq_init_attr_mask {
+	IBV_SRQ_INIT_ATTR_TYPE		= 1 << 0,
+	IBV_SRQ_INIT_ATTR_PD		= 1 << 1,
+	IBV_SRQ_INIT_ATTR_XRCD		= 1 << 2,
+	IBV_SRQ_INIT_ATTR_CQ		= 1 << 3,
+	IBV_SRQ_INIT_ATTR_RESERVED	= 1 << 4
+};
+
+struct ibv_srq_init_attr_ex {
+	void		       *srq_context;
+	struct ibv_srq_attr	attr;
+
+	uint32_t		comp_mask;
+	enum ibv_srq_type	srq_type;
+	struct ibv_pd	       *pd;
+	struct ibv_xrcd	       *xrcd;
+	struct ibv_cq	       *cq;
+};
+
+enum ibv_qp_type {
+	IBV_QPT_RC = 2,
+	IBV_QPT_UC,
+	IBV_QPT_UD,
+	/* XRC compatible code */
+	IBV_QPT_XRC,
+	IBV_QPT_RAW_PACKET = 8,
+	IBV_QPT_RAW_ETH = 8,
+	IBV_QPT_XRC_SEND = 9,
+	IBV_QPT_XRC_RECV,
+
+	/* Leave a gap for future qp types before starting with
+	 * experimental qp types.
+	 */
+	IBV_EXP_QP_TYPE_START	= 32,
+	IBV_EXP_QPT_DC_INI	= IBV_EXP_QP_TYPE_START
+};
+
+struct ibv_qp_cap {
+	uint32_t		max_send_wr;
+	uint32_t		max_recv_wr;
+	uint32_t		max_send_sge;
+	uint32_t		max_recv_sge;
+	uint32_t		max_inline_data;
+};
+
+struct ibv_qp_init_attr {
+	void		       *qp_context;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	struct ibv_qp_cap	cap;
+	enum ibv_qp_type	qp_type;
+	int			sq_sig_all;
+	/* Below is needed for backwards compatabile */
+	struct ibv_xrc_domain  *xrc_domain;
+};
+
+enum ibv_qp_init_attr_mask {
+	IBV_QP_INIT_ATTR_PD		= 1 << 0,
+	IBV_QP_INIT_ATTR_XRCD		= 1 << 1,
+	IBV_QP_INIT_ATTR_RESERVED	= 1 << 2
+};
+
+struct ibv_qp_init_attr_ex {
+	void		       *qp_context;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	struct ibv_qp_cap	cap;
+	enum ibv_qp_type	qp_type;
+	int			sq_sig_all;
+
+	uint32_t		comp_mask;
+	struct ibv_pd	       *pd;
+	struct ibv_xrcd	       *xrcd;
+};
+
+enum ibv_qp_open_attr_mask {
+	IBV_QP_OPEN_ATTR_NUM		= 1 << 0,
+	IBV_QP_OPEN_ATTR_XRCD		= 1 << 1,
+	IBV_QP_OPEN_ATTR_CONTEXT	= 1 << 2,
+	IBV_QP_OPEN_ATTR_TYPE		= 1 << 3,
+	IBV_QP_OPEN_ATTR_RESERVED	= 1 << 4
+};
+
+struct ibv_qp_open_attr {
+	uint32_t		comp_mask;
+	uint32_t		qp_num;
+	struct ibv_xrcd        *xrcd;
+	void		       *qp_context;
+	enum ibv_qp_type	qp_type;
+};
+
+enum ibv_qp_attr_mask {
+	IBV_QP_STATE			= 1 << 	0,
+	IBV_QP_CUR_STATE		= 1 << 	1,
+	IBV_QP_EN_SQD_ASYNC_NOTIFY	= 1 << 	2,
+	IBV_QP_ACCESS_FLAGS		= 1 << 	3,
+	IBV_QP_PKEY_INDEX		= 1 << 	4,
+	IBV_QP_PORT			= 1 << 	5,
+	IBV_QP_QKEY			= 1 << 	6,
+	IBV_QP_AV			= 1 << 	7,
+	IBV_QP_PATH_MTU			= 1 << 	8,
+	IBV_QP_TIMEOUT			= 1 << 	9,
+	IBV_QP_RETRY_CNT		= 1 << 10,
+	IBV_QP_RNR_RETRY		= 1 << 11,
+	IBV_QP_RQ_PSN			= 1 << 12,
+	IBV_QP_MAX_QP_RD_ATOMIC		= 1 << 13,
+	IBV_QP_ALT_PATH			= 1 << 14,
+	IBV_QP_MIN_RNR_TIMER		= 1 << 15,
+	IBV_QP_SQ_PSN			= 1 << 16,
+	IBV_QP_MAX_DEST_RD_ATOMIC	= 1 << 17,
+	IBV_QP_PATH_MIG_STATE		= 1 << 18,
+	IBV_QP_CAP			= 1 << 19,
+	IBV_QP_DEST_QPN			= 1 << 20
+};
+
+enum ibv_qp_state {
+	IBV_QPS_RESET,
+	IBV_QPS_INIT,
+	IBV_QPS_RTR,
+	IBV_QPS_RTS,
+	IBV_QPS_SQD,
+	IBV_QPS_SQE,
+	IBV_QPS_ERR,
+	IBV_QPS_UNKNOWN
+};
+
+enum ibv_mig_state {
+	IBV_MIG_MIGRATED,
+	IBV_MIG_REARM,
+	IBV_MIG_ARMED
+};
+
+struct ibv_qp_attr {
+	enum ibv_qp_state	qp_state;
+	enum ibv_qp_state	cur_qp_state;
+	enum ibv_mtu		path_mtu;
+	enum ibv_mig_state	path_mig_state;
+	uint32_t		qkey;
+	uint32_t		rq_psn;
+	uint32_t		sq_psn;
+	uint32_t		dest_qp_num;
+	int			qp_access_flags;
+	struct ibv_qp_cap	cap;
+	struct ibv_ah_attr	ah_attr;
+	struct ibv_ah_attr	alt_ah_attr;
+	uint16_t		pkey_index;
+	uint16_t		alt_pkey_index;
+	uint8_t			en_sqd_async_notify;
+	uint8_t			sq_draining;
+	uint8_t			max_rd_atomic;
+	uint8_t			max_dest_rd_atomic;
+	uint8_t			min_rnr_timer;
+	uint8_t			port_num;
+	uint8_t			timeout;
+	uint8_t			retry_cnt;
+	uint8_t			rnr_retry;
+	uint8_t			alt_port_num;
+	uint8_t			alt_timeout;
+};
+
+enum ibv_wr_opcode {
+	IBV_WR_RDMA_WRITE,
+	IBV_WR_RDMA_WRITE_WITH_IMM,
+	IBV_WR_SEND,
+	IBV_WR_SEND_WITH_IMM,
+	IBV_WR_RDMA_READ,
+	IBV_WR_ATOMIC_CMP_AND_SWP,
+	IBV_WR_ATOMIC_FETCH_AND_ADD
+};
+
+enum ibv_send_flags {
+	IBV_SEND_FENCE		= 1 << 0,
+	IBV_SEND_SIGNALED	= 1 << 1,
+	IBV_SEND_SOLICITED	= 1 << 2,
+	IBV_SEND_INLINE		= 1 << 3
+};
+
+struct ibv_sge {
+	uint64_t		addr;
+	uint32_t		length;
+	uint32_t		lkey;
+};
+
+struct ibv_send_wr {
+	uint64_t		wr_id;
+	struct ibv_send_wr     *next;
+	struct ibv_sge	       *sg_list;
+	int			num_sge;
+	enum ibv_wr_opcode	opcode;
+	int			send_flags;
+	uint32_t		imm_data;	/* in network byte order */
+	union {
+		struct {
+			uint64_t	remote_addr;
+			uint32_t	rkey;
+		} rdma;
+		struct {
+			uint64_t	remote_addr;
+			uint64_t	compare_add;
+			uint64_t	swap;
+			uint32_t	rkey;
+		} atomic;
+		struct {
+			struct ibv_ah  *ah;
+			uint32_t	remote_qpn;
+			uint32_t	remote_qkey;
+		} ud;
+	} wr;
+	union {
+		union {
+			struct {
+				uint32_t    remote_srqn;
+			} xrc;
+		} qp_type;
+
+		uint32_t		xrc_remote_srq_num;
+	};
+};
+
+struct ibv_recv_wr {
+	uint64_t		wr_id;
+	struct ibv_recv_wr     *next;
+	struct ibv_sge	       *sg_list;
+	int			num_sge;
+};
+
+struct ibv_mw_bind {
+	uint64_t		wr_id;
+	struct ibv_mr	       *mr;
+	void		       *addr;
+	size_t			length;
+	int			send_flags;
+	int			mw_access_flags;
+};
+
+struct ibv_srq {
+	struct ibv_context     *context;
+	void		       *srq_context;
+	struct ibv_pd	       *pd;
+	uint32_t		handle;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		events_completed;
+
+	/* below are for source compatabilty with legacy XRC,
+	*   padding based on ibv_srq_legacy.
+	*/
+	uint32_t		xrc_srq_num_bin_compat_padding;
+	struct ibv_xrc_domain	*xrc_domain_bin_compat_padding;
+	struct ibv_cq	*xrc_cq_bin_compat_padding;
+	void		*ibv_srq_padding;
+
+	/* legacy fields */
+	uint32_t		xrc_srq_num;
+	struct ibv_xrc_domain	*xrc_domain;
+	struct ibv_cq		*xrc_cq;
+};
+
+/* Not in use in new API, needed for compilation as part of source compat layer */
+enum ibv_event_flags {
+	IBV_XRC_QP_EVENT_FLAG = 0x80000000,
+};
+
+
+
+struct ibv_qp {
+	struct ibv_context     *context;
+	void		       *qp_context;
+	struct ibv_pd	       *pd;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	uint32_t		handle;
+	uint32_t		qp_num;
+	enum ibv_qp_state       state;
+	enum ibv_qp_type	qp_type;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		events_completed;
+};
+
+struct ibv_comp_channel {
+	struct ibv_context     *context;
+	int			fd;
+	int			refcnt;
+};
+
+struct ibv_cq {
+	struct ibv_context     *context;
+	struct ibv_comp_channel *channel;
+	void		       *cq_context;
+	uint32_t		handle;
+	int			cqe;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		comp_events_completed;
+	uint32_t		async_events_completed;
+};
+
+struct ibv_ah {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	uint32_t		handle;
+};
+
+enum ibv_flow_flags {
+	IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1,
+	IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1,
+};
+
+enum ibv_flow_attr_type {
+	/* steering according to rule specifications */
+	IBV_FLOW_ATTR_NORMAL		= 0x0,
+	/* default unicast and multicast rule -
+	 * receive all Eth traffic which isn't steered to any QP
+	 */
+	IBV_FLOW_ATTR_ALL_DEFAULT	= 0x1,
+	/* default multicast rule -
+	 * receive all Eth multicast traffic which isn't steered to any QP
+	 */
+	IBV_FLOW_ATTR_MC_DEFAULT	= 0x2,
+};
+
+enum ibv_flow_spec_type {
+	IBV_FLOW_SPEC_ETH	= 0x20,
+	IBV_FLOW_SPEC_IPV4	= 0x30,
+	IBV_FLOW_SPEC_TCP	= 0x40,
+	IBV_FLOW_SPEC_UDP	= 0x41,
+};
+
+struct ibv_flow_eth_filter {
+	uint8_t		dst_mac[6];
+	uint8_t		src_mac[6];
+	uint16_t	ether_type;
+	/*
+	 * same layout as 802.1q: prio 3, cfi 1, vlan id 12
+	 */
+	uint16_t	vlan_tag;
+};
+
+struct ibv_flow_spec_eth {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_eth_filter val;
+	struct ibv_flow_eth_filter mask;
+};
+
+struct ibv_flow_ipv4_filter {
+	uint32_t src_ip;
+	uint32_t dst_ip;
+};
+
+struct ibv_flow_spec_ipv4 {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_ipv4_filter val;
+	struct ibv_flow_ipv4_filter mask;
+};
+
+struct ibv_flow_tcp_udp_filter {
+	uint16_t dst_port;
+	uint16_t src_port;
+};
+
+struct ibv_flow_spec_tcp_udp {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_tcp_udp_filter val;
+	struct ibv_flow_tcp_udp_filter mask;
+};
+
+struct ibv_flow_spec {
+	union {
+		struct {
+			enum ibv_flow_spec_type	type;
+			uint16_t		size;
+		} hdr;
+		struct ibv_flow_spec_eth eth;
+		struct ibv_flow_spec_ipv4 ipv4;
+		struct ibv_flow_spec_tcp_udp tcp_udp;
+	};
+};
+
+struct ibv_flow_attr {
+	uint32_t comp_mask;
+	enum ibv_flow_attr_type type;
+	uint16_t size;
+	uint16_t priority;
+	uint8_t num_of_specs;
+	uint8_t port;
+	uint32_t flags;
+	/* Following are the optional layers according to user request
+	 * struct ibv_flow_spec_xxx [L2]
+	 * struct ibv_flow_spec_yyy [L3/L4]
+	 */
+};
+
+struct ibv_flow {
+	uint32_t	   comp_mask;
+	struct ibv_context *context;
+	uint32_t	   handle;
+};
+
+struct ibv_device;
+struct ibv_context;
+
+struct ibv_device_ops {
+	struct ibv_context *	(*alloc_context)(struct ibv_device *device, int cmd_fd);
+	void			(*free_context)(struct ibv_context *context);
+};
+
+enum {
+	IBV_SYSFS_NAME_MAX	= 64,
+	IBV_SYSFS_PATH_MAX	= 256
+};
+
+struct ibv_device {
+	struct ibv_device_ops	ops;
+	enum ibv_node_type	node_type;
+	enum ibv_transport_type	transport_type;
+	/* Name of underlying kernel IB device, eg "mthca0" */
+	char			name[IBV_SYSFS_NAME_MAX];
+	/* Name of uverbs device, eg "uverbs0" */
+	char			dev_name[IBV_SYSFS_NAME_MAX];
+	/* Path to infiniband_verbs class device in sysfs */
+	char			dev_path[IBV_SYSFS_PATH_MAX];
+	/* Path to infiniband class device in sysfs */
+	char			ibdev_path[IBV_SYSFS_PATH_MAX];
+};
+
+struct verbs_device {
+	struct ibv_device device; /* Must be first */
+	size_t	sz;
+	size_t	size_of_context;
+	int	(*init_context)(struct verbs_device *device,
+				struct ibv_context *ctx, int cmd_fd);
+	void	(*uninit_context)(struct verbs_device *device,
+				struct ibv_context *ctx);
+	/* future fields added here */
+};
+
+struct ibv_context_ops {
+	int			(*query_device)(struct ibv_context *context,
+					      struct ibv_device_attr *device_attr);
+	int			(*query_port)(struct ibv_context *context, uint8_t port_num,
+					      struct ibv_port_attr *port_attr);
+	struct ibv_pd *		(*alloc_pd)(struct ibv_context *context);
+	int			(*dealloc_pd)(struct ibv_pd *pd);
+	struct ibv_mr *		(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length,
+					  int access);
+	struct ibv_mr *		(*rereg_mr)(struct ibv_mr *mr,
+					    int flags,
+					    struct ibv_pd *pd, void *addr,
+					    size_t length,
+					    int access);
+	int			(*dereg_mr)(struct ibv_mr *mr);
+	struct ibv_mw *		(*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type);
+	int			(*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw,
+					   struct ibv_mw_bind *mw_bind);
+	int			(*dealloc_mw)(struct ibv_mw *mw);
+	struct ibv_cq *		(*create_cq)(struct ibv_context *context, int cqe,
+					     struct ibv_comp_channel *channel,
+					     int comp_vector);
+	int			(*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc);
+	int			(*req_notify_cq)(struct ibv_cq *cq, int solicited_only);
+	void			(*cq_event)(struct ibv_cq *cq);
+	int			(*resize_cq)(struct ibv_cq *cq, int cqe);
+	int			(*destroy_cq)(struct ibv_cq *cq);
+	struct ibv_srq *	(*create_srq)(struct ibv_pd *pd,
+					      struct ibv_srq_init_attr *srq_init_attr);
+	int			(*modify_srq)(struct ibv_srq *srq,
+					      struct ibv_srq_attr *srq_attr,
+					      int srq_attr_mask);
+	int			(*query_srq)(struct ibv_srq *srq,
+					     struct ibv_srq_attr *srq_attr);
+	int			(*destroy_srq)(struct ibv_srq *srq);
+	int			(*post_srq_recv)(struct ibv_srq *srq,
+						 struct ibv_recv_wr *recv_wr,
+						 struct ibv_recv_wr **bad_recv_wr);
+	struct ibv_qp *		(*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
+	int			(*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+					    int attr_mask,
+					    struct ibv_qp_init_attr *init_attr);
+	int			(*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+					     int attr_mask);
+	int			(*destroy_qp)(struct ibv_qp *qp);
+	int			(*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr,
+					     struct ibv_send_wr **bad_wr);
+	int			(*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr,
+					     struct ibv_recv_wr **bad_wr);
+	struct ibv_ah *		(*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+	int			(*destroy_ah)(struct ibv_ah *ah);
+	int			(*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
+						uint16_t lid);
+	int			(*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
+						uint16_t lid);
+	void			(*async_event)(struct ibv_async_event *event);
+};
+
+struct ibv_context {
+	struct ibv_device      *device;
+	struct ibv_context_ops	ops;
+	int			cmd_fd;
+	int			async_fd;
+	int			num_comp_vectors;
+	pthread_mutex_t		mutex;
+	void		       *abi_compat;
+};
+
+enum verbs_context_mask {
+	VERBS_CONTEXT_XRCD         = (uint64_t)1 << 0,
+	VERBS_CONTEXT_SRQ          = (uint64_t)1 << 1,
+	VERBS_CONTEXT_QP           = (uint64_t)1 << 2,
+	VERBS_CONTEXT_RESERVED     = (uint64_t)1 << 3,
+	VERBS_CONTEXT_EXP	   = (uint64_t)1 << 62
+};
+
+struct verbs_context {
+	/*  "grows up" - new fields go here */
+	int (*_reserved_2) (void);
+	int (*destroy_flow) (struct ibv_flow *flow);
+	int (*_reserved_1) (void);
+	struct ibv_flow * (*create_flow) (struct ibv_qp *qp,
+					  struct ibv_flow_attr *flow_attr);
+	struct ibv_qp * (*open_qp)(struct ibv_context *context,
+			struct ibv_qp_open_attr *attr);
+	struct ibv_qp * (*create_qp_ex)(struct ibv_context *context,
+			struct ibv_qp_init_attr_ex *qp_init_attr_ex);
+	int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num);
+	struct ibv_srq * (*create_srq_ex)(struct ibv_context *context,
+			struct ibv_srq_init_attr_ex *srq_init_attr_ex);
+	struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context,
+			struct ibv_xrcd_init_attr *xrcd_init_attr);
+	int  (*close_xrcd)(struct ibv_xrcd *xrcd);
+	uint64_t has_comp_mask;
+	size_t   sz;	/* Must be immediately before struct ibv_context */
+	struct ibv_context context;/* Must be last field in the struct */
+};
+
+/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
+/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx)
+{
+	return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ?
+		NULL : container_of(ctx, struct verbs_context, context);
+}
+
+#define verbs_get_ctx_op(ctx, op) ({ \
+	struct verbs_context *_vctx = verbs_get_ctx(ctx); \
+	(!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \
+	!_vctx->op) ? NULL : _vctx; })*/
+
+#define verbs_set_ctx_op(_vctx, op, ptr) ({ \
+	struct verbs_context *vctx = _vctx; \
+	if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \
+		vctx->op = ptr; })
+
+static inline struct verbs_device *verbs_get_device(struct ibv_device *dev)
+{
+	return (dev->ops.alloc_context) ?
+		NULL : container_of(dev, struct verbs_device, device);
+}
+
+static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
+  return qp->context->ops.post_send(qp, wr, bad_wr);
+}
+
+#endif  // NCCL_IBV_CORE_H_
--- a/rccl/src/include/ibvsymbols.h
+++ b/rccl/src/include/ibvsymbols.h
+#ifndef NCCL_IBV_SYMBOLS_H_
+#define NCCL_IBV_SYMBOLS_H_
+
+#ifdef NCCL_BUILD_RDMA_CORE
+#include <infiniband/verbs.h>
+#else
+#include "ibvcore.h"
+#endif
+
+#include "nccl.h"
+
+/* IB Verbs Function Pointers*/
+struct ncclIbvSymbols {
+  int (*ibv_internal_fork_init)(void);
+  struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
+  void (*ibv_internal_free_device_list)(struct ibv_device **list);
+  const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
+  struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
+  int (*ibv_internal_close_device)(struct ibv_context *context);
+  int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
+  void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
+  int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
+  int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
+  int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
+  int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
+  struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
+  int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
+  struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
+  struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access);
+  /* DMA-BUF support */
+  struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
+  int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
+  struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
+  int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
+  struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
+  int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
+  int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
+  const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
+};
+
+/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
+ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols);
+
+#endif  // NCCL_IBV_SYMBOLS_H_
--- a/rccl/src/include/ibvwrap.h
+++ b/rccl/src/include/ibvwrap.h
+/*************************************************************************
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2004, 2011-2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ *
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_IBVWRAP_H_
+#define NCCL_IBVWRAP_H_
+
+#ifdef NCCL_BUILD_RDMA_CORE
+#include <infiniband/verbs.h>
+#else
+#include "ibvcore.h"
+#endif
+
+#include "core.h"
+#include <sys/types.h>
+#include <unistd.h>
+
+typedef enum ibv_return_enum
+{
+    IBV_SUCCESS = 0,                   //!< The operation was successful
+} ibv_return_t;
+
+ncclResult_t wrap_ibv_symbols(void);
+/* NCCL wrappers of IB verbs functions */
+ncclResult_t wrap_ibv_fork_init(void);
+ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices);
+ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list);
+const char *wrap_ibv_get_device_name(struct ibv_device *device);
+ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device);
+ncclResult_t wrap_ibv_close_device(struct ibv_context *context);
+ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event);
+ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event);
+ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr);
+ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
+ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
+ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
+ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context);
+ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
+ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
+struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
+ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
+/* DMA-BUF support */
+ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
+struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
+ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
+ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
+ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
+ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
+ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq);
+static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) {
+  int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/
+  if (done < 0) {
+    WARN("Call to ibv_poll_cq() returned %d", done);
+    return ncclSystemError;
+  }
+  *num_done = done;
+  return ncclSuccess;
+}
+ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
+ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
+ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
+
+static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
+  int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  if (ret != IBV_SUCCESS) {
+    WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) {
+  int ret = qp->context->ops.post_recv(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  if (ret != IBV_SUCCESS) {
+    WARN("ibv_post_recv() failed with error %s", strerror(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);
+
+#endif //End include guard
--- a/rccl/src/include/info.h
+++ b/rccl/src/include/info.h
+/*************************************************************************
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INFO_H_
+#define NCCL_INFO_H_
+
+#include "nccl.h"
+#include "devcomm.h"
+#include "collectives.h"
+#include "core.h"
+#include "utils.h"
+#include "strongstream.h"
+
+typedef enum : uint8_t {
+  ncclPatternRing,
+  ncclPatternRingTwice,
+  ncclPatternPipelineFrom,
+  ncclPatternPipelineTo,
+  ncclPatternTreeUp,
+  ncclPatternTreeDown,
+  ncclPatternTreeUpDown,
+  ncclPatternCollnetChain,
+  ncclPatternCollnetDirect,
+  ncclPatternNvls,
+  ncclPatternNvlsTree,
+  ncclPatternSend,
+  ncclPatternRecv
+} ncclPattern_t;
+
+// Used to pass NCCL call information between functions
+struct ncclInfo {
+  ncclFunc_t coll;
+  const char* opName;
+  // NCCL Coll Args
+  const void* sendbuff;
+  void* recvbuff;
+  size_t count;
+  ncclDataType_t datatype;
+  ncclRedOp_t op;
+  int root; // peer for p2p operations
+  ncclComm_t comm;
+  cudaStream_t stream;
+  // Algorithm details
+  int chunkSteps;
+  int sliceSteps;
+  // Computed later
+  ncclDevRedOpFull opFull;
+  int algorithm;
+  int protocol;
+  ncclPattern_t pattern;
+  int nChannels;
+  int nThreads;
+  size_t nBytes;
+  int nstepsPerLoop;
+  int nchunksPerLoop;
+  int chunkSize;
+  int channelId;
+};
+
+inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
+  info->nBytes = info->count * ncclTypeSize(info->datatype);
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAllPivot) {
+    info->count = info->nBytes;
+    info->datatype = ncclInt8;
+  }
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
+  return ncclSuccess;
+}
+
+struct ncclTaskColl {
+  struct ncclTaskColl* next;
+  ncclFunc_t func;
+  void const* sendbuff;
+  void* recvbuff;
+  size_t count;
+  int root;
+  ncclDataType_t datatype;
+  ncclDevRedOpFull op;
+  int chunkSteps, sliceSteps;
+};
+struct ncclTaskP2p {
+  ncclTaskP2p *next;
+  void *buff;
+  size_t bytes;
+  // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
+  // of where it left off.
+  int chunk;
+};
+
+struct ncclCudaStreamList {
+  struct ncclCudaStreamList *next;
+  cudaStream_t stream;
+};
+struct ncclTasks {
+  struct Peer {
+    bool sendSeen, recvSeen;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
+  };
+  struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
+  size_t collBytesTotal;
+  struct Peer* peers/*[nRanks]*/;
+  int *p2pSendOrder, *p2pRecvOrder;
+  int p2pOrderSteps;
+  int nTasksColl, nTasksP2p;
+
+  // The list of user streams aggregated over all tasks present.
+  struct ncclCudaStreamList* streams;
+  // Keep track of the number of user streams
+  int numStreams;
+  // The most recent user stream. Ignored if streams==nullptr
+  cudaStream_t streamRecent;
+  // The graph capturing all user streams or invalid if none. Thus we restrict the
+  // user that all streams must be captured in the same graph or not captured
+  // at all. Technically we could probably relax this, but that would mean
+  // collecting a different `ncclTasks` per graph and one for non-graph.
+  struct ncclCudaGraph capturingGraph;
+};
+
+#endif
--- a/rccl/src/include/ipcsocket.h
+++ b/rccl/src/include/ipcsocket.h
+/*
+ * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See COPYRIGHT for license information
+ */
+
+#ifndef NCCL_IPCSOCKET_H
+#define NCCL_IPCSOCKET_H
+
+#include "nccl.h"
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <memory.h>
+#include <sys/un.h>
+#include <inttypes.h>
+
+#define NCCL_IPC_SOCKNAME_LEN 64
+
+struct ncclIpcSocket {
+  int fd;
+  char socketName[NCCL_IPC_SOCKNAME_LEN];
+  volatile uint32_t* abortFlag;
+};
+
+ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
+ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
+
+ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
+ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
+
+#endif /* NCCL_IPCSOCKET_H */
--- a/rccl/src/include/msccl/msccl_kernel.h
+++ b/rccl/src/include/msccl/msccl_kernel.h
+/*************************************************************************
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT License.
+ ************************************************************************/
+
+#ifndef MSCCL_KERNEL_H_
+#define MSCCL_KERNEL_H_
+
+#define MSCCL_KERNEL_ENTRY_NAME(devredop, type, proto, fullOps) mscclKernel_##devredop##_##type##_##proto##_##fullOps
+
+#define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, proto, fullOps) \
+__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, proto, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work);
+
+#define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, LL, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, LL128, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE_PROTO(devredop, type, Simple, fullOps)
+
+#define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(devredop, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, half, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, float, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, double, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, rccl_bfloat16, fullOps)
+
+#define MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(devredop, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t, fullOps) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t, fullOps)
+
+#define MSCCL_DECL_KERNEL_ENTRY_FUNC() \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Sum, false) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Prod, false) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Min, false) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(Max, false) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP(PreMulSum, false) \
+  MSCCL_DECL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(SumPostDiv, false)
+
+MSCCL_DECL_KERNEL_ENTRY_FUNC()
+
+#endif
--- a/rccl/src/include/msccl/msccl_lifecycle.h
+++ b/rccl/src/include/msccl/msccl_lifecycle.h
+/*************************************************************************
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT License.
+ ************************************************************************/
+
+#ifndef MSCCL_LIFECYCLE_H_
+#define MSCCL_LIFECYCLE_H_
+
+#include "enqueue.h"
+
+#include "msccl/msccl_struct.h"
+
+bool mscclEnabled();
+
+void mscclSetIsCallerFlag();
+void mscclClearIsCallerFlag();
+bool mscclIsCaller();
+
+bool mscclAvailable();
+
+ncclResult_t mscclInit(ncclComm_t comm);
+
+ncclResult_t mscclGroupStart();
+
+ncclResult_t mscclEnqueueCheck(
+    const void* sendbuff, const size_t sendcounts[], const size_t sdispls[],
+    void* recvbuff, const size_t recvcounts[], const size_t rdispls[],
+    size_t count, ncclDataType_t datatype, int root, int peer, ncclRedOp_t op,
+    mscclFunc_t mscclFunc, ncclComm_t comm, hipStream_t stream);
+
+ncclResult_t mscclGroupEnd();
+
+ncclResult_t mscclTeardown();
+
+size_t mscclKernMaxLocalSize();
+
+#endif
--- a/rccl/src/include/msccl/msccl_parser.h
+++ b/rccl/src/include/msccl/msccl_parser.h
+/*************************************************************************
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef MSCCL_PARSER_H_
+#define MSCCL_PARSER_H_
+
+#include "nccl.h"
+#include "debug.h"
+#include "checks.h"
+#include <stdlib.h>
+
+#include "msccl/msccl_struct.h"
+
+// A few constraints to make the implementation easy
+#define MAX_STR_LEN 255
+#define MAX_ATTR_COUNT 16
+#define MAX_SUBS 1024
+#define MAX_NODES 4096
+
+#define NODE_TYPE_NONE 0
+#define NODE_TYPE_OPEN 1
+#define NODE_TYPE_CLOSE 2
+#define NODE_TYPE_SINGLE 3
+
+struct mscclXmlNode {
+  char name[MAX_STR_LEN+1];
+  struct {
+    char key[MAX_STR_LEN+1];
+    char value[MAX_STR_LEN+1];
+  } attrs[MAX_ATTR_COUNT+1]; // Need an extra one to consume extra params
+  int nAttrs;
+  int type;
+  struct mscclXmlNode* parent;
+  struct mscclXmlNode* subs[MAX_SUBS];
+  int nSubs;
+};
+
+struct mscclXml {
+  struct mscclXmlNode nodes[MAX_NODES];
+  int maxIndex;
+};
+
+static ncclResult_t mscclXmlGetAttrIndex(struct mscclXmlNode* node, const char* attrName, int* index) {
+  *index = -1;
+  const int nAttrs = node->nAttrs;
+  for (int a=0; a<nAttrs; a++) {
+    if (strncmp(node->attrs[a].key, attrName, MAX_STR_LEN) == 0) {
+      *index = a;
+      return ncclSuccess;
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t mscclXmlGetAttr(struct mscclXmlNode* node, const char* attrName, const char** value) {
+  int index;
+  NCCLCHECK(mscclXmlGetAttrIndex(node, attrName, &index));
+  *value = index == -1 ? NULL : node->attrs[index].value;
+  return ncclSuccess;
+}
+
+static ncclResult_t mscclXmlGetAttrStr(struct mscclXmlNode* node, const char* attrName, const char** value) {
+  NCCLCHECK(mscclXmlGetAttr(node, attrName, value));
+  if (*value == NULL) {
+    WARN("Attribute %s of node %s not found", attrName, node->name);
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+static ncclResult_t mscclXmlGetAttrInt(struct mscclXmlNode* node, const char* attrName, int* value) {
+  const char* str;
+  NCCLCHECK(mscclXmlGetAttrStr(node, attrName, &str));
+  *value = strtol(str, NULL, 0);
+  return ncclSuccess;
+}
+
+static ncclResult_t mscclXmlGetAttrInt64(struct mscclXmlNode* node, const char* attrName, int64_t* value) {
+  const char* str;
+  NCCLCHECK(mscclXmlGetAttrStr(node, attrName, &str));
+  *value = strtoll(str, NULL, 0);
+  return ncclSuccess;
+}
+
+static ncclResult_t mscclXmlFindTag(struct mscclXml* xml, const char* tagName, struct mscclXmlNode** node) {
+  *node = NULL;
+  for (int i=0; i<xml->maxIndex; i++) {
+    struct mscclXmlNode* n = xml->nodes+i;
+    if (strcmp(n->name, tagName) == 0) {
+      *node = n;
+      return ncclSuccess;
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t mscclGetAlgoFromXmlFile(const char* xmlGraphFile, struct mscclAlgo* algo, int rank);
+
+ncclResult_t mscclGetAlgoMetaFromXmlFile(const char* xmlGraphFile, struct mscclAlgoMeta* algoMeta);
+
+#endif
--- a/rccl/src/include/msccl/msccl_scheduler.h
+++ b/rccl/src/include/msccl/msccl_scheduler.h
+/*************************************************************************
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT License.
+ ************************************************************************/
+
+#ifndef MSCCL_SCHEDULER_H_
+#define MSCCL_SCHEDULER_H_
+
+typedef enum { mscclFuncReduce             =  0,
+               mscclFuncBroadcast          =  1,
+               mscclFuncAllReduce          =  2,
+               mscclFuncReduceScatter      =  3,
+               mscclFuncAllGather          =  4,
+               mscclFuncSend               =  5,
+               mscclFuncRecv               =  6,
+               mscclFuncGather             =  7,
+               mscclFuncScatter            =  8,
+               mscclFuncAllToAll           =  9,
+               mscclFuncAllToAllv          =  10,
+               mscclNumFuncs               =  11 } mscclFunc_t;
+
+struct mscclSchedulerParam {
+  const void* sendBuff;
+  const size_t* sendCounts;
+  const size_t* sDisPls;
+  void* recvBuff;
+  const size_t* recvCounts;
+  const size_t* rDisPls;
+  size_t count;
+  ncclDataType_t dataType;
+  int root;
+  int peer;
+  ncclRedOp_t op;
+  mscclFunc_t func;
+  int rank;
+  int nRanks;
+  bool scheduled;
+  mscclAlgoHandle_t handle;
+  uint64_t opCount;
+};
+
+typedef struct {
+  // Name of the scheduler (mainly for logs)
+  const char* name;
+  // Load all algorithms
+  ncclResult_t (*init)();
+  // Select an algorithm
+  ncclResult_t (*selectAlgo)(struct mscclSchedulerParam* param);
+  // Unload all algorithms
+  ncclResult_t (*teardown)();
+} mscclSchedulerInterface;
+
+#endif
--- a/rccl/src/include/msccl/msccl_setup.h
+++ b/rccl/src/include/msccl/msccl_setup.h
+/*************************************************************************
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT License.
+ ************************************************************************/
+
+#ifndef MSCCL_SETUP_H_
+#define MSCCL_SETUP_H_
+
+#include <hip/hip_runtime.h>
+
+#include "comm.h"
+#include "msccl/msccl_struct.h"
+
+ncclResult_t mscclGetCaptureStatus(hipStream_t stream);
+
+ncclResult_t mscclSetupScratch(struct mscclAlgo* hostAlgo, hipStream_t stream);
+
+ncclResult_t mscclSetupSyncFlags(hipStream_t stream);
+
+ncclResult_t mscclSetupConnections(struct mscclAlgo* hostAlgo, ncclComm_t comm);
+
+ncclResult_t mscclSetupCount(struct mscclAlgo* hostAlgo, ncclComm_t comm, size_t count, ncclDataType_t dataType);
+
+ncclResult_t mscclSetupProxy(struct mscclAlgo* hostAlgo, ncclComm_t comm, hipStream_t stream);
+
+ncclResult_t mscclSetupKernel(const void* sendBuff, void* recvBuff, size_t count,
+    ncclDataType_t dataType, ncclRedOp_t op, struct mscclAlgo* hostAlgo, struct mscclAlgo* devAlgo,
+    ncclComm_t comm, hipStream_t stream);
+
+ncclResult_t mscclInitWorkFifoStatus(mscclWorkFifoStatus* workFifoStatus);
+
+ncclResult_t mscclDestroyWorkFifoStatus(mscclWorkFifoStatus* workFifoStatus);
+
+#endif
--- a/rccl/src/include/msccl/msccl_status.h
+++ b/rccl/src/include/msccl/msccl_status.h
+/*************************************************************************
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT License.
+ ************************************************************************/
+
+#ifndef MSCCL_STATUS_H_
+#define MSCCL_STATUS_H_
+
+#include "msccl/msccl_struct.h"
+
+mscclStatus& mscclGetStatus();
+
+mscclSavedProxyArgs& mscclGetSavedProxyArgs();
+
+mscclThreadLocalStatus& mscclGetThreadLocalStatus();
+
+#endif
--- a/rccl/src/include/msccl/msccl_struct.h
+++ b/rccl/src/include/msccl/msccl_struct.h
+/*************************************************************************
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT License.
+ ************************************************************************/
+
+#ifndef MSCCL_STRUCT_H_
+#define MSCCL_STRUCT_H_
+
+#include <cstdint>
+#include <map>
+#include <set>
+#include <vector>
+#include "devcomm.h"
+#include "msccl/msccl_scheduler.h"
+
+#define MSCCL_MAX_NUM_STEPS 64
+#define MSCCL_MAX_NUM_THREAD_BLOCKS_PER_CHANNEL 32
+#define MSCCL_MAX_NUM_THREAD_BLOCKS 64
+#define MSCCL_MAX_COUNT 72 // max concurrent number of msccl chunk transmission
+#define MSCCL_MAX_REDUCE_FUSION 16
+#define MSCCL_MAX_NUM_ALGOS 1024
+
+#define MSCCL_SLICESTEPS (NCCL_STEPS/4)
+#define MSCCL_CHUNKSTEPS (NCCL_STEPS/2)
+
+#define MSCCL_INPUT_BUFFER 0
+#define MSCCL_OUTPUT_BUFFER 1
+#define MSCCL_SCRATCH_BUFFER 2
+
+#define MSCCL_SEND 0
+#define MSCCL_RECV 1
+#define MSCCL_RECV_COPY_SEND 2
+#define MSCCL_RECV_REDUCE_SEND 3
+#define MSCCL_RECV_REDUCE_COPY 4
+#define MSCCL_RECV_REDUCE_COPY_SEND 5
+#define MSCCL_LOCAL_COPY 6
+#define MSCCL_REDUCE 7
+
+struct mscclTransmission {
+  int16_t dependencePointer; // index to the first dependence
+  int16_t numDependencies; // dependencePointer+numDependencies indicate the last dependence
+  int16_t reductionPointer; // where the reduction starts
+  int16_t numReductions; // number of reductions with the same dst
+  int16_t srcOffset;
+  int16_t dstOffset;
+  uint8_t srcBuffer : 4; // input/output/scratch
+  uint8_t dstBuffer : 4; // input/output/scratch
+  int8_t hasDependence;
+  uint8_t type;
+  uint8_t count;
+}; // 16 bytes
+
+static_assert((1ULL << (8*sizeof(mscclTransmission::count))) - 1 > MSCCL_MAX_COUNT, "MSCCL_MAX_COUNT must representable by datatype of count");
+
+struct mscclThreadBlock {
+  // step is used to index into these arrays
+  struct mscclTransmission transmissions[MSCCL_MAX_NUM_STEPS]; // 4KB
+  int8_t dependentBid[MSCCL_MAX_NUM_STEPS]; // -1 if not dependent on any thread block, 256 bytes
+  int16_t dependentStep[MSCCL_MAX_NUM_STEPS]; // 512 bytes
+  int16_t reductionSrcOffsets[MSCCL_MAX_NUM_STEPS]; // 512 bytes
+  int16_t sendPeer;
+  int16_t recvPeer;
+  uint16_t nSteps;
+  int16_t channelId; // associated channel. -1 indicates a thread block with only local copies
+}; // 5384 bytes
+
+static_assert(sizeof(struct mscclThreadBlock) % sizeof(uint64_t) == 0, "Sanity check: sizeof(struct mscclThreadBlock) % sizeof(uint64_t) != 0");
+
+struct mscclFlag {
+  uint64_t flag;
+  uint64_t align[3]; // to avoid false sharing
+};
+
+struct mscclChannelPeerInfo {
+  int peer;
+  // nTransmissionsOfCount[i]: number of transmissions with count i (in terms of msccl chunks)
+  int nTransmissionsOfCount[MSCCL_MAX_COUNT + 1];
+  int existingCounts[MSCCL_MAX_COUNT + 1];
+  int nExistingCounts;
+};
+
+struct mscclChannelInfo {
+  struct mscclChannelPeerInfo sendPeerInfo[MSCCL_MAX_NUM_THREAD_BLOCKS_PER_CHANNEL];
+  int nSendPeers;
+  struct mscclChannelPeerInfo recvPeerInfo[MSCCL_MAX_NUM_THREAD_BLOCKS_PER_CHANNEL];
+  int nRecvPeers;
+};
+
+struct mscclAlgoMeta {
+  // Path to algorithm file
+  std::string filePath;
+  // number of chunks of input/output in each MSCCL algorithm loop
+  int nChunksPerLoop;
+  // number of ranks required by this algorithm
+  int nRanks;
+  // need to times nRanks for all-gather, reduce-scatter and all-to-all
+  int sizeMultiplier;
+  // MSCCL function type
+  mscclFunc_t func;
+  // Min message size allowed for this algorithm.
+  int64_t minBytes;
+  // Max message size allowed for this algorithm, 0 for no limit.
+  int64_t maxBytes;
+  // Whether this algorithm is suitable for in-place.
+  bool inPlace;
+  // Whether this algorithm is suitable for out-of-place.
+  bool outOfPlace;
+};
+
+struct mscclAlgo {
+  // number of chunks of input/output in each MSCCL algorithm loop
+  int nChunksPerLoop;
+  // the protocol that the algorithm needs to use
+  int protocol;
+  // number of channels needed by MSCCL algorithm
+  int nChannels;
+  // number of ranks required by this algorithm
+  int nRanks;
+  // number of necessary thread blocks
+  int nBlocks;
+  // number of scratch chunks that MSCCL will use
+  int nScratchChunks;
+  // need to times nRanks for all-gather, reduce-scatter and all-to-all
+  int sizeMultiplier;
+  // number of steps per chunk for this algorithm
+  int chunkSteps;
+  // number of steps per slice for this algorithm
+  int sliceSteps;
+  // bid is used as an index into this array
+  struct mscclThreadBlock mscclTBs[MSCCL_MAX_NUM_THREAD_BLOCKS];
+  // used to calculate proxy info
+  struct mscclChannelInfo mscclChannels[MAXCHANNELS];
+  // Whether the algorithm requires reduce operation
+  bool hasReduce;
+  // MSCCL function type
+  mscclFunc_t func;
+  // Min message size allowed for this algorithm.
+  int64_t minBytes;
+  // Max message size allowed for this algorithm, 0 for no limit.
+  int64_t maxBytes;
+  // Whether this algorithm is suitable for in-place.
+  bool inPlace;
+  // Whether this algorithm is suitable for out-of-place.
+  bool outOfPlace;
+  // Keep a bit mask of used types (max 8 at present)
+  uint8_t typeMask;
+};
+
+enum mscclGroupStatus {
+  mscclNoGroup,
+  mscclGroupSupportedOp,
+  mscclGroupUnsupportedOp
+};
+
+struct mscclSavedSchedulerParam {
+  struct mscclSchedulerParam p;
+  std::vector<size_t> savedSendCounts;
+  std::vector<size_t> savedSDisPls;
+  std::vector<size_t> savedRecvCounts;
+  std::vector<size_t> savedRDisPls;
+  ncclComm_t comm;
+  hipStream_t stream;
+};
+
+enum mscclCaptureStatus {
+  mscclNoCapture,
+  mscclNewCapture,
+  mscclExistingCapture
+};
+
+struct mscclProxyArg {
+  struct mscclAlgo* hostAlgo;
+  ncclComm_t comm;
+  mscclProxyArg(struct mscclAlgo* hostAlgo, ncclComm_t comm) 
+    : hostAlgo(hostAlgo), comm(comm) {}
+};
+
+typedef std::map<unsigned long long, std::vector<struct mscclProxyArg>> mscclSavedProxyArgs;
+
+struct mscclThreadLocalStatus {
+  bool mscclIsCallerFlag;
+  mscclGroupStatus groupStatus;
+  int groupDepth;
+  std::vector<struct mscclSavedSchedulerParam> savedSchedulerParams;
+  unsigned long long captureId;
+  mscclCaptureStatus captureStatus;
+  hipGraph_t graph;
+};
+
+struct mscclWorkFifoStatus {
+  uint64_t workFifoDepth;
+  struct mscclWork* workFifo;
+  uint32_t* workFifoDone;
+  uint32_t workFifoSent;
+  uint32_t workFifoSentPerThreadBlock[MSCCL_MAX_NUM_THREAD_BLOCKS];
+  uint32_t workFifoAckdMin;
+};
+
+typedef std::map<unsigned long long, mscclWorkFifoStatus> mscclSavedGraphWorkFifoStatus;
+
+struct mscclStatus {
+  std::vector<mscclAlgoHandle_t> freeAlgoHandles;
+  std::map<mscclAlgoHandle_t, mscclAlgo *> hostAlgos;
+  std::map<mscclAlgoHandle_t, mscclAlgo *> devAlgos;
+  struct mscclFlag* syncFlags;
+  void *scratchBuffer;
+  uint64_t scratchBufferSize;
+  size_t nBytes;
+  int stepSize;
+  int chunkSteps;
+  int sliceSteps;
+  int chunkSize;
+  int chunkEffectiveSize;
+  uint32_t workIndex;
+  uint32_t maxAllowedCount;
+  ncclDataType_t dataType;
+  std::map<ncclComm_t, std::set<mscclAlgoHandle_t>> connectedAlgos;
+  hipStream_t lastStream;
+  void* mscclSchedulerLib;
+  mscclSchedulerInterface* mscclSchedulerPtr;
+  std::vector<mscclAlgoMeta> algoMetas;
+  std::vector<std::map<int, mscclAlgoHandle_t>> rankToAlgoHandles;
+  bool graphEnabled;
+  bool graphFirstKernel;
+  bool needsProxy;
+  mscclWorkFifoStatus defaultWorkFifoStatus;
+  mscclSavedGraphWorkFifoStatus graphWorkFifoStatus;
+};
+
+#pragma pack(push)
+#pragma pack(8)
+
+struct mscclWork {
+  volatile struct mscclFlag *syncFlags;
+  void *scratchBuffer;
+  const void *sendBuff;
+  void *recvBuff;
+  uint32_t* workFifoDone;
+  size_t sizePerMscclChunk;
+  uint64_t redOpArg;
+  uint32_t workIndex;
+  uint32_t maxAllowedCount;
+  uint32_t workFifoDoneAck;
+  int nChunksPerLoop;
+  bool hasReduce;
+  bool redOpArgIsPtr;
+  uint32_t fnIndex;
+};
+static_assert(sizeof(struct mscclWork) % 16 == 0, "mscclWork needs to be 16B aligned");
+
+#pragma pack(pop)
+
+struct mscclShmemData {
+  struct mscclThreadBlock mscclTB;
+  alignas(16) struct mscclWork work;
+};
+static_assert(offsetof(struct mscclShmemData, work) % 16 == 0, "mscclShmemData.work needs to be 16B aligned");
+
+#endif
--- a/rccl/src/include/nccl_net.h
+++ b/rccl/src/include/nccl_net.h
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_H_
+#define NCCL_NET_H_
+
+#include "nccl.h"
+#include <stdint.h>
+
+#define NCCL_NET_HANDLE_MAXSIZE 128
+
+#define NCCL_PTR_HOST 0x1
+#define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4
+
+// Maximum number of requests per comm object
+#define NCCL_NET_MAX_REQUESTS 8
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  float latency;  // Network latency
+  int maxComms;   // Maximum number of comms we can create
+  int maxRecvs;   // Maximum number of grouped receives.
+}ncclNetProperties_v6_t;
+
+typedef ncclNetProperties_v6_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+typedef ncclNet_v6_t ncclNet_t;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v6_t;
+
+typedef ncclCollNet_v6_t ncclCollNet_t;
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6
+
+// v5 struct for backwards compatibility
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v5_t;
+
+// v5 struct for backwards compatibility
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v5_t;
+
+// v4 struct for backwards compatibility
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  int maxComms;   // Maximum number of comms we can create
+} ncclNetProperties_v4_t;
+
+// v4 struct for backwards compatibility
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connectHandle
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v4_t;
+
+// v4 struct for backwards compatibility
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v4_t;
+
+#endif // end include guard
--- a/rccl/src/include/net.h
+++ b/rccl/src/include/net.h
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INT_NET_H_
+#define NCCL_INT_NET_H_
+
+#include "nccl.h"
+#include "nccl_net.h"
+#include "comm.h"
+#include "checks.h"
+
+typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
+
+ncclResult_t ncclNetPluginInit();
+ncclResult_t ncclNetInit(struct ncclComm* comm);
+int ncclNetVersion(struct ncclComm* comm);
+
+// Test whether the current GPU support GPU Direct RDMA.
+ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
+
+extern ncclNet_t ncclNetIb;
+extern ncclNet_t ncclNetSocket;
+
+#endif
--- a/rccl/src/include/npkit/npkit.h
+++ b/rccl/src/include/npkit/npkit.h
+/*************************************************************************
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT License.
+ ************************************************************************/
+
+#ifndef NPKIT_H_
+#define NPKIT_H_
+
+#include <string>
+#include <thread>
+
+#include <hip/hip_runtime.h>
+
+#include "npkit/npkit_event.h"
+#include "npkit/npkit_struct.h"
+#include "common.h"
+
+#include <time.h>
+#include <sys/time.h>
+
+#define NPKIT_GET_GPU_TIMESTAMP wall_clock64
+#define RANK_NUM     16
+#define CHANNEL_NUM  32
+#define HOST_SUBMIT_CHANNEL_BUF  32
+
+class NpKit {
+ public:
+  static const uint64_t kNumGpuEventBuffers = CHANNEL_NUM;
+
+  static const uint64_t kNumCpuEventBuffers = CHANNEL_NUM + 1;
+
+  static ncclResult_t Init(int rank);
+
+  static ncclResult_t Dump(const std::string& dump_dir, int rank);
+
+  static ncclResult_t Shutdown(int rank);
+
+  static NpKitEventCollectContext* GetGpuEventCollectContexts(int rank);
+
+  static inline __device__ void CollectGpuEvent(uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp,
+                                                NpKitEventCollectContext* ctx) {
+    uint64_t event_buffer_head = ctx->event_buffer_head;
+    if (event_buffer_head < kMaxNumGpuEventsPerBuffer) {
+      NpKitEvent& event = ctx->event_buffer[event_buffer_head];
+      event.fields.type = type;
+      event.fields.size = size < 0 ? 0 : size;
+      event.fields.rsvd = rsvd;
+      event.fields.timestamp = timestamp;
+      ctx->event_buffer_head++;
+    }
+  }
+
+  static inline __device__ void CollectGpuEventLDS(uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp) {
+#if defined(ENABLE_NPKIT)
+    if (ncclShmem.event_buffer_head < LDS_NUM_EVENTS) {
+      NpKitEvent& event = ncclShmem.event_buffer[ncclShmem.event_buffer_head];
+      event.fields.type = type;
+      event.fields.size = size < 0 ? 0 : size;
+      event.fields.rsvd = rsvd;
+      event.fields.timestamp = timestamp;
+      ncclShmem.event_buffer_head++;
+    }
+#endif
+  }
+
+  static void CollectCpuEvent(int rank, uint8_t type, int64_t size, uint32_t rsvd, uint64_t timestamp, int channel_id);
+
+  static uint64_t *GetCpuTimestamp();
+  static uint64_t GetCpuTimeNs();
+
+ private:
+  static void CpuTimestampUpdateThread();
+
+  // 1M * 32 * 16B = 512MB per GPU
+  static const uint64_t kMaxNumGpuEventsPerBuffer = 1ULL << 20;
+
+  // 64K * 2 (send/recv) * (512/32) = 2M, 2M * 32 * 16B = 1GB per CPU
+  static const uint64_t kMaxNumCpuEventsPerBuffer = 1ULL << 21;
+
+  static NpKitEvent** gpu_event_buffers_[RANK_NUM];
+  static NpKitEvent** cpu_event_buffers_[RANK_NUM];
+  static int gpu_rtc_rate_khz[RANK_NUM];
+
+  static NpKitEventCollectContext* gpu_collect_contexts_[RANK_NUM];
+  static NpKitEventCollectContext* cpu_collect_contexts_[RANK_NUM];
+  static uint64_t* cpu_timestamp_;
+  static pthread_mutex_t npKitLock;
+
+  static uint64_t rank_;
+
+  static std::thread* cpu_timestamp_update_thread_;
+  static volatile bool cpu_timestamp_update_thread_should_stop_;
+};
+
+#endif
--- a/rccl/src/include/npkit/npkit_event.h
+++ b/rccl/src/include/npkit/npkit_event.h
+/*************************************************************************
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT License.
+ ************************************************************************/
+
+#ifndef NPKIT_EVENT_H_
+#define NPKIT_EVENT_H_
+
+#define NPKIT_EVENT_INVALID                                     0x0
+
+#define NPKIT_EVENT_ALL_REDUCE_RING_ENTRY                       0x1
+#define NPKIT_EVENT_ALL_REDUCE_RING_EXIT                        0x2
+#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY                0x3
+#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT                 0x4
+#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY                 0x5
+#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT                  0x6
+
+#define NPKIT_EVENT_COPY_SEND_ENTRY                             0x7
+#define NPKIT_EVENT_COPY_SEND_EXIT                              0x8
+#define NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY                      0x9
+#define NPKIT_EVENT_DIRECT_COPY_SEND_EXIT                       0xA
+#define NPKIT_EVENT_DIRECT_RECV_ENTRY                           0xB
+#define NPKIT_EVENT_DIRECT_RECV_EXIT                            0xC
+#define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY                 0xD
+#define NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT                  0xE
+#define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY          0xF
+#define NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT           0x10
+#define NPKIT_EVENT_DIRECT_SEND_ENTRY                           0x11
+#define NPKIT_EVENT_DIRECT_SEND_EXIT                            0x12
+#define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY               0x13
+#define NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT                0x14
+#define NPKIT_EVENT_RECV_ENTRY                                  0x15
+#define NPKIT_EVENT_RECV_EXIT                                   0x16
+#define NPKIT_EVENT_RECV_COPY_SEND_ENTRY                        0x17
+#define NPKIT_EVENT_RECV_COPY_SEND_EXIT                         0x18
+#define NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY                      0x19
+#define NPKIT_EVENT_RECV_REDUCE_COPY_EXIT                       0x1A
+#define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY                 0x1B
+#define NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT                  0x1C
+#define NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY                      0x1D
+#define NPKIT_EVENT_RECV_REDUCE_SEND_EXIT                       0x1E
+#define NPKIT_EVENT_SEND_ENTRY                                  0x1F
+#define NPKIT_EVENT_SEND_EXIT                                   0x20
+#define NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY                      0x21
+#define NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT                       0x22
+
+#define NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY                 0x23
+#define NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT                  0x24
+#define NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY      0x25
+#define NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT       0x26
+
+#define NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY                     0x27
+#define NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT                      0x28
+#define NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY                  0x29
+#define NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT                   0x2A
+
+#define NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY                  0x2B
+#define NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT                   0x2C
+#define NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY               0x2D
+#define NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT                0x2E
+
+#define NPKIT_EVENT_NET_SEND_ENTRY                              0x2F
+#define NPKIT_EVENT_NET_SEND_EXIT                               0x30
+
+#define NPKIT_EVENT_NET_RECV_ENTRY                              0x31
+#define NPKIT_EVENT_NET_RECV_EXIT                               0x32
+
+#define NPKIT_EVENT_TIME_SYNC_GPU                               0x33
+#define NPKIT_EVENT_TIME_SYNC_CPU                               0x34
+
+#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY                  0x35
+#define NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT                   0x36
+#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY      0x37
+#define NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT       0x38
+#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY  0x39
+#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT   0x3A
+#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY 0x3B
+#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT  0x3C
+#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY           0x3D
+#define NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT            0x3E
+
+#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY         0x3F
+#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT          0x40
+#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY      0x41
+#define NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT       0x42
+
+#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY    0x43
+#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT     0x44
+#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY          0x45
+#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT           0x46
+#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY       0x47
+#define NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT        0x48
+
+#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY                  0x49
+#define NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT                   0x4A
+#define NPKIT_EVENT_SEND_RECV_SEND_ENTRY                        0x4B
+#define NPKIT_EVENT_SEND_RECV_SEND_EXIT                         0x4C
+#define NPKIT_EVENT_SEND_RECV_RECV_ENTRY                        0x4D
+#define NPKIT_EVENT_SEND_RECV_RECV_EXIT                         0x4E
+
+#define NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME                    0x4F
+
+#define NPKIT_EVENT_ALL_GATHER_RING_ENTRY                       0x50
+#define NPKIT_EVENT_ALL_GATHER_RING_EXIT                        0x51
+#define NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY                  0x52
+#define NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT                   0x53
+#define NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY        0x54
+#define NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT         0x55
+#define NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY           0x56
+#define NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT            0x57
+
+#define NPKIT_EVENT_NET_TEST_ENTRY                              0x58
+#define NPKIT_EVENT_NET_TEST_EXIT                               0x59
+
+#define NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY                      0x5A
+#define NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT                       0x5B
+#define NPKIT_EVENT_MSCCL_REDUCE_ENTRY                          0x5C
+#define NPKIT_EVENT_MSCCL_REDUCE_EXIT                           0x5D
+#define NPKIT_EVENT_MSCCL_SEND_ENTRY                          	0x5E
+#define NPKIT_EVENT_MSCCL_SEND_EXIT                           	0x5F
+#define NPKIT_EVENT_MSCCL_RECV_ENTRY                            0x60
+#define NPKIT_EVENT_MSCCL_RECV_EXIT                             0x61
+#define NPKIT_EVENT_MSCCL_RUN_ENTRY                             0x62
+#define NPKIT_EVENT_MSCCL_RUN_EXIT                              0x63
+#define NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY                0x64
+#define NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT                 0x65
+#define NPKIT_EVENT_MSCCL_INIT_ENTRY                            0x66
+#define NPKIT_EVENT_MSCCL_INIT_EXIT                             0x67
+
+#define NPKIT_EVENT_HOST_ENTRY                                  0x68
+#define NPKIT_EVENT_HOST_EXIT                                   0x69
+
+#define NPKIT_EVENT_BROADCAST_RING_ENTRY                        0x6A
+#define NPKIT_EVENT_BROADCAST_RING_EXIT                         0x6B
+#define NPKIT_EVENT_BROADCAST_RING_COPY_SEND_ENTRY              0x6C
+#define NPKIT_EVENT_BROADCAST_RING_COPY_SEND_EXIT               0x6D
+
+#define NPKIT_EVENT_COMM_TEST_ENTRY                             0x6E
+#define NPKIT_EVENT_COMM_TEST_EXIT                              0x6F
+
+#define NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_ENTRY                 0x70
+#define NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_EXIT                  0x71
+
+#define NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_PAL_ENTRY             0x72
+#define NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_PAL_EXIT              0x73
+
+#define NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST_ENTRY                 0x74
+#define NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST_EXIT                  0x75
+
+#endif
--- a/rccl/src/include/npkit/npkit_struct.h
+++ b/rccl/src/include/npkit/npkit_struct.h
+/*************************************************************************
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT License.
+ ************************************************************************/
+
+#ifndef NPKIT_STRUCT_H_
+#define NPKIT_STRUCT_H_
+
+#include <cstdint>
+
+#pragma pack(push, 1)
+
+union NpKitEvent {
+  uint64_t bits[2];
+  struct {
+    uint64_t type : 8;
+    uint32_t size : 32;
+    uint64_t rsvd : 24;
+    uint64_t timestamp;
+  } fields;
+};
+
+struct NpKitEventCollectContext {
+  NpKitEvent* event_buffer;
+  uint64_t event_buffer_head;
+};
+
+#pragma pack(pop)
+
+#endif
--- a/rccl/src/include/nvmlwrap.h
+++ b/rccl/src/include/nvmlwrap.h
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVMLWRAP_H_
+#define NCCL_NVMLWRAP_H_
+
+#include "nccl.h"
+
+//#define NCCL_NVML_DIRECT 1
+#ifndef NCCL_NVML_DIRECT
+#define NCCL_NVML_DIRECT 0
+#endif
+
+#if NCCL_NVML_DIRECT
+#include "nvml.h"
+#else
+// Dynamically handle dependencies on NVML
+
+/* Extracted from nvml.h */
+typedef struct nvmlDevice_st* nvmlDevice_t;
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE   16
+
+typedef enum nvmlEnableState_enum
+{
+    NVML_FEATURE_DISABLED    = 0,     //!< Feature disabled
+    NVML_FEATURE_ENABLED     = 1      //!< Feature enabled
+} nvmlEnableState_t;
+
+typedef enum nvmlNvLinkCapability_enum
+{
+    NVML_NVLINK_CAP_P2P_SUPPORTED = 0,     // P2P over NVLink is supported
+    NVML_NVLINK_CAP_SYSMEM_ACCESS = 1,     // Access to system memory is supported
+    NVML_NVLINK_CAP_P2P_ATOMICS   = 2,     // P2P atomics are supported
+    NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3,     // System memory atomics are supported
+    NVML_NVLINK_CAP_SLI_BRIDGE    = 4,     // SLI is supported over this link
+    NVML_NVLINK_CAP_VALID         = 5,     // Link is supported on this device
+    // should be last
+    NVML_NVLINK_CAP_COUNT
+} nvmlNvLinkCapability_t;
+
+typedef enum nvmlReturn_enum
+{
+    NVML_SUCCESS = 0,                   //!< The operation was successful
+    NVML_ERROR_UNINITIALIZED = 1,       //!< NVML was not first initialized with nvmlInit()
+    NVML_ERROR_INVALID_ARGUMENT = 2,    //!< A supplied argument is invalid
+    NVML_ERROR_NOT_SUPPORTED = 3,       //!< The requested operation is not available on target device
+    NVML_ERROR_NO_PERMISSION = 4,       //!< The current user does not have permission for operation
+    NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
+    NVML_ERROR_NOT_FOUND = 6,           //!< A query to find an object was unsuccessful
+    NVML_ERROR_INSUFFICIENT_SIZE = 7,   //!< An input argument is not large enough
+    NVML_ERROR_INSUFFICIENT_POWER = 8,  //!< A device's external power cables are not properly attached
+    NVML_ERROR_DRIVER_NOT_LOADED = 9,   //!< NVIDIA driver is not loaded
+    NVML_ERROR_TIMEOUT = 10,            //!< User provided timeout passed
+    NVML_ERROR_IRQ_ISSUE = 11,          //!< NVIDIA Kernel detected an interrupt issue with a GPU
+    NVML_ERROR_LIBRARY_NOT_FOUND = 12,  //!< NVML Shared Library couldn't be found or loaded
+    NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
+    NVML_ERROR_CORRUPTED_INFOROM = 14,  //!< infoROM is corrupted
+    NVML_ERROR_GPU_IS_LOST = 15,        //!< The GPU has fallen off the bus or has otherwise become inaccessible
+    NVML_ERROR_RESET_REQUIRED = 16,     //!< The GPU requires a reset before it can be used again
+    NVML_ERROR_OPERATING_SYSTEM = 17,   //!< The GPU control device has been blocked by the operating system/cgroups
+    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
+    NVML_ERROR_IN_USE = 19,             //!< An operation cannot be performed because the GPU is currently in use
+    NVML_ERROR_UNKNOWN = 999            //!< An internal driver error occurred
+} nvmlReturn_t;
+
+typedef struct nvmlPciInfo_st
+{
+    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
+    unsigned int domain;             //!< The PCI domain on which the device's bus resides, 0 to 0xffff
+    unsigned int bus;                //!< The bus on which the device resides, 0 to 0xff
+    unsigned int device;             //!< The device's id on the bus, 0 to 31
+    unsigned int pciDeviceId;        //!< The combined 16-bit device id and 16-bit vendor id
+
+    // Added in NVML 2.285 API
+    unsigned int pciSubSystemId;     //!< The 32-bit Sub System Device ID
+
+    // NVIDIA reserved for internal use only
+    unsigned int reserved0;
+    unsigned int reserved1;
+    unsigned int reserved2;
+    unsigned int reserved3;
+} nvmlPciInfo_t;
+
+/* P2P Capability Index Status*/
+typedef enum nvmlGpuP2PStatus_enum
+{
+    NVML_P2P_STATUS_OK     = 0,
+    NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
+    NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
+    NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
+    NVML_P2P_STATUS_DISABLED_BY_REGKEY,
+    NVML_P2P_STATUS_NOT_SUPPORTED,
+    NVML_P2P_STATUS_UNKNOWN
+} nvmlGpuP2PStatus_t;
+
+/* P2P Capability Index*/
+typedef enum nvmlGpuP2PCapsIndex_enum
+{
+    NVML_P2P_CAPS_INDEX_READ = 0,
+    NVML_P2P_CAPS_INDEX_WRITE,
+    NVML_P2P_CAPS_INDEX_NVLINK,
+    NVML_P2P_CAPS_INDEX_ATOMICS,
+    NVML_P2P_CAPS_INDEX_PROP,
+    NVML_P2P_CAPS_INDEX_UNKNOWN
+} nvmlGpuP2PCapsIndex_t;
+
+/**
+ * Represents the type for sample value returned
+ */
+typedef enum nvmlValueType_enum
+{
+    NVML_VALUE_TYPE_DOUBLE = 0,
+    NVML_VALUE_TYPE_UNSIGNED_INT = 1,
+    NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
+    NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
+    NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4,
+
+    // Keep this last
+    NVML_VALUE_TYPE_COUNT
+}nvmlValueType_t;
+
+
+/**
+ * Union to represent different types of Value
+ */
+typedef union nvmlValue_st
+{
+    double dVal;                    //!< If the value is double
+    unsigned int uiVal;             //!< If the value is unsigned int
+    unsigned long ulVal;            //!< If the value is unsigned long
+    unsigned long long ullVal;      //!< If the value is unsigned long long
+    signed long long sllVal;        //!< If the value is signed long long
+}nvmlValue_t;
+
+/**
+ * Field Identifiers.
+ *
+ * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
+ */
+
+/* NVLink Speed */
+#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90  //!< Common NVLink Speed in MBps for active links
+#define NVML_FI_DEV_NVLINK_LINK_COUNT        91  //!< Number of NVLinks present on the device
+
+/**
+ * Remote device NVLink ID
+ *
+ * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t.
+ */
+#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID     146 //!< Remote device NVLink ID
+
+/**
+ * NVSwitch: connected NVLink count
+ */
+#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT   147  //!< Number of NVLinks connected to NVSwitch
+
+#define NVML_FI_DEV_NVLINK_GET_SPEED                  164
+#define NVML_FI_DEV_NVLINK_GET_STATE                  165
+#define NVML_FI_DEV_NVLINK_GET_VERSION                166
+#define NVML_FI_MAX 167 //!< One greater than the largest field ID defined above
+
+/**
+ * Information for a Field Value Sample
+ */
+typedef struct nvmlFieldValue_st
+{
+    unsigned int fieldId;       //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above.
+    unsigned int scopeId;       //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId can represent linkId.
+    long long timestamp;        //!< CPU Timestamp of this value in microseconds since 1970
+    long long latencyUsec;      //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call.
+    nvmlValueType_t valueType;  //!< Type of the value stored in value
+    nvmlReturn_t nvmlReturn;    //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS
+    nvmlValue_t value;          //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
+} nvmlFieldValue_t;
+
+/* End of nvml.h */
+#endif // NCCL_NVML_DIRECT
+
+constexpr int ncclNvmlMaxDevices = 32;
+struct ncclNvmlDeviceInfo {
+  nvmlDevice_t handle;
+  int computeCapabilityMajor, computeCapabilityMinor;
+};
+struct ncclNvmlDevicePairInfo {
+  nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite;
+};
+extern int ncclNvmlDeviceCount;
+extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
+extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];
+
+// All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
+// Outsiders need only call it if they want to inspect the ncclNvml global
+// tables above.
+ncclResult_t ncclNvmlEnsureInitialized();
+
+ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
+ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
+ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
+ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult);
+ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
+ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
+ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
+
+#endif // End include guard