#include #include #include #include #include "mpi.h" #include "net.h" #include "bootstrap.h" #include "hardware_utils.h" using namespace sccl; int main(int argc, char* argv[]) { int rank, nranks; int tag1, src, dst, cnt; MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nranks); MPI_Comm_rank(MPI_COMM_WORLD, &rank); printf("rank=%d, nranks=%d\n", rank, nranks); // ----------------------------------------------------------------------- // INFO(SCCL_LOG_TOPO, "Bootstrap ...\n"); struct scclRankInfo* rank_info; struct sccl::hardware::topology::bootstrap::scclBootstrapComm* comm; SCCLCHECK(scclCalloc(&rank_info, 1)); SCCLCHECK(scclCalloc(&comm, 1)); rank_info->rank = rank; rank_info->nRanks = nranks; rank_info->localRanks = 2; rank_info->hipDev = rank % rank_info->localRanks; auto sccl_bootstrap = new sccl::hardware::topology::bootstrap::scclBootstrap(rank_info, comm); SCCLCHECK(sccl_bootstrap->bootstrapInitCheck()); sccl::hardware::topology::bootstrap::printUniqueInfo(comm->unique_info); int cuda_id; HIPCHECK(hipGetDevice(&cuda_id)); printf("rank=%d, cuda_id=%d\n", rank, cuda_id); MPI_Finalize(); } /* 单机执行 SCCL_DEBUG_LEVEL=ABORT mpirun --allow-run-as-root -np 4 1_mpi_init SCCL_DEBUG_LEVEL=INFO SCCL_DEBUG_SUBSYS=ALL mpirun --allow-run-as-root -np 2 1_mpi_init 跨机执行 SCCL_DEBUG_LEVEL=ABORT mpirun --allow-run-as-root --hostfile hostfile -np 16 ./1_mpi_init SCCL_DEBUG_LEVEL=ABORT SCCL_DEBUG_SUBSYS=BOOTSTRAP mpirun --allow-run-as-root --hostfile hostfile2 -np 4 ./1_mpi_init */