#include "core.h" #include "devcomm.h" #include "comm.h" #include "topo.h" namespace sccl { namespace hardware { namespace topology { namespace detect { SCCL_PARAM(Nthreads, "NTHREADS", -2); SCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2); static int getNthreads(const char* name, int env, int min, int max, int def, int WarpSize) { int nt = env; if(nt > 0) { if(nt % WarpSize != 0) { WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WarpSize); nt = max; } else if(nt > max) { WARN("Invalid %s %d (maximum %d).", name, nt, max); nt = max; } else if(nt < min) { WARN("Invalid %s %d (minimum %d).", name, nt, min); nt = min; } } else { nt = def; } return nt; } scclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) { int def, set; if(str[0] == '^') { def = 1; set = 0; str++; } else { def = 0; set = 1; } for(int i = 0; i < nelems; i++) list[i] = def; char* tokStr = strdup(str); char* tmpStr; char* token = strtok_r(tokStr, ",", &tmpStr); while(token) { for(int i = 0; i < nelems; i++) if(strcasecmp(token, elems[i]) == 0) list[i] = set; token = strtok_r(NULL, ",", &tmpStr); } free(tokStr); return scclSuccess; } // Latencies in us, Bandwidths in GB/s // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } static const float baseLat[SCCL_NUM_ALGORITHMS][SCCL_NUM_PROTOCOLS] = {{12.0, 12.0, 17.0}, {12.0, 12.0, 17.0}, // Tree, Ring {12.0, 12.0, 17.0}, {12.0, 12.0, 17.0}, // Collnet Direct, Chain {0, 0, 0}, {0, 0, 0}}; // NVLS, NVLS Tree // NVLink, PCI, Network #define SCCL_HW_NVLINK 0 #define SCCL_HW_PCI 1 #define SCCL_HW_NET 2 struct tuningModel { float hwLat[3][SCCL_NUM_ALGORITHMS][SCCL_NUM_PROTOCOLS]; float bwRatio[2][SCCL_NUM_ALGORITHMS][SCCL_NUM_PROTOCOLS]; float treeCorrectionFactor[SCCL_NUM_PROTOCOLS][27]; float ringCorrectionFactor[SCCL_NUM_PROTOCOLS][27]; }; static struct tuningModel tuning_model_0{ .hwLat = { /* NVLINK */ {/* Tree (LL/LL128/Simple)*/ {0.8, 1.4, 2.5}, /* Ring (LL/LL128/Simple)*/ {0.8, 2.2, 3.6}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 0.8}, /* CollNetChain (Simple)*/ {0.0, 0.0, 1.4}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* PCI */ {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7}, /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7}, /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* NET */ {/* Tree (LL/LL128/Simple)*/ {11.8, 18.2, 20.8}, /* Ring (LL/LL128/Simple)*/ {9.5, 19.8, 15.1}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 11.8}, /* CollNetChain (Simple)*/ {0.0, 0.0, 18.2}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, }, .bwRatio = { /* 2 nodes */ {/* Tree (LL/LL128/Simple)*/ {0.28, 0.22, 0.91}, /* Ring (LL/LL128/Simple)*/ {0.31, 0.34, 1.00}, /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00}, /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* more than 2 nodes */ {/* Tree (LL/LL128/Simple)*/ {0.04, 0.22, 0.95}, /* Ring (LL/LL128/Simple)*/ {0.04, 0.34, 1.00}, /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00}, /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, }, .treeCorrectionFactor = { { 0.1, 0.2, 0.1, 0.1, 0.9, 0.3, 0.4, 0.1, 0.2, 0.4, 0.2, 0.1, 0.3, 0.3, 0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, { 0.1, 0.3, 1.0, 0.1, 0.5, 1.0, 0.9, 1.0, 1.0, 1.0, 0.3, 0.1, 0.4, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, }, // { 0.2, 1.0, 0.1, 0.1, 0.7, 0.2, 0.4, 0.1, 0.1, 0.3, 0.4, 0.3, 0.6, 0.8, 1.0, 1.0, 1.0, 1.0, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, }, { 0.2, 1.0, 0.1, 0.1, 0.7, 0.2, 0.4, 0.1, 0.1, 0.3, 0.4, 0.3, 0.6, 0.8, 1.0, 1.0, 1.0, 1.0, 0.9, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, }, }, .ringCorrectionFactor = { { 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.2, 0.3, 0.5, 0.3, 0.1, 0.5, 0.5, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.7, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, }, { 1.0, 0.8, 0.2, 1.0, 1.0, 0.3, 1.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.5, 1.0, 0.8, 0.8, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, }, }; static struct tuningModel tuning_model_1{ .hwLat = { /* NVLINK */ {/* Tree (LL/LL128/Simple)*/ {1.5, 1.5, 4.5}, /* Ring (LL/LL128/Simple)*/ {1.5, 1.5, 4.5}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 4.5}, /* CollNetChain (Simple)*/ {0.0, 0.0, 4.5}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* PCI */ {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7}, /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7}, /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* NET */ {/* Tree (LL/LL128/Simple)*/ {33.0, 33.0, 15.8}, /* Ring (LL/LL128/Simple)*/ {5.1, 5.1, 68.8}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 15.8}, /* CollNetChain (Simple)*/ {0.0, 0.0, 15.8}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, }, .bwRatio = { /* 2 nodes */ {/* Tree (LL/LL128/Simple)*/ {0.30, 1.00, 0.99}, /* Ring (LL/LL128/Simple)*/ {0.31, 1.00, 1.00}, /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00}, /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* more than 2 nodes */ {/* Tree (LL/LL128/Simple)*/ {0.15, 1.00, 0.42}, /* Ring (LL/LL128/Simple)*/ {0.20, 1.00, 1.00}, /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00}, /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, }, .treeCorrectionFactor = { { 0.5, 0.4, 0.7, 0.6, 1.0, 1.0, 0.5, 0.4, 0.1, 0.5, 0.4, 0.6, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.6, 0.5, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1, }, { 0.5, 0.4, 0.7, 0.6, 1.0, 1.0, 0.5, 0.4, 0.1, 0.5, 0.4, 0.6, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.6, 0.5, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1, }, // { 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.4, 0.5, 0.1, 0.6, 1.0, 1.0, 1.0, 0.6, 0.5, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.5, 0.3, 0.3, }, { 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.4, 0.5, 0.1, 0.6, 1.0, 1.0, 1.0, 0.6, 0.5, 0.7, 1.0, 1.0, 1.0, 0.4, 0.4, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1, }, }, .ringCorrectionFactor = { { 1.0, 0.5, 1.0, 1.0, 0.6, 0.7, 1.0, 1.0, 0.2, 1.0, 0.9, 0.7, 1.0, 1.0, 1.0, 0.9, 0.9, 0.8, 0.8, 0.7, 0.6, 0.5, 0.5, 0.3, 0.2, 0.1, 0.1, }, { 1.0, 0.5, 1.0, 1.0, 0.6, 0.7, 1.0, 1.0, 0.2, 1.0, 0.9, 0.7, 1.0, 1.0, 1.0, 0.9, 0.9, 0.8, 0.8, 0.7, 0.6, 0.5, 0.5, 0.3, 0.2, 0.1, 0.1, }, { 0.3, 1.0, 0.3, 0.1, 0.1, 0.1, 0.3, 0.7, 1.0, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.9, 1.0, 1.0, 1.0, 1.0, }, }, }; static struct tuningModel tuning_model_2{ .hwLat = { /* NVLINK */ {/* Tree (LL/LL128/Simple)*/ {1.5, 1.5, 4.5}, /* Ring (LL/LL128/Simple)*/ {1.5, 1.5, 4.5}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 4.5}, /* CollNetChain (Simple)*/ {0.0, 0.0, 4.5}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* PCI */ {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7}, /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7}, /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* NET */ {/* Tree (LL/LL128/Simple)*/ {27.9, 27.9, 15.8}, /* Ring (LL/LL128/Simple)*/ {12.1, 12.1, 68.8}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 15.8}, /* CollNetChain (Simple)*/ {0.0, 0.0, 15.8}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, }, .bwRatio = { /* 2 nodes */ {/* Tree (LL/LL128/Simple)*/ {0.30, 1.00, 0.99}, /* Ring (LL/LL128/Simple)*/ {0.31, 1.00, 1.00}, /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00}, /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* more than 2 nodes */ {/* Tree (LL/LL128/Simple)*/ {0.07, 1.00, 0.42}, /* Ring (LL/LL128/Simple)*/ {0.08, 1.00, 1.00}, /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00}, /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, }, .treeCorrectionFactor = { { 0.1, 0.4, 0.3, 0.3, 0.2, 0.4, 0.5, 0.1, 0.1, 0.6, 0.7, 0.7, 0.8, 1.0, 0.9, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, }, { 0.1, 0.4, 0.3, 0.3, 0.2, 0.4, 0.5, 0.1, 0.1, 0.6, 0.7, 0.7, 0.8, 1.0, 0.9, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, }, // { 1.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.1, 0.6, 0.9, 0.8, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.9, 0.9, 1.0, 1.0, 1.0, }, { 1.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.1, 0.6, 0.9, 0.8, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 0.4, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4, }, }, .ringCorrectionFactor = { { 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 1.0, 1.0, 1.0, 1.0, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, { 0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 1.0, 1.0, 1.0, 1.0, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.5, 0.6, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, }, }; static struct tuningModel tuning_model_3{ .hwLat = { /* NVLINK */ {/* Tree (LL/LL128/Simple)*/ {0.8, 0.0, 2.5}, /* Ring (LL/LL128/Simple)*/ {0.8, 0.0, 3.6}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 0.8}, /* CollNetChain (Simple)*/ {0.0, 0.0, 0.0}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* PCI */ {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7}, /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7}, /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* NET */ {/* Tree (LL/LL128/Simple)*/ {12.5, 0.0, 22.4}, /* Ring (LL/LL128/Simple)*/ {9.5, 0.0, 19.8}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 12.5}, /* CollNetChain (Simple)*/ {0.0, 0.0, 0.0}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, }, .bwRatio = { /* 2 nodes */ {/* Tree (LL/LL128/Simple)*/ {0.20, 0.00, 1.75}, /* Ring (LL/LL128/Simple)*/ {0.20, 0.00, 1.00}, /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00}, /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* more than 2 nodes */ {/* Tree (LL/LL128/Simple)*/ {0.20, 0.00, 0.96}, /* Ring (LL/LL128/Simple)*/ {0.20, 0.00, 1.00}, /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00}, /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, }, .treeCorrectionFactor = { { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.2, 1.0, 0.9, 1.0, 0.6, 0.4, 0.6, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, }, { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, }, // { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.2, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.7, 0.8, 0.9, 0.7, 0.7, }, { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.2, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 0.4, 0.3, 0.3, 0.3, 0.4, 0.3, 0.3, }, }, .ringCorrectionFactor = { { 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.1, 0.2, 0.1, 0.4, 0.4, 0.2, 0.2, 0.3, 0.7, 0.5, 0.4, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, }, { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, }, { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 1.0, 0.1, 0.3, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.4, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, }, }; static struct tuningModel tuning_model_4{ .hwLat = { /* NVLINK */ {/* Tree (LL/LL128/Simple)*/ {0.8, 1.4, 2.5}, /* Ring (LL/LL128/Simple)*/ {0.8, 2.2, 3.6}, /* CollNetDirect (Simple)*/ {0.8, 1.4, 2.5}, /* CollNetChain (Simple)*/ {0.8, 1.4, 2.5}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* PCI */ {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7}, /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7}, /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* NET */ {/* Tree (LL/LL128/Simple)*/ {32.2, 34.4, 47.6}, /* Ring (LL/LL128/Simple)*/ {35.4, 87.8, 209.2}, /* CollNetDirect (Simple)*/ {0.0, 0.0, 47.6}, /* CollNetChain (Simple)*/ {0.0, 0.0, 47.6}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, }, .bwRatio = { /* 2 nodes */ {/* Tree (LL/LL128/Simple)*/ {0.16, 1.09, 1.61}, /* Ring (LL/LL128/Simple)*/ {0.15, 0.41, 1.00}, /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00}, /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, /* more than 2 nodes */ {/* Tree (LL/LL128/Simple)*/ {0.16, 1.09, 1.08}, /* Ring (LL/LL128/Simple)*/ {0.15, 0.41, 1.00}, /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00}, /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00}, /* NVLS */ {0, 0, 0}, /* NVLS Tree */ {0, 0, 0}}, }, .treeCorrectionFactor = { { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.1, 0.1, 0.2, 0.4, 0.6, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.2, 1.0, 0.5, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, }, // { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.4, 0.3, 0.3, 0.1, 0.1, 1.0, 1.0, 0.7, 0.5, 0.6, 0.5, 0.6, 0.6, 0.5, 0.6, 0.6, 0.6, 0.7, }, // { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.4, 0.3, 0.3, 0.1, 0.1, 1.0, 1.0, 0.7, 0.5, 0.6, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, }, }, .ringCorrectionFactor = { { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, { 0.4, 0.5, 0.5, 0.4, 0.4, 0.4, 0.4, 0.2, 0.2, 0.1, 0.3, 1.0, 1.0, 0.7, 0.8, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.8, 0.5, 0.4, 0.3, 0.3, }, { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.8, 0.5, 0.1, 0.7, 0.2, 0.4, 0.4, 0.6, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, }, }; static struct tuningModel rcclTuningModel[] = { tuning_model_0, tuning_model_1, tuning_model_2, tuning_model_3, tuning_model_4, }; /* Array indexes used below */ #define VOLTA_COMPCAP_IDX 0 #define AMPERE_COMPCAP_IDX 1 #define HOPPER_COMPCAP_IDX 2 // LL128 max BW per channel static const double llMaxBws[3][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}, /* Hopper-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}}; static const double perChMaxRingLL128Bws[3][3] = { /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7}, }; static const double perChMaxTreeLL128Bws[3][3] = { /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0}, /* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0}, }; static const double perChMaxTreeBws[3][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8}, /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0}, }; // Network post overhead in ns (1000 = 1 us) SCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2); static float getNetOverhead(struct scclComm* comm) { if(scclParamNetOverhead() != -2) return scclParamNetOverhead() * .001; int cpuArch, cpuVendor, cpuModel; SCCLCHECK(scclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); if(cpuArch == SCCL_TOPO_CPU_ARCH_X86 && cpuVendor == SCCL_TOPO_CPU_VENDOR_INTEL) return 1.0; if(cpuArch == SCCL_TOPO_CPU_ARCH_X86 && cpuVendor == SCCL_TOPO_CPU_VENDOR_AMD) return 2.0; else return 1.0; } scclResult_t scclTopoTuneModel(struct scclComm* comm, int minCompCap, int maxCompCap, struct scclTopoGraph** graphs) { int simpleDefaultThreads = (graphs[SCCL_ALGO_RING]->bwIntra * graphs[SCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : SCCL_SIMPLE_MAX_NTHREADS; comm->maxThreads[SCCL_ALGO_RING][SCCL_PROTO_SIMPLE] = getNthreads("SCCL_NTHREADS", scclParamNthreads(), 4 * comm->WarpSize, SCCL_MAX_NTHREADS, simpleDefaultThreads, comm->WarpSize); comm->maxThreads[SCCL_ALGO_TREE][SCCL_PROTO_SIMPLE] = comm->maxThreads[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_SIMPLE] = getNthreads("SCCL_NTHREADS", scclParamNthreads(), 4 * comm->WarpSize, SCCL_MAX_NTHREADS, SCCL_MAX_NTHREADS, comm->WarpSize); comm->maxThreads[SCCL_ALGO_RING][SCCL_PROTO_LL] = comm->maxThreads[SCCL_ALGO_TREE][SCCL_PROTO_LL] = comm->maxThreads[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_LL] = getNthreads("SCCL_NTHREADS", scclParamNthreads(), 4 * comm->WarpSize, SCCL_MAX_NTHREADS, SCCL_MAX_NTHREADS, comm->WarpSize); comm->maxThreads[SCCL_ALGO_RING][SCCL_PROTO_LL128] = comm->maxThreads[SCCL_ALGO_TREE][SCCL_PROTO_LL128] = getNthreads("SCCL_LL128_NTHREADS", scclParamLl128Nthreads(), 4 * comm->WarpSize, SCCL_LL128_MAX_NTHREADS, SCCL_LL128_MAX_NTHREADS, comm->WarpSize); int nNodes = comm->nNodes; int nRanks = comm->nRanks; if(nRanks <= 1) return scclSuccess; int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX; int cpuArch, cpuVendor, cpuModel; SCCLCHECK(scclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); int index2 = nNodes <= 2 ? nNodes - 1 : 2; // LL: for single node, we look at GPU type; for multi-node, we look at CPU type int index1 = nNodes == 1 ? compCapIndex : cpuVendor == SCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0; double llMaxBw = llMaxBws[index1][index2]; double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2]; double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2]; double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2]; // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring // if (cpuArch == SCCL_TOPO_CPU_ARCH_POWER) hwLat[SCCL_HW_PCI][SCCL_ALGO_TREE][SCCL_PROTO_SIMPLE] = hwLat[SCCL_HW_PCI][SCCL_ALGO_RING][SCCL_PROTO_SIMPLE]; float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount int intraHw[SCCL_NUM_ALGORITHMS], hw[SCCL_NUM_ALGORITHMS]; for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? SCCL_HW_NVLINK : SCCL_HW_PCI; for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : SCCL_HW_NET; for(int coll = 0; coll < SCCL_NUM_FUNCTIONS; coll++) { int nsteps = coll == scclFuncAllReduce ? 2 * (nRanks - 1) : coll == scclFuncReduceScatter || coll == scclFuncAllGather ? nRanks - 1 : nRanks; int nInterSteps = coll == scclFuncAllReduce ? (nNodes > 1 ? 2 * nNodes : 0) : coll == scclFuncReduceScatter || coll == scclFuncAllGather ? nNodes - 1 : nNodes; for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++) { if(coll == scclFuncBroadcast && a != SCCL_ALGO_RING) continue; if(coll == scclFuncReduce && a != SCCL_ALGO_RING) continue; if(coll == scclFuncReduceScatter && a != SCCL_ALGO_RING) continue; if(coll == scclFuncAllGather && a != SCCL_ALGO_RING) continue; for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) { if((a == SCCL_ALGO_NVLS || a == SCCL_ALGO_NVLS_TREE) && p != SCCL_PROTO_SIMPLE) continue; int collnet = (a == SCCL_ALGO_COLLNET_DIRECT || a == SCCL_ALGO_COLLNET_CHAIN) ? 1 : 0; float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter; float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw; // INFO(SCCL_INIT, "algo %s proto %s busBw %f baseBw %f bw %f nChannels %d bwIntra %f bwInter %f", scclAlgoStr[a], scclProtoStr[p], busBw, // comm->topo->baseBw, bw, graphs[a]->nChannels, graphs[a]->bwIntra, graphs[a]->bwInter); // Various model refinements if(nNodes <= 2) busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p]; else busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p]; if(a == SCCL_ALGO_COLLNET_DIRECT && p == SCCL_PROTO_SIMPLE && minCompCap >= 90) busBw *= .85; // Convert bus BW to algorithm BW float ratio; if(a == SCCL_ALGO_RING) ratio = (1.0 * nRanks) / nsteps; else if(a == SCCL_ALGO_NVLS) ratio = 5.0 / 6.0; else if(a == SCCL_ALGO_NVLS_TREE) ratio = .70 * nNodes / (2 * (nNodes - 1)); else ratio = .5; comm->bandwidths[coll][a][p] = busBw * ratio; comm->latencies[coll][a][p] = baseLat[a][p]; float intraLat = rcclTuningModel[comm->topo->tuning].hwLat[intraHw[a]][a][p]; float interLat = graphs[a]->latencyInter ? graphs[a]->latencyInter : rcclTuningModel[comm->topo->tuning].hwLat[SCCL_HW_NET][a][p]; // if (nNodes > 1 && p == SCCL_PROTO_LL) intraLat *= 1.8; if(p == SCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter; if(a == SCCL_ALGO_RING) { float lat = rcclTuningModel[comm->topo->tuning].hwLat[hw[a]][a][p]; if((coll == scclFuncReduce || coll == scclFuncBroadcast)) { if(graphs[a]->sameChannels) { comm->latencies[coll][a][p] += lat; } else { if(p == SCCL_PROTO_SIMPLE) lat = rcclTuningModel[comm->topo->tuning] .hwLat[hw[a]][SCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling comm->latencies[coll][a][p] += nsteps * lat; } } else { // Inter-node rings still have to launch nsteps * net overhead. float netOverhead = 0.0; if(nNodes > 1) { netOverhead = getNetOverhead(comm); if(p == SCCL_PROTO_SIMPLE) netOverhead *= 3; } intraLat = std::max(intraLat, netOverhead); comm->latencies[coll][a][p] += (nsteps - nInterSteps) * intraLat + nInterSteps * interLat; } } else if(a == SCCL_ALGO_TREE) { comm->latencies[coll][a][p] += 2 * ((nRanks / nNodes - 1) * intraLat + log2i(nNodes) * interLat); } else if(a == SCCL_ALGO_COLLNET_DIRECT) { comm->latencies[coll][a][p] += 2 * (std::min(1, (nRanks / nNodes - 1)) * intraLat + (nRanks / nNodes - 1) * 0.5) + interLat; // Add 0.5 arity serialization latency } else if(a == SCCL_ALGO_COLLNET_CHAIN) { comm->latencies[coll][a][p] += 2 * (nRanks / nNodes - 1) * intraLat + interLat; } else if(a == SCCL_ALGO_NVLS) { if(nNodes > 1) comm->latencies[coll][a][p] += rcclTuningModel[comm->topo->tuning].hwLat[SCCL_HW_NET][a][p]; } else if(a == SCCL_ALGO_NVLS_TREE) { comm->latencies[coll][a][p] += 2 * (nNodes - 1) * rcclTuningModel[comm->topo->tuning].hwLat[SCCL_HW_NET][a][p]; } } } } // Protocols/Algorithms enable/disable, and user overrides. // All are enabled except ll128 which is enabled by default only in certain cases. int protoEnable[SCCL_NUM_PROTOCOLS] = {1, 2, 1}; int algoEnable[SCCL_NUM_ALGORITHMS] = {1, 1, 1, 1, 1, 1}; const char* protoStr = getenv("SCCL_PROTO"); if(protoStr) { INFO(SCCL_ENV, "SCCL_PROTO set by environment to %s", protoStr); SCCLCHECK(parseList(protoStr, scclProtoStr, SCCL_NUM_PROTOCOLS, protoEnable)); } const char* algoStr = getenv("SCCL_ALGO"); if(algoStr) { INFO(SCCL_ENV, "SCCL_ALGO set by environment to %s", algoStr); SCCLCHECK(parseList(algoStr, scclAlgoStr, SCCL_NUM_ALGORITHMS, algoEnable)); } if(comm->nNodes == 1) algoEnable[SCCL_ALGO_NVLS_TREE] = 0; // Disable CollNet if it is not supported if(comm->collNetSupport == 0) { algoEnable[SCCL_ALGO_COLLNET_DIRECT] = 0; algoEnable[SCCL_ALGO_COLLNET_CHAIN] = 0; if(comm->nNodes > 1) algoEnable[SCCL_ALGO_NVLS] = 0; // If user has hard set SCCL_ALGO=COLLNET, ignore it if(algoEnable[SCCL_ALGO_RING] == 0 && algoEnable[SCCL_ALGO_TREE] == 0 && algoEnable[SCCL_ALGO_NVLS] == 0 && algoEnable[SCCL_ALGO_NVLS_TREE] == 0) { algoEnable[SCCL_ALGO_RING] = algoEnable[SCCL_ALGO_TREE] = 1; if(comm->rank == 0) WARN("CollNet is not supported or fails to initialize, ignoring SCCL_ALGO=COLLNET"); } } else { // Disable CollNet+Direct if not on an NVSwitch system int nvsCount = 0; SCCLCHECK(scclTopoGetNvsCount(comm->topo, &nvsCount)); if(nvsCount == 0) algoEnable[SCCL_ALGO_COLLNET_DIRECT] = 0; } for(int c = 0; c < SCCL_NUM_FUNCTIONS; c++) for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++) for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) { // Disable LL protocol on gfx11xx int pEnable = protoEnable[p]; if(pEnable == 2 && p == SCCL_PROTO_LL128) { #if defined(ENABLE_LL128) // Enable LL128 by default only on gfx90a with available tuning table pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL && (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && comm->topo->ll128Enabled) ? 1 : 0; #else pEnable = 0; #endif } if(pEnable == 0) comm->bandwidths[c][a][p] = 0; // Never disable ring for non-allreduce operations. That allows to run real apps with SCCL_ALGO=TREE. if(a == SCCL_ALGO_RING && c != scclFuncAllReduce) continue; if(algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0; } if(comm->rank == 0) { char line[1024]; for(int block = 0; block < 2; block++) { sprintf(line, " Algorithm |"); for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) { int a = block * SCCL_NUM_ALGORITHMS / 2 + ba; sprintf(line + strlen(line), " %14s %14s %14s |", "", scclAlgoStr[a], ""); } INFO(SCCL_TUNING, "%s", line); sprintf(line, " Protocol |"); for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) { for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) { sprintf(line + strlen(line), " %14s |", scclProtoStr[p]); } } INFO(SCCL_TUNING, "%s", line); sprintf(line, " Max NThreads |"); for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) { int a = block * SCCL_NUM_ALGORITHMS / 2 + ba; for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) { sprintf(line + strlen(line), " %14d |", comm->maxThreads[a][p]); } } INFO(SCCL_TUNING, "%s", line); for(int c = 0; c < SCCL_NUM_FUNCTIONS; c++) { sprintf(line, "%13s |", scclFuncStr[c]); for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) { int a = block * SCCL_NUM_ALGORITHMS / 2 + ba; for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) { sprintf(line + strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]); } } INFO(SCCL_TUNING, "%s", line); } } } // Set per-thread amount of work before we increase nThreads and nChannels for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++) { comm->threadThresholds[a][SCCL_PROTO_LL] = SCCL_LL_THREAD_THRESHOLD; comm->threadThresholds[a][SCCL_PROTO_LL128] = SCCL_LL128_THREAD_THRESHOLD; comm->threadThresholds[a][SCCL_PROTO_SIMPLE] = SCCL_SIMPLE_THREAD_THRESHOLD; } comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_LL] *= nRanks; comm->threadThresholds[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_SIMPLE] = 256; comm->threadThresholds[SCCL_ALGO_COLLNET_CHAIN][SCCL_PROTO_SIMPLE] = 256; // Override defaults with user env char* str = getenv("SCCL_THREAD_THRESHOLDS"); if(str) { INFO(SCCL_ENV, "SCCL_THREAD_THRESHOLDS set by environment to %s", str); ssize_t t[2][SCCL_NUM_PROTOCOLS] = {{-2, -2, -2}, {-2, -2, -2}}; sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0] + 1, t[0] + 2, t[1], t[1] + 1, t[1] + 2); for(int a = 0; a < 2; a++) { for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) { if(t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p]; } } } INFO(SCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld | %ld", comm->threadThresholds[SCCL_ALGO_TREE][SCCL_PROTO_LL], comm->threadThresholds[SCCL_ALGO_TREE][SCCL_PROTO_LL128], comm->threadThresholds[SCCL_ALGO_TREE][SCCL_PROTO_SIMPLE], comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_LL], comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_LL128], comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_SIMPLE], comm->threadThresholds[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_SIMPLE], comm->threadThresholds[SCCL_ALGO_COLLNET_CHAIN][SCCL_PROTO_SIMPLE]); return scclSuccess; } scclResult_t scclTopoGetAlgoTime(struct scclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) { float bw = info->comm->bandwidths[info->coll][algorithm][protocol]; float lat = info->comm->latencies[info->coll][algorithm][protocol]; if(bw == 0) { *time = -1.0; return scclSuccess; } int logSize = log2i(info->nBytes >> 6); if(algorithm == SCCL_ALGO_TREE) { if(logSize < 27) bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][logSize]; else bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][26]; } else if(algorithm == SCCL_ALGO_RING && info->comm->nNodes > 1) { if(logSize < 27) bw *= rcclTuningModel[info->comm->topo->tuning].ringCorrectionFactor[protocol][logSize]; else bw *= rcclTuningModel[info->comm->topo->tuning].ringCorrectionFactor[protocol][26]; } // Tree pipelining saves latency in aggregation cases int latCount = algorithm == SCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, SCCL_MAX_WORK_ELEMENTS); *time = lat * latCount + (info->nBytes) / (1000 * bw); return scclSuccess; } } // namespace detect } // namespace topology } // namespace hardware } // namespace sccl