#include "core.h"
#include "devcomm.h"
#include "comm.h"
#include "topo.h"

namespace sccl {
namespace hardware {
namespace topology {
namespace detect {

SCCL_PARAM(Nthreads, "NTHREADS", -2);
SCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2);

static int getNthreads(const char* name, int env, int min, int max, int def, int WarpSize) {
    int nt = env;
    if(nt > 0) {
        if(nt % WarpSize != 0) {
            WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WarpSize);
            nt = max;
        } else if(nt > max) {
            WARN("Invalid %s %d (maximum %d).", name, nt, max);
            nt = max;
        } else if(nt < min) {
            WARN("Invalid %s %d (minimum %d).", name, nt, min);
            nt = min;
        }
    } else {
        nt = def;
    }
    return nt;
}

scclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) {
    int def, set;
    if(str[0] == '^') {
        def = 1;
        set = 0;
        str++;
    } else {
        def = 0;
        set = 1;
    }
    for(int i = 0; i < nelems; i++)
        list[i] = def;
    char* tokStr = strdup(str);
    char* tmpStr;
    char* token = strtok_r(tokStr, ",", &tmpStr);
    while(token) {
        for(int i = 0; i < nelems; i++)
            if(strcasecmp(token, elems[i]) == 0)
                list[i] = set;
        token = strtok_r(NULL, ",", &tmpStr);
    }
    free(tokStr);
    return scclSuccess;
}

// Latencies in us, Bandwidths in GB/s
// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
static const float baseLat[SCCL_NUM_ALGORITHMS][SCCL_NUM_PROTOCOLS] = {{12.0, 12.0, 17.0},
                                                                       {12.0, 12.0, 17.0}, // Tree, Ring
                                                                       {12.0, 12.0, 17.0},
                                                                       {12.0, 12.0, 17.0}, // Collnet Direct, Chain
                                                                       {0, 0, 0},
                                                                       {0, 0, 0}}; // NVLS, NVLS Tree

// NVLink, PCI, Network
#define SCCL_HW_NVLINK 0
#define SCCL_HW_PCI 1
#define SCCL_HW_NET 2

struct tuningModel {
    float hwLat[3][SCCL_NUM_ALGORITHMS][SCCL_NUM_PROTOCOLS];
    float bwRatio[2][SCCL_NUM_ALGORITHMS][SCCL_NUM_PROTOCOLS];
    float treeCorrectionFactor[SCCL_NUM_PROTOCOLS][27];
    float ringCorrectionFactor[SCCL_NUM_PROTOCOLS][27];
};

static struct tuningModel tuning_model_0{
    .hwLat =
        {
            /* NVLINK */
            {/* Tree (LL/LL128/Simple)*/ {0.8, 1.4, 2.5},
             /* Ring (LL/LL128/Simple)*/ {0.8, 2.2, 3.6},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 0.8},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 1.4},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* PCI */
            {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
             /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* NET */
            {/* Tree (LL/LL128/Simple)*/ {11.8, 18.2, 20.8},
             /* Ring (LL/LL128/Simple)*/ {9.5, 19.8, 15.1},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 11.8},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 18.2},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
        },

    .bwRatio =
        {
            /* 2 nodes */
            {/* Tree (LL/LL128/Simple)*/ {0.28, 0.22, 0.91},
             /* Ring (LL/LL128/Simple)*/ {0.31, 0.34, 1.00},
             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* more than 2 nodes */
            {/* Tree (LL/LL128/Simple)*/ {0.04, 0.22, 0.95},
             /* Ring (LL/LL128/Simple)*/ {0.04, 0.34, 1.00},
             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
        },

    .treeCorrectionFactor =
        {
            {
                0.1, 0.2, 0.1, 0.1, 0.9, 0.3, 0.4, 0.1, 0.2, 0.4, 0.2, 0.1, 0.3, 0.3, 0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
            },
            {
                0.1, 0.3, 1.0, 0.1, 0.5, 1.0, 0.9, 1.0, 1.0, 1.0, 0.3, 0.1, 0.4, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
            },
            //    { 0.2, 1.0, 0.1, 0.1, 0.7, 0.2, 0.4, 0.1, 0.1, 0.3, 0.4, 0.3, 0.6, 0.8, 1.0, 1.0, 1.0, 1.0, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, },
            {
                0.2, 1.0, 0.1, 0.1, 0.7, 0.2, 0.4, 0.1, 0.1, 0.3, 0.4, 0.3, 0.6, 0.8, 1.0, 1.0, 1.0, 1.0, 0.9, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4,
            },
        },

    .ringCorrectionFactor =
        {
            {
                0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.2, 0.3, 0.5, 0.3, 0.1, 0.5, 0.5, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
            },
            {
                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.7, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3,
            },
            {
                1.0, 0.8, 0.2, 1.0, 1.0, 0.3, 1.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.5, 1.0, 0.8, 0.8, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
            },
        },
};

static struct tuningModel tuning_model_1{
    .hwLat =
        {
            /* NVLINK */
            {/* Tree (LL/LL128/Simple)*/ {1.5, 1.5, 4.5},
             /* Ring (LL/LL128/Simple)*/ {1.5, 1.5, 4.5},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 4.5},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 4.5},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* PCI */
            {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
             /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* NET */
            {/* Tree (LL/LL128/Simple)*/ {33.0, 33.0, 15.8},
             /* Ring (LL/LL128/Simple)*/ {5.1, 5.1, 68.8},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 15.8},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 15.8},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
        },

    .bwRatio =
        {
            /* 2 nodes */
            {/* Tree (LL/LL128/Simple)*/ {0.30, 1.00, 0.99},
             /* Ring (LL/LL128/Simple)*/ {0.31, 1.00, 1.00},
             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* more than 2 nodes */
            {/* Tree (LL/LL128/Simple)*/ {0.15, 1.00, 0.42},
             /* Ring (LL/LL128/Simple)*/ {0.20, 1.00, 1.00},
             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
        },

    .treeCorrectionFactor =
        {
            {
                0.5, 0.4, 0.7, 0.6, 1.0, 1.0, 0.5, 0.4, 0.1, 0.5, 0.4, 0.6, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.6, 0.5, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1,
            },
            {
                0.5, 0.4, 0.7, 0.6, 1.0, 1.0, 0.5, 0.4, 0.1, 0.5, 0.4, 0.6, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.6, 0.5, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1,
            },
            //    { 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.4, 0.5, 0.1, 0.6, 1.0, 1.0, 1.0, 0.6, 0.5, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.5, 0.3, 0.3, },
            {
                0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.4, 0.5, 0.1, 0.6, 1.0, 1.0, 1.0, 0.6, 0.5, 0.7, 1.0, 1.0, 1.0, 0.4, 0.4, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1,
            },
        },

    .ringCorrectionFactor =
        {
            {
                1.0, 0.5, 1.0, 1.0, 0.6, 0.7, 1.0, 1.0, 0.2, 1.0, 0.9, 0.7, 1.0, 1.0, 1.0, 0.9, 0.9, 0.8, 0.8, 0.7, 0.6, 0.5, 0.5, 0.3, 0.2, 0.1, 0.1,
            },
            {
                1.0, 0.5, 1.0, 1.0, 0.6, 0.7, 1.0, 1.0, 0.2, 1.0, 0.9, 0.7, 1.0, 1.0, 1.0, 0.9, 0.9, 0.8, 0.8, 0.7, 0.6, 0.5, 0.5, 0.3, 0.2, 0.1, 0.1,
            },
            {
                0.3, 1.0, 0.3, 0.1, 0.1, 0.1, 0.3, 0.7, 1.0, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.9, 1.0, 1.0, 1.0, 1.0,
            },
        },
};

static struct tuningModel tuning_model_2{
    .hwLat =
        {
            /* NVLINK */
            {/* Tree (LL/LL128/Simple)*/ {1.5, 1.5, 4.5},
             /* Ring (LL/LL128/Simple)*/ {1.5, 1.5, 4.5},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 4.5},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 4.5},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* PCI */
            {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
             /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* NET */
            {/* Tree (LL/LL128/Simple)*/ {27.9, 27.9, 15.8},
             /* Ring (LL/LL128/Simple)*/ {12.1, 12.1, 68.8},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 15.8},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 15.8},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
        },

    .bwRatio =
        {
            /* 2 nodes */
            {/* Tree (LL/LL128/Simple)*/ {0.30, 1.00, 0.99},
             /* Ring (LL/LL128/Simple)*/ {0.31, 1.00, 1.00},
             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* more than 2 nodes */
            {/* Tree (LL/LL128/Simple)*/ {0.07, 1.00, 0.42},
             /* Ring (LL/LL128/Simple)*/ {0.08, 1.00, 1.00},
             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
        },

    .treeCorrectionFactor =
        {
            {
                0.1, 0.4, 0.3, 0.3, 0.2, 0.4, 0.5, 0.1, 0.1, 0.6, 0.7, 0.7, 0.8, 1.0, 0.9, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
            },
            {
                0.1, 0.4, 0.3, 0.3, 0.2, 0.4, 0.5, 0.1, 0.1, 0.6, 0.7, 0.7, 0.8, 1.0, 0.9, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
            },
            //    { 1.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.1, 0.6, 0.9, 0.8, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.9, 0.9, 1.0, 1.0, 1.0, },
            {
                1.0, 0.1, 0.1, 0.1, 0.1, 0.2, 0.3, 0.5, 0.1, 0.6, 0.9, 0.8, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 0.4, 0.3, 0.4, 0.4, 0.4, 0.4, 0.4,
            },
        },

    .ringCorrectionFactor =
        {
            {
                0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 1.0, 1.0, 1.0, 1.0, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
            },
            {
                0.1, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 1.0, 1.0, 1.0, 1.0, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
            },
            {
                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.5, 0.6, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
            },
        },
};

static struct tuningModel tuning_model_3{
    .hwLat =
        {
            /* NVLINK */
            {/* Tree (LL/LL128/Simple)*/ {0.8, 0.0, 2.5},
             /* Ring (LL/LL128/Simple)*/ {0.8, 0.0, 3.6},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 0.8},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 0.0},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* PCI */
            {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
             /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* NET */
            {/* Tree (LL/LL128/Simple)*/ {12.5, 0.0, 22.4},
             /* Ring (LL/LL128/Simple)*/ {9.5, 0.0, 19.8},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 12.5},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 0.0},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
        },

    .bwRatio =
        {
            /* 2 nodes */
            {/* Tree (LL/LL128/Simple)*/ {0.20, 0.00, 1.75},
             /* Ring (LL/LL128/Simple)*/ {0.20, 0.00, 1.00},
             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* more than 2 nodes */
            {/* Tree (LL/LL128/Simple)*/ {0.20, 0.00, 0.96},
             /* Ring (LL/LL128/Simple)*/ {0.20, 0.00, 1.00},
             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
        },

    .treeCorrectionFactor =
        {
            {
                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.2, 1.0, 0.9, 1.0, 0.6, 0.4, 0.6, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
            },
            {
                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
            },
            //    { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.2, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.7, 0.8, 0.9, 0.7, 0.7, },
            {
                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.2, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4, 0.4, 0.3, 0.3, 0.3, 0.4, 0.3, 0.3,
            },
        },

    .ringCorrectionFactor =
        {
            {
                0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.1, 0.2, 0.1, 0.4, 0.4, 0.2, 0.2, 0.3, 0.7, 0.5, 0.4, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
            },
            {
                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
            },
            {
                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 1.0, 0.1, 0.3, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.4, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
            },
        },
};

static struct tuningModel tuning_model_4{
    .hwLat =
        {
            /* NVLINK */
            {/* Tree (LL/LL128/Simple)*/ {0.8, 1.4, 2.5},
             /* Ring (LL/LL128/Simple)*/ {0.8, 2.2, 3.6},
             /* CollNetDirect (Simple)*/ {0.8, 1.4, 2.5},
             /* CollNetChain (Simple)*/ {0.8, 1.4, 2.5},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* PCI */
            {/* Tree (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
             /* Ring (LL/LL128/Simple)*/ {2.2, 2.2, 5.7},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 5.7},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 5.7},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* NET */
            {/* Tree (LL/LL128/Simple)*/ {32.2, 34.4, 47.6},
             /* Ring (LL/LL128/Simple)*/ {35.4, 87.8, 209.2},
             /* CollNetDirect (Simple)*/ {0.0, 0.0, 47.6},
             /* CollNetChain (Simple)*/ {0.0, 0.0, 47.6},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
        },

    .bwRatio =
        {
            /* 2 nodes */
            {/* Tree (LL/LL128/Simple)*/ {0.16, 1.09, 1.61},
             /* Ring (LL/LL128/Simple)*/ {0.15, 0.41, 1.00},
             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
            /* more than 2 nodes */
            {/* Tree (LL/LL128/Simple)*/ {0.16, 1.09, 1.08},
             /* Ring (LL/LL128/Simple)*/ {0.15, 0.41, 1.00},
             /* CollNetDirect (Simple)*/ {0.00, 0.00, 1.00},
             /* CollNetChain (Simple)*/ {0.00, 0.00, 1.00},
             /* NVLS */ {0, 0, 0},
             /* NVLS Tree */ {0, 0, 0}},
        },

    .treeCorrectionFactor =
        {
            {
                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.1, 0.1, 0.2, 0.4, 0.6, 0.5, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
            },
            {
                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.2, 1.0, 0.5, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
            },
            //    { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.4, 0.3, 0.3, 0.1, 0.1, 1.0, 1.0, 0.7, 0.5, 0.6, 0.5, 0.6, 0.6, 0.5, 0.6, 0.6, 0.6, 0.7, },
            //    { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.4, 0.3, 0.3, 0.1, 0.1, 1.0, 1.0, 0.7, 0.5, 0.6, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, },
        },

    .ringCorrectionFactor =
        {
            {
                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.1, 0.3, 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
            },
            {
                0.4, 0.5, 0.5, 0.4, 0.4, 0.4, 0.4, 0.2, 0.2, 0.1, 0.3, 1.0, 1.0, 0.7, 0.8, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9, 0.8, 0.5, 0.4, 0.3, 0.3,
            },
            {
                0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.8, 0.5, 0.1, 0.7, 0.2, 0.4, 0.4, 0.6, 0.7, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
            },
        },
};

static struct tuningModel rcclTuningModel[] = {
    tuning_model_0,
    tuning_model_1,
    tuning_model_2,
    tuning_model_3,
    tuning_model_4,
};

/* Array indexes used below */
#define VOLTA_COMPCAP_IDX 0
#define AMPERE_COMPCAP_IDX 1
#define HOPPER_COMPCAP_IDX 2

// LL128 max BW per channel
static const double llMaxBws[3][3] = {
    /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
    /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
    /* Hopper-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0}};

static const double perChMaxRingLL128Bws[3][3] = {
    /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
    /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
    /* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7},
};
static const double perChMaxTreeLL128Bws[3][3] = {
    /* Volta (N1/N2/N4) */ {20.0, 20.0, 20.0},
    /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
    /* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0},
};
static const double perChMaxTreeBws[3][3] = {
    /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0},
    /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
    /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
};

// Network post overhead in ns (1000 = 1 us)
SCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);

static float getNetOverhead(struct scclComm* comm) {
    if(scclParamNetOverhead() != -2)
        return scclParamNetOverhead() * .001;
    int cpuArch, cpuVendor, cpuModel;
    SCCLCHECK(scclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
    if(cpuArch == SCCL_TOPO_CPU_ARCH_X86 && cpuVendor == SCCL_TOPO_CPU_VENDOR_INTEL)
        return 1.0;
    if(cpuArch == SCCL_TOPO_CPU_ARCH_X86 && cpuVendor == SCCL_TOPO_CPU_VENDOR_AMD)
        return 2.0;
    else
        return 1.0;
}

scclResult_t scclTopoTuneModel(struct scclComm* comm, int minCompCap, int maxCompCap, struct scclTopoGraph** graphs) {
    int simpleDefaultThreads = (graphs[SCCL_ALGO_RING]->bwIntra * graphs[SCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : SCCL_SIMPLE_MAX_NTHREADS;
    comm->maxThreads[SCCL_ALGO_RING][SCCL_PROTO_SIMPLE] =
        getNthreads("SCCL_NTHREADS", scclParamNthreads(), 4 * comm->WarpSize, SCCL_MAX_NTHREADS, simpleDefaultThreads, comm->WarpSize);
    comm->maxThreads[SCCL_ALGO_TREE][SCCL_PROTO_SIMPLE] = comm->maxThreads[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_SIMPLE] =
        getNthreads("SCCL_NTHREADS", scclParamNthreads(), 4 * comm->WarpSize, SCCL_MAX_NTHREADS, SCCL_MAX_NTHREADS, comm->WarpSize);
    comm->maxThreads[SCCL_ALGO_RING][SCCL_PROTO_LL] = comm->maxThreads[SCCL_ALGO_TREE][SCCL_PROTO_LL] =
        comm->maxThreads[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_LL] =
            getNthreads("SCCL_NTHREADS", scclParamNthreads(), 4 * comm->WarpSize, SCCL_MAX_NTHREADS, SCCL_MAX_NTHREADS, comm->WarpSize);
    comm->maxThreads[SCCL_ALGO_RING][SCCL_PROTO_LL128] = comm->maxThreads[SCCL_ALGO_TREE][SCCL_PROTO_LL128] =
        getNthreads("SCCL_LL128_NTHREADS", scclParamLl128Nthreads(), 4 * comm->WarpSize, SCCL_LL128_MAX_NTHREADS, SCCL_LL128_MAX_NTHREADS, comm->WarpSize);

    int nNodes = comm->nNodes;
    int nRanks = comm->nRanks;
    if(nRanks <= 1)
        return scclSuccess;

    int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
    int cpuArch, cpuVendor, cpuModel;
    SCCLCHECK(scclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
    int index2 = nNodes <= 2 ? nNodes - 1 : 2;
    // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
    int index1                 = nNodes == 1 ? compCapIndex : cpuVendor == SCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
    double llMaxBw             = llMaxBws[index1][index2];
    double perChMaxTreeBw      = perChMaxTreeBws[compCapIndex][index2];
    double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
    double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
    // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
    // if (cpuArch == SCCL_TOPO_CPU_ARCH_POWER) hwLat[SCCL_HW_PCI][SCCL_ALGO_TREE][SCCL_PROTO_SIMPLE] = hwLat[SCCL_HW_PCI][SCCL_ALGO_RING][SCCL_PROTO_SIMPLE];
    float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount

    int intraHw[SCCL_NUM_ALGORITHMS], hw[SCCL_NUM_ALGORITHMS];
    for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++)
        intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? SCCL_HW_NVLINK : SCCL_HW_PCI;
    for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++)
        hw[a] = nNodes == 1 ? intraHw[a] : SCCL_HW_NET;

    for(int coll = 0; coll < SCCL_NUM_FUNCTIONS; coll++) {
        int nsteps      = coll == scclFuncAllReduce ? 2 * (nRanks - 1) : coll == scclFuncReduceScatter || coll == scclFuncAllGather ? nRanks - 1 : nRanks;
        int nInterSteps = coll == scclFuncAllReduce                                    ? (nNodes > 1 ? 2 * nNodes : 0)
                          : coll == scclFuncReduceScatter || coll == scclFuncAllGather ? nNodes - 1
                                                                                       : nNodes;

        for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++) {
            if(coll == scclFuncBroadcast && a != SCCL_ALGO_RING)
                continue;
            if(coll == scclFuncReduce && a != SCCL_ALGO_RING)
                continue;
            if(coll == scclFuncReduceScatter && a != SCCL_ALGO_RING)
                continue;
            if(coll == scclFuncAllGather && a != SCCL_ALGO_RING)
                continue;

            for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
                if((a == SCCL_ALGO_NVLS || a == SCCL_ALGO_NVLS_TREE) && p != SCCL_PROTO_SIMPLE)
                    continue;
                int collnet = (a == SCCL_ALGO_COLLNET_DIRECT || a == SCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
                float bw    = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
                float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw;
                // INFO(SCCL_INIT, "algo %s proto %s busBw %f baseBw %f bw %f nChannels %d bwIntra %f bwInter %f", scclAlgoStr[a], scclProtoStr[p], busBw,
                // comm->topo->baseBw, bw, graphs[a]->nChannels, graphs[a]->bwIntra, graphs[a]->bwInter);

                // Various model refinements
                if(nNodes <= 2)
                    busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p];
                else
                    busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[1][a][p];

                if(a == SCCL_ALGO_COLLNET_DIRECT && p == SCCL_PROTO_SIMPLE && minCompCap >= 90)
                    busBw *= .85;

                // Convert bus BW to algorithm BW
                float ratio;
                if(a == SCCL_ALGO_RING)
                    ratio = (1.0 * nRanks) / nsteps;
                else if(a == SCCL_ALGO_NVLS)
                    ratio = 5.0 / 6.0;
                else if(a == SCCL_ALGO_NVLS_TREE)
                    ratio = .70 * nNodes / (2 * (nNodes - 1));
                else
                    ratio = .5;
                comm->bandwidths[coll][a][p] = busBw * ratio;

                comm->latencies[coll][a][p] = baseLat[a][p];
                float intraLat              = rcclTuningModel[comm->topo->tuning].hwLat[intraHw[a]][a][p];
                float interLat              = graphs[a]->latencyInter ? graphs[a]->latencyInter : rcclTuningModel[comm->topo->tuning].hwLat[SCCL_HW_NET][a][p];
                // if (nNodes > 1 && p == SCCL_PROTO_LL) intraLat *= 1.8;
                if(p == SCCL_PROTO_SIMPLE)
                    interLat += graphs[a]->latencyInter;

                if(a == SCCL_ALGO_RING) {
                    float lat = rcclTuningModel[comm->topo->tuning].hwLat[hw[a]][a][p];
                    if((coll == scclFuncReduce || coll == scclFuncBroadcast)) {
                        if(graphs[a]->sameChannels) {
                            comm->latencies[coll][a][p] += lat;
                        } else {
                            if(p == SCCL_PROTO_SIMPLE)
                                lat = rcclTuningModel[comm->topo->tuning]
                                          .hwLat[hw[a]][SCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
                            comm->latencies[coll][a][p] += nsteps * lat;
                        }
                    } else {
                        // Inter-node rings still have to launch nsteps * net overhead.
                        float netOverhead = 0.0;
                        if(nNodes > 1) {
                            netOverhead = getNetOverhead(comm);
                            if(p == SCCL_PROTO_SIMPLE)
                                netOverhead *= 3;
                        }
                        intraLat = std::max(intraLat, netOverhead);
                        comm->latencies[coll][a][p] += (nsteps - nInterSteps) * intraLat + nInterSteps * interLat;
                    }
                } else if(a == SCCL_ALGO_TREE) {
                    comm->latencies[coll][a][p] += 2 * ((nRanks / nNodes - 1) * intraLat + log2i(nNodes) * interLat);
                } else if(a == SCCL_ALGO_COLLNET_DIRECT) {
                    comm->latencies[coll][a][p] +=
                        2 * (std::min(1, (nRanks / nNodes - 1)) * intraLat + (nRanks / nNodes - 1) * 0.5) + interLat; // Add 0.5 arity serialization latency
                } else if(a == SCCL_ALGO_COLLNET_CHAIN) {
                    comm->latencies[coll][a][p] += 2 * (nRanks / nNodes - 1) * intraLat + interLat;
                } else if(a == SCCL_ALGO_NVLS) {
                    if(nNodes > 1)
                        comm->latencies[coll][a][p] += rcclTuningModel[comm->topo->tuning].hwLat[SCCL_HW_NET][a][p];
                } else if(a == SCCL_ALGO_NVLS_TREE) {
                    comm->latencies[coll][a][p] += 2 * (nNodes - 1) * rcclTuningModel[comm->topo->tuning].hwLat[SCCL_HW_NET][a][p];
                }
            }
        }
    }

    // Protocols/Algorithms enable/disable, and user overrides.
    // All are enabled except ll128 which is enabled by default only in certain cases.
    int protoEnable[SCCL_NUM_PROTOCOLS] = {1, 2, 1};
    int algoEnable[SCCL_NUM_ALGORITHMS] = {1, 1, 1, 1, 1, 1};

    const char* protoStr = getenv("SCCL_PROTO");
    if(protoStr) {
        INFO(SCCL_ENV, "SCCL_PROTO set by environment to %s", protoStr);
        SCCLCHECK(parseList(protoStr, scclProtoStr, SCCL_NUM_PROTOCOLS, protoEnable));
    }
    const char* algoStr = getenv("SCCL_ALGO");
    if(algoStr) {
        INFO(SCCL_ENV, "SCCL_ALGO set by environment to %s", algoStr);
        SCCLCHECK(parseList(algoStr, scclAlgoStr, SCCL_NUM_ALGORITHMS, algoEnable));
    }

    if(comm->nNodes == 1)
        algoEnable[SCCL_ALGO_NVLS_TREE] = 0;

    // Disable CollNet if it is not supported
    if(comm->collNetSupport == 0) {
        algoEnable[SCCL_ALGO_COLLNET_DIRECT] = 0;
        algoEnable[SCCL_ALGO_COLLNET_CHAIN]  = 0;
        if(comm->nNodes > 1)
            algoEnable[SCCL_ALGO_NVLS] = 0;
        // If user has hard set SCCL_ALGO=COLLNET, ignore it
        if(algoEnable[SCCL_ALGO_RING] == 0 && algoEnable[SCCL_ALGO_TREE] == 0 && algoEnable[SCCL_ALGO_NVLS] == 0 && algoEnable[SCCL_ALGO_NVLS_TREE] == 0) {
            algoEnable[SCCL_ALGO_RING] = algoEnable[SCCL_ALGO_TREE] = 1;
            if(comm->rank == 0)
                WARN("CollNet is not supported or fails to initialize, ignoring SCCL_ALGO=COLLNET");
        }
    } else {
        // Disable CollNet+Direct if not on an NVSwitch system
        int nvsCount = 0;
        SCCLCHECK(scclTopoGetNvsCount(comm->topo, &nvsCount));
        if(nvsCount == 0)
            algoEnable[SCCL_ALGO_COLLNET_DIRECT] = 0;
    }

    for(int c = 0; c < SCCL_NUM_FUNCTIONS; c++)
        for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++)
            for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
                // Disable LL protocol on gfx11xx
                int pEnable = protoEnable[p];
                if(pEnable == 2 && p == SCCL_PROTO_LL128) {
#if defined(ENABLE_LL128)
                    // Enable LL128 by default only on gfx90a with available tuning table
                    pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
                                      (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx90a") && comm->topo->ll128Enabled)
                                  ? 1
                                  : 0;
#else
                    pEnable = 0;
#endif
                }
                if(pEnable == 0)
                    comm->bandwidths[c][a][p] = 0;
                // Never disable ring for non-allreduce operations. That allows to run real apps with SCCL_ALGO=TREE.
                if(a == SCCL_ALGO_RING && c != scclFuncAllReduce)
                    continue;
                if(algoEnable[a] == 0)
                    comm->bandwidths[c][a][p] = 0;
            }

    if(comm->rank == 0) {
        char line[1024];
        for(int block = 0; block < 2; block++) {
            sprintf(line, "  Algorithm   |");
            for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) {
                int a = block * SCCL_NUM_ALGORITHMS / 2 + ba;
                sprintf(line + strlen(line), " %14s   %14s   %14s |", "", scclAlgoStr[a], "");
            }
            INFO(SCCL_TUNING, "%s", line);
            sprintf(line, "  Protocol    |");
            for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) {
                for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
                    sprintf(line + strlen(line), " %14s |", scclProtoStr[p]);
                }
            }
            INFO(SCCL_TUNING, "%s", line);
            sprintf(line, " Max NThreads |");
            for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) {
                int a = block * SCCL_NUM_ALGORITHMS / 2 + ba;
                for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
                    sprintf(line + strlen(line), " %14d |", comm->maxThreads[a][p]);
                }
            }
            INFO(SCCL_TUNING, "%s", line);
            for(int c = 0; c < SCCL_NUM_FUNCTIONS; c++) {
                sprintf(line, "%13s |", scclFuncStr[c]);
                for(int ba = 0; ba < SCCL_NUM_ALGORITHMS / 2; ba++) {
                    int a = block * SCCL_NUM_ALGORITHMS / 2 + ba;
                    for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
                        sprintf(line + strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
                    }
                }
                INFO(SCCL_TUNING, "%s", line);
            }
        }
    }

    // Set per-thread amount of work before we increase nThreads and nChannels
    for(int a = 0; a < SCCL_NUM_ALGORITHMS; a++) {
        comm->threadThresholds[a][SCCL_PROTO_LL]     = SCCL_LL_THREAD_THRESHOLD;
        comm->threadThresholds[a][SCCL_PROTO_LL128]  = SCCL_LL128_THREAD_THRESHOLD;
        comm->threadThresholds[a][SCCL_PROTO_SIMPLE] = SCCL_SIMPLE_THREAD_THRESHOLD;
    }
    comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_LL] *= nRanks;
    comm->threadThresholds[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_SIMPLE] = 256;
    comm->threadThresholds[SCCL_ALGO_COLLNET_CHAIN][SCCL_PROTO_SIMPLE]  = 256;

    // Override defaults with user env
    char* str = getenv("SCCL_THREAD_THRESHOLDS");
    if(str) {
        INFO(SCCL_ENV, "SCCL_THREAD_THRESHOLDS set by environment to %s", str);
        ssize_t t[2][SCCL_NUM_PROTOCOLS] = {{-2, -2, -2}, {-2, -2, -2}};
        sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0] + 1, t[0] + 2, t[1], t[1] + 1, t[1] + 2);
        for(int a = 0; a < 2; a++) {
            for(int p = 0; p < SCCL_NUM_PROTOCOLS; p++) {
                if(t[a][p] >= 0)
                    comm->threadThresholds[a][p] = t[a][p];
            }
        }
    }

    INFO(SCCL_INIT,
         "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld | %ld",
         comm->threadThresholds[SCCL_ALGO_TREE][SCCL_PROTO_LL],
         comm->threadThresholds[SCCL_ALGO_TREE][SCCL_PROTO_LL128],
         comm->threadThresholds[SCCL_ALGO_TREE][SCCL_PROTO_SIMPLE],
         comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_LL],
         comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_LL128],
         comm->threadThresholds[SCCL_ALGO_RING][SCCL_PROTO_SIMPLE],
         comm->threadThresholds[SCCL_ALGO_COLLNET_DIRECT][SCCL_PROTO_SIMPLE],
         comm->threadThresholds[SCCL_ALGO_COLLNET_CHAIN][SCCL_PROTO_SIMPLE]);
    return scclSuccess;
}

scclResult_t scclTopoGetAlgoTime(struct scclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) {
    float bw  = info->comm->bandwidths[info->coll][algorithm][protocol];
    float lat = info->comm->latencies[info->coll][algorithm][protocol];
    if(bw == 0) {
        *time = -1.0;
        return scclSuccess;
    }
    int logSize = log2i(info->nBytes >> 6);

    if(algorithm == SCCL_ALGO_TREE) {
        if(logSize < 27)
            bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][logSize];
        else
            bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][26];
    } else if(algorithm == SCCL_ALGO_RING && info->comm->nNodes > 1) {
        if(logSize < 27)
            bw *= rcclTuningModel[info->comm->topo->tuning].ringCorrectionFactor[protocol][logSize];
        else
            bw *= rcclTuningModel[info->comm->topo->tuning].ringCorrectionFactor[protocol][26];
    }
    // Tree pipelining saves latency in aggregation cases
    int latCount = algorithm == SCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, SCCL_MAX_WORK_ELEMENTS);
    *time        = lat * latCount + (info->nBytes) / (1000 * bw);
    return scclSuccess;
}

} // namespace detect
} // namespace topology
} // namespace hardware
} // namespace sccl