Commit c0dad530 authored by wangkaixiong's avatar wangkaixiong 🚴🏼
Browse files

init

parents
#!/bin/bash
mpirun -H master:8,node1:8 --prefix /opt/mpi -np 16 --allow-run-as-root \
--mca plm_rsh_args "-p 2222" \
-x NCCL_DEBUG=WARN \
-x HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-x HSA_FORCE_FINE_GRAIN_PCIE=1 \
-x NCCL_MAX_NCHANNELS=20 \
-x NCCL_MIN_NCHANNELS=20 \
-x NCCL_IB_HCA=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10 \
./build/all_reduce_perf -b 7618 -e 1G -f 2 -g 1 -d half
#-x NCCL_IB_QP_PER_CONNECTION=4 \
#-x NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_1:1,mlx5_10:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 \
# -x NCCL_IB_HCA=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10 \
# --mca plm rsh \
# --mca plm_rsh_agent ssh \
# -x NCCL_GRAPH_FILE=./topo.xml \
# -x NCCL_ALGO=tree,ring \
# --mca plm_rsh_args "-2 -o StrictHostKeyChecking=no" \ # 使用 SSH 通信
# -x NCCL_DEBUG=INFO # 输出调试日志,定位问题
# -x NCCL_IB_GID_INDEX=3 # 若 IB 多子网,指定 GID 索引
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
\ No newline at end of file
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
def pytest_addoption(parser):
parser.addoption("--hostfile", action="store", default="", help="specify MPI hostfile")
\ No newline at end of file
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/all_gather_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_AllGatherSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllGather test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllGather test error(s) detected.")
assert rccl_test.returncode == 0
\ No newline at end of file
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/all_reduce_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_AllReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllReduce test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllReduce test error(s) detected.")
assert rccl_test.returncode == 0
\ No newline at end of file
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/broadcast_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_BroadcastSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("Broadcast test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("Broadcast test error(s) detected.")
assert rccl_test.returncode == 0
\ No newline at end of file
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/reduce_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_ReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("Reduce test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("Reduce test error(s) detected.")
assert rccl_test.returncode == 0
\ No newline at end of file
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import pytest
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = ["1","2","4"]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/reduce_scatter_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_ReduceScatterSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("ReduceScatter test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("ReduceScatter test error(s) detected.")
assert rccl_test.returncode == 0
\ No newline at end of file
#
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
#
#include ../../makefiles/common.mk
.PHONY: all clean
BUILDDIR := $(abspath ../../build)
DST_DIR := $(BUILDDIR)/test/verifiable
ROCM_PATH ?= /opt/rocm
MPI_HOME ?= /usr/lib/openmpi
PREFIX ?= /usr/local
VERBOSE ?= 0
DEBUG ?= 0
NCCL_HOME ?= ""
HIPCC = $(ROCM_PATH)/bin/hipcc
CXX = $(HIPCC)
HIPCUFLAGS := -std=c++14
LDFLAGS :=
HIPLDFLAGS :=
ifneq ($(NCCL_HOME), "")
HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
endif
HIPCUFLAGS += -I$(ROCM_PATH)/include
HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
ifeq ($(DEBUG), 0)
HIPCUFLAGS += -O3
else
HIPCUFLAGS += -O0 -g -ggdb3
endif
ifeq ($(VERBOSE), 0)
.SILENT:
endif
ifeq ($(MPI), 1)
HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
else ifeq ($(MPICH), 1)
HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
HIPLDFLAGS += -L/usr/lib -lmpich
endif
LIBRARIES += rccl
HIPLDFLAGS += $(LIBRARIES:%=-l%)
all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test
clean:
rm -rf $(DST_DIR)
TEST_VERIFIABLE_SRCDIR := .
TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
include verifiable.mk
self_test: $(DST_DIR)/self_test
$(DST_DIR)/self_test: verifiable.cu verifiable.h
@printf "Linking %s\n" $@
@mkdir -p $(DST_DIR)
$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
/* Generate parameters for our error bound model of floating point average
* (sum of scaled values) by sampling sums of random sequences for each
* floating point type.
*
* The model has parameters "coef" and "power", where for two floats a & b,
* they are close enough if and only if:
* abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
*
* Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
*
* Compile with:
* nvcc -gencode=arch=compute_80,code=sm_80
*/
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdint>
#include <hip/hip_bfloat16.h>
#include <hip/hip_fp16.h>
using std::uint64_t;
using std::uint32_t;
using bfloat16 = hip_bfloat16;
template<typename T>
struct float_traits;
template<>
struct float_traits<float> {
static constexpr int mantissa_bits = 23;
static constexpr int exponent_bits = 8;
using uint_t = uint32_t;
__device__ static float make(double x) { return (float)x; }
__device__ static float make(uint64_t x) { return (float)x; }
__device__ static double todouble(float x) { return x; }
__device__ static float add(float a, float b) { return a+b; }
__device__ static float mul(float a, float b) { return a*b; }
};
template<>
struct float_traits<double> {
static constexpr int mantissa_bits = 52;
static constexpr int exponent_bits = 11;
using uint_t = uint64_t;
__device__ static double make(double x) { return x; }
__device__ static double make(uint64_t x) { return (double)x; }
__device__ static double todouble(double x) { return x; }
__device__ static double add(double a, double b) { return a+b; }
__device__ static double mul(double a, double b) { return a*b; }
};
template<>
struct float_traits<__half> {
static constexpr int mantissa_bits = 10;
static constexpr int exponent_bits = 5;
using uint_t = uint16_t;
__device__ static __half make(double x) { return __float2half((float)x); }
__device__ static __half make(uint64_t x) { return __int2half_rn(x); }
__device__ static double todouble(__half x) { return __half2float(x); }
__device__ static __half add(__half a, __half b) { return __hadd(a, b); }
__device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
};
template<>
struct float_traits<bfloat16> {
static constexpr int mantissa_bits = 7;
static constexpr int exponent_bits = 8;
using uint_t = uint16_t;
__device__ static bfloat16 make(double x) { return bfloat16(x); }
__device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
__device__ static double todouble(bfloat16 x) { return double(x); }
__device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
__device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
};
template<typename F>
__device__ int compare(F a, F b) {
union { typename float_traits<F>::uint_t ua; F fa; };
union { typename float_traits<F>::uint_t ub; F fb; };
ua=0; ub=0;
fa=a; fb=b;
//std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
return ua < ub ? ub-ua : ua-ub;
}
struct xoshiro256ss {
uint64_t s[4];
__device__ xoshiro256ss(int seed) {
constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
for(int i=0; i < 4; i++)
s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
}
__device__ uint64_t operator()() {
auto rol64 = [](uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
};
uint64_t const result = rol64(s[1] * 5, 7) * 9;
uint64_t const t = s[1] << 17;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rol64(s[3], 45);
return result;
}
};
static __device__ int __reduce_max_sync(unsigned int mask, int value)
{
//We ignore mask, since all bits are set when calling them in the
//test code below.
int width = warpSize;
for (unsigned int i = warpSize; i; i >>= 1) {
value = max(__shfl_down(value, i, width), value);
}
return value;
}
template<typename F>
__global__ void kernel() {
using traits = float_traits<F>;
constexpr int samps = 4<<10;
__shared__ F accf[samps];
__shared__ double accd[samps];
xoshiro256ss rng(threadIdx.x);
float expo_avg = 1;
for(int pass=0; pass < 2; pass++) {
F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
int err_max = 0;
float coef = 0;
double expo_sum = 0;
int expo_n = 0;
int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
//for(int round=0; round < 2; round++) {
for(int i=threadIdx.x; i < samps; i += blockDim.x) {
accf[i] = (F)0;
accd[i] = 0;
}
__syncthreads();
for(int r=0; r < max_ranks; r++) {
int err = 0;
for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
F f = traits::make(d);
accf[i] = traits::add(accf[i], traits::mul(scalar, f));
accd[i] += traits::todouble(f);
//if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
err = err > e ? err : e;
}
err = __reduce_max_sync(-1u, err);
err_max = err_max > err ? err_max : err;
if (r >= 2) {
// err = 1 + coef*pow(r,expo)
float c = float(err-1)/powf(float(r), expo_avg);
coef = coef > c ? coef : c;
}
if (r >= 2) {
double expo = log2f(1+err_max)/log2f(r);
expo_sum += expo;
expo_n++;
//if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
}
}
}
if(pass==0)
expo_avg = expo_sum/expo_n;
else if(threadIdx.x == 0)
printf(" coef=%1.10f expo=%1.10f\n", coef, expo_avg);
}
}
int main() {
std::printf("type=float:\n");
kernel<float><<<1,32>>>();
hipDeviceSynchronize();
std::printf("\ntype=half:\n");
kernel<half><<<1,32>>>();
hipDeviceSynchronize();
std::printf("\ntype=bfloat16:\n");
kernel<bfloat16><<<1,32>>>();
hipDeviceSynchronize();
return 0;
}
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
//#pragma nv_diag_suppress declared_but_not_referenced
#include "verifiable.h"
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <hip/hip_bfloat16.h>
#include "rccl/rccl.h"
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_BFLOAT16 ==1
#define HAVE_ncclBfloat16 1
#else
#define HAVE_ncclBfloat16 0
#endif
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
#define HAVE_ncclAvg 1
#else
#define HAVE_ncclAvg 0
#endif
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
#define HAVE_ncclPreMulSum 1
#else
#define HAVE_ncclPreMulSum 0
#endif
#include <algorithm>
#include <cassert>
#include <cstdio>
#include <cstdint>
#include <cmath>
#include <unistd.h>
using std::size_t;
using std::int8_t;
using std::int16_t;
using std::int32_t;
using std::int64_t;
using std::uint8_t;
using std::uint16_t;
using std::uint32_t;
using std::uint64_t;
////////////////////////////////////////////////////////////////////////////////
namespace {
template<typename T>
__device__ unsigned long long bitsOf(T x) {
union { unsigned long long ull; T val; } u;
u.ull = 0;
u.val = x;
return u.ull;
}
__host__ __device__ uint64_t mixBits(uint64_t x) {
union { uint32_t u32[2]; uint64_t u64; };
u64 = x;
u32[1] += 1;
u32[0] ^= u32[1];
u64 *= 0x9e3779b97f4a7c13u;
u32[0] ^= u32[1]<<16 ^ u32[1]>>16;
return u64;
}
__host__ __device__ uint64_t hashOf(uint64_t a, uint64_t b=0) {
a += uint64_t(1)<<32;
a += b;
a ^= a>>32;
a *= 0x9e3779b97f4a7c13u;
a += b>>16 ^ b<<48;
a ^= a>>32;
a *= 0xc4ceb9fe1a85ec53u;
return a;
}
}
////////////////////////////////////////////////////////////////////////////////
namespace {
template<typename T>
struct IsIntegral: std::is_integral<T> {};
template<>
struct IsIntegral<__half>: std::false_type {};
#if RCCL_BFLOAT16 == 1
template<>
struct IsIntegral<hip_bfloat16>: std::false_type {};
#endif
}
////////////////////////////////////////////////////////////////////////////////
// Hide a value from arithmetic optimizations. Hopefully compiler cannot detect
// that this is equivalent to the identity function.
template<typename T>
__host__ __device__ T inhibit(T x) {
union { uint64_t u64; T val; };
u64 = 0;
val = x;
u64 *= 0x0000000100000001u;
u64 *= 0xffffffff00000001u;
return val;
}
////////////////////////////////////////////////////////////////////////////////
namespace {
template<typename Y, typename X>
__host__ __device__ Y castTo(X x) {
return Y(x);
}
template<typename Y>
__host__ __device__ Y castTo(float x) {
return Y(x);
}
template<>
__host__ __device__ __half castTo<__half>(float x) {
return __float2half(x);
}
#if RCCL_BFLOAT16 == 1
template<>
__host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
return hip_bfloat16(x);
}
#endif
}
////////////////////////////////////////////////////////////////////////////////
// The reduction functions
namespace {
struct ReduceNil {
template<typename T>
__host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
template<typename T>
__host__ __device__ T operator()(T a, T /*b*/) const { return a; }
template<typename T>
__host__ __device__ T postOp(T x) const { return x; }
};
struct ReduceSum {
template<typename T>
__host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
template<typename T, typename=decltype(T()+T())>
__host__ __device__ T operator()(T a, T b) const { return a + b; }
__host__ __device__ __half operator()(__half a, __half b) const {
return __float2half(__half2float(a) + __half2float(b));
}
#if RCCL_BFLOAT16 == 1
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
}
#endif
template<typename T>
__host__ __device__ T postOp(T x) const { return x; }
};
struct ReduceProd {
template<typename T>
__host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
template<typename T, typename=decltype(T()*T())>
__host__ __device__ T operator()(T a, T b) const { return a * b; }
__host__ __device__ __half operator()(__half a, __half b) const {
return __float2half(__half2float(a) * __half2float(b));
}
#if RCCL_BFLOAT16 == 1
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
}
#endif
template<typename T>
__host__ __device__ T postOp(T x) const { return x; }
};
struct ReduceMin {
template<typename T>
__host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
template<typename T, typename=decltype(T()<T())>
__host__ __device__ T operator()(T a, T b) const { return a < b ? a : b; }
__host__ __device__ __half operator()(__half a, __half b) const {
return __half2float(a) < __half2float(b) ? a : b;
}
#if RCCL_BFLOAT16 == 1
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
return static_cast<float>(a) < static_cast<float>(b) ? a : b;
}
#endif
template<typename T>
__host__ __device__ T postOp(T x) const { return x; }
};
struct ReduceMax {
template<typename T>
__host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
template<typename T, typename=decltype(T()>T())>
__host__ __device__ T operator()(T a, T b) const { return a > b ? a : b; }
__host__ __device__ __half operator()(__half a, __half b) const {
return __half2float(a) > __half2float(b) ? a : b;
}
#if RCCL_BFLOAT16 == 1
__host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
return static_cast<float>(a) > static_cast<float>(b) ? a : b;
}
#endif
template<typename T>
__host__ __device__ T postOp(T x) const { return x; }
};
struct ReducePreMulSum {
template<typename T>
__host__ __device__ T preOp(T x, int rank_me) const {
return ReduceProd()(x, ncclVerifiablePremulScalar<T>(rank_me));
}
template<typename T>
__host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
template<typename T>
__host__ __device__ T postOp(T x) const { return x; }
};
template<typename T, bool integral = IsIntegral<T>::value>
struct ReduceAvg_Base;
template<typename T>
struct ReduceAvg_Base<T, /*integral=*/true> {
int rank_n;
__host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
__host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
__host__ __device__ T postOp(T x) const { return x/rank_n; }
};
template<typename T>
struct ReduceAvg_Base<T, /*integral=*/false> {
int rank_n;
__host__ __device__ T preOp(T x, int /*rank_me*/) const {
using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
return ReduceProd()(inhibit(castTo<T>(T1(1)/T1(rank_n))), inhibit(x));
}
__host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
__host__ __device__ T postOp(T x) const { return x; }
};
struct ReduceAvg {
int rank_n;
template<typename T>
__host__ __device__ T preOp(T x, int rank_me) const {
return ReduceAvg_Base<T>{rank_n}.preOp(x, rank_me);
}
template<typename T>
__host__ __device__ T operator()(T a, T b) const {
return ReduceAvg_Base<T>{rank_n}(a, b);
}
template<typename T>
__host__ __device__ T postOp(T x) const {
return ReduceAvg_Base<T>{rank_n}.postOp(x);
}
};
}
////////////////////////////////////////////////////////////////////////////////
namespace {
template<typename T>
struct FloatLayout;
template<>
struct FloatLayout<float> {
static constexpr int exponent_bits = 8, mantissa_bits = 23;
static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
};
template<>
struct FloatLayout<double> {
static constexpr int exponent_bits = 11, mantissa_bits = 52;
static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
};
template<>
struct FloatLayout<__half> {
static constexpr int exponent_bits = 5, mantissa_bits = 10;
static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
};
#if RCCL_BFLOAT16 == 1
template<>
struct FloatLayout<hip_bfloat16> {
static constexpr int exponent_bits = 8, mantissa_bits = 7;
static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
};
#endif
template<typename T>
__host__ __device__ T makeFloat(int sign, int exp, uint64_t mant) {
union { T ans; uint64_t bits; };
bits = sign;
bits <<= FloatLayout<T>::exponent_bits;
bits |= exp;
bits <<= FloatLayout<T>::mantissa_bits;
bits |= mant;
return ans;
}
}
////////////////////////////////////////////////////////////////////////////////
namespace {
// High bits of multiplcation are useful for generating bounded random values
// from unbounded random values. For instance, given X a totally random 32-bit
// integer, `umul32hi(X,n)` will be totally random within [0,n).
__host__ __device__ uint64_t umul32hi(uint32_t a, uint32_t b) {
#if HIP_VERSION > 50200000
return __umulhi(a, b);
#else
return uint64_t(a)*b >> 32;
#endif
}
__host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
#if HIP_VERSION > 50200000
return __umul64hi(a, b);
#else
return uint64_t(__uint128_t(a)*__uint128_t(b) >> 64);
#endif
}
__host__ __device__ int clz32(int x) {
#if HIP_VERSION > 50200000
return __clz(x);
#else
return x==0 ? 32 : __builtin_clz(x);
#endif
}
__host__ __device__ int clz64(long long x) {
#if HIP_VERSION > 50200000
return __clzll(x);
#else
return x==0 ? 64 : __builtin_clzll(x);
#endif
}
}
////////////////////////////////////////////////////////////////////////////////
namespace {
// Returns a wildly permuted rank index. Useful when we know we want exactly N
// random ranks to exhibit some behavior, we can just test if:
// `shuffleRank(rank_n, rank_me, rng) < N`. Note that rank_n > 0 must be true
// for well defined results. This mixes the bits of rng.
__host__ __device__ int shuffleRank(int rank_n, int rank_me, uint64_t &rng) {
uint32_t a = uint32_t(rng);
uint32_t b = uint32_t(rng>>32);
rng = mixBits(rng);
uint32_t r = rank_me;
// round down rank_n to largest pow2, then subtract 1
uint32_t n2 = (~uint32_t(0)>>1) >> clz32(rank_n);
// These are 1:1 functions modulo 2^n:
// f(x) = x*a + b : for odd a, any b
// f(x) = (x*x + x)/2
// So we apply both to the bottom n2+1 ranks, then rotate the top
// (rank_n-n2-1) to the bottom and apply both again.
if(r <= n2) {
// shuffle bottom n2+1 ranks
r = (r*(a|1) + b) & n2;
r = (r*r + r)/2 & n2;
// rotate top to bottom
r += rank_n - (n2+1);
}
else
r -= n2+1; // rotate top to bottom
if(r <= n2) {
// shuffle bottom n2+1 again
r = (r*(b|1) + a) & n2;
r = (r*r + r)/2 & n2;
}
return r;
}
}
namespace {
// Generate wild integers x and y such that if every rank submits its x into a
// summation the result will be y with y <= y_max. Ranks should be shuffled
// before calling.
template<typename Uint>
__host__ __device__ void genSumXY(
int rank_n, int rank_me, uint64_t &rng, Uint y_max, Uint &x, Uint &y,
bool avoid_y=false // if true then returned y will not equal given y
) {
static_assert(std::is_unsigned<Uint>::value, "Type must be unsigned integral.");
{ // Pick y as a random value in [y_max/2, y_max]
Uint d, y_min = (y_max+1)/2;
if(8*sizeof(Uint) > 32)
d = umul64hi(rng, y_max/2 + (avoid_y ? 0 : 1));
else
d = umul32hi(uint32_t(rng), y_max/2 + (avoid_y ? 0 : 1));
Uint y1 = (avoid_y ? y+1 : y_min) + d;
y = y1 - (avoid_y && (y1 < y_min || y_max < y1) ? y_max/2 : 0);
}
rng = mixBits(rng);
unsigned r = unsigned(rank_me);
unsigned rn = unsigned(rank_n);
// Partition our rn ranks into pn distinct subsets each of size rn/pn. If each
// rank submits 1+p (where p is 0-based partition index) then the sum be:
// (rn/pn) * pn*(pn+1)/2
// So set this equal to our desired sum y and solve for pn.
// (rn/pn) * pn*(pn+1)/2 = y
// rn*(pn+1)/2 = y
// pn = 2*(y/rn)-1
Uint pn = rn == 1 ? 1 : 2*(y/rn) - 1;
// In the case where rn is huge (compared to y) use only one partition meaning
// that all rn ranks will submit 1 (since p=0).
pn = pn == 0 ? 1 : pn;
// Can't have more partitions than ranks.
pn = rn < pn ? rn : pn;
// Compute sum of contribution from pn partitions where each submits p+1.
Uint p_sum;
if(y_max <= ~uint32_t(0)>>1) // compile time known
p_sum = Uint(uint32_t(pn)*uint32_t(pn+1)/2);
else
p_sum = Uint(uint64_t(pn)*uint64_t(pn+1)/2);
// Let s be the number of ranks per partition. This is either rn/pn as we
// intended, or y/p_sum if that's smaller to prevent overshooting our target y.
uint32_t s = y/p_sum < rn/pn ? y/p_sum : rn/pn;
x = (s != 0 && r/s < pn) ? 1 + r/s : 0; // First s*pn ranks contribute partition index +1.
x += r == rn-1 ? y - s*p_sum : 0; // Last rank contributes discrepancy.
}
}
namespace {
template<typename T>
__host__ __device__ T genInOutFloatSum(
bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index,
bool same_sign
) {
constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
uint64_t rng = hashOf(seed, index);
int y_sign = rng & 1;
int x_sign = y_sign;
int xy_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
rng = mixBits(rng);
rank_me = shuffleRank(rank_n, rank_me, rng);
// If we're using mixed signs then partition into evens and odds.
int subrank_n = same_sign ? rank_n : (rank_n+1)/2;
int subrank_me = same_sign ? rank_me : rank_me/2;
uintmant_t x0_mant, y0_mant;
genSumXY(subrank_n, subrank_me, rng, max_mant, x0_mant, y0_mant);
if (!same_sign && (rank_n+0)/2 != 0) {
uintmant_t x1_mant, y1_mant = y0_mant;
// Avoid generating y1_mant == y0_mant so we don't have to worry about
// signed zero as the result.
genSumXY((rank_n+0)/2, rank_me/2, rng, max_mant, x1_mant, y1_mant, /*avoid_y=*/true);
y_sign ^= y0_mant < y1_mant ? 1 : 0;
y0_mant = (y0_mant < y1_mant ? -1 : 1)*(y0_mant - y1_mant);
x_sign ^= rank_me%2;
x0_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
}
uintmant_t ans_mant = input_not_output ? x0_mant : y0_mant;
if(ans_mant == 0)
return T(0.0f);
else {
int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
int ans_sign = input_not_output ? x_sign : y_sign;
int ans_exp = xy_exp - shift;
ans_mant <<= shift;
return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
}
}
}
namespace {
template<typename T>
__host__ __device__ T genInOutFloatPreMulSum(
bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
) {
constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
uint64_t rng = hashOf(seed, index);
int y_sign = rng & 1;
int y_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
rng = mixBits(rng);
int subrank_me0 = shuffleRank((rank_n+1)/2, rank_me/2, rng);
int subrank_me1 = shuffleRank((rank_n+0)/2, rank_me/2, rng);
// when ncclVerifiablePremulScalar() = 1.0 (rank_me%2 == 0)
uintmant_t x0_mant, y0_mant;
genSumXY((rank_n+1)/2, subrank_me0, rng, max_mant>>1, x0_mant, y0_mant);
// when ncclVerifiablePremulScalar() = 2.0 (rank_me%2 == 1)
uintmant_t x1_mant=0, y1_mant=0;
if((rank_n+0)/2 != 0)
genSumXY((rank_n+0)/2, subrank_me1, rng, max_mant>>2, x1_mant, y1_mant);
uintmant_t x_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
uintmant_t y_mant = y0_mant + 2*y1_mant;
uintmant_t ans_mant = input_not_output ? x_mant : y_mant;
if(ans_mant == 0)
return T(0.0f);
else {
int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
int ans_sign = y_sign;
int ans_exp = y_exp - shift;
ans_mant <<= shift;
return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
}
}
}
namespace {
template<typename T>
__host__ __device__ T genInOutFloatProd(
bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
) {
// Three kinds of contributions (values for x):
// 1) x = random value: only one rank does this
// 2) x = 2^n: random positive n
// 3) x = 1
// Since only one rank submits a random value, the result of the product
// will have the same mantissa as that value but with an exponent incorporating
// the sum of the exponents from case (2)
uint64_t rng = hashOf(seed, index);
rank_me = shuffleRank(rank_n, rank_me, rng);
int y_sign = (rank_n/2)%2;
int x_sign = rank_me%2;
constexpr unsigned max_exp = -1 + (1<<(FloatLayout<T>::exponent_bits-1));
unsigned x_exp=0, y_exp=0;
genSumXY(rank_n, rank_me, rng, max_exp, x_exp, y_exp);
x_exp += FloatLayout<T>::exponent_bias;
y_exp += FloatLayout<T>::exponent_bias;
constexpr uint64_t mant_mask = (uint64_t(1)<<FloatLayout<T>::mantissa_bits)-1;
uint64_t y_mant = rng & mant_mask;
if (y_mant == 0) y_mant = 1;
return makeFloat<T>(
input_not_output ? x_sign : y_sign,
input_not_output ? x_exp : y_exp,
!input_not_output || rank_me==0 ? y_mant : 0
);
}
}
////////////////////////////////////////////////////////////////////////////////
// What follows is lots of overloads for genInput/genOutput to generate data
namespace {
// General case for integral data for all ops but ReduceNil/premulsum
template<typename T, typename ReduceFn,
typename = typename std::enable_if<
!std::is_same<ReduceFn, ReduceNil>::value
>::type>
__host__ __device__ void genInput(
T &ans, ReduceFn, int rank_n, int rank_me, uint64_t seed, intptr_t index,
std::true_type /*integral*/
) {
(void)rank_n; // silence unused warnings
union { uint64_t bits; T tmp; };
bits = uint64_t(-1)>>(64 - 8*sizeof(T));
bits &= hashOf(index ^ index<<16 ^ rank_me, seed);
// make sure we never return 0 in products
ans = std::is_same<ReduceFn, ReduceProd>::value && bits == 0 ? T(1) : tmp;
}
}
////////////////////////////////////////////////////////////////////////////////
// Dumb/generic case for genOutput just reduces results of genInput
namespace {
template<typename T, typename ReduceFn, bool IsIntegral>
__host__ __device__ void genOutput(
T &ans, ReduceFn op, int rank_n, uint64_t seed, intptr_t index,
std::integral_constant<bool, IsIntegral>
) {
T acc = genInput<T>(op, rank_n, 0, seed, index);
acc = op.preOp(acc, 0);
for(int r=1; r < rank_n; r++)
acc = op(acc, op.preOp(genInput<T>(op, rank_n, r, seed, index), r));
ans = op.postOp(acc);
}
}
////////////////////////////////////////////////////////////////////////////////
// Nil reduction (byte copy functions). Optimized to assume rank_n=1
namespace {
template<typename T, bool IsIntegral>
__host__ __device__ void genInput(
T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
std::integral_constant<bool, IsIntegral>
) {
(void)rank_n, (void)rank_me; // silence unused warnings
union { uint64_t bits; T tmp; };
bits = mixBits(seed ^ index);
bits >>= 64 - 8*sizeof(T);
bits &= uint64_t(-1)>>(64 - 8*sizeof(T));
ans = tmp;
}
template<typename T, typename ReduceFn, bool IsIntegral>
__host__ __device__ void genOutput(
T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index,
std::integral_constant<bool, IsIntegral>
) {
ans = genInput<T>(op, rank_n, 0, seed, index);
}
}
////////////////////////////////////////////////////////////////////////////////
// Sum of float
namespace {
template<typename T>
__host__ __device__ void genInput(
T &ans, ReduceSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
std::false_type /*integral*/
) {
ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/false);
}
template<typename T>
__host__ __device__ void genOutput(
T &ans, ReduceSum, int rank_n, uint64_t seed, intptr_t index,
std::false_type /*integral*/
) {
ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/false);
}
}
////////////////////////////////////////////////////////////////////////////////
// Product of float
namespace {
template<typename T>
__host__ __device__ void genInput(
T &ans, ReduceProd, int rank_n, int rank_me, uint64_t seed, intptr_t index,
std::false_type /*integral*/
) {
ans = genInOutFloatProd<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
}
template<typename T>
__host__ __device__ void genOutput(
T &ans, ReduceProd, int rank_n, uint64_t seed, intptr_t index,
std::false_type /*integral*/
) {
ans = genInOutFloatProd<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
}
}
////////////////////////////////////////////////////////////////////////////////
// PreMulSum of int/float
namespace {
template<typename T>
__host__ __device__ void genInput(
T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
std::true_type integral
) {
genInput(ans, ReduceSum(), rank_n, rank_me, seed, index, integral);
}
// No genOutput overload specific to premulsum(int), just use generic case.
template<typename T>
__host__ __device__ void genInput(
T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
std::false_type /*integral*/
) {
ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
}
template<typename T>
__host__ __device__ void genOutput(
T &ans, ReducePreMulSum, int rank_n, uint64_t seed, intptr_t index,
std::false_type /*integral*/
) {
ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
}
}
/////////////////////////////////////////////////////////////////////////////////
// Average of float
namespace {
template<typename T>
__host__ __device__ void genInput(
T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index,
std::false_type /*integral*/
) {
ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true);
}
template<typename T>
__host__ __device__ void genOutput(
T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index,
std::false_type /*integral*/
) {
ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
//ans = ReduceProd()(ans, T1(1)/T1(rank_n));
ans = ReduceProd()(ans, inhibit(castTo<T>(T1(1)/T1(rank_n))));
}
}
/////////////////////////////////////////////////////////////////////////////////
// min/max of float
namespace {
template<typename T>
__host__ __device__ void genInput(
T &ans, ReduceMin, int rank_n, int rank_me, uint64_t seed, intptr_t index,
std::false_type integral
) {
genInput<T>(ans, ReduceMax(), rank_n, rank_me, seed, index, integral);
}
template<typename T>
__host__ __device__ void genInput(
T &ans, ReduceMax, int rank_n, int rank_me, uint64_t seed, intptr_t index,
std::false_type /*integral*/
) {
(void)rank_n; // silence unused warnings
constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout<T>::mantissa_bits)-1;
uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed);
int sign = rng & 1;
rng ^= rng>>1;
int exp = rng & ((1<<(FloatLayout<T>::exponent_bits-1))-1);
exp += 1<<(FloatLayout<T>::exponent_bits-2);
rng ^= rng >> FloatLayout<T>::exponent_bits;
uint64_t mant = rng & mant_mask;
ans = makeFloat<T>(sign, exp, mant);
}
// No genOutput overload specific to floating point min/max, just use generic case.
}
///////////////////////////////////////////////////////////////////////////////
// Entry API for genInput/genOutput
namespace {
template<typename T, typename ReduceFn>
__host__ __device__ T genInput(
ReduceFn op, int rank_n, int rank_me, uint64_t seed, intptr_t index
) {
T ans;
genInput(ans, op, rank_n, rank_me, seed, index,
std::integral_constant<bool, IsIntegral<T>::value>());
return ans;
}
template<typename T, typename ReduceFn>
__host__ __device__ T genOutput(
ReduceFn op, int rank_n, uint64_t seed, intptr_t index
) {
T ans;
genOutput(ans, op, rank_n, seed, index,
std::integral_constant<bool, IsIntegral<T>::value>());
return ans;
}
}
////////////////////////////////////////////////////////////////////////////////
#if !SELF_TEST
namespace {
template<typename T, typename ReduceFn>
__global__ void prepareInput2(
T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me,
uint64_t seed, intptr_t elt_ix0
) {
intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
intptr_t i = i0 + threadIdx.x;
while(i < i1) {
elts[i] = genInput<T>(op, rank_n, rank_me, seed, elt_ix0+i);
#if 0
T output = genOutput<T>(op, rank_n, seed, elt_ix0+i);
printf("prepareInput2 T=%d seed=0x%llx r=%d ix=%lld x=%g output=%g elts=%p\n",
std::is_same<T,int>::value, (long long)seed, int(rank_me), (long long)i, (float)elts[i], (float)output, elts);
#endif
i += blockDim.x;
}
}
template<typename ReduceOp>
void prepareInput1(
void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
) {
int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
#define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
switch(elt_ty) {
case ncclInt8: CASE_TY(int8_t)
case ncclUint8: CASE_TY(uint8_t)
case ncclInt32: CASE_TY(int32_t)
case ncclUint32: CASE_TY(uint32_t)
case ncclInt64: CASE_TY(int64_t)
case ncclUint64: CASE_TY(uint64_t)
case ncclFloat16: CASE_TY(__half)
#if HAVE_ncclBfloat16
case ncclBfloat16: CASE_TY(hip_bfloat16)
#endif
case ncclFloat32: CASE_TY(float)
case ncclFloat64: CASE_TY(double)
default: assert(0);
}
#undef CASE_TY
}
}
void ncclVerifiablePrepareInput(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
) {
#define CASE_OP(op) \
if(rank_n == 1) \
prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
else \
prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
break;
switch(red_op) {
case ncclSum: CASE_OP(ReduceSum())
case ncclMin: CASE_OP(ReduceMin())
case ncclMax: CASE_OP(ReduceMax())
case ncclProd: CASE_OP(ReduceProd())
#if HAVE_ncclAvg
case ncclAvg: CASE_OP(ReduceAvg{rank_n})
#endif
#if HAVE_ncclPreMulSum
default: CASE_OP(ReducePreMulSum())
#endif
}
#undef CASE_OP
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if !SELF_TEST
namespace {
template<typename T, typename ReduceFn>
__global__ void prepareExpected2(
T *elts, intptr_t elt_n, ReduceFn op, int rank_n,
uint64_t seed, intptr_t elt_ix0
) {
intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
intptr_t i = i0 + threadIdx.x;
while(i < i1) {
elts[i] = genOutput<T>(op, rank_n, seed, elt_ix0+i);
#if 0
printf("prepareExpected2 seed=0x%llx ix=%lld x=%g elts=%p\n",
(long long)seed, (long long)(elt_ix0+i), (float)elts[i], elts);
#endif
i += blockDim.x;
}
}
template<typename ReduceOp>
void prepareExpected1(
void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
) {
int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
#define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
switch(elt_ty) {
case ncclInt8: CASE_TY(int8_t)
case ncclUint8: CASE_TY(uint8_t)
case ncclInt32: CASE_TY(int32_t)
case ncclUint32: CASE_TY(uint32_t)
case ncclInt64: CASE_TY(int64_t)
case ncclUint64: CASE_TY(uint64_t)
case ncclFloat16: CASE_TY(__half)
#if HAVE_ncclBfloat16
case ncclBfloat16: CASE_TY(hip_bfloat16)
#endif
case ncclFloat32: CASE_TY(float)
case ncclFloat64: CASE_TY(double)
default: assert(0);
}
#undef CASE_TY
}
}
void ncclVerifiablePrepareExpected(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
) {
#define CASE_OP(op) \
if(rank_n == 1) \
prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
else \
prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
break;
switch(red_op) {
case ncclSum: CASE_OP(ReduceSum())
case ncclMin: CASE_OP(ReduceMin())
case ncclMax: CASE_OP(ReduceMax())
case ncclProd: CASE_OP(ReduceProd())
#if HAVE_ncclAvg
case ncclAvg: CASE_OP(ReduceAvg{rank_n})
#endif
#if HAVE_ncclPreMulSum
default: CASE_OP(ReducePreMulSum())
#endif
}
#undef CASE_OP
}
#endif
////////////////////////////////////////////////////////////////////////////////
namespace {
/* How we compare floating point values when exactness is impossible is interesting.
* First, we take note that simply reinterpreting integer bits as floating point
* gives us a monotonic mapping which exponentially spaces out floats. Thus
* consecutive integers encode consecutive floats. In general, using integer
* subraction on the bitpatterns of two floats gives us an integer which is the
* logarithm of their relative difference. But, if the floats always have similar
* exponents, than the integer difference is actually proportional to the
* relative error (this is because we are counting hops in the mantissa bits only,
* not the exponent bits). So a cheap way to compare if two floats are relatively
* close is: abs(intBits(a), intBits(b)) < tolerance. The following formula
* calculates such a tolerance for a summation of n floats. This formula
* was derived by inspecting the maximum observed integer difference over many
* random runs of summation. The parameter values were computed by the
* companion program "inexact_regress.cu".
*/
__host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) {
float power, coef;
switch(elt_ty) {
case ncclFloat32:
case ncclFloat64:
power = .51f;
coef = 1.25f;
break;
case ncclFloat16:
power = .91f;
coef = .75f;
break;
#if HAVE_ncclBfloat16
case ncclBfloat16:
power = .91f;
coef = .66f;
break;
#endif
}
#if __CUDA_ARCH__
return 1 + unsigned(coef*powf(float(rank_n), power));
#else
return 1 + unsigned(coef*std::pow(float(rank_n), power));
#endif
}
template<typename T>
__host__ __device__ uint64_t calcDelta(T a, T b) {
union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y;
x.t = a;
y.t = b;
switch(sizeof(T)) {
case 1: return x.i1 < y.i1 ? y.i1 - x.i1 : x.i1 - y.i1;
case 2: return x.i2 < y.i2 ? y.i2 - x.i2 : x.i2 - y.i2;
case 4: return x.i4 < y.i4 ? y.i4 - x.i4 : x.i4 - y.i4;
default: return x.i8 < y.i8 ? y.i8 - x.i8 : x.i8 - y.i8;
}
}
}
////////////////////////////////////////////////////////////////////////////////
#if !SELF_TEST
namespace {
template<typename T>
__global__ void verifyPrepared(
T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n
) {
intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
intptr_t i = i0 + threadIdx.x;
int64_t bad = 0;
while(i < i1) {
T a = results[i], b = expected[i];
T delta = a < b ? b - a : a - b;
bad += tolerance < delta ? 1 : 0;
#if 0
if(tolerance < delta) {
printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]);
}
#endif
i += blockDim.x;
}
//asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
atomicAdd((unsigned long *)bad_elt_n, (unsigned long)bad);
}
template<typename T, typename Uint, typename ReduceFn>
__global__ void verifyInline2(
T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed,
intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n
) {
intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
intptr_t i = i0 + threadIdx.x;
int64_t bad = 0;
while(i < i1) {
union { T t; Uint u; } a, b;
a.t = results[i];
b.t = genOutput<T>(op, rank_n, seed, elt_ix0+i);
Uint delta = a.u < b.u ? b.u - a.u : a.u - b.u;
bad += tolerance < delta ? 1 : 0;
#if 0
T input = genInput<T>(op, rank_n, 0, seed, elt_ix0+i);
if(tolerance < delta) {
printf("verifyInline2 fail T=%d ix=%lld got=%g exp=%g input=%g\n",
std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
} else {
printf("verifyInline2 pass T=%d ix=%lld got=%g exp=%g input=%g\n",
std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
}
#endif
i += blockDim.x;
}
//asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
atomicAdd((unsigned long*)bad_elt_n, (unsigned long)bad);
}
template<typename T, typename Uint>
void verifyInline1(
T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n
) {
#define CASE_OP(op) \
if(rank_n == 1) \
verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
else \
verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
break;
switch(red_op) {
case ncclSum: CASE_OP(ReduceSum())
case ncclMin: CASE_OP(ReduceMin())
case ncclMax: CASE_OP(ReduceMax())
case ncclProd: CASE_OP(ReduceProd())
#if HAVE_ncclAvg
case ncclAvg: CASE_OP(ReduceAvg{rank_n})
#endif
#if HAVE_ncclPreMulSum
default: CASE_OP(ReducePreMulSum())
#endif
}
#undef CASE_OP
}
}
void ncclVerifiableVerify(
void const *results, void const *expected, intptr_t elt_n, int elt_ty,
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
int64_t *bad_elt_n, hipStream_t stream
) {
bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
#if HAVE_ncclBfloat16
floating |= elt_ty == ncclBfloat16;
#endif
unsigned tolerance = 0;
#if HAVE_ncclAvg
if (floating && red_op == ncclAvg)
tolerance = calcSumFloatTolerance(rank_n, elt_ty);
#endif
int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
*bad_elt_n = 0;
#define CASE_TY(T, Uint) { \
if(expected != nullptr) { \
verifyPrepared<<<block_n, 512, 0, stream>>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \
} else { \
verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
} \
} break;
switch(elt_ty) {
case ncclInt8: CASE_TY(int8_t, uint8_t)
case ncclUint8: CASE_TY(uint8_t, uint8_t)
case ncclInt32: CASE_TY(int32_t, uint32_t)
case ncclUint32: CASE_TY(uint32_t, uint32_t)
case ncclInt64: CASE_TY(int64_t, uint64_t)
case ncclUint64: CASE_TY(uint64_t, uint64_t)
case ncclFloat16: CASE_TY(__half, uint16_t)
#if HAVE_ncclBfloat16
case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
#endif
case ncclFloat32: CASE_TY(float, uint32_t)
case ncclFloat64: CASE_TY(double, uint64_t)
default: assert(0);
}
#undef CASE_TY
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if SELF_TEST
#include <iostream>
template<typename T, typename Op>
__device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) {
//if(!std::is_same<T,half>::value) return;
//if(!std::is_same<Op,ReduceProd>::value) return;
//if(rank_n!=3) return;
unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? calcSumFloatTolerance(rank_n, ty) : 0;
uint64_t seed = 0xc8e2bed69766d533;
for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) {
//if(ix!=387) continue;
T y = genOutput<T>(op, rank_n, seed, ix);
T sum;
for(int r=0; r < rank_n; r++) {
T x = genInput<T>(op, rank_n, r, seed, ix);
x = op.preOp(x, r);
sum = r==0 ? x : op(sum, inhibit(x));
//std::printf("x = %llx, sum = %llx\n", bitsOf(x), bitsOf(sum));
}
sum = op.postOp(sum);
if(tolerance < calcDelta(sum, y)) {
printf(
//"%10g != %10g : T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
"%llx != %llx : T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
*(long long*)&sum, *(long long*)&y, tyname, opname, rank_n, ix
);
}
}
}
template<typename T>
__device__ void sweep1(int ty, char const *tyname) {
for(int i=0; i < 10; i++) {
int rank_n = (1<<i) + i;
sweep2<T>(ty, tyname, ReduceSum(), "sum", rank_n);
sweep2<T>(ty, tyname, ReduceProd(), "prod", rank_n);
sweep2<T>(ty, tyname, ReduceMin(), "min", rank_n);
sweep2<T>(ty, tyname, ReduceMax(), "max", rank_n);
sweep2<T>(ty, tyname, ReducePreMulSum(), "premulsum", rank_n);
sweep2<T>(ty, tyname, ReduceAvg{rank_n}, "avg", rank_n);
}
}
__global__ void sweep() {
sweep1<int8_t>(ncclInt8, "int8");
sweep1<uint8_t>(ncclUint8, "uint8");
sweep1<int32_t>(ncclInt32, "int32");
sweep1<uint32_t>(ncclUint32, "uint32");
sweep1<int64_t>(ncclInt64, "int64");
sweep1<uint64_t>(ncclUint64, "uint64");
sweep1<__half>(ncclFloat16, "half");
#if HAVE_ncclBfloat16
sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
#endif
sweep1<float>(ncclFloat32, "float");
sweep1<double>(ncclFloat64, "double");
}
int main(int arg_n, char **args) {
std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
hipSetDevice(0);
sweep<<<1,512>>>();
hipDeviceSynchronize();
return 0;
}
#endif
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef _d41d8cd98f00b204e9800998ecf8427e
#define _d41d8cd98f00b204e9800998ecf8427e
#include <hip/hip_runtime.h>
#include <stdint.h>
/* Routines for launching kernels that verify reduction results. A significant
* feature of these routines is they carefully craft floating point input
* to produce exactly predictable output.
*
* int elt_ty: actually just a ncclDataType_t
*
* int red_op: mostly just a ncclRedOp_t. Since PreMulSum ops are dynamically
* created, these are encoded as the value ncclNumOps and their scalar is
* assumed to be `ncclVerifiablePremulScalar(rank_me)`
*
* uint64_t seed: arbitrary 64-bits to use in seeding the random values
*
* intptr_t elt_ix0: index of first element pointed to by elts when generating
* random values. This makes it possible to generate subsequences independently
* as well as in aggregate.
*
* int rank_n: Number of contributions into the reduction. Non-reduction
* collectives like broadcast, gather, etc will always set this to one.
*
* int rank_me: Index of this contribution
*/
// Use this as the local scalar for PreMulSum ops
template<typename T>
__host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
return T(rank_me%2 == 0 ? 1.0f : 2.0f);
}
// Enqueue kernel to generate data which is to be reduced.
void ncclVerifiablePrepareInput(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
);
// Enqueue kernel to generate expected results of reduction.
void ncclVerifiablePrepareExpected(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
);
// Enqueue kernel to verify reduced data matches expectation. The number of
// failed elements is written to bad_elt_n which must be in cudaHost memory.
// If `expected == nullptr` then the expected results are generated on-the-fly
// which can be costly. Thus if you plan to run the same reduction multiple
// times it is advantageous to precompute the expected values with
// ncclVerifiablePrepareExpected and pass them as `expected` here.
void ncclVerifiableVerify(
void const *results, void const *expected, intptr_t elt_n, int elt_ty,
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
int64_t *bad_elt_n, hipStream_t stream
);
#endif
# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
# We requires both of the following paths to be set upon including this makefile
# TEST_VERIFIABLE_SRCDIR = <points to this directory>
# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
@printf "Compiling %s\n" $@
@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment