Commit 441833d3 authored by Seth Howell's avatar Seth Howell
Browse files

third-party: Update tests to use upstream NVSHMEM



NVSHMEM 3.3 and above support the host-side features
in the patch.

Note: Removed recv queue support
Signed-off-by: default avatarSeth Howell <sethh@nvidia.com>
parent 898269fa
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "utils.cuh" #include "utils.cuh"
#ifndef DISABLE_NVSHMEM #ifndef DISABLE_NVSHMEM
#include "nvshmem.h"
#include "ibgda_device.cuh" #include "ibgda_device.cuh"
#endif #endif
......
...@@ -29,26 +29,42 @@ if __name__ == '__main__': ...@@ -29,26 +29,42 @@ if __name__ == '__main__':
sources.extend(['csrc/kernels/internode.cu', 'csrc/kernels/internode_ll.cu']) sources.extend(['csrc/kernels/internode.cu', 'csrc/kernels/internode_ll.cu'])
include_dirs.extend([f'{nvshmem_dir}/include']) include_dirs.extend([f'{nvshmem_dir}/include'])
library_dirs.extend([f'{nvshmem_dir}/lib']) library_dirs.extend([f'{nvshmem_dir}/lib'])
nvcc_dlink.extend(['-dlink', f'-L{nvshmem_dir}/lib', '-lnvshmem']) nvcc_dlink.extend(['-dlink', f'-L{nvshmem_dir}/lib', '-lnvshmem_device'])
extra_link_args.extend(['-l:libnvshmem.a', '-l:nvshmem_bootstrap_uid.so', f'-Wl,-rpath,{nvshmem_dir}/lib']) extra_link_args.extend(['-l:libnvshmem_host.so', '-l:libnvshmem_device.a', f'-Wl,-rpath,{nvshmem_dir}/lib'])
if int(os.getenv('DISABLE_SM90_FEATURES', 0)): if int(os.getenv('DISABLE_SM90_FEATURES', 0)):
# Prefer A100 # Prefer A100
print("Not using SM_90")
os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '8.0') os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '8.0')
# Disable some SM90 features: FP8, launch methods, and TMA # Disable some SM90 features: FP8, launch methods, and TMA
cxx_flags.append('-DDISABLE_SM90_FEATURES') cxx_flags.append('-DDISABLE_SM90_FEATURES')
nvcc_flags.append('-DDISABLE_SM90_FEATURES') nvcc_flags.append('-DDISABLE_SM90_FEATURES')
# Add architecture flags to nvcc_dlink for the final linking step
if len(nvcc_dlink) > 0:
nvcc_dlink.extend([
'-gencode=arch=compute_80,code=sm_80',
'-gencode=arch=compute_80,code=compute_80'
])
# Disable internode and low-latency kernels # Disable internode and low-latency kernels
assert disable_nvshmem assert disable_nvshmem
else: else:
# Prefer H800 series # Prefer H800 series
os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '9.0') os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '9.0')
print("Using SM_90")
# CUDA 12 flags # CUDA 12 flags
nvcc_flags.extend(['-rdc=true', '--ptxas-options=--register-usage-level=10']) nvcc_flags.extend(['-rdc=true', '--ptxas-options=--register-usage-level=10'])
# Add architecture flags to nvcc_dlink for the final linking step
if len(nvcc_dlink) > 0:
nvcc_dlink.extend([
'-gencode=arch=compute_90,code=sm_90',
'-gencode=arch=compute_90,code=compute_90'
])
# Disable LD/ST tricks, as some CUDA version does not support `.L1::no_allocate` # Disable LD/ST tricks, as some CUDA version does not support `.L1::no_allocate`
if os.environ['TORCH_CUDA_ARCH_LIST'].strip() != '9.0': if os.environ['TORCH_CUDA_ARCH_LIST'].strip() != '9.0':
assert int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', 1)) == 1 assert int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', 1)) == 1
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment