Stricter conditions for aggressive PTX instructions

004d6f9b · Chenggang Zhao · 7de7464e · 004d6f9b · 004d6f9b · 004d6f9b
Commit 004d6f9b authored Jun 27, 2025 by Chenggang Zhao
Show whitespace changes
Inline Side-by-side

Showing with 7 additions and 5 deletions

README.md README.md +1 -0

csrc/kernels/utils.cuh csrc/kernels/utils.cuh +1 -1

setup.py setup.py +5 -4

No files found.
--- a/README.md
+++ b/README.md
@@ -303,6 +303,7 @@ For two-micro-batch overlapping, you can refer to the following figure. With our
  - [ ] Internode kernels
  - [ ] Low-latency kernels
 - [ ] SM-free kernels and refactors
+- [ ] Fully remove undefined-behavior PTX instructions
 ## Notices

--- a/csrc/kernels/utils.cuh
+++ b/csrc/kernels/utils.cuh
@@ -145,7 +145,7 @@ __device__  __forceinline__ int64_t ld_volatile_global(const uint64_t *ptr) {
 #ifndef DISABLE_AGGRESSIVE_PTX_INSTRS
 #define LD_NC_FUNC "ld.global.nc.L1::no_allocate.L2::256B"
 #else
-#define LD_NC_FUNC "ld.volatile.global"
+#define LD_NC_FUNC "ld.volatile.global.L2::256B"
 #endif
 // `ld.global.nc.L1::no_allocate` will be translated into `LDG.E.NA.[width].CONSTANT` in SASS

--- a/setup.py
+++ b/setup.py
@@ -42,10 +42,6 @@ if __name__ == '__main__':
        # Disable internode and low-latency kernels
        assert disable_nvshmem
-        # Disable LD/ST tricks, as some CUDA version does not support `.L1::no_allocate`
-        assert int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', 1)) == 1
-        os.environ['DISABLE_AGGRESSIVE_PTX_INSTRS'] = '1'
    else:
        # Prefer H800 series
        os.environ['TORCH_CUDA_ARCH_LIST'] = os.getenv('TORCH_CUDA_ARCH_LIST', '9.0')
@@ -53,6 +49,11 @@ if __name__ == '__main__':
        # CUDA 12 flags
        nvcc_flags.extend(['-rdc=true', '--ptxas-options=--register-usage-level=10'])
+    # Disable LD/ST tricks, as some CUDA version does not support `.L1::no_allocate`
+    if os.environ['TORCH_CUDA_ARCH_LIST'].strip() != '9.0':
+        assert int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', 1)) == 1
+        os.environ['DISABLE_AGGRESSIVE_PTX_INSTRS'] = '1'
    # Disable aggressive PTX instructions
    if int(os.getenv('DISABLE_AGGRESSIVE_PTX_INSTRS', '0')):
        cxx_flags.append('-DDISABLE_AGGRESSIVE_PTX_INSTRS')