[build] fix computer capability arch flags, add PTX, handle PTX (#591)

* fix arch flags, add PTX * bug fix Co-authored-by: Jeff Rasley <jerasley@microsoft.com>

[build] fix computer capability arch flags, add PTX, handle PTX (#591)
* fix arch flags, add PTX * bug fix Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
8a184b6b · Stas Bekman · GitHub · 0518252d · 8a184b6b
Unverified Commit 8a184b6b authored Dec 11, 2020 by Stas Bekman Committed by GitHub Dec 11, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 3 deletions

op_builder/builder.py op_builder/builder.py +6 -3

No files found.
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -221,7 +221,7 @@ class CUDAOpBuilder(OpBuilder):

        1. `TORCH_CUDA_ARCH_LIST` takes priority over `cross_compile_archs`.
        2. If neither is set default compute capabilities will be used
-        3. Under `jit_mode` compute capabilities of all visible cards will be used.
+        3. Under `jit_mode` compute capabilities of all visible cards will be used plus PTX

        Format:

@@ -243,6 +243,7 @@ class CUDAOpBuilder(OpBuilder):
                if cc not in ccs:
                    ccs.append(cc)
            ccs = sorted(ccs)
+            ccs[-1] += '+PTX'
        else:
            # Cross-compile mode, compile for various architectures
            # env override takes priority
@@ -260,8 +261,10 @@ class CUDAOpBuilder(OpBuilder):

        args = []
        for cc in ccs:
-            cc = cc.replace('.', '')
-            args.append(f'-gencode=arch=compute_{cc},code=compute_{cc}')
+            num = cc[0] + cc[2]
+            args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
+            if cc.endswith('+PTX'):
+                args.append(f'-gencode=arch=compute_{num},code=compute_{num}')

        return args