still working on windows CI problem

9f9d5b79 · yan.yan · ccb1f1db · 9f9d5b79 · 9f9d5b79 · 9f9d5b79
Commit 9f9d5b79 authored Oct 19, 2021 by yan.yan
5 changed files
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -15,7 +15,7 @@ jobs:
    runs-on: windows-latest
    strategy:
      matrix:
-        python-version: ['3.6', '3.7', '3.8', '3.9', '3.10'] 
+        python-version: ['3.7', '3.8', '3.9', '3.10'] 
        cuda-version: ['10.2', '11.1', '11.4']
    steps:
      - uses: actions/checkout@master
@@ -64,7 +64,7 @@ jobs:
    runs-on: ubuntu-20.04
    strategy:
      matrix:
-        python-version: ['3.8'] # this version is only used for upload.
+        python-version: ['3.7', '3.8', '3.9', '3.10'] # this version is only used for upload.
        cuda-version: ['102', '111', '114']

    steps:
@@ -88,7 +88,7 @@ jobs:
          PLAT: manylinux2014_x86_64
        if: (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags')) && (env.CUDA_VERSION != '') ) || env.CUDA_VERSION == '114'
        run: |
-          docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} -v `pwd`:/io $DOCKER_IMAGE bash -c "/io/tools/build-wheels.sh"
+          docker run --rm -e PLAT=$PLAT -e CUMM_CUDA_VERSION=${{ matrix.cuda-version }} -e SPCONV_PYTHON_LIST=${{env.PYTHON_VERSION}} -v `pwd`:/io $DOCKER_IMAGE bash -c "/io/tools/build-wheels.sh"

      - name: Publish a Python distribution to PyPI
        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')

--- a/.gitignore
+++ b/.gitignore
@@ -109,3 +109,5 @@ venv.bak/
 .vscode

 __version__.py
+
+wheelhouse_tmp
\ No newline at end of file
--- a/spconv/build.py
+++ b/spconv/build.py
@@ -20,7 +20,8 @@ from pccm.utils import project_is_editable, project_is_installed
 from .constants import PACKAGE_NAME, PACKAGE_ROOT

 if project_is_installed(PACKAGE_NAME) and project_is_editable(PACKAGE_NAME):
-    from cumm.gemm.main import GemmMainUnitTest, SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS
+    from spconv.core import SHUFFLE_SIMT_PARAMS, SHUFFLE_VOLTA_PARAMS, SHUFFLE_TURING_PARAMS
+    from cumm.gemm.main import GemmMainUnitTest
    from spconv.csrc.sparse.all import SpconvOps
    cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
    cu.namespace = "cumm.gemm.main"

--- a/spconv/core.py
+++ b/spconv/core.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from enum import Enum
-
+from cumm.gemm.main import gen_shuffle_params, GemmAlgoParams
+from cumm.gemm import kernel
+from typing import List
+from cumm.gemm.algospec.core import TensorOpParams
 class ConvAlgo(Enum):
    Native = "Native"
    MaskImplicitGemm = "MaskImplicitGemm"
@@ -25,3 +28,177 @@ class AlgoHint(Enum):
    BackwardInput = 0b010
    BackwardWeight = 0b100

+# we can't add more kernels here because build in github action is very slow.
+# TODO two step build: build gemm kernels first, then bind for every python
+
+SHUFFLE_SIMT_PARAMS: List[GemmAlgoParams] = [
+    *gen_shuffle_params(
+        (64, 128, 32), (32, 64, 32), ["s8,s8,s32,s32,s32"],
+        2, kernel.GemmAlgo.SimtDP4A, None),
+    *gen_shuffle_params(
+        (128, 64, 32), (64, 32, 32), ["s8,s8,s32,s32,s32"],
+        2, kernel.GemmAlgo.SimtDP4A, None),
+    *gen_shuffle_params(
+        (128, 128, 32),
+        (32, 64, 32), ["s8,s8,s32,s32,s32"], 2,
+        kernel.GemmAlgo.SimtDP4A, None),
+    # *gen_shuffle_params(
+    #     (128, 128, 32),
+    #     (64, 32, 32), ["s8,s8,s8,s32,s32", "s8,s8,s32,s32,s32"], 2,
+    #     kernel.GemmAlgo.SimtDP4A, None),
+    *gen_shuffle_params(
+        (64, 64, 32), (32, 32, 32), ["s8,s8,s32,s32,s32"],
+        2, kernel.GemmAlgo.SimtDP4A, None),
+    *gen_shuffle_params(
+        (64, 256, 8),
+        (32, 64, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    # *gen_shuffle_params(
+    #     (64, 256, 8),
+    #     (64, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (32, 128, 16),
+        (32, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (32, 512, 8),
+        (32, 64, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    # *gen_shuffle_params(
+    #     (128, 128, 8),
+    #     (64, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (128, 128, 8),
+        (32, 64, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (64, 128, 8),
+        (32, 64, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    # *gen_shuffle_params(
+    #     (64, 128, 8),
+    #     (64, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    # *gen_shuffle_params(
+    #     (128, 64, 8),
+    #     (32, 64, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (128, 64, 8),
+        (64, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (64, 64, 8),
+        (32, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (32, 64, 16),
+        (32, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (64, 32, 16),
+        (32, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (32, 32, 32),
+        (32, 32, 8), ["f32,f32,f32,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    # fall back kernels if mat is misaligned for half
+    # *gen_shuffle_params(
+    #     (128, 128, 8),
+    #     (32, 64, 8), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (32, 64, 32),
+        (32, 32, 8), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (32, 32, 32),
+        (32, 32, 8), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    # *gen_shuffle_params(
+    #     (64, 64, 16),
+    #     (32, 32, 8), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (64, 128, 16),
+        (32, 64, 8), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+    *gen_shuffle_params(
+        (64, 64, 8),
+        (32, 32, 8), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2, kernel.GemmAlgo.Simt, None),
+]
+
+SHUFFLE_VOLTA_PARAMS: List[GemmAlgoParams] = [
+    *gen_shuffle_params(
+        (64, 64, 32),
+        (32, 32, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Volta, TensorOpParams((8, 8, 4))),
+    # *gen_shuffle_params(
+    #     (128, 128, 32),
+    #     (64, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+    #     kernel.GemmAlgo.Volta, TensorOpParams((8, 8, 4))),
+    *gen_shuffle_params(
+        (128, 256, 32),
+        (64, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Volta, TensorOpParams((8, 8, 4))),
+    *gen_shuffle_params(
+        (256, 128, 32),
+        (64, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Volta, TensorOpParams((8, 8, 4))),
+    *gen_shuffle_params(
+        (128, 64, 32),
+        (64, 32, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Volta, TensorOpParams((8, 8, 4))),
+    *gen_shuffle_params(
+        (64, 128, 32),
+        (32, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Volta, TensorOpParams((8, 8, 4))),
+]
+# SHUFFLE_VOLTA_PARAMS = []
+SHUFFLE_TURING_PARAMS: List[GemmAlgoParams] = [
+    *gen_shuffle_params(
+        (64, 64, 32),
+        (32, 32, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (128, 128, 32),
+        (32, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    # *gen_shuffle_params(
+    #     (128, 128, 32),
+    #     (64, 32, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+    #     kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (64, 64, 64),
+        (32, 32, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (64, 128, 64),
+        (32, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (128, 256, 32),
+        (64, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (256, 128, 32),
+        (64, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (128, 64, 32),
+        (64, 32, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (64, 128, 32),
+        (32, 64, 32), ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((16, 8, 8))),
+    *gen_shuffle_params(
+        (64, 64, 32), (32, 32, 32), ["s8,s8,s32,s32,s32"],
+        2, kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+    *gen_shuffle_params(
+        (128, 128, 32),
+        (32, 64, 32), ["s8,s8,s32,s32,s32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+    # *gen_shuffle_params(
+    #     (128, 128, 32),
+    #     (64, 32, 32), ["s8,s8,s8,s32,s32", "s8,s8,s32,s32,s32"], 2,
+    #     kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+    *gen_shuffle_params(
+        (128, 256, 32),
+        (64, 64, 32), ["s8,s8,s32,s32,s32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+    *gen_shuffle_params(
+        (256, 128, 32),
+        (64, 64, 32), ["s8,s8,s32,s32,s32"], 2,
+        kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+    *gen_shuffle_params(
+        (128, 64, 32), (64, 32, 32), ["s8,s8,s32,s32,s32"],
+        2, kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+    *gen_shuffle_params(
+        (64, 128, 32), (32, 64, 32), ["s8,s8,s32,s32,s32"],
+        2, kernel.GemmAlgo.Turing, TensorOpParams((8, 8, 16))),
+]
--- a/tools/build-wheels.sh
+++ b/tools/build-wheels.sh
@@ -27,12 +27,22 @@ function repair_wheel {

 export SPCONV_DISABLE_JIT="1"
 export CUMM_CUDA_ARCH_LIST="all"
+# export SPCONV_PYTHON_LIST="3.7;3.8;3.9;3.10"
 # Compile wheels, we only support 3.6-3.10.
 # "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
-"/opt/python/cp37-cp37m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
-"/opt/python/cp38-cp38/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
-"/opt/python/cp39-cp39/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
-"/opt/python/cp310-cp310/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+
+for PYVER in ${SPCONV_PYTHON_LIST//;/ }
+do
+    PYVER2=`echo "$PYVER" | sed 's/\.//'`
+    PYVER_CP="cp$PYVER2-cp$PYVER2"
+    if [ "$PYVER2" = "36" ]; then
+        PYVER_CP="cp$PYVER2-cp${PYVER2}m"
+    fi
+    if [ "$PYVER2" = "37" ]; then
+        PYVER_CP="cp$PYVER2-cp${PYVER2}m"
+    fi
+    "/opt/python/$PYVER_CP/bin/pip" wheel /io/  -v --no-deps -w /io/wheelhouse_tmp
+done

 # Bundle external shared libraries into the wheels
 for whl in /io/wheelhouse_tmp/*.whl; do