fix CI problem

77a7981a · yan.yan · d4de767e · 77a7981a · 77a7981a · 77a7981a
Commit 77a7981a authored Sep 25, 2022 by yan.yan
7 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog
+## [2.2.2] - 2022-9-25
+### Fixed 
+- Fix CI problem: main function too long and cause OOM in CI vm.
+
 ## [2.2.1] - 2022-9-25
 ### Fixed 
 - Fix build problem

--- a/pyproject.toml
+++ b/pyproject.toml
 [build-system]
-requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.3.3"]
+requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.3.4"]
 build-backend = "setuptools.build_meta"
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,7 @@ NAME = 'spconv'
 RELEASE_NAME = NAME
 deps = ["cumm"]
 cuda_ver = os.environ.get("CUMM_CUDA_VERSION", "")
+
 # is_ci_build = cuda_ver != ""
 # if not cuda_ver:
 #     nvcc_version = subprocess.check_output(["nvcc", "--version"
@@ -35,12 +36,12 @@ cuda_ver = os.environ.get("CUMM_CUDA_VERSION", "")
 #     cuda_ver = version_str

 if cuda_ver:
-    cuda_ver = cuda_ver.replace(".", "") # 10.2 to 102
+    cuda_ver_str = cuda_ver.replace(".", "") # 10.2 to 102

-    RELEASE_NAME += "-cu{}".format(cuda_ver)
-    deps = ["cumm-cu{}>=0.3.2".format(cuda_ver)]
+    RELEASE_NAME += "-cu{}".format(cuda_ver_str)
+    deps = ["cumm-cu{}>=0.3.4".format(cuda_ver_str)]
 else:
-    deps = ["cumm>=0.3.2"]
+    deps = ["cumm>=0.3.4"]



@@ -176,8 +177,9 @@ if disable_jit is not None and disable_jit == "1":
    cu.namespace = "cumm.gemm.main"
    std = "c++17"
    if cuda_ver:
-        cuda_ver_number = int(cuda_ver)
-        if cuda_ver_number < 110:
+        cuda_ver_vec = list(map(int, cuda_ver.split(".")))
+        cuda_ver_tuple = (cuda_ver_vec[0], cuda_ver_vec[1])
+        if cuda_ver_tuple[0] < 11:
            std = "c++14" 
        else:
            std = "c++17"

--- a/spconv/csrc/sparse/all.py
+++ b/spconv/csrc/sparse/all.py
@@ -29,6 +29,7 @@ from .gather import GatherCPU
 from .alloc import ExternalAllocator, ThrustAllocator
 from spconv.constants import SPCONV_DIRECT_TABLE_HASH_SIZE_SCALE, AllocKeys
 import re
+import os 

 class CustomThrustLib(pccm.Class):
    def __init__(self):
@@ -131,7 +132,12 @@ class SpconvOps(pccm.Class):
        define_str = "\n".join(defines)
        self.add_global_code(define_str)
        self.build_meta.add_global_cflags("cl", "/DNOMINMAX")
-        # self.build_meta.add_global_cflags("nvcc", "-w")
+        cuda_ver = os.environ.get("CUMM_CUDA_VERSION", "")
+        if cuda_ver:
+            cuda_ver_vec = list(map(int, cuda_ver.split(".")))
+            cuda_ver_tuple = (cuda_ver_vec[0], cuda_ver_vec[1])
+            if cuda_ver_tuple[0] < 11:
+                self.build_meta.add_global_cflags("nvcc", "-w")

        # for name in dir(AllocKeys):
        #     if not name.startswith("__"):

--- a/spconv/csrc/sparse/convops.py
+++ b/spconv/csrc/sparse/convops.py
@@ -591,6 +591,7 @@ class GemmTunerSimple(pccm.ParameterizedClass):
                finally_algos.push_back(desp);
            }}
        }}
+        std::sort(finally_algos.begin(), finally_algos.end(), [](auto a, auto b){{return a.min_arch > b.min_arch;}});
        return finally_algos;
        """)
        return code.ret("std::vector<tv::gemm::GemmAlgoDesp>",
@@ -702,9 +703,9 @@ class GemmTunerSimple(pccm.ParameterizedClass):
            trans_c, arch, shuffle_type, use_tf32);
        auto c_ = c.clone_whole_storage();
        std::vector<GemmTuneResult> all_profile_res;
-        std::vector<int> splitk_tests;
+        std::unordered_set<int> splitk_tests;
        std::vector<float> times;
-
+        float min_time = -1;
        for (auto& desp : avail){{
            tv::gemm::GemmParams params;
            if (desp.is_nvrtc || prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end()){{
@@ -722,12 +723,18 @@ class GemmTunerSimple(pccm.ParameterizedClass):
            params.stream = stream_int;
            if (desp.split_k_serial() && (hint & {AlgoHint.BackwardWeight.value})){{
                splitk_tests = {{{', '.join(map(str, SPCONV_BWD_SPLITK))}}};
+                splitk_tests.insert(int(a.dim(0)) / std::min(1 << 10, int(a.dim(0))));
+                splitk_tests.insert(int(a.dim(0)) / std::min(1 << 11, int(a.dim(0))));
+                splitk_tests.insert(int(a.dim(0)) / std::min(1 << 12, int(a.dim(0))));
            }} else {{
                splitk_tests = {{1}};
            }}
-            for (auto spk : splitk_tests){{
+            std::vector<int> splitk_tests_vec(splitk_tests.begin(), splitk_tests.end());
+            std::sort(splitk_tests_vec.begin(), splitk_tests_vec.end(), [](auto a, auto b){{return a > b;}});
+            for (auto spk : splitk_tests_vec){{
                float total_time = 0.0;
                params.split_k_slices = spk;
+                int actual_run = 0;
                for (int j = 0; j < num_run; ++j){{
                    auto ev_start = tv::CUDAEvent();
                    auto ev_end = tv::CUDAEvent();
@@ -736,11 +743,22 @@ class GemmTunerSimple(pccm.ParameterizedClass):
                    ev_end.record(stream_int);
                    if (j > 0){{
                        // skip first run
-                        total_time += tv::CUDAEvent::sync_and_duration(ev_start, ev_end);
+                        auto cur_time = tv::CUDAEvent::sync_and_duration(ev_start, ev_end);
+                        total_time += cur_time;
+                        actual_run++;
+                        if (min_time > 0 && cur_time > min_time * 1.5){{
+                            // early skip for slow kernels
+                            break;
+                        }}
                    }}
                }}
-                total_time /= (num_run - 1);
+                total_time /= actual_run;
                times.push_back(total_time);
+                if (min_time < 0){{
+                    min_time = total_time;
+                }}else{{
+                    min_time = std::min(min_time, total_time);
+                }}
                all_profile_res.push_back(GemmTuneResult(desp, arch, spk));
            }}
        }}
@@ -1078,6 +1096,7 @@ class ConvTunerSimple(pccm.ParameterizedClass):
                finally_algos.push_back(desp);
            }}
        }}
+        std::sort(finally_algos.begin(), finally_algos.end(), [](auto a, auto b){{return a.min_arch > b.min_arch;}});
        return finally_algos;
        """)
        return code.ret("std::vector<tv::gemm::ConvAlgoDesp>",
@@ -1145,9 +1164,10 @@ class ConvTunerSimple(pccm.ParameterizedClass):
        int channel_c = inp.dim(1);

        std::vector<ConvTuneResult> all_profile_res;
-        std::vector<int> splitk_tests;
+        std::unordered_set<int> splitk_tests;
        std::vector<float> times;
        tv::gemm::ConvOpType op_type_cpp = static_cast<tv::gemm::ConvOpType>(op_type);
+        float min_time = -1;
        for (auto& desp : avail){{
            tv::gemm::ConvParams params({NDIM_DONT_CARE}, op_type_cpp, tv::CUDAKernelTimer(false));
            if (desp.is_nvrtc || prebuilt_names_.find(desp.__repr__()) == prebuilt_names_.end()){{
@@ -1176,12 +1196,18 @@ class ConvTunerSimple(pccm.ParameterizedClass):

            if (desp.split_k_serial() && (op_type_cpp == tv::gemm::ConvOpType::kBackwardWeight)){{
                splitk_tests = {{{', '.join(map(str, SPCONV_BWD_SPLITK))}}};
+                splitk_tests.insert(int(inp.dim(0)) / std::min(1 << 10, int(inp.dim(0))));
+                splitk_tests.insert(int(inp.dim(0)) / std::min(1 << 11, int(inp.dim(0))));
+                splitk_tests.insert(int(inp.dim(0)) / std::min(1 << 12, int(inp.dim(0))));
            }} else {{
                splitk_tests = {{1}};
            }}
-            for (auto spk : splitk_tests){{
+            std::vector<int> splitk_tests_vec(splitk_tests.begin(), splitk_tests.end());
+            std::sort(splitk_tests_vec.begin(), splitk_tests_vec.end(), [](auto a, auto b){{return a > b;}});
+            for (auto spk : splitk_tests_vec){{
                float total_time = 0.0;
                params.split_k_slices = spk;
+                int actual_run = 0;
                for (int j = 0; j < num_run; ++j){{
                    auto ev_start = tv::CUDAEvent();
                    auto ev_end = tv::CUDAEvent();
@@ -1190,11 +1216,22 @@ class ConvTunerSimple(pccm.ParameterizedClass):
                    ev_end.record(stream_int);
                    if (j > 0){{
                        // skip first run
-                        total_time += tv::CUDAEvent::sync_and_duration(ev_start, ev_end);
+                        auto cur_time = tv::CUDAEvent::sync_and_duration(ev_start, ev_end);
+                        total_time += cur_time;
+                        actual_run++;
+                        if (min_time > 0 && cur_time > min_time * 1.5){{
+                            // early skip for slow kernels
+                            break;
                        }}
                    }}
-                total_time /= (num_run - 1);
+                }}
+                total_time /= actual_run;
                times.push_back(total_time);
+                if (min_time < 0){{
+                    min_time = total_time;
+                }}else{{
+                    min_time = std::min(min_time, total_time);
+                }}
                all_profile_res.push_back(ConvTuneResult(desp, arch, spk));
            }}
        }}

--- a/tools/build-wheels-dev.sh
+++ b/tools/build-wheels-dev.sh
+#!/bin/bash
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e -u -x
+
+function repair_wheel {
+    wheel="$1"
+    outpath="$2"
+    if ! auditwheel show "$wheel"; then
+        echo "Skipping non-platform wheel $wheel"
+    else
+        auditwheel repair "$wheel" --plat "$PLAT" -w "$outpath"
+    fi
+}
+gcc -v
+export SPCONV_DISABLE_JIT="1"
+export CUMM_CUDA_ARCH_LIST="7.5"
+# export SPCONV_PYTHON_LIST="3.7;3.8;3.9;3.10"
+# Compile wheels, we only support 3.6-3.10.
+# "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+
+for PYVER in ${SPCONV_PYTHON_LIST//;/ }
+do
+    PYVER2=`echo "$PYVER" | sed 's/\.//'`
+    PYVER_CP="cp$PYVER2-cp$PYVER2"
+    if [ "$PYVER2" = "36" ]; then
+        PYVER_CP="cp$PYVER2-cp${PYVER2}m"
+    fi
+    if [ "$PYVER2" = "37" ]; then
+        PYVER_CP="cp$PYVER2-cp${PYVER2}m"
+    fi
+    if [[ $PYVER2 == *"311"* ]]; then
+        PYVER_CP="cp311-cp311"
+    fi
+
+    "/opt/python/$PYVER_CP/bin/pip" wheel /io/  -v --no-deps -w /io/wheelhouse_tmp
+done
+
+# Bundle external shared libraries into the wheels
+for whl in /io/wheelhouse_tmp/*.whl; do
+    repair_wheel "$whl" /io/dist
+done
+
+rm -rf /io/wheelhouse_tmp
\ No newline at end of file
--- a/version.txt
+++ b/version.txt
-2.2.1
+2.2.2