Merge pull request #157 from mit-han-lab/dev

Fix missing third_party & merge main branch of dev repo

Merge pull request #157 from mit-han-lab/dev
Fix missing third_party & merge main branch of dev repo
67723598 · Zhekai Zhang · GitHub · e1c5f3e4 · 75dad570 · 67723598
Unverified Commit 67723598 authored Mar 07, 2025 by Zhekai Zhang Committed by GitHub Mar 07, 2025
12 changed files
--- a/setup.py
+++ b/setup.py
@@ -129,7 +129,7 @@ if __name__ == "__main__":
    for target in sm_targets:
        NVCC_FLAGS += ["-gencode", f"arch=compute_{target},code=sm_{target}"]
-    NVCC_MSVC_FLAGS = ["-Xcompiler", "/Zc:__cplusplus", "-Xcompiler", "/FS"]
+    NVCC_MSVC_FLAGS = ["-Xcompiler", "/Zc:__cplusplus", "-Xcompiler", "/FS", "-Xcompiler", "/bigobj"]
    nunchaku_extension = CUDAExtension(
        name="nunchaku._C",

--- a/src/Module.h
+++ b/src/Module.h
@@ -215,6 +215,11 @@ struct LayerOffloadHelper {
        if (offload) {
            streamCompute = std::make_unique<CUDAStreamWrapper>();
            streamLoad = std::make_unique<CUDAStreamWrapper>();
+            needWorkaround = checkWorkaround();
+            if (needWorkaround) {
+                spdlog::debug("Offloading helper: use WDDM workaround");
+            }
        }
    }
@@ -240,6 +245,7 @@ private:
                funcCompute(layer);
                nextComputeDone = std::make_unique<CUDAEventWrapper>();
                checkCUDA(cudaEventRecord(nextComputeDone->event, getCurrentCUDAStream()));
+                workaroundFlush();
            }
            {
@@ -253,10 +259,13 @@ private:
                }
                nextLoadDone = std::make_unique<CUDAEventWrapper>();
                checkCUDA(cudaEventRecord(nextLoadDone->event, getCurrentCUDAStream()));
+                workaroundFlush();
            }
            eventComputeDone = std::move(nextComputeDone);
            eventLoadDone = std::move(nextLoadDone);
+            workaroundSynchronize();
        }
    }
@@ -266,4 +275,35 @@ private:
        }
        checkCUDA(cudaStreamWaitEvent(getCurrentCUDAStream(), event->event));
    }
+    // WDDM prevents multiple streams run concurrently
+    // use flush and synchronize to work around
+    bool needWorkaround;
+    static bool checkWorkaround() {
+        if (char *env = getenv("NUNCHAKU_OFFLOAD_WDDM_WORKAROUND")) {
+            if (std::string(env) == "1") {
+                return true;
+            } else if (std::string(env) == "0") {
+                return false;
+            }
+        }
+    #ifdef _WIN32
+        return true;
+    #else
+        return false;
+    #endif
+    }
+    void workaroundFlush() {
+        if (!needWorkaround) {
+            return;
+        }
+        cudaStreamQuery(getCurrentCUDAStream());
+    }
+    void workaroundSynchronize() {
+        if (!needWorkaround) {
+            return;
+        }
+        checkCUDA(cudaEventSynchronize(eventComputeDone->event));
+    }
 };
\ No newline at end of file
--- a/Block-Sparse-Attention @ 0d23f715
+++ b/Block-Sparse-Attention @ 0d23f715
+Subproject commit 0d23f715690c5171fd93679de8afd149376db167
--- a/third_party/Block-Sparse-Attention/.gitkeep
+++ b/third_party/Block-Sparse-Attention/.gitkeep
--- a/cutlass @ a75b4ac4
+++ b/cutlass @ a75b4ac4
+Subproject commit a75b4ac483166189a45290783cb0a18af5ff0ea5
--- a/third_party/cutlass/.gitkeep
+++ b/third_party/cutlass/.gitkeep
--- a/json @ 63258397
+++ b/json @ 63258397
+Subproject commit 63258397761b3dd96dd171e5a5ad5aa915834c35
--- a/third_party/json/.gitkeep
+++ b/third_party/json/.gitkeep
--- a/mio @ 8b6b7d87
+++ b/mio @ 8b6b7d87
+Subproject commit 8b6b7d878c89e81614d05edca7936de41ccdd2da
--- a/third_party/mio/.gitkeep
+++ b/third_party/mio/.gitkeep
--- a/spdlog @ 27cb4c76
+++ b/spdlog @ 27cb4c76
+Subproject commit 27cb4c76708608465c413f6d0e6b8d99a4d84302
--- a/third_party/spdlog/.gitkeep
+++ b/third_party/spdlog/.gitkeep