Commit 75dad570 authored by Zhekai Zhang's avatar Zhekai Zhang
Browse files

[major] merge dev repo v0.1.4

parent 9cb2307d
...@@ -129,7 +129,7 @@ if __name__ == "__main__": ...@@ -129,7 +129,7 @@ if __name__ == "__main__":
for target in sm_targets: for target in sm_targets:
NVCC_FLAGS += ["-gencode", f"arch=compute_{target},code=sm_{target}"] NVCC_FLAGS += ["-gencode", f"arch=compute_{target},code=sm_{target}"]
NVCC_MSVC_FLAGS = ["-Xcompiler", "/Zc:__cplusplus", "-Xcompiler", "/FS"] NVCC_MSVC_FLAGS = ["-Xcompiler", "/Zc:__cplusplus", "-Xcompiler", "/FS", "-Xcompiler", "/bigobj"]
nunchaku_extension = CUDAExtension( nunchaku_extension = CUDAExtension(
name="nunchaku._C", name="nunchaku._C",
......
...@@ -215,6 +215,11 @@ struct LayerOffloadHelper { ...@@ -215,6 +215,11 @@ struct LayerOffloadHelper {
if (offload) { if (offload) {
streamCompute = std::make_unique<CUDAStreamWrapper>(); streamCompute = std::make_unique<CUDAStreamWrapper>();
streamLoad = std::make_unique<CUDAStreamWrapper>(); streamLoad = std::make_unique<CUDAStreamWrapper>();
needWorkaround = checkWorkaround();
if (needWorkaround) {
spdlog::debug("Offloading helper: use WDDM workaround");
}
} }
} }
...@@ -240,6 +245,7 @@ private: ...@@ -240,6 +245,7 @@ private:
funcCompute(layer); funcCompute(layer);
nextComputeDone = std::make_unique<CUDAEventWrapper>(); nextComputeDone = std::make_unique<CUDAEventWrapper>();
checkCUDA(cudaEventRecord(nextComputeDone->event, getCurrentCUDAStream())); checkCUDA(cudaEventRecord(nextComputeDone->event, getCurrentCUDAStream()));
workaroundFlush();
} }
{ {
...@@ -253,10 +259,13 @@ private: ...@@ -253,10 +259,13 @@ private:
} }
nextLoadDone = std::make_unique<CUDAEventWrapper>(); nextLoadDone = std::make_unique<CUDAEventWrapper>();
checkCUDA(cudaEventRecord(nextLoadDone->event, getCurrentCUDAStream())); checkCUDA(cudaEventRecord(nextLoadDone->event, getCurrentCUDAStream()));
workaroundFlush();
} }
eventComputeDone = std::move(nextComputeDone); eventComputeDone = std::move(nextComputeDone);
eventLoadDone = std::move(nextLoadDone); eventLoadDone = std::move(nextLoadDone);
workaroundSynchronize();
} }
} }
...@@ -266,4 +275,35 @@ private: ...@@ -266,4 +275,35 @@ private:
} }
checkCUDA(cudaStreamWaitEvent(getCurrentCUDAStream(), event->event)); checkCUDA(cudaStreamWaitEvent(getCurrentCUDAStream(), event->event));
} }
// WDDM prevents multiple streams run concurrently
// use flush and synchronize to work around
bool needWorkaround;
static bool checkWorkaround() {
if (char *env = getenv("NUNCHAKU_OFFLOAD_WDDM_WORKAROUND")) {
if (std::string(env) == "1") {
return true;
} else if (std::string(env) == "0") {
return false;
}
}
#ifdef _WIN32
return true;
#else
return false;
#endif
}
void workaroundFlush() {
if (!needWorkaround) {
return;
}
cudaStreamQuery(getCurrentCUDAStream());
}
void workaroundSynchronize() {
if (!needWorkaround) {
return;
}
checkCUDA(cudaEventSynchronize(eventComputeDone->event));
}
}; };
\ No newline at end of file
Subproject commit 0d23f715690c5171fd93679de8afd149376db167
Subproject commit a75b4ac483166189a45290783cb0a18af5ff0ea5
Subproject commit 63258397761b3dd96dd171e5a5ad5aa915834c35
Subproject commit 8b6b7d878c89e81614d05edca7936de41ccdd2da
Subproject commit 27cb4c76708608465c413f6d0e6b8d99a4d84302
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment