[Bugfix] Flash attention arches not getting set properly (#9062)

22482e49 · Lucas Wilkinson · GitHub · 3d826d2c · 22482e49
Unverified Commit 22482e49 authored Oct 04, 2024 by Lucas Wilkinson Committed by GitHub Oct 04, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 0 deletions

CMakeLists.txt CMakeLists.txt +11 -0

No files found.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -482,6 +482,17 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
  return()
 endif ()
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target  
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the 
+# arches in the CUDA case (and instead set the gencodes on a per file basis) 
+# we need to manually set VLLM_GPU_ARCHES here.
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  foreach(_ARCH ${CUDA_ARCHS})
+    string(REPLACE "." "" _ARCH "${_ARCH}")
+    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
+  endforeach()
+endif()
 #
 # Build vLLM flash attention from source
 #