"requirements-common.txt" did not exist on "71d63ed72e8e528e3131af79b2c1ad853065ccb8"
flashmla.cmake 5.07 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
include(FetchContent)

# If FLASH_MLA_SRC_DIR is set, flash-mla is installed from that directory 
# instead of downloading.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{FLASH_MLA_SRC_DIR})
  set(FLASH_MLA_SRC_DIR $ENV{FLASH_MLA_SRC_DIR})
endif()

if(FLASH_MLA_SRC_DIR)
  FetchContent_Declare(
        flashmla 
        SOURCE_DIR ${FLASH_MLA_SRC_DIR}
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
  )
else()
  FetchContent_Declare(
        flashmla
21
        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
Matthew Bonanni's avatar
Matthew Bonanni committed
22
        GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
23
24
25
26
27
28
29
30
31
32
33
34
35
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
  )
endif()


FetchContent_MakeAvailable(flashmla)
message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")

# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
# Only build FlashMLA kernels if we are building for something compatible with 
# sm90a
36
37

set(SUPPORT_ARCHS)
38
39
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3)
    list(APPEND SUPPORT_ARCHS "9.0a")
40
endif()
41
42
43
44
45
46
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.9)
    # CUDA 12.9 has introduced "Family-Specific Architecture Features"
    # this supports all compute_10x family
    list(APPEND SUPPORT_ARCHS "10.0f")
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
    list(APPEND SUPPORT_ARCHS "10.0a")
47
48
49
50
51
endif()


cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
if(FLASH_MLA_ARCHS)
52
    message(STATUS "FlashMLA CUDA architectures: ${FLASH_MLA_ARCHS}")
53
54
55
    set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
    list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")

56
    set(FlashMLA_SOURCES
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
        ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
        ${flashmla_SOURCE_DIR}/csrc/pybind.cpp
        ${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
        ${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
        ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
        ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
        ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
    )

    set(FlashMLA_Extension_SOURCES
        ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
74
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
75
    )
76
77

    set(FlashMLA_INCLUDES
78
79
80
81
82
83
84
85
86
87
        ${flashmla_SOURCE_DIR}/csrc
        ${flashmla_SOURCE_DIR}/csrc/sm90
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
    )

    set(FlashMLA_Extension_INCLUDES
        ${flashmla_SOURCE_DIR}/csrc
        ${flashmla_SOURCE_DIR}/csrc/sm90
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
88
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
89
90
        ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
    )
91
92
93
94
95

    set_gencode_flags_for_srcs(
        SRCS "${FlashMLA_SOURCES}"
        CUDA_ARCHS "${FLASH_MLA_ARCHS}")

96
97
98
99
    set_gencode_flags_for_srcs(
        SRCS "${FlashMLA_Extension_SOURCES}"
        CUDA_ARCHS "${FLASH_MLA_ARCHS}")

100
    define_extension_target(
101
102
103
104
105
106
107
108
109
        _flashmla_C
        DESTINATION vllm
        LANGUAGE ${VLLM_GPU_LANG}
        SOURCES ${FlashMLA_SOURCES}
        COMPILE_FLAGS ${VLLM_GPU_FLAGS}
        ARCHITECTURES ${VLLM_GPU_ARCHES}
        INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
        USE_SABI 3
        WITH_SOABI)
110
111
112
113
114
115
116

    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
    target_compile_options(_flashmla_C PRIVATE
        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)

117
    define_extension_target(
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
        _flashmla_extension_C
        DESTINATION vllm
        LANGUAGE ${VLLM_GPU_LANG}
        SOURCES ${FlashMLA_Extension_SOURCES}
        COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
        ARCHITECTURES ${VLLM_GPU_ARCHES}
        INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
        USE_SABI 3
        WITH_SOABI)

    # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
    # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
    target_compile_options(_flashmla_extension_C PRIVATE
        $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
        $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
133
else()
134
135
    message(STATUS "FlashMLA will not compile: unsupported CUDA architecture ${CUDA_ARCHS}")
    # Create empty targets for setup.py on unsupported systems
136
    add_custom_target(_flashmla_C)
137
    add_custom_target(_flashmla_extension_C)
138
139
endif()