CMakeLists.txt

# Copyright (c) OpenMMLab. All rights reserved.

cmake_minimum_required(VERSION 3.8)

#add_subdirectory(fused_multi_head_attention)

#find_package(CUDAToolkit REQUIRED)
find_package(CUDA REQUIRED)

add_library(Llama STATIC
        LlamaV2.cc
        LlamaBatch.cc
        BlockManager.cc
        SequenceManager.cc
        LlamaWeight.cc
        LlamaDecoderLayerWeight.cc
        LlamaFfnLayer.cc
        unified_decoder.cc
        unified_attention_layer.cc
        llama_kernels.cu
        llama_decoder_kernels.cu
        llama_utils.cu
        )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fPIC")
#set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
#set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
target_link_directories(Llama PUBLIC ../../../../3rdparty/composable_kernel/)
target_link_libraries(Llama PUBLIC cudart
        gemm_s4_f16
        cublasMMWrapper
        DynamicDecodeLayer
        activation_kernels
        decoder_masked_multihead_attention
        decoder_multihead_attention
        bert_preprocess_kernels
        decoding_kernels
        unfused_attention_kernels
        custom_ar_kernels
        custom_ar_comm
        gpt_kernels
        tensor
        memory_utils
        nccl_utils
        cuda_utils
        logger
        gemm_multiB_int4)
#        llama_fmha)

if (NOT MSVC)
#        add_subdirectory(flash_attention2)
#        target_link_libraries(Llama PUBLIC flash_attention2)
endif()

add_executable(llama_gemm llama_gemm.cc)
target_link_libraries(llama_gemm PUBLIC -lrocblas cudart gpt_gemm_func memory_utils cuda_utils logger)
install(TARGETS llama_gemm DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/bin)

find_package(Catch2 3 QUIET)
if (Catch2_FOUND)
        add_executable(test_cache_manager test_cache_manager.cc)
        target_link_libraries(test_cache_manager PRIVATE Llama Catch2::Catch2WithMain)
endif ()