/opt/dtk-23.10/cuda/bin/nvcc  -lineinfo -Xcompiler -Wall -ldl --gpu-max-threads-per-block=1024                          -gencode=arch=compute_70,code=\"sm_70,compute_70\"                         -gencode=arch=compute_75,code=\"sm_75,compute_75\"                         -gencode=arch=compute_80,code=\"sm_80,compute_80\"                         -gencode=arch=compute_86,code=\"sm_86,compute_86\"                          -DWMMA --expt-extended-lambda --expt-relaxed-constexpr --std=c++17 -DCUDA_PTX_FP8_F2FP_ENABLED -fPIC -O2 -g -DNDEBUG -Xcompiler -O3 -DCUDA_PTX_FP8_F2FP_ENABLED --use_fast_math -Xcompiler=-fPIC -Wno-deprecated-gpu-targets -shared -dlink CMakeFiles/decoder_multihead_attention.dir/decoder_multihead_attention.cu.o CMakeFiles/decoder_multihead_attention.dir/kv_cache.cu.o -o CMakeFiles/decoder_multihead_attention.dir/cmake_device_link.o   -L/opt/mpi/lib 
