/opt/dtk-23.10/cuda/bin/nvcc -fPIC -DWMMA -O2 -g -DNDEBUG -O3 -fuse-ld=gold -shared -Wl,-soname,libtransformer-shared.so -o lib/libtransformer-shared.so src/turbomind/layers/sampling_layers/CMakeFiles/BaseSamplingLayer.dir/BaseSamplingLayer.cc.o src/turbomind/layers/CMakeFiles/DynamicDecodeLayer.dir/DynamicDecodeLayer.cc.o src/turbomind/models/llama/CMakeFiles/Llama.dir/LlamaV2.cc.o src/turbomind/models/llama/CMakeFiles/Llama.dir/LlamaBatch.cc.o src/turbomind/models/llama/CMakeFiles/Llama.dir/BlockManager.cc.o src/turbomind/models/llama/CMakeFiles/Llama.dir/SequenceManager.cc.o src/turbomind/models/llama/CMakeFiles/Llama.dir/LlamaWeight.cc.o src/turbomind/models/llama/CMakeFiles/Llama.dir/LlamaDecoderLayerWeight.cc.o src/turbomind/models/llama/CMakeFiles/Llama.dir/LlamaFfnLayer.cc.o src/turbomind/models/llama/CMakeFiles/Llama.dir/unified_decoder.cc.o src/turbomind/models/llama/CMakeFiles/Llama.dir/unified_attention_layer.cc.o src/turbomind/models/llama/CMakeFiles/Llama.dir/llama_kernels.cu.o src/turbomind/models/llama/CMakeFiles/Llama.dir/llama_decoder_kernels.cu.o src/turbomind/models/llama/CMakeFiles/Llama.dir/llama_utils.cu.o src/turbomind/triton_backend/llama/CMakeFiles/LlamaTritonBackend.dir/LlamaTritonModel.cc.o src/turbomind/triton_backend/llama/CMakeFiles/LlamaTritonBackend.dir/LlamaTritonModelInstance.cc.o src/turbomind/layers/sampling_layers/CMakeFiles/TopKSamplingLayer.dir/TopKSamplingLayer.cu.o src/turbomind/layers/sampling_layers/CMakeFiles/TopPSamplingLayer.dir/TopPSamplingLayer.cu.o src/turbomind/triton_backend/CMakeFiles/TransformerTritonBackend.dir/transformer_triton_backend.cpp.o src/turbomind/kernels/CMakeFiles/activation_kernels.dir/activation_kernels.cu.o src/turbomind/kernels/CMakeFiles/ban_bad_words.dir/ban_bad_words.cu.o src/turbomind/kernels/CMakeFiles/bert_preprocess_kernels.dir/bert_preprocess_kernels.cu.o src/turbomind/utils/CMakeFiles/cublasAlgoMap.dir/cublasAlgoMap.cc.o src/turbomind/utils/CMakeFiles/cublasMMWrapper.dir/cublasMMWrapper.cc.o src/turbomind/utils/CMakeFiles/cuda_utils.dir/cuda_utils.cc.o src/turbomind/utils/CMakeFiles/custom_ar_comm.dir/custom_ar_comm.cc.o src/turbomind/kernels/CMakeFiles/custom_ar_kernels.dir/custom_ar_kernels.cu.o src/turbomind/kernels/decoder_multihead_attention/CMakeFiles/decoder_multihead_attention.dir/decoder_multihead_attention.cu.o src/turbomind/kernels/decoder_multihead_attention/CMakeFiles/decoder_multihead_attention.dir/kv_cache.cu.o src/turbomind/kernels/CMakeFiles/decoder_masked_multihead_attention.dir/decoder_masked_multihead_attention/decoder_masked_multihead_attention_128.cu.o src/turbomind/kernels/CMakeFiles/decoder_masked_multihead_attention.dir/decoder_masked_multihead_attention.cu.o src/turbomind/kernels/CMakeFiles/decoding_kernels.dir/decoding_kernels.cu.o src/turbomind/kernels/CMakeFiles/gpt_kernels.dir/gpt_kernels.cu.o src/turbomind/kernels/CMakeFiles/logprob_kernels.dir/logprob_kernels.cu.o src/turbomind/utils/CMakeFiles/logger.dir/logger.cc.o src/turbomind/utils/CMakeFiles/memory_utils.dir/memory_utils.cu.o src/turbomind/utils/CMakeFiles/mpi_utils.dir/mpi_utils.cc.o src/turbomind/utils/CMakeFiles/nccl_utils.dir/nccl_utils.cc.o src/turbomind/utils/CMakeFiles/nvtx_utils.dir/nvtx_utils.cc.o src/turbomind/kernels/CMakeFiles/sampling_penalty_kernels.dir/sampling_penalty_kernels.cu.o src/turbomind/kernels/CMakeFiles/sampling_topk_kernels.dir/sampling_topk_kernels.cu.o src/turbomind/kernels/CMakeFiles/sampling_topp_kernels.dir/sampling_topp_kernels.cu.o src/turbomind/kernels/CMakeFiles/stop_criteria.dir/stop_criteria_kernels.cu.o src/turbomind/utils/CMakeFiles/tensor.dir/Tensor.cc.o src/turbomind/kernels/CMakeFiles/unfused_attention_kernels.dir/unfused_attention_kernels.cu.o src/turbomind/utils/CMakeFiles/word_list.dir/word_list.cc.o -L/opt/mpi/lib -L/opt/dtk-23.10/cuda/targets/x86_64-linux/lib/stubs -Wl,-rpath,/opt/mpi/lib: -lmpi /opt/dtk-23.10/cuda/lib64/libnccl.so -lcudart -lcublas -lcurand -ldl