Unverified Commit 4c9959f6 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Support windows platform (#209)

* __PRETTY_FUNCTION__

* CASE_K

* uint

* remove not

* HALF_FLT_MAX

* struct init

* port utils

* better build pthread-win32

* port kernels

* port utils/gemm_test

* hide windows header

* port models

* port examples && triton_backend && unittests

* update build readme

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

* fix build

* fix build

* cmake version

* fix typos

* update ci

* port kernels/gemm_s_f16

* update ci

* fix ci

* use cudaStreamSynchronize instead of volatile check

* remove gettimeofday

* remove pthread-win32

* remove dirent.h

* update pre-commit

* update

* remove todo

* fix include

* fix build

* fix build

* fix build ci

* fix github action trigger

* update README

* fix linux-build ci

* remove windows folder

* fix lint

* update readme
parent 0d21f366
...@@ -311,6 +311,39 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, ...@@ -311,6 +311,39 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf,
} }
} }
#ifdef _MSC_VER
#define CASE_K(K_MIN, K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_) \
if (K_MIN <= max_top_k && max_top_k <= K_MAX) { \
topk_stage1<T, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_> \
<<<batch_size * BLOCKS_PER_BEAM_, BLOCK_SIZE_1_, 0, stream>>>(log_probs, \
temp_log_probs, \
topk_tmp_id_buf, \
topk_tmp_val_buf, \
finished, \
max_top_k, \
top_ks, \
vocab_size, \
end_ids, \
skip_decode); \
topk_stage2_sampling<T, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_> \
<<<batch_size, BLOCK_SIZE_2_, K_MAX * sizeof(int) + K_MAX * sizeof(float), stream>>>(topk_tmp_id_buf, \
topk_tmp_val_buf, \
ids, \
sequence_length, \
finished, \
cum_log_probs, \
output_log_probs, \
max_top_k, \
top_ks, \
top_p, \
top_ps, \
curandstate, \
end_ids, \
vocab_size, \
skip_decode); \
break; \
}
#else
#define CASE_K(K_MIN, K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_) \ #define CASE_K(K_MIN, K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_) \
case K_MIN ... K_MAX: \ case K_MIN ... K_MAX: \
topk_stage1<T, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_> \ topk_stage1<T, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_> \
...@@ -341,6 +374,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf, ...@@ -341,6 +374,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf,
vocab_size, \ vocab_size, \
skip_decode); \ skip_decode); \
break; break;
#endif
template<typename T> template<typename T>
void invokeBatchTopKSampling(void* workspace, void invokeBatchTopKSampling(void* workspace,
...@@ -385,6 +419,15 @@ void invokeBatchTopKSampling(void* workspace, ...@@ -385,6 +419,15 @@ void invokeBatchTopKSampling(void* workspace,
int* topk_tmp_id_buf = (int*)(temp_log_probs + temp_log_probs_buf_size); int* topk_tmp_id_buf = (int*)(temp_log_probs + temp_log_probs_buf_size);
T* topk_tmp_val_buf = (T*)(topk_tmp_id_buf + topk_tmp_ids_buf_size); T* topk_tmp_val_buf = (T*)(topk_tmp_id_buf + topk_tmp_ids_buf_size);
#ifdef _MSC_VER
do {
CASE_K(1, 16, 128, 128, 8);
CASE_K(17, 32, 256, 128, 8);
CASE_K(33, 64, 256, 256, 8);
CASE_K(65, 1024, 256, 256, 8);
throw std::domain_error(fmtstr("top-k kernel supports 1<=k<=1024 but got k=%d", max_top_k));
} while (0);
#else
switch (max_top_k) { switch (max_top_k) {
CASE_K(1, 16, 128, 128, 8); CASE_K(1, 16, 128, 128, 8);
CASE_K(17, 32, 256, 128, 8); CASE_K(17, 32, 256, 128, 8);
...@@ -393,6 +436,7 @@ void invokeBatchTopKSampling(void* workspace, ...@@ -393,6 +436,7 @@ void invokeBatchTopKSampling(void* workspace,
default: default:
throw std::domain_error(fmtstr("top-k kernel supports 1<=k<=1024 but got k=%d", max_top_k)); throw std::domain_error(fmtstr("top-k kernel supports 1<=k<=1024 but got k=%d", max_top_k));
} }
#endif
} }
#undef CASE_K #undef CASE_K
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "src/turbomind/kernels/reduce_kernel_utils.cuh" #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/stop_criteria_kernels.h" #include "src/turbomind/kernels/stop_criteria_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
...@@ -94,7 +95,7 @@ void invokeStopWordsCriterion(const int* output_ids, ...@@ -94,7 +95,7 @@ void invokeStopWordsCriterion(const int* output_ids,
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__); TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
// Check if we have sampled a word from the stop_words list. If so, stop the sequence. // Check if we have sampled a word from the stop_words list. If so, stop the sequence.
dim3 block, grid; dim3 block, grid;
block.x = min(((stop_words_len + 32 - 1) / 32) * 32, 256UL); block.x = min((unsigned long)((stop_words_len + 32 - 1) / 32) * 32, 256UL);
grid.x = (stop_words_len + block.x - 1) / block.x; grid.x = (stop_words_len + block.x - 1) / block.x;
grid.y = batch_size * beam_width; grid.y = batch_size * beam_width;
...@@ -150,7 +151,11 @@ void invokeLengthCriterion(bool* finished, ...@@ -150,7 +151,11 @@ void invokeLengthCriterion(bool* finished,
length_criterion<<<grid, block, 0, stream>>>( length_criterion<<<grid, block, 0, stream>>>(
finished, should_stop, h_pinned_finished_sum_, sequence_limit_length, batch_size, beam_width, step); finished, should_stop, h_pinned_finished_sum_, sequence_limit_length, batch_size, beam_width, step);
#ifdef _MSC_VER
cudaStreamSynchronize(stream);
#else
while (((volatile int*)h_pinned_finished_sum_)[0] == -1) {}; while (((volatile int*)h_pinned_finished_sum_)[0] == -1) {};
#endif
sync_check_cuda_error(); sync_check_cuda_error();
*should_stop = h_pinned_finished_sum_[0] == batch_size * beam_width; *should_stop = h_pinned_finished_sum_[0] == batch_size * beam_width;
......
...@@ -16,8 +16,9 @@ cmake_minimum_required(VERSION 3.8) ...@@ -16,8 +16,9 @@ cmake_minimum_required(VERSION 3.8)
add_subdirectory(sampling_layers) add_subdirectory(sampling_layers)
find_package(CUDAToolkit REQUIRED)
add_library(DynamicDecodeLayer STATIC DynamicDecodeLayer.cc) add_library(DynamicDecodeLayer STATIC DynamicDecodeLayer.cc)
set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(DynamicDecodeLayer PUBLIC -lcudart TopKSamplingLayer target_link_libraries(DynamicDecodeLayer PUBLIC CUDA::cudart TopKSamplingLayer
TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils) TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils)
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#pragma once #pragma once
#include "src/turbomind/utils/cuda_fp8_utils.h" #include "src/turbomind/utils/cuda_fp8_utils.h"
#include "stdlib.h" #include "stdlib.h"
#include <cstdint>
namespace turbomind { namespace turbomind {
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "src/turbomind/kernels/stop_criteria_kernels.h" #include "src/turbomind/kernels/stop_criteria_kernels.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
namespace turbomind { namespace turbomind {
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "src/turbomind/layers/FfnLayer.h" #include "src/turbomind/layers/FfnLayer.h"
#include "src/turbomind/kernels/transpose_int8_kernels.h" #include "src/turbomind/kernels/transpose_int8_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/nvtx_utils.h" #include "src/turbomind/utils/nvtx_utils.h"
namespace turbomind { namespace turbomind {
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/turbomind/kernels/sampling_penalty_kernels.h" #include "src/turbomind/kernels/sampling_penalty_kernels.h"
#include "src/turbomind/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/cuda_utils.h" #include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
......
...@@ -14,17 +14,19 @@ ...@@ -14,17 +14,19 @@
cmake_minimum_required(VERSION 3.8) cmake_minimum_required(VERSION 3.8)
find_package(CUDAToolkit REQUIRED)
add_library(BaseSamplingLayer STATIC BaseSamplingLayer.cc) add_library(BaseSamplingLayer STATIC BaseSamplingLayer.cc)
set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(BaseSamplingLayer PUBLIC -lcudart sampling_penalty_kernels memory_utils) target_link_libraries(BaseSamplingLayer PUBLIC CUDA::cudart sampling_penalty_kernels memory_utils)
add_library(TopKSamplingLayer STATIC TopKSamplingLayer.cu) add_library(TopKSamplingLayer STATIC TopKSamplingLayer.cu)
set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(TopKSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels) target_link_libraries(TopKSamplingLayer PUBLIC CUDA::cudart BaseSamplingLayer sampling_topk_kernels)
add_library(TopPSamplingLayer STATIC TopPSamplingLayer.cu) add_library(TopPSamplingLayer STATIC TopPSamplingLayer.cu)
set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(TopPSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels) target_link_libraries(TopPSamplingLayer PUBLIC CUDA::cudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels)
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "src/turbomind/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/kernels/sampling_topp_kernels.h" #include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#pragma once #pragma once
#include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
namespace turbomind { namespace turbomind {
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "src/turbomind/kernels/sampling_topk_kernels.h" #include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/kernels/sampling_topp_kernels.h" #include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h" #include "src/turbomind/utils/memory_utils.h"
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#pragma once #pragma once
#include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h" #include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/turbomind/macro.h"
namespace turbomind { namespace turbomind {
......
#pragma once
#if !defined(__PRETTY_FUNCTION__) && !defined(__GNUC__)
#define __PRETTY_FUNCTION__ __FUNCSIG__
#endif
typedef unsigned int uint;
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "src/turbomind/macro.h"
#include <string> #include <string>
#include <vector> #include <vector>
......
...@@ -3,10 +3,34 @@ ...@@ -3,10 +3,34 @@
#pragma once #pragma once
#include "src/turbomind/utils/logger.h" #include "src/turbomind/utils/logger.h"
#ifndef _MSC_VER
#include <pthread.h> #include <pthread.h>
#endif
namespace turbomind { namespace turbomind {
#ifdef _MSC_VER
class Barrier {
public:
Barrier(unsigned count)
{
TM_LOG_INFO("Barrier(%d)", (int)count);
FT_CHECK(count == 1);
}
Barrier(const Barrier&) = delete;
Barrier& operator=(const Barrier&) = delete;
Barrier(Barrier&&) noexcept = delete;
Barrier& operator=(Barrier&&) noexcept = delete;
void wait() {}
~Barrier() {}
};
#else
class Barrier { class Barrier {
public: public:
Barrier(unsigned count) Barrier(unsigned count)
...@@ -34,4 +58,6 @@ private: ...@@ -34,4 +58,6 @@ private:
pthread_barrier_t barrier_{}; pthread_barrier_t barrier_{};
}; };
#endif
} // namespace turbomind } // namespace turbomind
...@@ -4,6 +4,8 @@ cmake_minimum_required(VERSION 3.8) ...@@ -4,6 +4,8 @@ cmake_minimum_required(VERSION 3.8)
add_subdirectory(fused_multi_head_attention) add_subdirectory(fused_multi_head_attention)
find_package(CUDAToolkit REQUIRED)
add_library(Llama STATIC add_library(Llama STATIC
LlamaV2.cc LlamaV2.cc
LlamaBatch.cc LlamaBatch.cc
...@@ -20,7 +22,7 @@ add_library(Llama STATIC ...@@ -20,7 +22,7 @@ add_library(Llama STATIC
llama_utils.cu) llama_utils.cu)
set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON) set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(Llama PUBLIC -lcudart target_link_libraries(Llama PUBLIC CUDA::cudart
gemm_s4_f16 gemm_s4_f16
cublasMMWrapper cublasMMWrapper
DynamicDecodeLayer DynamicDecodeLayer
...@@ -40,4 +42,4 @@ target_link_libraries(Llama PUBLIC -lcudart ...@@ -40,4 +42,4 @@ target_link_libraries(Llama PUBLIC -lcudart
llama_fmha) llama_fmha)
add_executable(llama_gemm llama_gemm.cc) add_executable(llama_gemm llama_gemm.cc)
target_link_libraries(llama_gemm PUBLIC -lcudart gpt_gemm_func memory_utils cuda_utils logger) target_link_libraries(llama_gemm PUBLIC CUDA::cudart gpt_gemm_func memory_utils cuda_utils logger)
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include "src/turbomind/models/llama/LlamaBatch.h" #include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/turbomind/kernels/decoding_kernels.h" #include "src/turbomind/kernels/decoding_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/LlamaV2.h" #include "src/turbomind/models/llama/LlamaV2.h"
#include "src/turbomind/models/llama/Request.h" #include "src/turbomind/models/llama/Request.h"
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "src/turbomind/models/llama/LlamaContextAttentionLayer.h" #include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
#include "src/turbomind/kernels/bert_preprocess_kernels.h" #include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/turbomind/kernels/unfused_attention_kernels.h" #include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h" #include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/llama_utils.h"
...@@ -265,40 +266,41 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptr ...@@ -265,40 +266,41 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptr
// flash attention // flash attention
using AttentionOp = FlashAttentionOp<T>; using AttentionOp = FlashAttentionOp<T>;
using Layout = typename AttentionOp::AttentionLayout; using Layout = typename AttentionOp::AttentionLayout;
Layout layout_q{.stride_batch = int(local_head_num_ * max_q_len * size_per_head_), Layout layout_q{
.stride_seq = int(size_per_head_), int(local_head_num_ * max_q_len * size_per_head_), int(size_per_head_), int(max_q_len * size_per_head_)};
.stride_head = int(max_q_len * size_per_head_)}; Layout layout_k{int(local_head_num_ * max_seq_len * size_per_head_),
Layout layout_k{.stride_batch = int(local_head_num_ * max_seq_len * size_per_head_), int(size_per_head_),
.stride_seq = int(size_per_head_), int(max_seq_len * size_per_head_),
.stride_head = int(max_seq_len * size_per_head_), false,
.batch_seqs_offset = int(cache_layer_offset), int(cache_layer_offset),
.batch_seqs = key_cache_ptrs}; key_cache_ptrs};
Layout layout_v{.stride_batch = int(local_head_num_ * max_seq_len * size_per_head_), Layout layout_v{int(local_head_num_ * max_seq_len * size_per_head_),
.stride_seq = int(size_per_head_), int(size_per_head_),
.stride_head = int(max_seq_len * size_per_head_), int(max_seq_len * size_per_head_),
.batch_seqs_offset = int(cache_layer_offset), false,
.batch_seqs = val_cache_ptrs}; int(cache_layer_offset),
val_cache_ptrs};
Layout layout_o{ Layout layout_o{
.stride_batch = int(local_head_num_ * max_q_len * size_per_head_), int(local_head_num_ * max_q_len * size_per_head_),
.stride_seq = int(local_head_num_ * size_per_head_), int(local_head_num_ * size_per_head_),
.stride_head = int(size_per_head_), int(size_per_head_),
.use_seqlens = true, true,
}; };
size_t group_size = size_t(local_head_num_ / local_kv_head_num_); size_t group_size = size_t(local_head_num_ / local_kv_head_num_);
AttentionOp flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_); AttentionOp flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
typename AttentionOp::Params attn_params{.attn_out = qkv_buf_3_, typename AttentionOp::Params attn_params{qkv_buf_3_,
.query = q_buf_2_, q_buf_2_,
.key = k_cache_buf_, k_cache_buf_,
.val = v_cache_buf_, v_cache_buf_,
.mask = attention_mask, attention_mask,
.out_accum = qk_buf_float_, qk_buf_float_,
.cu_seqlens_q = cu_seqlens, cu_seqlens,
.cu_seqlens_k = nullptr, nullptr,
.group_size = group_size, group_size,
.layout_q = layout_q, layout_q,
.layout_k = layout_k, layout_k,
.layout_v = layout_v, layout_v,
.layout_o = layout_o}; layout_o};
// //
flash_attention(attn_params, stream_); flash_attention(attn_params, stream_);
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "src/turbomind/models/llama/LlamaContextDecoder.h" #include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/turbomind/kernels/bert_preprocess_kernels.h" #include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/turbomind/kernels/gpt_kernels.h" #include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/LlamaContextDecoder.h" #include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h" #include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h"
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.cc // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.cc
#include "src/turbomind/models/llama/LlamaDecoder.h" #include "src/turbomind/models/llama/LlamaDecoder.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h" #include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h" #include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h" #include "src/turbomind/models/llama/llama_utils.h"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment