Unverified Commit 4c9959f6 authored by Chen Xin's avatar Chen Xin Committed by GitHub
Browse files

Support windows platform (#209)

* __PRETTY_FUNCTION__

* CASE_K

* uint

* remove not

* HALF_FLT_MAX

* struct init

* port utils

* better build pthread-win32

* port kernels

* port utils/gemm_test

* hide windows header

* port models

* port examples && triton_backend && unittests

* update build readme

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

* fix build

* fix build

* cmake version

* fix typos

* update ci

* port kernels/gemm_s_f16

* update ci

* fix ci

* use cudaStreamSynchronize instead of volatile check

* remove gettimeofday

* remove pthread-win32

* remove dirent.h

* update pre-commit

* update

* remove todo

* fix include

* fix build

* fix build

* fix build ci

* fix github action trigger

* update README

* fix linux-build ci

* remove windows folder

* fix lint

* update readme
parent 0d21f366
......@@ -311,6 +311,39 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf,
}
}
#ifdef _MSC_VER
#define CASE_K(K_MIN, K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_) \
if (K_MIN <= max_top_k && max_top_k <= K_MAX) { \
topk_stage1<T, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_> \
<<<batch_size * BLOCKS_PER_BEAM_, BLOCK_SIZE_1_, 0, stream>>>(log_probs, \
temp_log_probs, \
topk_tmp_id_buf, \
topk_tmp_val_buf, \
finished, \
max_top_k, \
top_ks, \
vocab_size, \
end_ids, \
skip_decode); \
topk_stage2_sampling<T, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_> \
<<<batch_size, BLOCK_SIZE_2_, K_MAX * sizeof(int) + K_MAX * sizeof(float), stream>>>(topk_tmp_id_buf, \
topk_tmp_val_buf, \
ids, \
sequence_length, \
finished, \
cum_log_probs, \
output_log_probs, \
max_top_k, \
top_ks, \
top_p, \
top_ps, \
curandstate, \
end_ids, \
vocab_size, \
skip_decode); \
break; \
}
#else
#define CASE_K(K_MIN, K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_) \
case K_MIN ... K_MAX: \
topk_stage1<T, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_> \
......@@ -341,6 +374,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf,
vocab_size, \
skip_decode); \
break;
#endif
template<typename T>
void invokeBatchTopKSampling(void* workspace,
......@@ -385,6 +419,15 @@ void invokeBatchTopKSampling(void* workspace,
int* topk_tmp_id_buf = (int*)(temp_log_probs + temp_log_probs_buf_size);
T* topk_tmp_val_buf = (T*)(topk_tmp_id_buf + topk_tmp_ids_buf_size);
#ifdef _MSC_VER
do {
CASE_K(1, 16, 128, 128, 8);
CASE_K(17, 32, 256, 128, 8);
CASE_K(33, 64, 256, 256, 8);
CASE_K(65, 1024, 256, 256, 8);
throw std::domain_error(fmtstr("top-k kernel supports 1<=k<=1024 but got k=%d", max_top_k));
} while (0);
#else
switch (max_top_k) {
CASE_K(1, 16, 128, 128, 8);
CASE_K(17, 32, 256, 128, 8);
......@@ -393,6 +436,7 @@ void invokeBatchTopKSampling(void* workspace,
default:
throw std::domain_error(fmtstr("top-k kernel supports 1<=k<=1024 but got k=%d", max_top_k));
}
#endif
}
#undef CASE_K
......
......@@ -16,6 +16,7 @@
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/stop_criteria_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
......@@ -94,7 +95,7 @@ void invokeStopWordsCriterion(const int* output_ids,
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
// Check if we have sampled a word from the stop_words list. If so, stop the sequence.
dim3 block, grid;
block.x = min(((stop_words_len + 32 - 1) / 32) * 32, 256UL);
block.x = min((unsigned long)((stop_words_len + 32 - 1) / 32) * 32, 256UL);
grid.x = (stop_words_len + block.x - 1) / block.x;
grid.y = batch_size * beam_width;
......@@ -150,7 +151,11 @@ void invokeLengthCriterion(bool* finished,
length_criterion<<<grid, block, 0, stream>>>(
finished, should_stop, h_pinned_finished_sum_, sequence_limit_length, batch_size, beam_width, step);
#ifdef _MSC_VER
cudaStreamSynchronize(stream);
#else
while (((volatile int*)h_pinned_finished_sum_)[0] == -1) {};
#endif
sync_check_cuda_error();
*should_stop = h_pinned_finished_sum_[0] == batch_size * beam_width;
......
......@@ -16,8 +16,9 @@ cmake_minimum_required(VERSION 3.8)
add_subdirectory(sampling_layers)
find_package(CUDAToolkit REQUIRED)
add_library(DynamicDecodeLayer STATIC DynamicDecodeLayer.cc)
set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(DynamicDecodeLayer PUBLIC -lcudart TopKSamplingLayer
target_link_libraries(DynamicDecodeLayer PUBLIC CUDA::cudart TopKSamplingLayer
TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils)
......@@ -17,6 +17,7 @@
#pragma once
#include "src/turbomind/utils/cuda_fp8_utils.h"
#include "stdlib.h"
#include <cstdint>
namespace turbomind {
......
......@@ -19,6 +19,7 @@
#include "src/turbomind/kernels/stop_criteria_kernels.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace turbomind {
......
......@@ -16,6 +16,7 @@
#include "src/turbomind/layers/FfnLayer.h"
#include "src/turbomind/kernels/transpose_int8_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/nvtx_utils.h"
namespace turbomind {
......
......@@ -18,6 +18,7 @@
#include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/turbomind/kernels/sampling_penalty_kernels.h"
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/memory_utils.h"
......
......@@ -14,17 +14,19 @@
cmake_minimum_required(VERSION 3.8)
find_package(CUDAToolkit REQUIRED)
add_library(BaseSamplingLayer STATIC BaseSamplingLayer.cc)
set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(BaseSamplingLayer PUBLIC -lcudart sampling_penalty_kernels memory_utils)
target_link_libraries(BaseSamplingLayer PUBLIC CUDA::cudart sampling_penalty_kernels memory_utils)
add_library(TopKSamplingLayer STATIC TopKSamplingLayer.cu)
set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(TopKSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels)
target_link_libraries(TopKSamplingLayer PUBLIC CUDA::cudart BaseSamplingLayer sampling_topk_kernels)
add_library(TopPSamplingLayer STATIC TopPSamplingLayer.cu)
set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(TopPSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels)
target_link_libraries(TopPSamplingLayer PUBLIC CUDA::cudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels)
......@@ -20,6 +20,7 @@
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
......
......@@ -18,6 +18,7 @@
#pragma once
#include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/memory_utils.h"
namespace turbomind {
......
......@@ -22,6 +22,7 @@
#include "src/turbomind/kernels/sampling_topk_kernels.h"
#include "src/turbomind/kernels/sampling_topp_kernels.h"
#include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/memory_utils.h"
......
......@@ -18,6 +18,7 @@
#pragma once
#include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
#include "src/turbomind/macro.h"
namespace turbomind {
......
#pragma once
#if !defined(__PRETTY_FUNCTION__) && !defined(__GNUC__)
#define __PRETTY_FUNCTION__ __FUNCSIG__
#endif
typedef unsigned int uint;
......@@ -14,6 +14,7 @@
* limitations under the License.
*/
#include "src/turbomind/macro.h"
#include <string>
#include <vector>
......
......@@ -3,10 +3,34 @@
#pragma once
#include "src/turbomind/utils/logger.h"
#ifndef _MSC_VER
#include <pthread.h>
#endif
namespace turbomind {
#ifdef _MSC_VER
class Barrier {
public:
Barrier(unsigned count)
{
TM_LOG_INFO("Barrier(%d)", (int)count);
FT_CHECK(count == 1);
}
Barrier(const Barrier&) = delete;
Barrier& operator=(const Barrier&) = delete;
Barrier(Barrier&&) noexcept = delete;
Barrier& operator=(Barrier&&) noexcept = delete;
void wait() {}
~Barrier() {}
};
#else
class Barrier {
public:
Barrier(unsigned count)
......@@ -34,4 +58,6 @@ private:
pthread_barrier_t barrier_{};
};
#endif
} // namespace turbomind
......@@ -4,6 +4,8 @@ cmake_minimum_required(VERSION 3.8)
add_subdirectory(fused_multi_head_attention)
find_package(CUDAToolkit REQUIRED)
add_library(Llama STATIC
LlamaV2.cc
LlamaBatch.cc
......@@ -20,7 +22,7 @@ add_library(Llama STATIC
llama_utils.cu)
set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(Llama PUBLIC -lcudart
target_link_libraries(Llama PUBLIC CUDA::cudart
gemm_s4_f16
cublasMMWrapper
DynamicDecodeLayer
......@@ -40,4 +42,4 @@ target_link_libraries(Llama PUBLIC -lcudart
llama_fmha)
add_executable(llama_gemm llama_gemm.cc)
target_link_libraries(llama_gemm PUBLIC -lcudart gpt_gemm_func memory_utils cuda_utils logger)
target_link_libraries(llama_gemm PUBLIC CUDA::cudart gpt_gemm_func memory_utils cuda_utils logger)
......@@ -2,6 +2,7 @@
#include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/turbomind/kernels/decoding_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/LlamaV2.h"
#include "src/turbomind/models/llama/Request.h"
......
......@@ -22,6 +22,7 @@
#include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
#include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h"
......@@ -265,40 +266,41 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptr
// flash attention
using AttentionOp = FlashAttentionOp<T>;
using Layout = typename AttentionOp::AttentionLayout;
Layout layout_q{.stride_batch = int(local_head_num_ * max_q_len * size_per_head_),
.stride_seq = int(size_per_head_),
.stride_head = int(max_q_len * size_per_head_)};
Layout layout_k{.stride_batch = int(local_head_num_ * max_seq_len * size_per_head_),
.stride_seq = int(size_per_head_),
.stride_head = int(max_seq_len * size_per_head_),
.batch_seqs_offset = int(cache_layer_offset),
.batch_seqs = key_cache_ptrs};
Layout layout_v{.stride_batch = int(local_head_num_ * max_seq_len * size_per_head_),
.stride_seq = int(size_per_head_),
.stride_head = int(max_seq_len * size_per_head_),
.batch_seqs_offset = int(cache_layer_offset),
.batch_seqs = val_cache_ptrs};
Layout layout_q{
int(local_head_num_ * max_q_len * size_per_head_), int(size_per_head_), int(max_q_len * size_per_head_)};
Layout layout_k{int(local_head_num_ * max_seq_len * size_per_head_),
int(size_per_head_),
int(max_seq_len * size_per_head_),
false,
int(cache_layer_offset),
key_cache_ptrs};
Layout layout_v{int(local_head_num_ * max_seq_len * size_per_head_),
int(size_per_head_),
int(max_seq_len * size_per_head_),
false,
int(cache_layer_offset),
val_cache_ptrs};
Layout layout_o{
.stride_batch = int(local_head_num_ * max_q_len * size_per_head_),
.stride_seq = int(local_head_num_ * size_per_head_),
.stride_head = int(size_per_head_),
.use_seqlens = true,
int(local_head_num_ * max_q_len * size_per_head_),
int(local_head_num_ * size_per_head_),
int(size_per_head_),
true,
};
size_t group_size = size_t(local_head_num_ / local_kv_head_num_);
AttentionOp flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
typename AttentionOp::Params attn_params{.attn_out = qkv_buf_3_,
.query = q_buf_2_,
.key = k_cache_buf_,
.val = v_cache_buf_,
.mask = attention_mask,
.out_accum = qk_buf_float_,
.cu_seqlens_q = cu_seqlens,
.cu_seqlens_k = nullptr,
.group_size = group_size,
.layout_q = layout_q,
.layout_k = layout_k,
.layout_v = layout_v,
.layout_o = layout_o};
typename AttentionOp::Params attn_params{qkv_buf_3_,
q_buf_2_,
k_cache_buf_,
v_cache_buf_,
attention_mask,
qk_buf_float_,
cu_seqlens,
nullptr,
group_size,
layout_q,
layout_k,
layout_v,
layout_o};
//
flash_attention(attn_params, stream_);
......
......@@ -21,6 +21,7 @@
#include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h"
......
......@@ -20,6 +20,7 @@
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.cc
#include "src/turbomind/models/llama/LlamaDecoder.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment