Support windows platform (#209)

* __PRETTY_FUNCTION__ * CASE_K * uint * remove not * HALF_FLT_MAX * struct init * port utils * better build pthread-win32 * port kernels * port utils/gemm_test * hide windows header * port models * port examples && triton_backend && unittests * update build readme * fix lint * fix lint * fix lint * fix lint * fix lint * fix build * fix build * cmake version * fix typos * update ci * port kernels/gemm_s_f16 * update ci * fix ci * use cudaStreamSynchronize instead of volatile check * remove gettimeofday * remove pthread-win32 * remove dirent.h * update pre-commit * update * remove todo * fix include * fix build * fix build * fix build ci * fix github action trigger * update README * fix linux-build ci * remove windows folder * fix lint * update readme

Support windows platform (#209)
* __PRETTY_FUNCTION__ * CASE_K * uint * remove not * HALF_FLT_MAX * struct init * port utils * better build pthread-win32 * port kernels * port utils/gemm_test * hide windows header * port models * port examples && triton_backend && unittests * update build readme * fix lint * fix lint * fix lint * fix lint * fix lint * fix build * fix build * cmake version * fix typos * update ci * port kernels/gemm_s_f16 * update ci * fix ci * use cudaStreamSynchronize instead of volatile check * remove gettimeofday * remove pthread-win32 * remove dirent.h * update pre-commit * update * remove todo * fix include * fix build * fix build * fix build ci * fix github action trigger * update README * fix linux-build ci * remove windows folder * fix lint * update readme
4c9959f6 · Chen Xin · GitHub · 0d21f366 · 4c9959f6 · 4c9959f6
Unverified Commit 4c9959f6 authored Aug 17, 2023 by Chen Xin Committed by GitHub Aug 17, 2023
20 changed files
--- a/src/turbomind/kernels/sampling_topk_kernels.cu
+++ b/src/turbomind/kernels/sampling_topk_kernels.cu
@@ -311,6 +311,39 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf,
    }
 }
+#ifdef _MSC_VER
+#define CASE_K(K_MIN, K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_)                                           \
+    if (K_MIN <= max_top_k && max_top_k <= K_MAX) {                                                                    \
+        topk_stage1<T, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_>                                                                \
+            <<<batch_size * BLOCKS_PER_BEAM_, BLOCK_SIZE_1_, 0, stream>>>(log_probs,                                   \
+                                                                          temp_log_probs,                              \
+                                                                          topk_tmp_id_buf,                             \
+                                                                          topk_tmp_val_buf,                            \
+                                                                          finished,                                    \
+                                                                          max_top_k,                                   \
+                                                                          top_ks,                                      \
+                                                                          vocab_size,                                  \
+                                                                          end_ids,                                     \
+                                                                          skip_decode);                                \
+        topk_stage2_sampling<T, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_>                                                       \
+            <<<batch_size, BLOCK_SIZE_2_, K_MAX * sizeof(int) + K_MAX * sizeof(float), stream>>>(topk_tmp_id_buf,      \
+                                                                                                 topk_tmp_val_buf,     \
+                                                                                                 ids,                  \
+                                                                                                 sequence_length,      \
+                                                                                                 finished,             \
+                                                                                                 cum_log_probs,        \
+                                                                                                 output_log_probs,     \
+                                                                                                 max_top_k,            \
+                                                                                                 top_ks,               \
+                                                                                                 top_p,                \
+                                                                                                 top_ps,               \
+                                                                                                 curandstate,          \
+                                                                                                 end_ids,              \
+                                                                                                 vocab_size,           \
+                                                                                                 skip_decode);         \
+        break;                                                                                                         \
+    }
+#else
 #define CASE_K(K_MIN, K_MAX, BLOCK_SIZE_1_, BLOCK_SIZE_2_, BLOCKS_PER_BEAM_)                                           \
    case K_MIN ... K_MAX:                                                                                              \
        topk_stage1<T, BLOCK_SIZE_1_, BLOCKS_PER_BEAM_>                                                                \
@@ -341,6 +374,7 @@ __global__ void topk_stage2_sampling(const int* __restrict topk_tmp_id_buf,
                                                                                                 vocab_size,           \
                                                                                                 skip_decode);         \
        break;
+#endif
 template<typename T>
 void invokeBatchTopKSampling(void*          workspace,
@@ -385,6 +419,15 @@ void invokeBatchTopKSampling(void*          workspace,
    int* topk_tmp_id_buf  = (int*)(temp_log_probs + temp_log_probs_buf_size);
    T*   topk_tmp_val_buf = (T*)(topk_tmp_id_buf + topk_tmp_ids_buf_size);
+#ifdef _MSC_VER
+    do {
+        CASE_K(1, 16, 128, 128, 8);
+        CASE_K(17, 32, 256, 128, 8);
+        CASE_K(33, 64, 256, 256, 8);
+        CASE_K(65, 1024, 256, 256, 8);
+        throw std::domain_error(fmtstr("top-k kernel supports 1<=k<=1024 but got k=%d", max_top_k));
+    } while (0);
+#else
    switch (max_top_k) {
        CASE_K(1, 16, 128, 128, 8);
        CASE_K(17, 32, 256, 128, 8);
@@ -393,6 +436,7 @@ void invokeBatchTopKSampling(void*          workspace,
        default:
            throw std::domain_error(fmtstr("top-k kernel supports 1<=k<=1024 but got k=%d", max_top_k));
    }
+#endif
 }
 #undef CASE_K

--- a/src/turbomind/kernels/stop_criteria_kernels.cu
+++ b/src/turbomind/kernels/stop_criteria_kernels.cu
@@ -16,6 +16,7 @@
 #include "src/turbomind/kernels/reduce_kernel_utils.cuh"
 #include "src/turbomind/kernels/stop_criteria_kernels.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
@@ -94,7 +95,7 @@ void invokeStopWordsCriterion(const int*   output_ids,
    TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
    // Check if we have sampled a word from the stop_words list. If so, stop the sequence.
    dim3 block, grid;
-    block.x = min(((stop_words_len + 32 - 1) / 32) * 32, 256UL);
+    block.x = min((unsigned long)((stop_words_len + 32 - 1) / 32) * 32, 256UL);
    grid.x  = (stop_words_len + block.x - 1) / block.x;
    grid.y  = batch_size * beam_width;
@@ -150,7 +151,11 @@ void invokeLengthCriterion(bool*           finished,
    length_criterion<<<grid, block, 0, stream>>>(
        finished, should_stop, h_pinned_finished_sum_, sequence_limit_length, batch_size, beam_width, step);
+#ifdef _MSC_VER
+    cudaStreamSynchronize(stream);
+#else
    while (((volatile int*)h_pinned_finished_sum_)[0] == -1) {};
+#endif
    sync_check_cuda_error();
    *should_stop = h_pinned_finished_sum_[0] == batch_size * beam_width;

--- a/src/turbomind/layers/CMakeLists.txt
+++ b/src/turbomind/layers/CMakeLists.txt
@@ -16,8 +16,9 @@ cmake_minimum_required(VERSION 3.8)
 add_subdirectory(sampling_layers)
+find_package(CUDAToolkit REQUIRED)
 add_library(DynamicDecodeLayer STATIC DynamicDecodeLayer.cc)
 set_property(TARGET DynamicDecodeLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(DynamicDecodeLayer PUBLIC -lcudart TopKSamplingLayer
+target_link_libraries(DynamicDecodeLayer PUBLIC CUDA::cudart TopKSamplingLayer
        TopPSamplingLayer ban_bad_words stop_criteria gpt_kernels tensor nvtx_utils)
--- a/src/turbomind/layers/DenseWeight.h
+++ b/src/turbomind/layers/DenseWeight.h
@@ -17,6 +17,7 @@
 #pragma once
 #include "src/turbomind/utils/cuda_fp8_utils.h"
 #include "stdlib.h"
+#include <cstdint>
 namespace turbomind {

--- a/src/turbomind/layers/DynamicDecodeLayer.cc
+++ b/src/turbomind/layers/DynamicDecodeLayer.cc
@@ -19,6 +19,7 @@
 #include "src/turbomind/kernels/stop_criteria_kernels.h"
 #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
 #include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/cuda_utils.h"
 namespace turbomind {

--- a/src/turbomind/layers/FfnLayer.cc
+++ b/src/turbomind/layers/FfnLayer.cc
@@ -16,6 +16,7 @@
 #include "src/turbomind/layers/FfnLayer.h"
 #include "src/turbomind/kernels/transpose_int8_kernels.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/nvtx_utils.h"
 namespace turbomind {

--- a/src/turbomind/layers/sampling_layers/BaseSamplingLayer.cc
+++ b/src/turbomind/layers/sampling_layers/BaseSamplingLayer.cc
@@ -18,6 +18,7 @@
 #include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
 #include "src/turbomind/kernels/sampling_penalty_kernels.h"
 #include "src/turbomind/kernels/sampling_topk_kernels.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"

--- a/src/turbomind/layers/sampling_layers/CMakeLists.txt
+++ b/src/turbomind/layers/sampling_layers/CMakeLists.txt
@@ -14,17 +14,19 @@
 cmake_minimum_required(VERSION 3.8)
+find_package(CUDAToolkit REQUIRED)
 add_library(BaseSamplingLayer STATIC BaseSamplingLayer.cc)
 set_property(TARGET BaseSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET BaseSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(BaseSamplingLayer PUBLIC -lcudart sampling_penalty_kernels memory_utils)
+target_link_libraries(BaseSamplingLayer PUBLIC CUDA::cudart sampling_penalty_kernels memory_utils)
 add_library(TopKSamplingLayer STATIC TopKSamplingLayer.cu)
 set_property(TARGET TopKSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET TopKSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(TopKSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels)
+target_link_libraries(TopKSamplingLayer PUBLIC CUDA::cudart BaseSamplingLayer sampling_topk_kernels)
 add_library(TopPSamplingLayer STATIC TopPSamplingLayer.cu)
 set_property(TARGET TopPSamplingLayer PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET TopPSamplingLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(TopPSamplingLayer PUBLIC -lcudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels)
+target_link_libraries(TopPSamplingLayer PUBLIC CUDA::cudart BaseSamplingLayer sampling_topk_kernels sampling_topp_kernels)
--- a/src/turbomind/layers/sampling_layers/TopKSamplingLayer.cu
+++ b/src/turbomind/layers/sampling_layers/TopKSamplingLayer.cu
@@ -20,6 +20,7 @@
 #include "src/turbomind/kernels/sampling_topk_kernels.h"
 #include "src/turbomind/kernels/sampling_topp_kernels.h"
 #include "src/turbomind/layers/sampling_layers/TopKSamplingLayer.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/logger.h"
 #include "src/turbomind/utils/memory_utils.h"

--- a/src/turbomind/layers/sampling_layers/TopKSamplingLayer.h
+++ b/src/turbomind/layers/sampling_layers/TopKSamplingLayer.h
@@ -18,6 +18,7 @@
 #pragma once
 #include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/memory_utils.h"
 namespace turbomind {

--- a/src/turbomind/layers/sampling_layers/TopPSamplingLayer.cu
+++ b/src/turbomind/layers/sampling_layers/TopPSamplingLayer.cu
@@ -22,6 +22,7 @@
 #include "src/turbomind/kernels/sampling_topk_kernels.h"
 #include "src/turbomind/kernels/sampling_topp_kernels.h"
 #include "src/turbomind/layers/sampling_layers/TopPSamplingLayer.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/utils/logger.h"
 #include "src/turbomind/utils/memory_utils.h"

--- a/src/turbomind/layers/sampling_layers/TopPSamplingLayer.h
+++ b/src/turbomind/layers/sampling_layers/TopPSamplingLayer.h
@@ -18,6 +18,7 @@
 #pragma once
 #include "src/turbomind/layers/sampling_layers/BaseSamplingLayer.h"
+#include "src/turbomind/macro.h"
 namespace turbomind {

--- a/src/turbomind/macro.h
+++ b/src/turbomind/macro.h
+#pragma once
+#if !defined(__PRETTY_FUNCTION__) && !defined(__GNUC__)
+#define __PRETTY_FUNCTION__ __FUNCSIG__
+#endif
+typedef unsigned int uint;
--- a/src/turbomind/models/BaseWeight.h
+++ b/src/turbomind/models/BaseWeight.h
@@ -14,6 +14,7 @@
 * limitations under the License.
 */
+#include "src/turbomind/macro.h"
 #include <string>
 #include <vector>

--- a/src/turbomind/models/llama/Barrier.h
+++ b/src/turbomind/models/llama/Barrier.h
@@ -3,10 +3,34 @@
 #pragma once
 #include "src/turbomind/utils/logger.h"
+#ifndef _MSC_VER
 #include <pthread.h>
+#endif
 namespace turbomind {
+#ifdef _MSC_VER
+class Barrier {
+public:
+    Barrier(unsigned count)
+    {
+        TM_LOG_INFO("Barrier(%d)", (int)count);
+        FT_CHECK(count == 1);
+    }
+    Barrier(const Barrier&) = delete;
+    Barrier& operator=(const Barrier&) = delete;
+    Barrier(Barrier&&) noexcept        = delete;
+    Barrier& operator=(Barrier&&) noexcept = delete;
+    void wait() {}
+    ~Barrier() {}
+};
+#else
 class Barrier {
 public:
    Barrier(unsigned count)
@@ -34,4 +58,6 @@ private:
    pthread_barrier_t barrier_{};
 };
+#endif
 }  // namespace turbomind
--- a/src/turbomind/models/llama/CMakeLists.txt
+++ b/src/turbomind/models/llama/CMakeLists.txt
@@ -4,6 +4,8 @@ cmake_minimum_required(VERSION 3.8)
 add_subdirectory(fused_multi_head_attention)
+find_package(CUDAToolkit REQUIRED)
 add_library(Llama STATIC
        LlamaV2.cc
        LlamaBatch.cc
@@ -20,7 +22,7 @@ add_library(Llama STATIC
        llama_utils.cu)
 set_property(TARGET Llama PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET Llama PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
-target_link_libraries(Llama PUBLIC -lcudart
+target_link_libraries(Llama PUBLIC CUDA::cudart
        gemm_s4_f16
        cublasMMWrapper
        DynamicDecodeLayer
@@ -40,4 +42,4 @@ target_link_libraries(Llama PUBLIC -lcudart
        llama_fmha)
 add_executable(llama_gemm llama_gemm.cc)
-target_link_libraries(llama_gemm PUBLIC -lcudart gpt_gemm_func memory_utils cuda_utils logger)
+target_link_libraries(llama_gemm PUBLIC CUDA::cudart gpt_gemm_func memory_utils cuda_utils logger)
--- a/src/turbomind/models/llama/LlamaBatch.cc
+++ b/src/turbomind/models/llama/LlamaBatch.cc
@@ -2,6 +2,7 @@
 #include "src/turbomind/models/llama/LlamaBatch.h"
 #include "src/turbomind/kernels/decoding_kernels.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/LlamaNcclGuard.h"
 #include "src/turbomind/models/llama/LlamaV2.h"
 #include "src/turbomind/models/llama/Request.h"

--- a/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/turbomind/models/llama/LlamaContextAttentionLayer.cc
@@ -22,6 +22,7 @@
 #include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
 #include "src/turbomind/kernels/bert_preprocess_kernels.h"
 #include "src/turbomind/kernels/unfused_attention_kernels.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/LlamaNcclGuard.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"
@@ -265,40 +266,41 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T**    key_cache_ptr
    // flash attention
    using AttentionOp = FlashAttentionOp<T>;
    using Layout      = typename AttentionOp::AttentionLayout;
-    Layout layout_q{.stride_batch = int(local_head_num_ * max_q_len * size_per_head_),
+    Layout layout_q{
-                    .stride_seq   = int(size_per_head_),
+        int(local_head_num_ * max_q_len * size_per_head_), int(size_per_head_), int(max_q_len * size_per_head_)};
-                    .stride_head  = int(max_q_len * size_per_head_)};
+    Layout layout_k{int(local_head_num_ * max_seq_len * size_per_head_),
-    Layout layout_k{.stride_batch      = int(local_head_num_ * max_seq_len * size_per_head_),
+                    int(size_per_head_),
-                    .stride_seq        = int(size_per_head_),
+                    int(max_seq_len * size_per_head_),
-                    .stride_head       = int(max_seq_len * size_per_head_),
+                    false,
-                    .batch_seqs_offset = int(cache_layer_offset),
+                    int(cache_layer_offset),
-                    .batch_seqs        = key_cache_ptrs};
+                    key_cache_ptrs};
-    Layout layout_v{.stride_batch      = int(local_head_num_ * max_seq_len * size_per_head_),
+    Layout layout_v{int(local_head_num_ * max_seq_len * size_per_head_),
-                    .stride_seq        = int(size_per_head_),
+                    int(size_per_head_),
-                    .stride_head       = int(max_seq_len * size_per_head_),
+                    int(max_seq_len * size_per_head_),
-                    .batch_seqs_offset = int(cache_layer_offset),
+                    false,
-                    .batch_seqs        = val_cache_ptrs};
+                    int(cache_layer_offset),
+                    val_cache_ptrs};
    Layout layout_o{
-        .stride_batch = int(local_head_num_ * max_q_len * size_per_head_),
+        int(local_head_num_ * max_q_len * size_per_head_),
-        .stride_seq   = int(local_head_num_ * size_per_head_),
+        int(local_head_num_ * size_per_head_),
-        .stride_head  = int(size_per_head_),
+        int(size_per_head_),
-        .use_seqlens  = true,
+        true,
    };
    size_t                       group_size = size_t(local_head_num_ / local_kv_head_num_);
    AttentionOp                  flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
-    typename AttentionOp::Params attn_params{.attn_out     = qkv_buf_3_,
+    typename AttentionOp::Params attn_params{qkv_buf_3_,
-                                             .query        = q_buf_2_,
+                                             q_buf_2_,
-                                             .key          = k_cache_buf_,
+                                             k_cache_buf_,
-                                             .val          = v_cache_buf_,
+                                             v_cache_buf_,
-                                             .mask         = attention_mask,
+                                             attention_mask,
-                                             .out_accum    = qk_buf_float_,
+                                             qk_buf_float_,
-                                             .cu_seqlens_q = cu_seqlens,
+                                             cu_seqlens,
-                                             .cu_seqlens_k = nullptr,
+                                             nullptr,
-                                             .group_size   = group_size,
+                                             group_size,
-                                             .layout_q     = layout_q,
+                                             layout_q,
-                                             .layout_k     = layout_k,
+                                             layout_k,
-                                             .layout_v     = layout_v,
+                                             layout_v,
-                                             .layout_o     = layout_o};
+                                             layout_o};
    //
    flash_attention(attn_params, stream_);

--- a/src/turbomind/models/llama/LlamaContextDecoder.cc
+++ b/src/turbomind/models/llama/LlamaContextDecoder.cc
@@ -21,6 +21,7 @@
 #include "src/turbomind/models/llama/LlamaContextDecoder.h"
 #include "src/turbomind/kernels/bert_preprocess_kernels.h"
 #include "src/turbomind/kernels/gpt_kernels.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/LlamaContextDecoder.h"
 #include "src/turbomind/models/llama/llama_decoder_kernels.h"
 #include "src/turbomind/models/llama/llama_kernels.h"

--- a/src/turbomind/models/llama/LlamaDecoder.cc
+++ b/src/turbomind/models/llama/LlamaDecoder.cc
@@ -20,6 +20,7 @@
 // https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.cc
 #include "src/turbomind/models/llama/LlamaDecoder.h"
+#include "src/turbomind/macro.h"
 #include "src/turbomind/models/llama/llama_decoder_kernels.h"
 #include "src/turbomind/models/llama/llama_kernels.h"
 #include "src/turbomind/models/llama/llama_utils.h"