change num_thread to 256

8ee4ae1f · zhuwenwen · e9aa4ff0 · 8ee4ae1f · 8ee4ae1f
Commit 8ee4ae1f authored Jul 02, 2024 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

README.md README.md +1 -1

csrc/attention/attention_kernels.cu csrc/attention/attention_kernels.cu +2 -2

No files found.
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
 + 若使用 pip install 下载安装过慢，可添加源：-i https://pypi.tuna.tsinghua.edu.cn/simple/

 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.4.3；
+- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.5.0；

 ## Known Issue
 - 无

--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -689,7 +689,7 @@ __global__ void paged_attention_v2_reduce_kernel(
 // TODO(woosuk): Tune NUM_THREADS.
 template <typename T, typename CACHE_T, int BLOCK_SIZE,
          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
-          int NUM_THREADS = 128>
+          int NUM_THREADS = 256>
 void paged_attention_v1_launcher(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
    torch::Tensor& value_cache, int num_kv_heads, float scale,
@@ -844,7 +844,7 @@ void paged_attention_v1(

 template <typename T, typename CACHE_T, int BLOCK_SIZE,
          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
-          int NUM_THREADS = 128, int PARTITION_SIZE = 512>
+          int NUM_THREADS = 256, int PARTITION_SIZE = 512>
 void paged_attention_v2_launcher(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,