Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f48954a4
Commit
f48954a4
authored
Jun 12, 2024
by
zhuwenwen
Browse files
merge v0.5.0
parents
1dba29d3
8f89d720
Changes
253
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
267 additions
and
170 deletions
+267
-170
cmake/cpu_extension.cmake
cmake/cpu_extension.cmake
+6
-6
cmake/utils.cmake
cmake/utils.cmake
+8
-3
collect_env.py
collect_env.py
+7
-0
csrc/activation_kernels.cu
csrc/activation_kernels.cu
+1
-1
csrc/attention/attention_kernels.cu
csrc/attention/attention_kernels.cu
+18
-16
csrc/cache.h
csrc/cache.h
+9
-5
csrc/cache_kernels.cu
csrc/cache_kernels.cu
+8
-5
csrc/cpu/attention.cpp
csrc/cpu/attention.cpp
+14
-12
csrc/cpu/cache.cpp
csrc/cpu/cache.cpp
+8
-5
csrc/cpu/cpu_types.hpp
csrc/cpu/cpu_types.hpp
+1
-1
csrc/cpu/layernorm.cpp
csrc/cpu/layernorm.cpp
+2
-2
csrc/cpu/pos_encoding.cpp
csrc/cpu/pos_encoding.cpp
+52
-51
csrc/cpu/pybind.cpp
csrc/cpu/pybind.cpp
+0
-44
csrc/cpu/torch_bindings.cpp
csrc/cpu/torch_bindings.cpp
+106
-0
csrc/cuda_compat.h
csrc/cuda_compat.h
+4
-0
csrc/cuda_utils.h
csrc/cuda_utils.h
+2
-4
csrc/cuda_utils_kernels.cu
csrc/cuda_utils_kernels.cu
+3
-3
csrc/custom_all_reduce.cu
csrc/custom_all_reduce.cu
+14
-8
csrc/dispatch_utils.h
csrc/dispatch_utils.h
+1
-1
csrc/layernorm_kernels.cu
csrc/layernorm_kernels.cu
+3
-3
No files found.
cmake/cpu_extension.cmake
View file @
f48954a4
...
@@ -73,7 +73,7 @@ set(VLLM_EXT_SRC
...
@@ -73,7 +73,7 @@ set(VLLM_EXT_SRC
"csrc/cpu/cache.cpp"
"csrc/cpu/cache.cpp"
"csrc/cpu/layernorm.cpp"
"csrc/cpu/layernorm.cpp"
"csrc/cpu/pos_encoding.cpp"
"csrc/cpu/pos_encoding.cpp"
"csrc/cpu/
pybind
.cpp"
)
"csrc/cpu/
torch_bindings
.cpp"
)
define_gpu_extension_target
(
define_gpu_extension_target
(
_C
_C
...
@@ -81,10 +81,10 @@ define_gpu_extension_target(
...
@@ -81,10 +81,10 @@ define_gpu_extension_target(
LANGUAGE CXX
LANGUAGE CXX
SOURCES
${
VLLM_EXT_SRC
}
SOURCES
${
VLLM_EXT_SRC
}
COMPILE_FLAGS
${
CXX_COMPILE_FLAGS
}
COMPILE_FLAGS
${
CXX_COMPILE_FLAGS
}
USE_SABI 3
WITH_SOABI
WITH_SOABI
)
)
add_custom_target
(
default
)
add_custom_target
(
default
)
message
(
STATUS
"Enabling C extension."
)
message
(
STATUS
"Enabling C extension."
)
add_dependencies
(
default _C
)
add_dependencies
(
default _C
)
cmake/utils.cmake
View file @
f48954a4
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
macro
(
find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS
)
macro
(
find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS
)
file
(
REAL_PATH
${
EXECUTABLE
}
EXECUTABLE
)
file
(
REAL_PATH
${
EXECUTABLE
}
EXECUTABLE
)
set
(
Python_EXECUTABLE
${
EXECUTABLE
}
)
set
(
Python_EXECUTABLE
${
EXECUTABLE
}
)
find_package
(
Python COMPONENTS Interpreter Development.Module
)
find_package
(
Python COMPONENTS Interpreter Development.Module
Development.SABIModule
)
if
(
NOT Python_FOUND
)
if
(
NOT Python_FOUND
)
message
(
FATAL_ERROR
"Unable to find python matching:
${
EXECUTABLE
}
."
)
message
(
FATAL_ERROR
"Unable to find python matching:
${
EXECUTABLE
}
."
)
endif
()
endif
()
...
@@ -295,6 +295,7 @@ endmacro()
...
@@ -295,6 +295,7 @@ endmacro()
# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
# LIBRARIES <libraries> - Extra link libraries.
# LIBRARIES <libraries> - Extra link libraries.
# WITH_SOABI - Generate library with python SOABI suffix name.
# WITH_SOABI - Generate library with python SOABI suffix name.
# USE_SABI <version> - Use python stable api <version>
#
#
# Note: optimization level/debug info is set via cmake build type.
# Note: optimization level/debug info is set via cmake build type.
#
#
...
@@ -302,7 +303,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
...
@@ -302,7 +303,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
cmake_parse_arguments
(
PARSE_ARGV 1
cmake_parse_arguments
(
PARSE_ARGV 1
GPU
GPU
"WITH_SOABI"
"WITH_SOABI"
"DESTINATION;LANGUAGE"
"DESTINATION;LANGUAGE
;USE_SABI
"
"SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES"
)
"SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES"
)
# Add hipify preprocessing step when building with HIP/ROCm.
# Add hipify preprocessing step when building with HIP/ROCm.
...
@@ -316,7 +317,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
...
@@ -316,7 +317,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
set
(
GPU_WITH_SOABI
)
set
(
GPU_WITH_SOABI
)
endif
()
endif
()
Python_add_library
(
${
GPU_MOD_NAME
}
MODULE
"
${
GPU_SOURCES
}
"
${
GPU_WITH_SOABI
}
)
if
(
GPU_USE_SABI
)
Python_add_library
(
${
GPU_MOD_NAME
}
MODULE USE_SABI
${
GPU_USE_SABI
}
${
GPU_WITH_SOABI
}
"
${
GPU_SOURCES
}
"
)
else
()
Python_add_library
(
${
GPU_MOD_NAME
}
MODULE
${
GPU_WITH_SOABI
}
"
${
GPU_SOURCES
}
"
)
endif
()
if
(
GPU_LANGUAGE STREQUAL
"HIP"
)
if
(
GPU_LANGUAGE STREQUAL
"HIP"
)
# Make this target dependent on the hipify preprocessor step.
# Make this target dependent on the hipify preprocessor step.
...
...
collect_env.py
View file @
f48954a4
...
@@ -64,6 +64,7 @@ DEFAULT_CONDA_PATTERNS = {
...
@@ -64,6 +64,7 @@ DEFAULT_CONDA_PATTERNS = {
"triton"
,
"triton"
,
"optree"
,
"optree"
,
"nccl"
,
"nccl"
,
"transformers"
,
}
}
DEFAULT_PIP_PATTERNS
=
{
DEFAULT_PIP_PATTERNS
=
{
...
@@ -75,6 +76,7 @@ DEFAULT_PIP_PATTERNS = {
...
@@ -75,6 +76,7 @@ DEFAULT_PIP_PATTERNS = {
"optree"
,
"optree"
,
"onnx"
,
"onnx"
,
"nccl"
,
"nccl"
,
"transformers"
,
}
}
...
@@ -601,6 +603,11 @@ Versions of relevant libraries:
...
@@ -601,6 +603,11 @@ Versions of relevant libraries:
{conda_packages}
{conda_packages}
"""
.
strip
()
"""
.
strip
()
# both the above code and the following code use `strip()` to
# remove leading/trailing whitespaces, so we need to add a newline
# in between to separate the two sections
env_info_fmt
+=
"
\n
"
env_info_fmt
+=
"""
env_info_fmt
+=
"""
ROCM Version: {rocm_version}
ROCM Version: {rocm_version}
Neuron SDK Version: {neuron_sdk_version}
Neuron SDK Version: {neuron_sdk_version}
...
...
csrc/activation_kernels.cu
View file @
f48954a4
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/
extension
.h>
#include <torch/
all
.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAGuard.h>
#include <cmath>
#include <cmath>
...
...
csrc/attention/attention_kernels.cu
View file @
f48954a4
...
@@ -17,7 +17,7 @@
...
@@ -17,7 +17,7 @@
* limitations under the License.
* limitations under the License.
*/
*/
#include <torch/
extension
.h>
#include <torch/
all
.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAGuard.h>
#include <algorithm>
#include <algorithm>
...
@@ -809,15 +809,16 @@ void paged_attention_v1(
...
@@ -809,15 +809,16 @@ void paged_attention_v1(
key_cache
,
// [num_blocks, num_heads, head_size/x, block_size, x]
key_cache
,
// [num_blocks, num_heads, head_size/x, block_size, x]
torch
::
Tensor
&
torch
::
Tensor
&
value_cache
,
// [num_blocks, num_heads, head_size, block_size]
value_cache
,
// [num_blocks, num_heads, head_size, block_size]
int
num_kv_heads
,
// [num_heads]
int
64_t
num_kv_heads
,
// [num_heads]
float
scale
,
double
scale
,
torch
::
Tensor
&
block_tables
,
// [num_seqs, max_num_blocks_per_seq]
torch
::
Tensor
&
block_tables
,
// [num_seqs, max_num_blocks_per_seq]
torch
::
Tensor
&
seq_lens
,
// [num_seqs]
torch
::
Tensor
&
seq_lens
,
// [num_seqs]
int
block_size
,
int
max_seq_len
,
int
64_t
block_size
,
int
64_t
max_seq_len
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
const
std
::
string
&
kv_cache_dtype
,
float
kv_scale
,
const
int
tp_rank
,
const
std
::
string
&
kv_cache_dtype
,
double
kv_scale
,
const
int64_t
tp_rank
,
const
int
blocksparse_local_blocks
,
const
int
blocksparse_vert_stride
,
const
int64_t
blocksparse_local_blocks
,
const
int
blocksparse_block_size
,
const
int
blocksparse_head_sliding_step
)
{
const
int64_t
blocksparse_vert_stride
,
const
int64_t
blocksparse_block_size
,
const
int64_t
blocksparse_head_sliding_step
)
{
const
bool
is_block_sparse
=
(
blocksparse_vert_stride
>
1
);
const
bool
is_block_sparse
=
(
blocksparse_vert_stride
>
1
);
DISPATCH_BY_KV_CACHE_DTYPE
(
query
.
dtype
(),
kv_cache_dtype
,
DISPATCH_BY_KV_CACHE_DTYPE
(
query
.
dtype
(),
kv_cache_dtype
,
...
@@ -973,15 +974,16 @@ void paged_attention_v2(
...
@@ -973,15 +974,16 @@ void paged_attention_v2(
key_cache
,
// [num_blocks, num_heads, head_size/x, block_size, x]
key_cache
,
// [num_blocks, num_heads, head_size/x, block_size, x]
torch
::
Tensor
&
torch
::
Tensor
&
value_cache
,
// [num_blocks, num_heads, head_size, block_size]
value_cache
,
// [num_blocks, num_heads, head_size, block_size]
int
num_kv_heads
,
// [num_heads]
int
64_t
num_kv_heads
,
// [num_heads]
float
scale
,
double
scale
,
torch
::
Tensor
&
block_tables
,
// [num_seqs, max_num_blocks_per_seq]
torch
::
Tensor
&
block_tables
,
// [num_seqs, max_num_blocks_per_seq]
torch
::
Tensor
&
seq_lens
,
// [num_seqs]
torch
::
Tensor
&
seq_lens
,
// [num_seqs]
int
block_size
,
int
max_seq_len
,
int
64_t
block_size
,
int
64_t
max_seq_len
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
const
std
::
string
&
kv_cache_dtype
,
float
kv_scale
,
const
int
tp_rank
,
const
std
::
string
&
kv_cache_dtype
,
double
kv_scale
,
const
int64_t
tp_rank
,
const
int
blocksparse_local_blocks
,
const
int
blocksparse_vert_stride
,
const
int64_t
blocksparse_local_blocks
,
const
int
blocksparse_block_size
,
const
int
blocksparse_head_sliding_step
)
{
const
int64_t
blocksparse_vert_stride
,
const
int64_t
blocksparse_block_size
,
const
int64_t
blocksparse_head_sliding_step
)
{
const
bool
is_block_sparse
=
(
blocksparse_vert_stride
>
1
);
const
bool
is_block_sparse
=
(
blocksparse_vert_stride
>
1
);
DISPATCH_BY_KV_CACHE_DTYPE
(
query
.
dtype
(),
kv_cache_dtype
,
DISPATCH_BY_KV_CACHE_DTYPE
(
query
.
dtype
(),
kv_cache_dtype
,
CALL_V2_LAUNCHER_BLOCK_SIZE
)
CALL_V2_LAUNCHER_BLOCK_SIZE
)
...
...
csrc/cache.h
View file @
f48954a4
#pragma once
#pragma once
#include <torch/
extension
.h>
#include <torch/
all
.h>
#include <map>
#include <map>
#include <vector>
#include <vector>
...
@@ -8,14 +8,18 @@
...
@@ -8,14 +8,18 @@
void
swap_blocks
(
torch
::
Tensor
&
src
,
torch
::
Tensor
&
dst
,
void
swap_blocks
(
torch
::
Tensor
&
src
,
torch
::
Tensor
&
dst
,
const
torch
::
Tensor
&
block_mapping
);
const
torch
::
Tensor
&
block_mapping
);
void
copy_blocks
(
std
::
vector
<
torch
::
Tensor
>&
key_caches
,
// Note: the key_caches and value_caches vectors are constant but
std
::
vector
<
torch
::
Tensor
>&
value_caches
,
// not the Tensors they contain. The vectors need to be const refs
// in order to satisfy pytorch's C++ operator registration code.
void
copy_blocks
(
std
::
vector
<
torch
::
Tensor
>
const
&
key_caches
,
std
::
vector
<
torch
::
Tensor
>
const
&
value_caches
,
const
torch
::
Tensor
&
block_mapping
);
const
torch
::
Tensor
&
block_mapping
);
void
reshape_and_cache
(
torch
::
Tensor
&
key
,
torch
::
Tensor
&
value
,
void
reshape_and_cache
(
torch
::
Tensor
&
key
,
torch
::
Tensor
&
value
,
torch
::
Tensor
&
key_cache
,
torch
::
Tensor
&
value_cache
,
torch
::
Tensor
&
key_cache
,
torch
::
Tensor
&
value_cache
,
torch
::
Tensor
&
slot_mapping
,
torch
::
Tensor
&
slot_mapping
,
const
std
::
string
&
kv_cache_dtype
,
const
float
kv_scale
);
const
std
::
string
&
kv_cache_dtype
,
const
double
kv_scale
);
void
reshape_and_cache_flash
(
torch
::
Tensor
&
key
,
torch
::
Tensor
&
value
,
void
reshape_and_cache_flash
(
torch
::
Tensor
&
key
,
torch
::
Tensor
&
value
,
torch
::
Tensor
&
key_cache
,
torch
::
Tensor
&
key_cache
,
...
@@ -25,4 +29,4 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
...
@@ -25,4 +29,4 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
// Just for unittest
// Just for unittest
void
convert_fp8
(
torch
::
Tensor
&
dst_cache
,
torch
::
Tensor
&
src_cache
,
void
convert_fp8
(
torch
::
Tensor
&
dst_cache
,
torch
::
Tensor
&
src_cache
,
const
float
scale
,
const
std
::
string
&
kv_cache_dtype
);
const
double
scale
,
const
std
::
string
&
kv_cache_dtype
);
csrc/cache_kernels.cu
View file @
f48954a4
#include <torch/
extension
.h>
#include <torch/
all
.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAGuard.h>
...
@@ -95,8 +95,11 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
...
@@ -95,8 +95,11 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
}
// namespace vllm
}
// namespace vllm
void
copy_blocks
(
std
::
vector
<
torch
::
Tensor
>&
key_caches
,
// Note: the key_caches and value_caches vectors are constant but
std
::
vector
<
torch
::
Tensor
>&
value_caches
,
// not the Tensors they contain. The vectors need to be const refs
// in order to satisfy pytorch's C++ operator registration code.
void
copy_blocks
(
std
::
vector
<
torch
::
Tensor
>
const
&
key_caches
,
std
::
vector
<
torch
::
Tensor
>
const
&
value_caches
,
const
torch
::
Tensor
&
block_mapping
)
{
const
torch
::
Tensor
&
block_mapping
)
{
int
num_layers
=
key_caches
.
size
();
int
num_layers
=
key_caches
.
size
();
TORCH_CHECK
(
num_layers
==
value_caches
.
size
());
TORCH_CHECK
(
num_layers
==
value_caches
.
size
());
...
@@ -255,7 +258,7 @@ void reshape_and_cache(
...
@@ -255,7 +258,7 @@ void reshape_and_cache(
torch
::
Tensor
&
torch
::
Tensor
&
value_cache
,
// [num_blocks, num_heads, head_size, block_size]
value_cache
,
// [num_blocks, num_heads, head_size, block_size]
torch
::
Tensor
&
slot_mapping
,
// [num_tokens]
torch
::
Tensor
&
slot_mapping
,
// [num_tokens]
const
std
::
string
&
kv_cache_dtype
,
const
float
kv_scale
)
{
const
std
::
string
&
kv_cache_dtype
,
const
double
kv_scale
)
{
int
num_tokens
=
key
.
size
(
0
);
int
num_tokens
=
key
.
size
(
0
);
int
num_heads
=
key
.
size
(
1
);
int
num_heads
=
key
.
size
(
1
);
int
head_size
=
key
.
size
(
2
);
int
head_size
=
key
.
size
(
2
);
...
@@ -334,7 +337,7 @@ __global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
...
@@ -334,7 +337,7 @@ __global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
// Only for testing.
// Only for testing.
void
convert_fp8
(
torch
::
Tensor
&
dst_cache
,
torch
::
Tensor
&
src_cache
,
void
convert_fp8
(
torch
::
Tensor
&
dst_cache
,
torch
::
Tensor
&
src_cache
,
const
float
kv_scale
,
const
std
::
string
&
kv_cache_dtype
)
{
const
double
kv_scale
,
const
std
::
string
&
kv_cache_dtype
)
{
torch
::
Device
src_device
=
src_cache
.
device
();
torch
::
Device
src_device
=
src_cache
.
device
();
torch
::
Device
dst_device
=
dst_cache
.
device
();
torch
::
Device
dst_device
=
dst_cache
.
device
();
TORCH_CHECK
(
src_device
.
is_cuda
(),
"src must be on a GPU"
)
TORCH_CHECK
(
src_device
.
is_cuda
(),
"src must be on a GPU"
)
...
...
csrc/cpu/attention.cpp
View file @
f48954a4
...
@@ -420,12 +420,13 @@ void paged_attention_v1_impl_launcher(
...
@@ -420,12 +420,13 @@ void paged_attention_v1_impl_launcher(
void
paged_attention_v1
(
void
paged_attention_v1
(
torch
::
Tensor
&
out
,
torch
::
Tensor
&
query
,
torch
::
Tensor
&
key_cache
,
torch
::
Tensor
&
out
,
torch
::
Tensor
&
query
,
torch
::
Tensor
&
key_cache
,
torch
::
Tensor
&
value_cache
,
int
num_kv_heads
,
float
scale
,
torch
::
Tensor
&
value_cache
,
int64_t
num_kv_heads
,
double
scale
,
torch
::
Tensor
&
block_tables
,
torch
::
Tensor
&
seq_lens
,
int
block_size
,
torch
::
Tensor
&
block_tables
,
torch
::
Tensor
&
seq_lens
,
int64_t
block_size
,
int
max_seq_len
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
int64_t
max_seq_len
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
const
std
::
string
&
kv_cache_dtype
,
float
kv_scale
,
const
int
tp_rank
,
const
std
::
string
&
kv_cache_dtype
,
double
kv_scale
,
const
int64_t
tp_rank
,
const
int
blocksparse_local_blocks
,
const
int
blocksparse_vert_stride
,
const
int64_t
blocksparse_local_blocks
,
const
int
blocksparse_block_size
,
const
int
blocksparse_head_sliding_step
)
{
const
int64_t
blocksparse_vert_stride
,
const
int64_t
blocksparse_block_size
,
const
int64_t
blocksparse_head_sliding_step
)
{
TORCH_CHECK
(
kv_scale
==
1.0
f
);
TORCH_CHECK
(
kv_scale
==
1.0
f
);
TORCH_CHECK
(
blocksparse_vert_stride
<=
1
,
TORCH_CHECK
(
blocksparse_vert_stride
<=
1
,
"CPU backend does not support blocksparse attention yet."
);
"CPU backend does not support blocksparse attention yet."
);
...
@@ -738,12 +739,13 @@ void paged_attention_v2_impl_launcher(
...
@@ -738,12 +739,13 @@ void paged_attention_v2_impl_launcher(
void
paged_attention_v2
(
void
paged_attention_v2
(
torch
::
Tensor
&
out
,
torch
::
Tensor
&
exp_sums
,
torch
::
Tensor
&
max_logits
,
torch
::
Tensor
&
out
,
torch
::
Tensor
&
exp_sums
,
torch
::
Tensor
&
max_logits
,
torch
::
Tensor
&
tmp_out
,
torch
::
Tensor
&
query
,
torch
::
Tensor
&
key_cache
,
torch
::
Tensor
&
tmp_out
,
torch
::
Tensor
&
query
,
torch
::
Tensor
&
key_cache
,
torch
::
Tensor
&
value_cache
,
int
num_kv_heads
,
float
scale
,
torch
::
Tensor
&
value_cache
,
int64_t
num_kv_heads
,
double
scale
,
torch
::
Tensor
&
block_tables
,
torch
::
Tensor
&
seq_lens
,
int
block_size
,
torch
::
Tensor
&
block_tables
,
torch
::
Tensor
&
seq_lens
,
int64_t
block_size
,
int
max_seq_len
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
int64_t
max_seq_len
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
const
std
::
string
&
kv_cache_dtype
,
float
kv_scale
,
const
int
tp_rank
,
const
std
::
string
&
kv_cache_dtype
,
double
kv_scale
,
const
int64_t
tp_rank
,
const
int
blocksparse_local_blocks
,
const
int
blocksparse_vert_stride
,
const
int64_t
blocksparse_local_blocks
,
const
int
blocksparse_block_size
,
const
int
blocksparse_head_sliding_step
)
{
const
int64_t
blocksparse_vert_stride
,
const
int64_t
blocksparse_block_size
,
const
int64_t
blocksparse_head_sliding_step
)
{
TORCH_CHECK
(
kv_scale
==
1.0
f
);
TORCH_CHECK
(
kv_scale
==
1.0
f
);
TORCH_CHECK
(
blocksparse_vert_stride
<=
1
,
TORCH_CHECK
(
blocksparse_vert_stride
<=
1
,
"CPU backend does not support blocksparse attention yet."
);
"CPU backend does not support blocksparse attention yet."
);
...
...
csrc/cpu/cache.cpp
View file @
f48954a4
...
@@ -5,8 +5,8 @@
...
@@ -5,8 +5,8 @@
namespace
{
namespace
{
template
<
typename
scalar_t
>
template
<
typename
scalar_t
>
void
copy_blocks_cpu_impl
(
std
::
vector
<
torch
::
Tensor
>&
key_caches
,
void
copy_blocks_cpu_impl
(
std
::
vector
<
torch
::
Tensor
>
const
&
key_caches
,
std
::
vector
<
torch
::
Tensor
>&
value_caches
,
std
::
vector
<
torch
::
Tensor
>
const
&
value_caches
,
const
torch
::
Tensor
&
mapping_pairs
,
const
torch
::
Tensor
&
mapping_pairs
,
const
int
element_num_per_block
,
const
int
element_num_per_block
,
const
int
layer_num
)
{
const
int
layer_num
)
{
...
@@ -82,8 +82,11 @@ void reshape_and_cache_cpu_impl(
...
@@ -82,8 +82,11 @@ void reshape_and_cache_cpu_impl(
}
}
};
// namespace
};
// namespace
void
copy_blocks
(
std
::
vector
<
torch
::
Tensor
>&
key_caches
,
// Note: the key_caches and value_caches vectors are constant but
std
::
vector
<
torch
::
Tensor
>&
value_caches
,
// not the Tensors they contain. The vectors need to be const refs
// in order to satisfy pytorch's C++ operator registration code.
void
copy_blocks
(
std
::
vector
<
torch
::
Tensor
>
const
&
key_caches
,
std
::
vector
<
torch
::
Tensor
>
const
&
value_caches
,
const
torch
::
Tensor
&
block_mapping
)
{
const
torch
::
Tensor
&
block_mapping
)
{
unsigned
num_layers
=
key_caches
.
size
();
unsigned
num_layers
=
key_caches
.
size
();
TORCH_CHECK
(
num_layers
==
value_caches
.
size
());
TORCH_CHECK
(
num_layers
==
value_caches
.
size
());
...
@@ -104,7 +107,7 @@ void copy_blocks(std::vector<torch::Tensor>& key_caches,
...
@@ -104,7 +107,7 @@ void copy_blocks(std::vector<torch::Tensor>& key_caches,
void
reshape_and_cache
(
torch
::
Tensor
&
key
,
torch
::
Tensor
&
value
,
void
reshape_and_cache
(
torch
::
Tensor
&
key
,
torch
::
Tensor
&
value
,
torch
::
Tensor
&
key_cache
,
torch
::
Tensor
&
value_cache
,
torch
::
Tensor
&
key_cache
,
torch
::
Tensor
&
value_cache
,
torch
::
Tensor
&
slot_mapping
,
torch
::
Tensor
&
slot_mapping
,
const
std
::
string
&
kv_cache_dtype
,
float
kv_scale
)
{
const
std
::
string
&
kv_cache_dtype
,
double
kv_scale
)
{
TORCH_CHECK
(
kv_scale
==
1.0
f
);
TORCH_CHECK
(
kv_scale
==
1.0
f
);
int
num_tokens
=
key
.
size
(
0
);
int
num_tokens
=
key
.
size
(
0
);
...
...
csrc/cpu/cpu_types.hpp
View file @
f48954a4
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#define CPU_TYPES_HPP
#define CPU_TYPES_HPP
#include <immintrin.h>
#include <immintrin.h>
#include <torch/
extension
.h>
#include <torch/
all
.h>
namespace
vec_op
{
namespace
vec_op
{
...
...
csrc/cpu/layernorm.cpp
View file @
f48954a4
...
@@ -88,7 +88,7 @@ void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
...
@@ -88,7 +88,7 @@ void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
}
// namespace
}
// namespace
void
rms_norm
(
torch
::
Tensor
&
out
,
torch
::
Tensor
&
input
,
torch
::
Tensor
&
weight
,
void
rms_norm
(
torch
::
Tensor
&
out
,
torch
::
Tensor
&
input
,
torch
::
Tensor
&
weight
,
float
epsilon
)
{
double
epsilon
)
{
int
hidden_size
=
input
.
size
(
-
1
);
int
hidden_size
=
input
.
size
(
-
1
);
int
num_tokens
=
input
.
numel
()
/
hidden_size
;
int
num_tokens
=
input
.
numel
()
/
hidden_size
;
...
@@ -102,7 +102,7 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
...
@@ -102,7 +102,7 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
}
}
void
fused_add_rms_norm
(
torch
::
Tensor
&
input
,
torch
::
Tensor
&
residual
,
void
fused_add_rms_norm
(
torch
::
Tensor
&
input
,
torch
::
Tensor
&
residual
,
torch
::
Tensor
&
weight
,
float
epsilon
)
{
torch
::
Tensor
&
weight
,
double
epsilon
)
{
int
hidden_size
=
input
.
size
(
-
1
);
int
hidden_size
=
input
.
size
(
-
1
);
int
num_tokens
=
input
.
numel
()
/
hidden_size
;
int
num_tokens
=
input
.
numel
()
/
hidden_size
;
...
...
csrc/cpu/pos_encoding.cpp
View file @
f48954a4
...
@@ -21,18 +21,13 @@ void rotary_embedding_impl(
...
@@ -21,18 +21,13 @@ void rotary_embedding_impl(
constexpr
int
VEC_ELEM_NUM
=
scalar_vec_t
::
get_elem_num
();
constexpr
int
VEC_ELEM_NUM
=
scalar_vec_t
::
get_elem_num
();
const
int
embed_dim
=
rot_dim
/
2
;
const
int
embed_dim
=
rot_dim
/
2
;
TORCH_CHECK
(
embed_dim
%
VEC_ELEM_NUM
==
0
);
bool
flag
=
(
embed_dim
%
VEC_ELEM_NUM
==
0
);
const
int
loop_upper
=
flag
?
embed_dim
:
embed_dim
-
VEC_ELEM_NUM
;
#pragma omp parallel for
auto
compute_loop
=
[
&
](
const
int64_t
token_head
,
const
scalar_t
*
cache_ptr
,
for
(
int
token_idx
=
0
;
token_idx
<
num_tokens
;
++
token_idx
)
{
scalar_t
*
qk
)
{
int64_t
pos
=
positions
[
token_idx
];
int
j
=
0
;
const
scalar_t
*
cache_ptr
=
cos_sin_cache
+
pos
*
rot_dim
;
for
(;
j
<
loop_upper
;
j
+=
VEC_ELEM_NUM
)
{
for
(
int
i
=
0
;
i
<
num_heads
;
++
i
)
{
const
int
head_idx
=
i
;
const
int64_t
token_head
=
token_idx
*
query_stride
+
head_idx
*
head_size
;
for
(
int
j
=
0
;
j
<
embed_dim
;
j
+=
VEC_ELEM_NUM
)
{
const
int
rot_offset
=
j
;
const
int
rot_offset
=
j
;
const
int
x_index
=
rot_offset
;
const
int
x_index
=
rot_offset
;
const
int
y_index
=
embed_dim
+
rot_offset
;
const
int
y_index
=
embed_dim
+
rot_offset
;
...
@@ -43,8 +38,8 @@ void rotary_embedding_impl(
...
@@ -43,8 +38,8 @@ void rotary_embedding_impl(
const
scalar_vec_t
cos
(
cache_ptr
+
x_index
);
const
scalar_vec_t
cos
(
cache_ptr
+
x_index
);
const
scalar_vec_t
sin
(
cache_ptr
+
y_index
);
const
scalar_vec_t
sin
(
cache_ptr
+
y_index
);
const
scalar_vec_t
q_x
(
q
uery
+
out_x
);
const
scalar_vec_t
q_x
(
q
k
+
out_x
);
const
scalar_vec_t
q_y
(
q
uery
+
out_y
);
const
scalar_vec_t
q_y
(
q
k
+
out_y
);
vec_op
::
FP32Vec8
fp32_cos
(
cos
);
vec_op
::
FP32Vec8
fp32_cos
(
cos
);
vec_op
::
FP32Vec8
fp32_sin
(
sin
);
vec_op
::
FP32Vec8
fp32_sin
(
sin
);
...
@@ -53,41 +48,47 @@ void rotary_embedding_impl(
...
@@ -53,41 +48,47 @@ void rotary_embedding_impl(
vec_op
::
FP32Vec8
fp32_q_y
(
q_y
);
vec_op
::
FP32Vec8
fp32_q_y
(
q_y
);
auto
out1
=
fp32_q_x
*
fp32_cos
-
fp32_q_y
*
fp32_sin
;
auto
out1
=
fp32_q_x
*
fp32_cos
-
fp32_q_y
*
fp32_sin
;
scalar_vec_t
(
out1
).
save
(
q
uery
+
out_x
);
scalar_vec_t
(
out1
).
save
(
q
k
+
out_x
);
auto
out2
=
fp32_q_y
*
fp32_cos
+
fp32_q_x
*
fp32_sin
;
auto
out2
=
fp32_q_y
*
fp32_cos
+
fp32_q_x
*
fp32_sin
;
scalar_vec_t
(
out2
).
save
(
query
+
out_y
);
scalar_vec_t
(
out2
).
save
(
qk
+
out_y
);
}
}
}
if
(
!
flag
)
{
for
(
int
i
=
0
;
i
<
num_kv_heads
;
++
i
)
{
for
(;
j
<
embed_dim
;
++
j
)
{
const
int
head_idx
=
i
;
const
int
x_index
=
j
;
const
int64_t
token_head
=
token_idx
*
key_stride
+
head_idx
*
head_size
;
const
int
y_index
=
embed_dim
+
j
;
for
(
int
j
=
0
;
j
<
embed_dim
;
j
+=
VEC_ELEM_NUM
)
{
const
int
rot_offset
=
j
;
const
int
x_index
=
rot_offset
;
const
int
y_index
=
embed_dim
+
rot_offset
;
const
int64_t
out_x
=
token_head
+
x_index
;
const
int64_t
out_x
=
token_head
+
x_index
;
const
int64_t
out_y
=
token_head
+
y_index
;
const
int64_t
out_y
=
token_head
+
y_index
;
const
scalar_vec_t
cos
(
cache_ptr
+
x_index
)
;
const
float
fp32_cos
=
cache_ptr
[
x_index
]
;
const
scalar_vec_t
sin
(
cache_ptr
+
y_index
)
;
const
float
fp32_sin
=
cache_ptr
[
y_index
]
;
const
scalar_vec_t
k_x
(
key
+
out_x
)
;
const
float
fp32_q_x
=
qk
[
out_x
]
;
const
scalar_vec_t
k_y
(
key
+
out_y
)
;
const
float
fp32_q_y
=
qk
[
out_y
]
;
vec_op
::
FP32Vec8
fp32_cos
(
cos
);
qk
[
out_x
]
=
fp32_q_x
*
fp32_cos
-
fp32_q_y
*
fp32_sin
;
vec_op
::
FP32Vec8
fp32_sin
(
sin
);
qk
[
out_y
]
=
fp32_q_y
*
fp32_cos
+
fp32_q_x
*
fp32_sin
;
}
}
};
vec_op
::
FP32Vec8
fp32_k_x
(
k_x
);
#pragma omp parallel for
vec_op
::
FP32Vec8
fp32_k_y
(
k_y
);
for
(
int
token_idx
=
0
;
token_idx
<
num_tokens
;
++
token_idx
)
{
int64_t
pos
=
positions
[
token_idx
];
const
scalar_t
*
cache_ptr
=
cos_sin_cache
+
pos
*
rot_dim
;
auto
out1
=
fp32_k_x
*
fp32_cos
-
fp32_k_y
*
fp32_sin
;
for
(
int
i
=
0
;
i
<
num_heads
;
++
i
)
{
scalar_vec_t
(
out1
).
save
(
key
+
out_x
);
const
int
head_idx
=
i
;
auto
out2
=
fp32_k_y
*
fp32_cos
+
fp32_k_x
*
fp32_sin
;
const
int64_t
token_head
=
scalar_vec_t
(
out2
).
save
(
key
+
out_y
);
token_idx
*
query_stride
+
head_idx
*
head_size
;
compute_loop
(
token_head
,
cache_ptr
,
query
);
}
}
for
(
int
i
=
0
;
i
<
num_kv_heads
;
++
i
)
{
const
int
head_idx
=
i
;
const
int64_t
token_head
=
token_idx
*
key_stride
+
head_idx
*
head_size
;
compute_loop
(
token_head
,
cache_ptr
,
key
);
}
}
}
}
}
}
...
@@ -167,7 +168,7 @@ void rotary_embedding_gptj_impl(
...
@@ -167,7 +168,7 @@ void rotary_embedding_gptj_impl(
};
// namespace
};
// namespace
void
rotary_embedding
(
torch
::
Tensor
&
positions
,
torch
::
Tensor
&
query
,
void
rotary_embedding
(
torch
::
Tensor
&
positions
,
torch
::
Tensor
&
query
,
torch
::
Tensor
&
key
,
int
head_size
,
torch
::
Tensor
&
key
,
int
64_t
head_size
,
torch
::
Tensor
&
cos_sin_cache
,
bool
is_neox
)
{
torch
::
Tensor
&
cos_sin_cache
,
bool
is_neox
)
{
int
num_tokens
=
query
.
numel
()
/
query
.
size
(
-
1
);
int
num_tokens
=
query
.
numel
()
/
query
.
size
(
-
1
);
int
rot_dim
=
cos_sin_cache
.
size
(
1
);
int
rot_dim
=
cos_sin_cache
.
size
(
1
);
...
...
csrc/cpu/pybind.cpp
deleted
100644 → 0
View file @
1dba29d3
#include "cache.h"
#include "cuda_utils.h"
#include "ops.h"
#include <torch/extension.h>
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
// vLLM custom ops
pybind11
::
module
ops
=
m
.
def_submodule
(
"ops"
,
"vLLM custom operators"
);
// Attention ops
ops
.
def
(
"paged_attention_v1"
,
&
paged_attention_v1
,
"Compute the attention between an input query and the cached "
"keys/values using PagedAttention."
);
ops
.
def
(
"paged_attention_v2"
,
&
paged_attention_v2
,
"PagedAttention V2."
);
// Activation ops
ops
.
def
(
"silu_and_mul"
,
&
silu_and_mul
,
"Activation function used in SwiGLU."
);
ops
.
def
(
"gelu_and_mul"
,
&
gelu_and_mul
,
"Activation function used in GeGLU with `none` approximation."
);
ops
.
def
(
"gelu_tanh_and_mul"
,
&
gelu_tanh_and_mul
,
"Activation function used in GeGLU with `tanh` approximation."
);
ops
.
def
(
"gelu_new"
,
&
gelu_new
,
"GELU implementation used in GPT-2."
);
ops
.
def
(
"gelu_fast"
,
&
gelu_fast
,
"Approximate GELU implementation."
);
// Layernorm
ops
.
def
(
"rms_norm"
,
&
rms_norm
,
"Apply Root Mean Square (RMS) Normalization to the input tensor."
);
ops
.
def
(
"fused_add_rms_norm"
,
&
fused_add_rms_norm
,
"In-place fused Add and RMS Normalization"
);
// Rotary embedding
ops
.
def
(
"rotary_embedding"
,
&
rotary_embedding
,
"Apply GPT-NeoX or GPT-J style rotary embedding to query and key"
);
// Cache ops
pybind11
::
module
cache_ops
=
m
.
def_submodule
(
"cache_ops"
,
"vLLM cache ops"
);
cache_ops
.
def
(
"swap_blocks"
,
&
swap_blocks
,
"Swap in (out) the cache blocks from src to dst"
);
cache_ops
.
def
(
"copy_blocks"
,
&
copy_blocks
,
"Copy the cache blocks from src to dst"
);
cache_ops
.
def
(
"reshape_and_cache"
,
&
reshape_and_cache
,
"Reshape the key and value tensors and cache them"
);
}
csrc/cpu/torch_bindings.cpp
0 → 100644
View file @
f48954a4
#include "cache.h"
#include "ops.h"
#include "registration.h"
#include <torch/library.h>
TORCH_LIBRARY_EXPAND
(
TORCH_EXTENSION_NAME
,
ops
)
{
// vLLM custom ops
// Attention ops
// Compute the attention between an input query and the cached keys/values
// using PagedAttention.
ops
.
def
(
"paged_attention_v1("
" Tensor! out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype, float kv_scale, int tp_rank,"
" int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step) -> ()"
);
ops
.
impl
(
"paged_attention_v1"
,
torch
::
kCPU
,
&
paged_attention_v1
);
// PagedAttention V2.
ops
.
def
(
"paged_attention_v2("
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
" Tensor tmp_out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype, float kv_scale, int tp_rank,"
" int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step) -> ()"
);
ops
.
impl
(
"paged_attention_v2"
,
torch
::
kCPU
,
&
paged_attention_v2
);
// Activation ops
// Activation function used in SwiGLU.
ops
.
def
(
"silu_and_mul(Tensor! out, Tensor input) -> ()"
);
ops
.
impl
(
"silu_and_mul"
,
torch
::
kCPU
,
&
silu_and_mul
);
// Activation function used in GeGLU with `none` approximation.
ops
.
def
(
"gelu_and_mul(Tensor! out, Tensor input) -> ()"
);
ops
.
impl
(
"gelu_and_mul"
,
torch
::
kCPU
,
&
gelu_and_mul
);
// Activation function used in GeGLU with `tanh` approximation.
ops
.
def
(
"gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()"
);
ops
.
impl
(
"gelu_tanh_and_mul"
,
torch
::
kCPU
,
&
gelu_tanh_and_mul
);
// GELU implementation used in GPT-2.
ops
.
def
(
"gelu_new(Tensor! out, Tensor input) -> ()"
);
ops
.
impl
(
"gelu_new"
,
torch
::
kCPU
,
&
gelu_new
);
// Approximate GELU implementation.
ops
.
def
(
"gelu_fast(Tensor! out, Tensor input) -> ()"
);
ops
.
impl
(
"gelu_fast"
,
torch
::
kCPU
,
&
gelu_fast
);
// Layernorm
// Apply Root Mean Square (RMS) Normalization to the input tensor.
ops
.
def
(
"rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
"()"
);
ops
.
impl
(
"rms_norm"
,
torch
::
kCPU
,
&
rms_norm
);
// In-place fused Add and RMS Normalization.
ops
.
def
(
"fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
"float epsilon) -> ()"
);
ops
.
impl
(
"fused_add_rms_norm"
,
torch
::
kCPU
,
&
fused_add_rms_norm
);
// Rotary embedding
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
ops
.
def
(
"rotary_embedding(Tensor positions, Tensor! query,"
" Tensor! key, int head_size,"
" Tensor cos_sin_cache, bool is_neox) -> ()"
);
ops
.
impl
(
"rotary_embedding"
,
torch
::
kCPU
,
&
rotary_embedding
);
}
TORCH_LIBRARY_EXPAND
(
CONCAT
(
TORCH_EXTENSION_NAME
,
_cache_ops
),
cache_ops
)
{
// Cache ops
// Swap in (out) the cache blocks from src to dst.
cache_ops
.
def
(
"swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()"
);
cache_ops
.
impl
(
"swap_blocks"
,
torch
::
kCPU
,
&
swap_blocks
);
// Copy the cache blocks from src to dst.
cache_ops
.
def
(
"copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
"block_mapping) -> ()"
);
cache_ops
.
impl
(
"copy_blocks"
,
torch
::
kCPU
,
&
copy_blocks
);
// Reshape the key and value tensors and cache them.
cache_ops
.
def
(
"reshape_and_cache(Tensor key, Tensor value,"
" Tensor! key_cache, Tensor! value_cache,"
" Tensor slot_mapping,"
" str kv_cache_dtype,"
" float kv_scale) -> ()"
);
cache_ops
.
impl
(
"reshape_and_cache"
,
torch
::
kCPU
,
&
reshape_and_cache
);
}
REGISTER_EXTENSION
(
TORCH_EXTENSION_NAME
)
csrc/cuda_compat.h
View file @
f48954a4
...
@@ -19,8 +19,12 @@
...
@@ -19,8 +19,12 @@
#ifndef USE_ROCM
#ifndef USE_ROCM
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
__shfl_xor_sync(uint32_t(-1), var, lane_mask)
__shfl_xor_sync(uint32_t(-1), var, lane_mask)
#define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
__shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
#else
#else
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
#define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
__shfl_xor(var, lane_mask, width)
#endif
#endif
#ifndef USE_ROCM
#ifndef USE_ROCM
...
...
csrc/cuda_utils.h
View file @
f48954a4
#pragma once
#pragma once
#
in
clude <torch/extension.h>
in
t64_t
get_device_attribute
(
int64_t
attribute
,
int64_t
device_id
);
int
get_device_attribute
(
int
attribute
,
int
device_id
);
int64_t
get_max_shared_memory_per_block_device_attribute
(
int64_t
device_id
);
int
get_max_shared_memory_per_block_device_attribute
(
int
device_id
);
csrc/cuda_utils_kernels.cu
View file @
f48954a4
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
#include <hip/hip_runtime.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime_api.h>
#endif
#endif
int
get_device_attribute
(
int
attribute
,
int
device_id
)
{
int
64_t
get_device_attribute
(
int
64_t
attribute
,
int
64_t
device_id
)
{
int
device
,
value
;
int
device
,
value
;
if
(
device_id
<
0
)
{
if
(
device_id
<
0
)
{
cudaGetDevice
(
&
device
);
cudaGetDevice
(
&
device
);
...
@@ -14,8 +14,8 @@ int get_device_attribute(int attribute, int device_id) {
...
@@ -14,8 +14,8 @@ int get_device_attribute(int attribute, int device_id) {
return
value
;
return
value
;
}
}
int
get_max_shared_memory_per_block_device_attribute
(
int
device_id
)
{
int
64_t
get_max_shared_memory_per_block_device_attribute
(
int
64_t
device_id
)
{
int
attribute
;
int
64_t
attribute
;
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
// cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
// cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
...
...
csrc/custom_all_reduce.cu
View file @
f48954a4
#include <ATen/cuda/Exceptions.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAStream.h>
#include <c10/cuda/CUDAStream.h>
#include <torch/
extension
.h>
#include <torch/
all
.h>
#include "custom_all_reduce.cuh"
#include "custom_all_reduce.cuh"
// fake pointer type
// fake pointer type
, must match fptr_t type in ops.h
using
fptr_t
=
u
int64_t
;
using
fptr_t
=
int64_t
;
static_assert
(
sizeof
(
void
*
)
==
sizeof
(
fptr_t
));
static_assert
(
sizeof
(
void
*
)
==
sizeof
(
fptr_t
));
fptr_t
init_custom_ar
(
torch
::
Tensor
&
meta
,
torch
::
Tensor
&
rank_data
,
fptr_t
init_custom_ar
(
torch
::
Tensor
&
meta
,
torch
::
Tensor
&
rank_data
,
const
std
::
vector
<
std
::
string
>&
handles
,
const
std
::
vector
<
std
::
string
>&
handles
,
const
std
::
vector
<
int64_t
>&
offsets
,
int
rank
,
const
std
::
vector
<
int64_t
>&
offsets
,
int
64_t
rank
,
bool
full_nvlink
)
{
bool
full_nvlink
)
{
int
world_size
=
offsets
.
size
();
int
world_size
=
offsets
.
size
();
if
(
world_size
>
8
)
if
(
world_size
>
8
)
...
@@ -55,7 +55,7 @@ bool _is_weak_contiguous(torch::Tensor& t) {
...
@@ -55,7 +55,7 @@ bool _is_weak_contiguous(torch::Tensor& t) {
t
.
numel
()
*
t
.
element_size
());
t
.
numel
()
*
t
.
element_size
());
}
}
bool
should_custom_ar
(
torch
::
Tensor
&
inp
,
int
max_size
,
int
world_size
,
bool
should_custom_ar
(
torch
::
Tensor
&
inp
,
int
64_t
max_size
,
int
64_t
world_size
,
bool
full_nvlink
)
{
bool
full_nvlink
)
{
auto
inp_size
=
inp
.
numel
()
*
inp
.
element_size
();
auto
inp_size
=
inp
.
numel
()
*
inp
.
element_size
();
// custom allreduce requires input byte size to be multiples of 16
// custom allreduce requires input byte size to be multiples of 16
...
@@ -125,7 +125,7 @@ void dispose(fptr_t _fa) {
...
@@ -125,7 +125,7 @@ void dispose(fptr_t _fa) {
delete
fa
;
delete
fa
;
}
}
int
meta_size
()
{
return
sizeof
(
vllm
::
Signal
);
}
int
64_t
meta_size
()
{
return
sizeof
(
vllm
::
Signal
);
}
void
register_buffer
(
fptr_t
_fa
,
torch
::
Tensor
&
t
,
void
register_buffer
(
fptr_t
_fa
,
torch
::
Tensor
&
t
,
const
std
::
vector
<
std
::
string
>&
handles
,
const
std
::
vector
<
std
::
string
>&
handles
,
...
@@ -134,10 +134,16 @@ void register_buffer(fptr_t _fa, torch::Tensor& t,
...
@@ -134,10 +134,16 @@ void register_buffer(fptr_t _fa, torch::Tensor& t,
fa
->
register_buffer
(
handles
,
offsets
,
t
.
data_ptr
());
fa
->
register_buffer
(
handles
,
offsets
,
t
.
data_ptr
());
}
}
std
::
pair
<
std
::
vector
<
uint8_t
>
,
std
::
vector
<
int64_t
>>
get_graph_buffer_ipc_meta
(
std
::
tuple
<
torch
::
Tensor
,
std
::
vector
<
int64_t
>>
get_graph_buffer_ipc_meta
(
fptr_t
_fa
)
{
fptr_t
_fa
)
{
auto
fa
=
reinterpret_cast
<
vllm
::
CustomAllreduce
*>
(
_fa
);
auto
fa
=
reinterpret_cast
<
vllm
::
CustomAllreduce
*>
(
_fa
);
return
fa
->
get_graph_buffer_ipc_meta
();
auto
[
handle_bytes
,
offsets
]
=
fa
->
get_graph_buffer_ipc_meta
();
auto
options
=
torch
::
TensorOptions
().
dtype
(
torch
::
kUInt8
).
device
(
torch
::
kCPU
);
auto
handles
=
torch
::
empty
({
static_cast
<
int64_t
>
(
handle_bytes
.
size
())},
options
);
std
::
memcpy
(
handles
.
data_ptr
(),
handle_bytes
.
data
(),
handle_bytes
.
size
());
return
{
handles
,
std
::
move
(
offsets
)};
}
}
void
register_graph_buffers
(
fptr_t
_fa
,
const
std
::
vector
<
std
::
string
>&
handles
,
void
register_graph_buffers
(
fptr_t
_fa
,
const
std
::
vector
<
std
::
string
>&
handles
,
...
...
csrc/dispatch_utils.h
View file @
f48954a4
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
*/
*/
#pragma once
#pragma once
#include <torch/
extension
.h>
#include <torch/
all
.h>
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
...
...
csrc/layernorm_kernels.cu
View file @
f48954a4
#include <torch/
extension
.h>
#include <torch/
all
.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/cuda/CUDAGuard.h>
...
@@ -291,7 +291,7 @@ fused_add_rms_norm_kernel(
...
@@ -291,7 +291,7 @@ fused_add_rms_norm_kernel(
void
rms_norm
(
torch
::
Tensor
&
out
,
// [..., hidden_size]
void
rms_norm
(
torch
::
Tensor
&
out
,
// [..., hidden_size]
torch
::
Tensor
&
input
,
// [..., hidden_size]
torch
::
Tensor
&
input
,
// [..., hidden_size]
torch
::
Tensor
&
weight
,
// [hidden_size]
torch
::
Tensor
&
weight
,
// [hidden_size]
float
epsilon
)
{
double
epsilon
)
{
int
hidden_size
=
input
.
size
(
-
1
);
int
hidden_size
=
input
.
size
(
-
1
);
int
num_tokens
=
input
.
numel
()
/
hidden_size
;
int
num_tokens
=
input
.
numel
()
/
hidden_size
;
...
@@ -319,7 +319,7 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size]
...
@@ -319,7 +319,7 @@ void rms_norm(torch::Tensor& out, // [..., hidden_size]
void
fused_add_rms_norm
(
torch
::
Tensor
&
input
,
// [..., hidden_size]
void
fused_add_rms_norm
(
torch
::
Tensor
&
input
,
// [..., hidden_size]
torch
::
Tensor
&
residual
,
// [..., hidden_size]
torch
::
Tensor
&
residual
,
// [..., hidden_size]
torch
::
Tensor
&
weight
,
// [hidden_size]
torch
::
Tensor
&
weight
,
// [hidden_size]
float
epsilon
)
{
double
epsilon
)
{
int
hidden_size
=
input
.
size
(
-
1
);
int
hidden_size
=
input
.
size
(
-
1
);
int
num_tokens
=
input
.
numel
()
/
hidden_size
;
int
num_tokens
=
input
.
numel
()
/
hidden_size
;
...
...
Prev
1
2
3
4
5
6
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment