Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
afd0da21
Commit
afd0da21
authored
Feb 03, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.7.1' into v0.7.1-dev
parents
1a11f127
4f4d427a
Changes
587
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
140 additions
and
29 deletions
+140
-29
csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+2
-2
csrc/quantization/marlin/sparse/common/mma.h
csrc/quantization/marlin/sparse/common/mma.h
+2
-2
csrc/rocm/attention.cu
csrc/rocm/attention.cu
+15
-10
csrc/rocm/ops.h
csrc/rocm/ops.h
+3
-3
csrc/rocm/torch_bindings.cpp
csrc/rocm/torch_bindings.cpp
+1
-1
csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+1
-1
csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+2
-2
csrc/torch_bindings.cpp
csrc/torch_bindings.cpp
+23
-4
docs/Makefile
docs/Makefile
+4
-0
docs/README.md
docs/README.md
+1
-0
docs/requirements-docs.txt
docs/requirements-docs.txt
+4
-1
docs/source/_static/custom.js
docs/source/_static/custom.js
+21
-1
docs/source/api/engine/async_llm_engine.md
docs/source/api/engine/async_llm_engine.md
+0
-0
docs/source/api/engine/index.md
docs/source/api/engine/index.md
+2
-2
docs/source/api/engine/llm_engine.md
docs/source/api/engine/llm_engine.md
+0
-0
docs/source/api/inference_params.md
docs/source/api/inference_params.md
+21
-0
docs/source/api/model/adapters.md
docs/source/api/model/adapters.md
+9
-0
docs/source/api/model/index.md
docs/source/api/model/index.md
+11
-0
docs/source/api/model/interfaces.md
docs/source/api/model/interfaces.md
+9
-0
docs/source/api/model/interfaces_base.md
docs/source/api/model/interfaces_base.md
+9
-0
No files found.
Too many changes to show.
To preserve performance only
587 of 587+
files are displayed.
Plain diff
Email patch
csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
View file @
afd0da21
...
...
@@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) {
static
constexpr
uint32_t
HI
=
0x00f000f0
;
static
constexpr
uint32_t
EX
=
0x64006400
;
// Guarantee that the `(a & b) | c` operations are LOP3s.
uint32_t
t0
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
uint32_t
t1
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
uint32_t
t0
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
uint32_t
t1
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
static
constexpr
uint32_t
SUB
=
0x64086408
;
...
...
csrc/quantization/marlin/sparse/common/mma.h
View file @
afd0da21
...
...
@@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) {
const
int
HI
=
0x00f000f0
;
const
int
EX
=
0x64006400
;
// Guarantee that the `(a & b) | c` operations are LOP3s.
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
int
lo
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
LO
,
EX
);
int
hi
=
lop3
<
(
0xf0
&
0xcc
)
|
0xaa
>
(
q
,
HI
,
EX
);
// We want signed int4 outputs, hence we fuse the `-8` symmetric zero point
// directly into `SUB` and `ADD`.
const
int
SUB
=
0x64086408
;
...
...
csrc/rocm/attention.cu
View file @
afd0da21
...
...
@@ -218,7 +218,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
scalar_t
*
__restrict__
out
,
// [num_seqs, num_heads, max_num_partitions,
// head_size]
scalar_t
*
__restrict__
final_out
,
// [num_seqs, num_heads, head_size]
int
max_ctx_blocks
,
float
k_scale
,
float
v_scale
)
{
int
max_ctx_blocks
,
const
float
*
k_scale
_ptr
,
const
float
*
v_scale
_ptr
)
{
constexpr
int
NWARPS
=
NUM_THREADS
/
WARP_SIZE
;
const
int
warpid
=
threadIdx
.
x
/
WARP_SIZE
;
const
int
laneid
=
threadIdx
.
x
%
WARP_SIZE
;
...
...
@@ -406,7 +406,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
// Vlocalb8[h][b * BLOCK_SIZE / 8 + d] = v_ptrh8be[d];
const
_B8x8
Vlocalb8
=
v_ptrh8be
[
d
];
Vlocal
[
h
][
b
*
BLOCK_SIZE
/
8
+
d
]
=
scaled_convert_b8x8
<
scalar_t
,
KV_DTYPE
>
(
Vlocalb8
,
v_scale
);
scaled_convert_b8x8
<
scalar_t
,
KV_DTYPE
>
(
Vlocalb8
,
*
v_scale
_ptr
);
}
}
}
...
...
@@ -416,7 +416,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
#pragma unroll
for
(
int
d
=
0
;
d
<
KHELOOP
;
d
++
)
{
Klocal
[
d
]
=
scaled_convert_b8x8
<
scalar_t
,
KV_DTYPE
>
(
Klocalb8
[
d
],
k_scale
);
scaled_convert_b8x8
<
scalar_t
,
KV_DTYPE
>
(
Klocalb8
[
d
],
*
k_scale
_ptr
);
}
}
...
...
@@ -890,7 +890,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
scalar_t
*
__restrict__
out
,
// [num_seqs, num_heads, max_num_partitions,
// head_size]
scalar_t
*
__restrict__
final_out
,
// [num_seqs, num_heads, head_size]
int
max_ctx_blocks
,
float
k_scale
,
float
v_scale
)
{
int
max_ctx_blocks
,
const
float
*
k_scale
,
const
float
*
v_scale
)
{
UNREACHABLE_CODE
}
...
...
@@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const
scalar_t
*
__restrict__
tmp_out
,
// [num_seqs, num_heads,
// max_num_partitions, head_size]
const
int
*
__restrict__
context_lens
,
// [num_seqs]
const
int
max_num_partitions
){
UNREACHABLE_CODE
}
const
int
max_num_partitions
)
{
UNREACHABLE_CODE
}
#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
...
...
@@ -919,7 +921,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq, \
alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
k_scale, v_scale);
k_scale
_ptr
, v_scale
_ptr
);
template
<
typename
T
,
typename
KVT
,
vllm
::
Fp8KVCacheDataType
KV_DTYPE
,
int
BLOCK_SIZE
,
int
HEAD_SIZE
,
int
PARTITION_SIZE
=
512
>
...
...
@@ -928,8 +930,8 @@ void paged_attention_custom_launcher(
torch
::
Tensor
&
tmp_out
,
torch
::
Tensor
&
query
,
torch
::
Tensor
&
key_cache
,
torch
::
Tensor
&
value_cache
,
const
int
num_kv_heads
,
float
scale
,
torch
::
Tensor
&
block_tables
,
torch
::
Tensor
&
context_lens
,
int
max_context_len
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
float
k_scale
,
float
v_scale
)
{
int
max_context_len
,
const
std
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
torch
::
Tensor
&
k_scale
,
torch
::
Tensor
&
v_scale
)
{
int
num_seqs
=
query
.
size
(
0
);
int
num_heads
=
query
.
size
(
1
);
int
head_size
=
query
.
size
(
2
);
...
...
@@ -953,6 +955,8 @@ void paged_attention_custom_launcher(
KVT
*
value_cache_ptr
=
reinterpret_cast
<
KVT
*>
(
value_cache
.
data_ptr
());
int
*
block_tables_ptr
=
block_tables
.
data_ptr
<
int
>
();
int
*
context_lens_ptr
=
context_lens
.
data_ptr
<
int
>
();
const
float
*
k_scale_ptr
=
reinterpret_cast
<
const
float
*>
(
k_scale
.
data_ptr
());
const
float
*
v_scale_ptr
=
reinterpret_cast
<
const
float
*>
(
v_scale
.
data_ptr
());
const
int
max_ctx_blocks
=
DIVIDE_ROUND_UP
(
max_context_len
,
BLOCK_SIZE
);
const
int
max_num_partitions
=
...
...
@@ -1086,8 +1090,9 @@ void paged_attention(
torch
::
Tensor
&
block_tables
,
// [num_seqs, max_num_blocks_per_seq]
torch
::
Tensor
&
context_lens
,
// [num_seqs]
int64_t
block_size
,
int64_t
max_context_len
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
const
std
::
string
&
kv_cache_dtype
,
double
k_scale
,
double
v_scale
)
{
const
std
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
const
std
::
string
&
kv_cache_dtype
,
torch
::
Tensor
&
k_scale
,
torch
::
Tensor
&
v_scale
)
{
const
int
head_size
=
query
.
size
(
2
);
if
(
kv_cache_dtype
==
"auto"
)
{
if
(
query
.
dtype
()
==
at
::
ScalarType
::
Half
)
{
...
...
csrc/rocm/ops.h
View file @
afd0da21
...
...
@@ -9,6 +9,6 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
double
scale
,
torch
::
Tensor
&
block_tables
,
torch
::
Tensor
&
context_lens
,
int64_t
block_size
,
int64_t
max_context_len
,
const
c10
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
const
std
::
string
&
kv_cache_dtype
,
double
k_scale
,
double
v_scale
);
const
std
::
optional
<
torch
::
Tensor
>&
alibi_slopes
,
const
std
::
string
&
kv_cache_dtype
,
torch
::
Tensor
&
k_scale
,
torch
::
Tensor
&
v_scale
);
csrc/rocm/torch_bindings.cpp
View file @
afd0da21
...
...
@@ -27,7 +27,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
" int max_context_len,"
" Tensor? alibi_slopes,"
" str kv_cache_dtype,"
"
float
k_scale,
float
v_scale) -> ()"
);
"
Tensor
k_scale,
Tensor
v_scale) -> ()"
);
rocm_ops
.
impl
(
"paged_attention"
,
torch
::
kCUDA
,
&
paged_attention
);
}
...
...
csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
View file @
afd0da21
...
...
@@ -286,7 +286,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
torch
::
Tensor
const
&
bt_meta
,
torch
::
Tensor
const
&
a_scales
,
torch
::
Tensor
const
&
b_scales
,
c10
::
optional
<
torch
::
Tensor
>
const
&
bias
)
{
std
::
optional
<
torch
::
Tensor
>
const
&
bias
)
{
TORCH_CHECK
(
a_scales
.
dtype
()
==
torch
::
kFloat32
);
TORCH_CHECK
(
b_scales
.
dtype
()
==
torch
::
kFloat32
);
if
(
bias
)
{
...
...
csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
View file @
afd0da21
...
...
@@ -22,7 +22,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
torch
::
Tensor
const
&
e
,
torch
::
Tensor
const
&
a_scales
,
torch
::
Tensor
const
&
b_scales
,
c10
::
optional
<
torch
::
Tensor
>
const
&
bias
);
std
::
optional
<
torch
::
Tensor
>
const
&
bias
);
#endif
void
cutlass_scaled_sparse_mm
(
torch
::
Tensor
&
c
,
torch
::
Tensor
const
&
a
,
...
...
@@ -30,7 +30,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
torch
::
Tensor
const
&
bt_meta
,
torch
::
Tensor
const
&
a_scales
,
torch
::
Tensor
const
&
b_scales
,
c10
::
optional
<
torch
::
Tensor
>
const
&
bias
)
{
std
::
optional
<
torch
::
Tensor
>
const
&
bias
)
{
// Checks for conformality
TORCH_CHECK
(
a
.
dim
()
==
2
&&
bt_nzs
.
dim
()
==
2
&&
c
.
dim
()
==
2
);
TORCH_CHECK
(
c
.
size
(
1
)
==
bt_nzs
.
size
(
0
)
&&
bt_nzs
.
size
(
1
)
*
2
==
a
.
size
(
1
)
&&
...
...
csrc/torch_bindings.cpp
View file @
afd0da21
...
...
@@ -30,7 +30,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype,
float
k_scale,
float
v_scale,"
" str kv_cache_dtype,
Tensor
k_scale,
Tensor
v_scale,"
" int tp_rank, int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step) -> ()"
);
...
...
@@ -44,7 +44,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype,
float
k_scale,
float
v_scale,"
" str kv_cache_dtype,
Tensor
k_scale,
Tensor
v_scale,"
" int tp_rank, int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step) -> ()"
);
...
...
@@ -208,6 +208,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops
.
def
(
"silu_and_mul(Tensor! out, Tensor input) -> ()"
);
ops
.
impl
(
"silu_and_mul"
,
torch
::
kCUDA
,
&
silu_and_mul
);
ops
.
def
(
"mul_and_silu(Tensor! out, Tensor input) -> ()"
);
ops
.
impl
(
"mul_and_silu"
,
torch
::
kCUDA
,
&
mul_and_silu
);
// Activation function used in GeGLU with `none` approximation.
ops
.
def
(
"gelu_and_mul(Tensor! out, Tensor input) -> ()"
);
ops
.
impl
(
"gelu_and_mul"
,
torch
::
kCUDA
,
&
gelu_and_mul
);
...
...
@@ -511,6 +514,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops
.
def
(
"cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool"
);
ops
.
impl
(
"cutlass_scaled_mm_supports_fp8"
,
&
cutlass_scaled_mm_supports_fp8
);
// Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
ops
.
def
(
"cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
"bool"
);
ops
.
impl
(
"cutlass_scaled_mm_supports_block_fp8"
,
&
cutlass_scaled_mm_supports_fp8
);
// Check if cutlass sparse scaled_mm is supported for CUDA devices of the
// given capability
ops
.
def
(
...
...
@@ -636,7 +646,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
" Tensor! key_cache, Tensor! value_cache,"
" Tensor slot_mapping,"
" str kv_cache_dtype,"
"
float
k_scale,
float
v_scale) -> ()"
);
"
Tensor
k_scale,
Tensor
v_scale) -> ()"
);
cache_ops
.
impl
(
"reshape_and_cache"
,
torch
::
kCUDA
,
&
reshape_and_cache
);
// Reshape the key and value tensors and cache them.
...
...
@@ -646,7 +656,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
" Tensor! value_cache,"
" Tensor slot_mapping,"
" str kv_cache_dtype,"
"
float
k_scale,
float
v_scale) -> ()"
);
"
Tensor
k_scale,
Tensor
v_scale) -> ()"
);
cache_ops
.
impl
(
"reshape_and_cache_flash"
,
torch
::
kCUDA
,
&
reshape_and_cache_flash
);
...
...
@@ -666,6 +676,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
" str kv_cache_dtype) -> ()"
);
cache_ops
.
impl
(
"write_cache_multi_layers"
,
torch
::
kCUDA
,
&
write_cache_multi_layers
);
// Concat kv_c and k_pe and cache them.
cache_ops
.
def
(
"concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
" Tensor! kv_cache,"
" Tensor slot_mapping,"
" str kv_cache_dtype,"
" Tensor scale) -> ()"
);
cache_ops
.
impl
(
"concat_and_cache_mla"
,
torch
::
kCUDA
,
&
concat_and_cache_mla
);
// Convert the key and value cache to fp8 data type.
cache_ops
.
def
(
"convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
...
...
docs/Makefile
View file @
afd0da21
...
...
@@ -18,3 +18,7 @@ help:
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%
:
Makefile
@
$(SPHINXBUILD)
-M
$@
"
$(SOURCEDIR)
"
"
$(BUILDDIR)
"
$(SPHINXOPTS)
$(O)
clean
:
@
$(SPHINXBUILD)
-M
clean
"
$(SOURCEDIR)
"
"
$(BUILDDIR)
"
$(SPHINXOPTS)
$(O)
rm
-rf
"
$(SOURCEDIR)
/getting_started/examples"
docs/README.md
View file @
afd0da21
...
...
@@ -16,4 +16,5 @@ make html
```
bash
python
-m
http.server
-d
build/html/
```
Launch your browser and open localhost:8000.
docs/requirements-docs.txt
View file @
afd0da21
sphinx==6.2.1
sphinx-argparse==0.4.0
sphinx-book-theme==1.0.1
sphinx-copybutton==0.5.2
sphinx-design==0.6.1
sphinx-togglebutton==0.3.2
myst-parser==3.0.1
sphinx-argparse==0.4.0
msgspec
cloudpickle
...
...
@@ -19,3 +21,4 @@ openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entr
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
requests
zmq
docs/source/_static/custom.js
View file @
afd0da21
// Add RunLLM widget
document
.
addEventListener
(
"
DOMContentLoaded
"
,
function
()
{
var
script
=
document
.
createElement
(
"
script
"
);
script
.
type
=
"
module
"
;
...
...
@@ -15,4 +16,23 @@ document.addEventListener("DOMContentLoaded", function () {
script
.
async
=
true
;
document
.
head
.
appendChild
(
script
);
});
\ No newline at end of file
});
// Update URL search params when tab is clicked
document
.
addEventListener
(
"
DOMContentLoaded
"
,
function
()
{
const
tabs
=
document
.
querySelectorAll
(
"
.sd-tab-label
"
);
function
updateURL
(
tab
)
{
const
syncGroup
=
tab
.
getAttribute
(
"
data-sync-group
"
);
const
syncId
=
tab
.
getAttribute
(
"
data-sync-id
"
);
if
(
syncGroup
&&
syncId
)
{
const
url
=
new
URL
(
window
.
location
);
url
.
searchParams
.
set
(
syncGroup
,
syncId
);
window
.
history
.
replaceState
(
null
,
""
,
url
);
}
}
tabs
.
forEach
(
tab
=>
{
tab
.
addEventListener
(
"
click
"
,
()
=>
updateURL
(
tab
));
});
});
docs/source/
dev
/engine/async_llm_engine.md
→
docs/source/
api
/engine/async_llm_engine.md
View file @
afd0da21
File moved
docs/source/
dev
/engine/
engine_
index.md
→
docs/source/
api
/engine/index.md
View file @
afd0da21
...
...
@@ -8,10 +8,10 @@
.. currentmodule:: vllm.engine
```
```
{toctree}
:::
{toctree}
:caption: Engines
:maxdepth: 2
llm_engine
async_llm_engine
```
:::
docs/source/
dev
/engine/llm_engine.md
→
docs/source/
api
/engine/llm_engine.md
View file @
afd0da21
File moved
docs/source/api/inference_params.md
0 → 100644
View file @
afd0da21
# Inference Parameters
Inference parameters for vLLM APIs.
(sampling-params)=
## Sampling Parameters
```
{eval-rst}
.. autoclass:: vllm.SamplingParams
:members:
```
(pooling-params)=
## Pooling Parameters
```
{eval-rst}
.. autoclass:: vllm.PoolingParams
:members:
```
docs/source/api/model/adapters.md
0 → 100644
View file @
afd0da21
# Model Adapters
## Module Contents
```
{eval-rst}
.. automodule:: vllm.model_executor.models.adapters
:members:
:member-order: bysource
```
docs/source/api/model/index.md
0 → 100644
View file @
afd0da21
# Model Development
## Submodules
:::{toctree}
:maxdepth: 1
interfaces_base
interfaces
adapters
:::
docs/source/api/model/interfaces.md
0 → 100644
View file @
afd0da21
# Optional Interfaces
## Module Contents
```
{eval-rst}
.. automodule:: vllm.model_executor.models.interfaces
:members:
:member-order: bysource
```
docs/source/api/model/interfaces_base.md
0 → 100644
View file @
afd0da21
# Base Model Interfaces
## Module Contents
```
{eval-rst}
.. automodule:: vllm.model_executor.models.interfaces_base
:members:
:member-order: bysource
```
Prev
1
…
3
4
5
6
7
8
9
10
11
…
30
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment