Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f48954a4
Commit
f48954a4
authored
Jun 12, 2024
by
zhuwenwen
Browse files
merge v0.5.0
parents
1dba29d3
8f89d720
Changes
253
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1282 additions
and
275 deletions
+1282
-275
vllm/model_executor/guided_decoding/outlines_decoding.py
vllm/model_executor/guided_decoding/outlines_decoding.py
+12
-19
vllm/model_executor/guided_decoding/outlines_logits_processors.py
...el_executor/guided_decoding/outlines_logits_processors.py
+36
-26
vllm/model_executor/layers/activation.py
vllm/model_executor/layers/activation.py
+21
-13
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
...14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+138
-0
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
...=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+146
-0
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
...=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+61
-55
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
...=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+146
-0
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
...=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+44
-44
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
...=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+146
-0
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+45
-41
vllm/model_executor/layers/layernorm.py
vllm/model_executor/layers/layernorm.py
+6
-4
vllm/model_executor/layers/linear.py
vllm/model_executor/layers/linear.py
+43
-3
vllm/model_executor/layers/logits_processor.py
vllm/model_executor/layers/logits_processor.py
+3
-2
vllm/model_executor/layers/quantization/__init__.py
vllm/model_executor/layers/quantization/__init__.py
+4
-1
vllm/model_executor/layers/quantization/bitsandbytes.py
vllm/model_executor/layers/quantization/bitsandbytes.py
+175
-0
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
...ers/quantization/compressed_tensors/compressed_tensors.py
+51
-36
vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
...ayers/quantization/compressed_tensors/schemes/__init__.py
+2
-0
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
...d_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
+85
-0
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
...d_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+4
-31
vllm/model_executor/layers/quantization/compressed_tensors/utils.py
..._executor/layers/quantization/compressed_tensors/utils.py
+114
-0
No files found.
vllm/model_executor/guided_decoding/outlines_decoding.py
View file @
f48954a4
import
asyncio
import
asyncio
import
concurrent.futures
import
concurrent.futures
from
copy
import
copy
from
enum
import
Enum
from
enum
import
Enum
from
functools
import
lru_cache
from
json
import
dumps
as
json_dumps
from
json
import
dumps
as
json_dumps
from
re
import
escape
as
regex_escape
from
re
import
escape
as
regex_escape
from
typing
import
Tuple
,
Union
from
typing
import
Tuple
,
Union
...
@@ -54,8 +52,10 @@ global_thread_pool = None # used for generating logits processor fsm
...
@@ -54,8 +52,10 @@ global_thread_pool = None # used for generating logits processor fsm
async
def
get_outlines_guided_decoding_logits_processor
(
async
def
get_outlines_guided_decoding_logits_processor
(
request
:
Union
[
CompletionRequest
,
ChatCompletionRequest
],
request
:
Union
[
CompletionRequest
,
tokenizer
)
->
Union
[
JSONLogitsProcessor
,
RegexLogitsProcessor
,
None
]:
ChatCompletionRequest
],
tokenizer
:
PreTrainedTokenizerBase
)
->
Union
[
JSONLogitsProcessor
,
RegexLogitsProcessor
,
CFGLogitsProcessor
,
None
]:
"""
"""
Given an OpenAI-compatible request, check for guided decoding parameters
Given an OpenAI-compatible request, check for guided decoding parameters
and get the necessary logits processor for the given guide.
and get the necessary logits processor for the given guide.
...
@@ -64,7 +64,7 @@ async def get_outlines_guided_decoding_logits_processor(
...
@@ -64,7 +64,7 @@ async def get_outlines_guided_decoding_logits_processor(
"""
"""
global
global_thread_pool
global
global_thread_pool
guide
,
mode
=
_get_guide_and_mode
(
request
)
guide
,
mode
=
_get_guide_and_mode
(
request
)
if
not
guide
:
if
not
guide
or
not
mode
:
return
None
return
None
if
global_thread_pool
is
None
:
if
global_thread_pool
is
None
:
...
@@ -72,15 +72,9 @@ async def get_outlines_guided_decoding_logits_processor(
...
@@ -72,15 +72,9 @@ async def get_outlines_guided_decoding_logits_processor(
max_workers
=
2
)
max_workers
=
2
)
loop
=
asyncio
.
get_running_loop
()
loop
=
asyncio
.
get_running_loop
()
result
=
await
loop
.
run_in_executor
(
global_thread_pool
,
return
await
loop
.
run_in_executor
(
global_thread_pool
,
_get_cached_logits_processor
,
guide
,
_get_logits_processor
,
guide
,
tokenizer
,
tokenizer
,
mode
,
mode
,
request
.
guided_whitespace_pattern
)
request
.
guided_whitespace_pattern
)
logits_processor
=
copy
(
result
)
# reset logits processor's internal state
logits_processor
.
init_state
()
return
logits_processor
def
_get_guide_and_mode
(
def
_get_guide_and_mode
(
...
@@ -115,11 +109,10 @@ def _get_guide_and_mode(
...
@@ -115,11 +109,10 @@ def _get_guide_and_mode(
return
None
,
None
return
None
,
None
@
lru_cache
(
maxsize
=
32
)
def
_get_logits_processor
(
def
_get_cached_logits_processor
(
guide
:
str
,
guide
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
mode
:
GuidedDecodingMode
,
tokenizer
:
PreTrainedTokenizerBase
,
whitespace_pattern
:
Union
[
str
,
None
]
mode
:
GuidedDecodingMode
,
)
->
Union
[
JSONLogitsProcessor
,
RegexLogitsProcessor
,
CFGLogitsProcessor
]:
whitespace_pattern
:
Union
[
str
,
None
]):
if
mode
==
GuidedDecodingMode
.
JSON
:
if
mode
==
GuidedDecodingMode
.
JSON
:
return
JSONLogitsProcessor
(
guide
,
tokenizer
,
whitespace_pattern
)
return
JSONLogitsProcessor
(
guide
,
tokenizer
,
whitespace_pattern
)
elif
mode
==
GuidedDecodingMode
.
REGEX
or
mode
==
GuidedDecodingMode
.
CHOICE
:
elif
mode
==
GuidedDecodingMode
.
REGEX
or
mode
==
GuidedDecodingMode
.
CHOICE
:
...
...
vllm/model_executor/guided_decoding/outlines_logits_processors.py
View file @
f48954a4
...
@@ -21,7 +21,7 @@ from functools import lru_cache
...
@@ -21,7 +21,7 @@ from functools import lru_cache
from
typing
import
Callable
,
DefaultDict
,
Dict
,
List
,
Union
from
typing
import
Callable
,
DefaultDict
,
Dict
,
List
,
Union
import
torch
import
torch
from
outlines.fsm.
fsm
import
CFG
FSM
,
FSM
,
RegexFSM
from
outlines.fsm.
guide
import
CFG
Guide
,
Generate
,
Guide
,
RegexGuide
,
Write
from
outlines.fsm.json_schema
import
build_regex_from_schema
from
outlines.fsm.json_schema
import
build_regex_from_schema
from
pydantic
import
BaseModel
from
pydantic
import
BaseModel
from
transformers
import
PreTrainedTokenizerBase
from
transformers
import
PreTrainedTokenizerBase
...
@@ -29,28 +29,32 @@ from transformers import PreTrainedTokenizerBase
...
@@ -29,28 +29,32 @@ from transformers import PreTrainedTokenizerBase
class
BaseLogitsProcessor
:
class
BaseLogitsProcessor
:
def
__init__
(
self
):
def
__init__
(
self
,
guide
:
Guide
):
# Child class should use initialize in their init.
self
.
_guide
:
Guide
=
guide
self
.
fsm
:
FSM
self
.
_fsm_state
:
DefaultDict
[
int
,
int
]
=
defaultdict
(
int
)
def
init_state
(
self
):
"""Initialize the FSM states."""
self
.
fsm_state
:
DefaultDict
[
int
,
int
]
=
defaultdict
(
int
)
def
__call__
(
self
,
input_ids
:
List
[
int
],
def
__call__
(
self
,
input_ids
:
List
[
int
],
scores
:
torch
.
Tensor
)
->
torch
.
Tensor
:
scores
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""Use the FSM to bias the logits before sampling the next token."""
"""Use the FSM to bias the logits before sampling the next token."""
seq_id
=
hash
(
tuple
(
input_ids
))
seq_id
=
hash
(
tuple
(
input_ids
))
if
len
(
input_ids
)
==
0
:
if
len
(
input_ids
)
>
0
:
self
.
init_state
()
else
:
last_token
=
input_ids
[
-
1
]
last_token
=
input_ids
[
-
1
]
last_seq_id
=
hash
(
tuple
(
input_ids
[:
-
1
]))
last_seq_id
=
hash
(
tuple
(
input_ids
[:
-
1
]))
self
.
fsm_state
[
seq_id
]
=
self
.
fsm
.
next_state
(
self
.
_fsm_state
[
seq_id
]
=
self
.
_guide
.
get_next_state
(
self
.
fsm_state
[
last_seq_id
],
last_token
)
state
=
self
.
_fsm_state
[
last_seq_id
],
token_id
=
last_token
)
instruction
=
self
.
_guide
.
get_next_instruction
(
state
=
self
.
_fsm_state
[
seq_id
])
allowed_tokens
=
self
.
fsm
.
allowed_token_ids
(
self
.
fsm_state
[
seq_id
])
if
type
(
instruction
)
==
Generate
:
allowed_tokens
=
instruction
.
tokens
elif
type
(
instruction
)
==
Write
:
# TODO: support fast forward tokens
allowed_tokens
=
[
instruction
.
tokens
[
0
]]
else
:
raise
TypeError
(
f
"Unsupported instruction type
{
type
(
instruction
)
}
"
)
mask
=
torch
.
full
((
scores
.
shape
[
-
1
],
),
mask
=
torch
.
full
((
scores
.
shape
[
-
1
],
),
-
math
.
inf
,
-
math
.
inf
,
...
@@ -62,6 +66,13 @@ class BaseLogitsProcessor:
...
@@ -62,6 +66,13 @@ class BaseLogitsProcessor:
class
RegexLogitsProcessor
(
BaseLogitsProcessor
):
class
RegexLogitsProcessor
(
BaseLogitsProcessor
):
@
classmethod
@
lru_cache
(
maxsize
=
32
)
def
_get_guide
(
cls
,
regex_string
:
str
,
tokenizer
:
PreTrainedTokenizerBase
)
->
Guide
:
tokenizer
=
_adapt_tokenizer
(
tokenizer
)
return
RegexGuide
(
regex_string
,
tokenizer
)
def
__init__
(
self
,
regex_string
:
str
,
tokenizer
:
PreTrainedTokenizerBase
):
def
__init__
(
self
,
regex_string
:
str
,
tokenizer
:
PreTrainedTokenizerBase
):
"""Compile the FSM that drives the regex-structured generation.
"""Compile the FSM that drives the regex-structured generation.
...
@@ -73,9 +84,8 @@ class RegexLogitsProcessor(BaseLogitsProcessor):
...
@@ -73,9 +84,8 @@ class RegexLogitsProcessor(BaseLogitsProcessor):
The model's tokenizer
The model's tokenizer
"""
"""
tokenizer
=
_adapt_tokenizer
(
tokenizer
)
super
().
__init__
(
fsm
=
RegexFSM
(
regex_string
,
tokenizer
)
RegexLogitsProcessor
.
_get_guide
(
regex_string
,
tokenizer
))
self
.
fsm
=
fsm
class
JSONLogitsProcessor
(
RegexLogitsProcessor
):
class
JSONLogitsProcessor
(
RegexLogitsProcessor
):
...
@@ -115,6 +125,12 @@ class JSONLogitsProcessor(RegexLogitsProcessor):
...
@@ -115,6 +125,12 @@ class JSONLogitsProcessor(RegexLogitsProcessor):
class
CFGLogitsProcessor
(
BaseLogitsProcessor
):
class
CFGLogitsProcessor
(
BaseLogitsProcessor
):
@
classmethod
@
lru_cache
(
maxsize
=
32
)
def
_get_guide
(
cls
,
cfg
:
str
,
tokenizer
:
PreTrainedTokenizerBase
)
->
Guide
:
tokenizer
=
_adapt_tokenizer
(
tokenizer
)
return
CFGGuide
(
cfg
,
tokenizer
)
def
__init__
(
self
,
cfg
:
str
,
tokenizer
:
PreTrainedTokenizerBase
):
def
__init__
(
self
,
cfg
:
str
,
tokenizer
:
PreTrainedTokenizerBase
):
"""Compile the FSM that drives the context free grammar generation.
"""Compile the FSM that drives the context free grammar generation.
...
@@ -126,17 +142,11 @@ class CFGLogitsProcessor(BaseLogitsProcessor):
...
@@ -126,17 +142,11 @@ class CFGLogitsProcessor(BaseLogitsProcessor):
The model's tokenizer
The model's tokenizer
"""
"""
tokenizer
=
_adapt_tokenizer
(
tokenizer
)
super
().
__init__
(
CFGLogitsProcessor
.
_get_guide
(
cfg
,
tokenizer
))
fsm
=
CFGFSM
(
cfg
,
tokenizer
)
self
.
_guide
=
self
.
_guide
.
copy
()
self
.
fsm
=
fsm
def
init_state
(
self
):
"""Initialize state with a CFGFSM copy."""
super
().
init_state
()
self
.
fsm
=
self
.
fsm
.
copy
()
@
lru_cache
@
lru_cache
(
maxsize
=
32
)
def
_adapt_tokenizer
(
tokenizer
:
PreTrainedTokenizerBase
):
def
_adapt_tokenizer
(
tokenizer
:
PreTrainedTokenizerBase
):
"""Adapt vLLM's tokenizer to use to compile the FSM.
"""Adapt vLLM's tokenizer to use to compile the FSM.
...
...
vllm/model_executor/layers/activation.py
View file @
f48954a4
...
@@ -6,14 +6,14 @@ import torch
...
@@ -6,14 +6,14 @@ import torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
vllm
import
_custom_ops
as
ops
from
vllm.distributed
import
(
divide
,
get_tensor_model_parallel_rank
,
from
vllm.distributed
import
(
divide
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
)
get_tensor_model_parallel_world_size
)
from
vllm.model_executor.custom_op
import
CustomOp
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.model_executor.utils
import
set_weight_attrs
class
SiluAndMul
(
nn
.
Module
):
class
SiluAndMul
(
CustomOp
):
"""An activation function for SwiGLU.
"""An activation function for SwiGLU.
The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
...
@@ -23,12 +23,14 @@ class SiluAndMul(nn.Module):
...
@@ -23,12 +23,14 @@ class SiluAndMul(nn.Module):
return: (num_tokens, d) or (batch_size, seq_len, d)
return: (num_tokens, d) or (batch_size, seq_len, d)
"""
"""
def
_
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward
_native
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""PyTorch-native implementation equivalent to forward()."""
"""PyTorch-native implementation equivalent to forward()."""
d
=
x
.
shape
[
-
1
]
//
2
d
=
x
.
shape
[
-
1
]
//
2
return
F
.
silu
(
x
[...,
:
d
])
*
x
[...,
d
:]
return
F
.
silu
(
x
[...,
:
d
])
*
x
[...,
d
:]
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward_cuda
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
from
vllm
import
_custom_ops
as
ops
d
=
x
.
shape
[
-
1
]
//
2
d
=
x
.
shape
[
-
1
]
//
2
output_shape
=
(
x
.
shape
[:
-
1
]
+
(
d
,
))
output_shape
=
(
x
.
shape
[:
-
1
]
+
(
d
,
))
out
=
torch
.
empty
(
output_shape
,
dtype
=
x
.
dtype
,
device
=
x
.
device
)
out
=
torch
.
empty
(
output_shape
,
dtype
=
x
.
dtype
,
device
=
x
.
device
)
...
@@ -36,7 +38,7 @@ class SiluAndMul(nn.Module):
...
@@ -36,7 +38,7 @@ class SiluAndMul(nn.Module):
return
out
return
out
class
GeluAndMul
(
nn
.
Module
):
class
GeluAndMul
(
CustomOp
):
"""An activation function for GeGLU.
"""An activation function for GeGLU.
The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
...
@@ -52,12 +54,14 @@ class GeluAndMul(nn.Module):
...
@@ -52,12 +54,14 @@ class GeluAndMul(nn.Module):
if
approximate
not
in
(
"none"
,
"tanh"
):
if
approximate
not
in
(
"none"
,
"tanh"
):
raise
ValueError
(
f
"Unknown approximate mode:
{
approximate
}
"
)
raise
ValueError
(
f
"Unknown approximate mode:
{
approximate
}
"
)
def
_
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward
_native
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""PyTorch-native implementation equivalent to forward()."""
"""PyTorch-native implementation equivalent to forward()."""
d
=
x
.
shape
[
-
1
]
//
2
d
=
x
.
shape
[
-
1
]
//
2
return
F
.
gelu
(
x
[...,
:
d
],
approximate
=
self
.
approximate
)
*
x
[...,
d
:]
return
F
.
gelu
(
x
[...,
:
d
],
approximate
=
self
.
approximate
)
*
x
[...,
d
:]
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward_cuda
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
from
vllm
import
_custom_ops
as
ops
d
=
x
.
shape
[
-
1
]
//
2
d
=
x
.
shape
[
-
1
]
//
2
output_shape
=
(
x
.
shape
[:
-
1
]
+
(
d
,
))
output_shape
=
(
x
.
shape
[:
-
1
]
+
(
d
,
))
out
=
torch
.
empty
(
output_shape
,
dtype
=
x
.
dtype
,
device
=
x
.
device
)
out
=
torch
.
empty
(
output_shape
,
dtype
=
x
.
dtype
,
device
=
x
.
device
)
...
@@ -71,28 +75,32 @@ class GeluAndMul(nn.Module):
...
@@ -71,28 +75,32 @@ class GeluAndMul(nn.Module):
return
f
'approximate=
{
repr
(
self
.
approximate
)
}
'
return
f
'approximate=
{
repr
(
self
.
approximate
)
}
'
class
NewGELU
(
nn
.
Module
):
class
NewGELU
(
CustomOp
):
def
_
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward
_native
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""PyTorch-native implementation equivalent to forward()."""
"""PyTorch-native implementation equivalent to forward()."""
c
=
math
.
sqrt
(
2.0
/
math
.
pi
)
c
=
math
.
sqrt
(
2.0
/
math
.
pi
)
return
0.5
*
x
*
(
1.0
+
torch
.
tanh
(
c
*
return
0.5
*
x
*
(
1.0
+
torch
.
tanh
(
c
*
(
x
+
0.044715
*
torch
.
pow
(
x
,
3.0
))))
(
x
+
0.044715
*
torch
.
pow
(
x
,
3.0
))))
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward_cuda
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
from
vllm
import
_custom_ops
as
ops
out
=
torch
.
empty_like
(
x
)
out
=
torch
.
empty_like
(
x
)
ops
.
gelu_new
(
out
,
x
)
ops
.
gelu_new
(
out
,
x
)
return
out
return
out
class
FastGELU
(
nn
.
Module
):
class
FastGELU
(
CustomOp
):
def
_
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward
_native
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""PyTorch-native implementation equivalent to forward()."""
"""PyTorch-native implementation equivalent to forward()."""
return
0.5
*
x
*
(
1.0
+
torch
.
tanh
(
x
*
0.7978845608
*
return
0.5
*
x
*
(
1.0
+
torch
.
tanh
(
x
*
0.7978845608
*
(
1.0
+
0.044715
*
x
*
x
)))
(
1.0
+
0.044715
*
x
*
x
)))
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward_cuda
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
from
vllm
import
_custom_ops
as
ops
out
=
torch
.
empty_like
(
x
)
out
=
torch
.
empty_like
(
x
)
ops
.
gelu_fast
(
out
,
x
)
ops
.
gelu_fast
(
out
,
x
)
return
out
return
out
...
...
vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
0 → 100644
View file @
f48954a4
{
"1"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
5
},
"2"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"4"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"8"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"16"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"24"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"32"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"48"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"96"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
2
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
3
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"1536"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
3
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
3
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
3
}
}
vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
0 → 100644
View file @
f48954a4
{
"1"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
5
},
"2"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
5
},
"4"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"8"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
5
},
"16"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"24"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"32"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"48"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"96"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"256"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
5
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"1536"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
4
}
}
vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
View file @
f48954a4
{
{
"1"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
5
},
},
"2"
:
{
"2"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
},
"4"
:
{
"4"
:
{
"BLOCK_SIZE_M"
:
16
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
4
},
},
"8"
:
{
"8"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_warps"
:
4
,
"num_stages"
:
5
"num_stages"
:
4
},
},
"16"
:
{
"16"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
6
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_warps"
:
4
,
"num_stages"
:
5
"num_stages"
:
3
},
},
"24"
:
{
"24"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_warps"
:
4
,
"num_stages"
:
5
"num_stages"
:
3
},
},
"32"
:
{
"32"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
25
6
,
"BLOCK_SIZE_N"
:
6
4
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
6
,
"GROUP_SIZE_M"
:
6
4
,
"num_warps"
:
8
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
3
},
},
"48"
:
{
"48"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
25
6
,
"BLOCK_SIZE_N"
:
6
4
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
6
4
,
"GROUP_SIZE_M"
:
1
6
,
"num_warps"
:
8
,
"num_warps"
:
4
,
"num_stages"
:
3
"num_stages"
:
3
},
},
"64"
:
{
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
25
6
,
"BLOCK_SIZE_N"
:
6
4
,
"BLOCK_SIZE_K"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
3
},
},
"96"
:
{
"96"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_warps"
:
4
,
"num_stages"
:
2
"num_stages"
:
3
},
},
"128"
:
{
"128"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_warps"
:
4
,
"num_stages"
:
3
"num_stages"
:
3
},
},
"256"
:
{
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_warps"
:
4
,
"num_stages"
:
5
"num_stages"
:
3
},
},
"512"
:
{
"512"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
6
4
,
"GROUP_SIZE_M"
:
1
6
,
"num_warps"
:
4
,
"num_warps"
:
8
,
"num_stages"
:
2
"num_stages"
:
4
},
},
"1024"
:
{
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_warps"
:
8
,
"num_stages"
:
4
"num_stages"
:
4
},
},
...
@@ -109,7 +115,7 @@
...
@@ -109,7 +115,7 @@
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_warps"
:
8
,
"num_stages"
:
4
"num_stages"
:
4
},
},
...
@@ -125,7 +131,7 @@
...
@@ -125,7 +131,7 @@
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_warps"
:
8
,
"num_stages"
:
4
"num_stages"
:
4
},
},
...
...
vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
0 → 100644
View file @
f48954a4
{
"1"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
5
},
"2"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"4"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"8"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
5
},
"16"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
5
},
"24"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
4
,
"num_stages"
:
5
},
"32"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"48"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"96"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
3
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
3
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"1536"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
4
}
}
vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
View file @
f48954a4
{
{
"1"
:
{
"1"
:
{
"BLOCK_SIZE_M"
:
1
6
,
"BLOCK_SIZE_M"
:
6
4
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
4
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
4
},
},
"2"
:
{
"2"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
5
},
},
"4"
:
{
"4"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
6
4
,
"BLOCK_SIZE_K"
:
25
6
,
"GROUP_SIZE_M"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
5
},
},
"8"
:
{
"8"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
4
},
},
"16"
:
{
"16"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
5
},
},
"24"
:
{
"24"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
25
6
,
"BLOCK_SIZE_N"
:
6
4
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
3
},
},
"32"
:
{
"32"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
6
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
4
},
},
"48"
:
{
"48"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
4
},
},
"64"
:
{
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
6
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
4
},
},
"96"
:
{
"96"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
32
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
32
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
4
},
},
"128"
:
{
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
6
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
4
},
},
"256"
:
{
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_warps"
:
4
,
"num_stages"
:
4
"num_stages"
:
3
},
},
"512"
:
{
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
6
,
"GROUP_SIZE_M"
:
6
4
,
"num_warps"
:
8
,
"num_warps"
:
8
,
"num_stages"
:
4
"num_stages"
:
4
},
},
...
@@ -107,7 +107,7 @@
...
@@ -107,7 +107,7 @@
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_warps"
:
8
,
"num_stages"
:
4
"num_stages"
:
4
},
},
...
@@ -115,7 +115,7 @@
...
@@ -115,7 +115,7 @@
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_warps"
:
8
,
"num_stages"
:
4
"num_stages"
:
4
},
},
...
@@ -139,7 +139,7 @@
...
@@ -139,7 +139,7 @@
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
6
4
,
"GROUP_SIZE_M"
:
1
6
,
"num_warps"
:
8
,
"num_warps"
:
8
,
"num_stages"
:
4
"num_stages"
:
4
}
}
...
...
vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
0 → 100644
View file @
f48954a4
{
"1"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
64
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
5
},
"2"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
16
,
"num_warps"
:
8
,
"num_stages"
:
3
},
"4"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"8"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
64
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
5
},
"16"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"24"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"32"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"48"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"64"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"96"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"128"
:
{
"BLOCK_SIZE_M"
:
64
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
256
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
4
,
"num_stages"
:
4
},
"256"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
128
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
1
,
"num_warps"
:
8
,
"num_stages"
:
5
},
"512"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"1024"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
64
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"1536"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"2048"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
4
},
"3072"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
3
},
"4096"
:
{
"BLOCK_SIZE_M"
:
128
,
"BLOCK_SIZE_N"
:
256
,
"BLOCK_SIZE_K"
:
128
,
"GROUP_SIZE_M"
:
32
,
"num_warps"
:
8
,
"num_stages"
:
4
}
}
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
f48954a4
...
@@ -10,7 +10,6 @@ import triton.language as tl
...
@@ -10,7 +10,6 @@ import triton.language as tl
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.utils
import
is_hip
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -308,6 +307,30 @@ def get_moe_configs(E: int, N: int,
...
@@ -308,6 +307,30 @@ def get_moe_configs(E: int, N: int,
return
None
return
None
def
get_default_config
(
M
:
int
,
E
:
int
,
N
:
int
,
K
:
int
,
topk
:
int
,
dtype
:
Optional
[
str
],
)
->
Dict
[
str
,
int
]:
config
=
{
'BLOCK_SIZE_M'
:
64
,
'BLOCK_SIZE_N'
:
64
,
'BLOCK_SIZE_K'
:
32
,
'GROUP_SIZE_M'
:
8
}
if
M
<=
E
:
config
=
{
'BLOCK_SIZE_M'
:
16
,
'BLOCK_SIZE_N'
:
32
,
'BLOCK_SIZE_K'
:
64
,
'GROUP_SIZE_M'
:
1
}
return
config
def
fused_topk
(
def
fused_topk
(
hidden_states
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
gating_output
:
torch
.
Tensor
,
gating_output
:
torch
.
Tensor
,
...
@@ -319,34 +342,26 @@ def fused_topk(
...
@@ -319,34 +342,26 @@ def fused_topk(
M
,
_
=
hidden_states
.
shape
M
,
_
=
hidden_states
.
shape
if
is_hip
():
topk_weights
=
torch
.
empty
(
M
,
# The MoE kernels are not yet supported on ROCm.
routing_weights
=
torch
.
softmax
(
gating_output
,
dim
=-
1
,
dtype
=
torch
.
float32
)
topk_weights
,
topk_ids
=
torch
.
topk
(
routing_weights
,
topk
,
dim
=-
1
)
else
:
import
vllm._moe_C
as
moe_kernels
topk_weights
=
torch
.
empty
(
M
,
topk
,
dtype
=
torch
.
float32
,
device
=
hidden_states
.
device
)
topk_ids
=
torch
.
empty
(
M
,
topk
,
topk
,
dtype
=
torch
.
in
t32
,
dtype
=
torch
.
floa
t32
,
device
=
hidden_states
.
device
)
device
=
hidden_states
.
device
)
token_expert_indicies
=
torch
.
empty
(
M
,
topk_ids
=
torch
.
empty
(
M
,
topk
,
topk
,
dtype
=
torch
.
int32
,
dtype
=
torch
.
int32
,
device
=
hidden_states
.
device
)
device
=
hidden_states
.
device
)
moe_kernels
.
topk_softmax
(
token_expert_indicies
=
torch
.
empty
(
M
,
topk_weights
,
topk
,
topk_ids
,
dtype
=
torch
.
int32
,
token_expert_indicies
,
device
=
hidden_states
.
device
)
gating_output
.
float
(),
# TODO(woosuk): Optimize this.
ops
.
topk_softmax
(
)
topk_weights
,
del
token_expert_indicies
# Not used. Will be used in the future.
topk_ids
,
token_expert_indicies
,
gating_output
.
float
(),
# TODO(woosuk): Optimize this.
)
del
token_expert_indicies
# Not used. Will be used in the future.
if
renormalize
:
if
renormalize
:
topk_weights
=
topk_weights
/
topk_weights
.
sum
(
dim
=-
1
,
keepdim
=
True
)
topk_weights
=
topk_weights
/
topk_weights
.
sum
(
dim
=-
1
,
keepdim
=
True
)
return
topk_weights
,
topk_ids
return
topk_weights
,
topk_ids
...
@@ -390,20 +405,9 @@ def fused_experts(hidden_states: torch.Tensor,
...
@@ -390,20 +405,9 @@ def fused_experts(hidden_states: torch.Tensor,
config
=
configs
[
min
(
configs
.
keys
(),
key
=
lambda
x
:
abs
(
x
-
M
))]
config
=
configs
[
min
(
configs
.
keys
(),
key
=
lambda
x
:
abs
(
x
-
M
))]
else
:
else
:
# Else use the default config
# Else use the default config
config
=
{
config
=
get_default_config
(
M
,
E
,
N
,
w1
.
shape
[
2
],
'BLOCK_SIZE_M'
:
64
,
topk_ids
.
shape
[
1
],
'BLOCK_SIZE_N'
:
64
,
"float8"
if
use_fp8
else
None
)
'BLOCK_SIZE_K'
:
32
,
'GROUP_SIZE_M'
:
8
}
if
M
<=
E
:
config
=
{
'BLOCK_SIZE_M'
:
16
,
'BLOCK_SIZE_N'
:
32
,
'BLOCK_SIZE_K'
:
64
,
'GROUP_SIZE_M'
:
1
}
intermediate_cache1
=
torch
.
empty
((
M
,
topk_ids
.
shape
[
1
],
N
),
intermediate_cache1
=
torch
.
empty
((
M
,
topk_ids
.
shape
[
1
],
N
),
device
=
hidden_states
.
device
,
device
=
hidden_states
.
device
,
...
...
vllm/model_executor/layers/layernorm.py
View file @
f48954a4
...
@@ -4,10 +4,10 @@ from typing import Optional, Tuple, Union
...
@@ -4,10 +4,10 @@ from typing import Optional, Tuple, Union
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
vllm
import
_custom_ops
as
ops
from
vllm
.model_executor.custom_op
import
CustomOp
class
RMSNorm
(
nn
.
Module
):
class
RMSNorm
(
CustomOp
):
"""Root mean square normalization.
"""Root mean square normalization.
Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
...
@@ -23,7 +23,7 @@ class RMSNorm(nn.Module):
...
@@ -23,7 +23,7 @@ class RMSNorm(nn.Module):
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
hidden_size
))
self
.
variance_epsilon
=
eps
self
.
variance_epsilon
=
eps
def
_
forward
(
def
forward
_native
(
self
,
self
,
x
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
residual
:
Optional
[
torch
.
Tensor
]
=
None
,
residual
:
Optional
[
torch
.
Tensor
]
=
None
,
...
@@ -43,11 +43,13 @@ class RMSNorm(nn.Module):
...
@@ -43,11 +43,13 @@ class RMSNorm(nn.Module):
else
:
else
:
return
x
,
residual
return
x
,
residual
def
forward
(
def
forward
_cuda
(
self
,
self
,
x
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
residual
:
Optional
[
torch
.
Tensor
]
=
None
,
residual
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]]:
)
->
Union
[
torch
.
Tensor
,
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]]:
from
vllm
import
_custom_ops
as
ops
if
residual
is
not
None
:
if
residual
is
not
None
:
ops
.
fused_add_rms_norm
(
ops
.
fused_add_rms_norm
(
x
,
x
,
...
...
vllm/model_executor/layers/linear.py
View file @
f48954a4
from
abc
import
abstractmethod
from
abc
import
abstractmethod
from
typing
import
List
,
Optional
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
torch
import
torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
...
@@ -29,6 +29,21 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
...
@@ -29,6 +29,21 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
return
shard_size
*
marlin_tile_size
,
shard_offset
*
marlin_tile_size
return
shard_size
*
marlin_tile_size
,
shard_offset
*
marlin_tile_size
def
adjust_bitsandbytes_shard
(
param
:
Parameter
,
qkv_offsets
:
Dict
[
str
,
Tuple
[
int
,
int
]],
loaded_shard_id
:
str
)
->
Tuple
[
int
,
int
]:
"""Adjust the quantization offsets and sizes for BitsAndBytes sharding."""
total
,
_
=
qkv_offsets
[
"total"
]
orig_offset
,
orig_size
=
qkv_offsets
[
loaded_shard_id
]
quantized_total
=
param
.
data
.
shape
[
0
]
quantized_offset
=
orig_offset
*
quantized_total
//
total
quantized_size
=
orig_size
*
quantized_total
//
total
return
quantized_size
,
quantized_offset
class
LinearMethodBase
(
QuantizeMethodBase
):
class
LinearMethodBase
(
QuantizeMethodBase
):
"""Base class for different (maybe quantized) linear methods."""
"""Base class for different (maybe quantized) linear methods."""
...
@@ -40,7 +55,7 @@ class LinearMethodBase(QuantizeMethodBase):
...
@@ -40,7 +55,7 @@ class LinearMethodBase(QuantizeMethodBase):
**
extra_weight_attrs
):
**
extra_weight_attrs
):
"""Create weights for a linear layer.
"""Create weights for a linear layer.
The weights will be set as attributes of the layer.
The weights will be set as attributes of the layer.
Args:
Args:
layer: The layer that is using the LinearMethodBase factory.
layer: The layer that is using the LinearMethodBase factory.
input_size_per_partition: Size of the weight input dim on rank X.
input_size_per_partition: Size of the weight input dim on rank X.
...
@@ -433,6 +448,12 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
...
@@ -433,6 +448,12 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
# Special case for Marlin.
# Special case for Marlin.
shard_size
,
shard_offset
=
adjust_marlin_shard
(
shard_size
,
shard_offset
=
adjust_marlin_shard
(
param
,
shard_size
,
shard_offset
)
param
,
shard_size
,
shard_offset
)
use_bitsandbytes
=
getattr
(
param
,
"use_bitsandbytes"
,
False
)
if
use_bitsandbytes
:
shard_size
=
loaded_weight
.
shape
[
output_dim
]
shard_offset
=
loaded_weight
.
shape
[
output_dim
]
*
\
loaded_shard_id
if
self
.
use_llama_nn
:
if
self
.
use_llama_nn
:
param_data_
=
param_data
.
narrow
(
output_dim
,
shard_offset
,
param_data_
=
param_data
.
narrow
(
output_dim
,
shard_offset
,
...
@@ -440,6 +461,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
...
@@ -440,6 +461,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
else
:
else
:
param_data
=
param_data
.
narrow
(
output_dim
,
shard_offset
,
param_data
=
param_data
.
narrow
(
output_dim
,
shard_offset
,
shard_size
)
shard_size
)
start_idx
=
tp_rank
*
shard_size
start_idx
=
tp_rank
*
shard_size
loaded_weight
=
loaded_weight
.
narrow
(
output_dim
,
start_idx
,
loaded_weight
=
loaded_weight
.
narrow
(
output_dim
,
start_idx
,
shard_size
)
shard_size
)
...
@@ -645,12 +668,29 @@ class QKVParallelLinear(ColumnParallelLinear):
...
@@ -645,12 +668,29 @@ class QKVParallelLinear(ColumnParallelLinear):
shard_size
,
shard_offset
=
adjust_marlin_shard
(
shard_size
,
shard_offset
=
adjust_marlin_shard
(
param
,
shard_size
,
shard_offset
)
param
,
shard_size
,
shard_offset
)
use_bitsandbytes
=
getattr
(
param
,
"use_bitsandbytes"
,
False
)
if
use_bitsandbytes
:
orig_qkv_offsets
=
{
"q"
:
(
0
,
self
.
num_heads
*
self
.
head_size
),
"k"
:
(
self
.
num_heads
*
self
.
head_size
,
self
.
num_kv_heads
*
self
.
head_size
),
"v"
:
((
self
.
num_heads
+
self
.
num_kv_heads
)
*
self
.
head_size
,
self
.
num_kv_heads
*
self
.
head_size
),
"total"
:
((
self
.
num_heads
+
2
*
self
.
num_kv_heads
)
*
self
.
head_size
,
0
)
}
shard_size
,
shard_offset
=
adjust_bitsandbytes_shard
(
param
,
orig_qkv_offsets
,
loaded_shard_id
)
if
self
.
use_llama_nn
:
if
self
.
use_llama_nn
:
param_data_
=
param_data
.
narrow
(
output_dim
,
shard_offset
,
param_data_
=
param_data
.
narrow
(
output_dim
,
shard_offset
,
shard_size
)
shard_size
)
else
:
else
:
param_data
=
param_data
.
narrow
(
output_dim
,
shard_offset
,
param_data
=
param_data
.
narrow
(
output_dim
,
shard_offset
,
shard_size
)
shard_size
)
if
loaded_shard_id
==
"q"
:
if
loaded_shard_id
==
"q"
:
shard_id
=
tp_rank
shard_id
=
tp_rank
else
:
else
:
...
...
vllm/model_executor/layers/logits_processor.py
View file @
f48954a4
...
@@ -21,7 +21,7 @@ class LogitsProcessor(nn.Module):
...
@@ -21,7 +21,7 @@ class LogitsProcessor(nn.Module):
def
__init__
(
self
,
def
__init__
(
self
,
vocab_size
:
int
,
vocab_size
:
int
,
org_vocab_size
:
Optional
[
int
]
=
None
,
org_vocab_size
:
Optional
[
int
]
=
None
,
scale
:
Optional
[
float
]
=
1.0
,
scale
:
float
=
1.0
,
logits_as_input
:
bool
=
False
)
->
None
:
logits_as_input
:
bool
=
False
)
->
None
:
"""
"""
Args:
Args:
...
@@ -52,7 +52,8 @@ class LogitsProcessor(nn.Module):
...
@@ -52,7 +52,8 @@ class LogitsProcessor(nn.Module):
logits
=
self
.
_get_logits
(
hidden_states
,
embedding
,
embedding_bias
)
logits
=
self
.
_get_logits
(
hidden_states
,
embedding
,
embedding_bias
)
if
logits
is
not
None
:
if
logits
is
not
None
:
logits
*=
self
.
scale
if
self
.
scale
!=
1.0
:
logits
*=
self
.
scale
# Apply logits processors (if any).
# Apply logits processors (if any).
logits
=
_apply_logits_processors
(
logits
,
sampling_metadata
)
logits
=
_apply_logits_processors
(
logits
,
sampling_metadata
)
...
...
vllm/model_executor/layers/quantization/__init__.py
View file @
f48954a4
...
@@ -4,6 +4,8 @@ from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
...
@@ -4,6 +4,8 @@ from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
from
vllm.model_executor.layers.quantization.awq
import
AWQConfig
from
vllm.model_executor.layers.quantization.awq
import
AWQConfig
from
vllm.model_executor.layers.quantization.base_config
import
(
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
QuantizationConfig
)
from
vllm.model_executor.layers.quantization.bitsandbytes
import
(
BitsAndBytesConfig
)
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
CompressedTensorsConfig
)
CompressedTensorsConfig
)
from
vllm.model_executor.layers.quantization.deepspeedfp
import
(
from
vllm.model_executor.layers.quantization.deepspeedfp
import
(
...
@@ -29,7 +31,8 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
...
@@ -29,7 +31,8 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
"gptq_marlin"
:
GPTQMarlinConfig
,
"gptq_marlin"
:
GPTQMarlinConfig
,
"gptq"
:
GPTQConfig
,
"gptq"
:
GPTQConfig
,
"squeezellm"
:
SqueezeLLMConfig
,
"squeezellm"
:
SqueezeLLMConfig
,
"sparseml"
:
CompressedTensorsConfig
,
"compressed-tensors"
:
CompressedTensorsConfig
,
"bitsandbytes"
:
BitsAndBytesConfig
,
}
}
...
...
vllm/model_executor/layers/quantization/bitsandbytes.py
0 → 100644
View file @
f48954a4
from
typing
import
Any
,
Dict
,
List
,
Optional
import
torch
from
torch.nn.parameter
import
Parameter
from
vllm.model_executor.layers.linear
import
(
LinearBase
,
LinearMethodBase
,
set_weight_attrs
)
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
class
BitsAndBytesConfig
(
QuantizationConfig
):
"""Config class for BitsAndBytes Quantization.
Reference: https://arxiv.org/abs/2305.14314
"""
def
__init__
(
self
,
adapter_name_or_path
:
str
,
target_modules
:
List
[
str
],
)
->
None
:
self
.
adapter_name_or_path
=
adapter_name_or_path
self
.
target_modules
=
target_modules
def
__repr__
(
self
)
->
str
:
return
(
f
"BitsAndBytesConfig(adapter_name_or_path=
{
self
.
adapter_name_or_path
}
"
)
@
classmethod
def
get_name
(
self
)
->
str
:
return
"bitsandbytes"
@
classmethod
def
get_supported_act_dtypes
(
self
)
->
List
[
torch
.
dtype
]:
return
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
]
@
classmethod
def
get_min_capability
(
self
)
->
int
:
return
70
@
staticmethod
def
get_config_filenames
()
->
List
[
str
]:
return
[
"adapter_config.json"
,
]
@
classmethod
def
from_config
(
cls
,
config
:
Dict
[
str
,
Any
])
->
"BitsAndBytesConfig"
:
adapter_name
=
cls
.
get_from_keys
(
config
,
[
"adapter_name_or_path"
])
default_target_modules
=
[
"gate_proj"
,
"down_proj"
,
"up_proj"
,
"q_proj"
,
"k_proj"
,
"v_proj"
,
"o_proj"
]
if
adapter_name
==
""
:
target_modules
=
default_target_modules
else
:
target_modules
=
cls
.
get_from_keys
(
config
,
[
"target_modules"
])
return
cls
(
adapter_name
,
target_modules
)
def
get_quant_method
(
self
,
layer
:
torch
.
nn
.
Module
)
->
Optional
[
"BitsAndBytesLinearMethod"
]:
if
isinstance
(
layer
,
LinearBase
):
return
BitsAndBytesLinearMethod
(
self
)
return
None
def
get_scaled_act_names
(
self
)
->
List
[
str
]:
return
[
"gelu"
,
"gelu_fast"
,
"gelu_new"
,
"gelu_pytorch_tanh"
]
class
BitsAndBytesLinearMethod
(
LinearMethodBase
):
"""Linear method for BitsAndBytes.
Args:
quant_config: The BitsAndBytes quantization config.
"""
def
__init__
(
self
,
quant_config
:
BitsAndBytesConfig
):
try
:
import
bitsandbytes
if
bitsandbytes
.
__version__
<
"0.42.0"
:
raise
ImportError
(
"bitsandbytes version is wrong. Please "
"install bitsandbytes>=0.42.0."
)
except
ImportError
as
err
:
raise
ImportError
(
"Please install bitsandbytes>=0.42.0 via "
"`pip install bitsandbytes>=0.42.0` to use "
"bitsandbytes quantizer."
)
from
err
self
.
quant_config
=
quant_config
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
input_size_per_partition
:
int
,
output_partition_sizes
:
List
[
int
],
input_size
:
int
,
output_size
:
int
,
params_dtype
:
torch
.
dtype
,
**
extra_weight_attrs
):
quant_ratio
=
0
if
params_dtype
.
is_floating_point
:
quant_ratio
=
torch
.
finfo
(
params_dtype
).
bits
//
torch
.
iinfo
(
torch
.
uint8
).
bits
else
:
quant_ratio
=
torch
.
iinfo
(
params_dtype
).
bits
//
torch
.
iinfo
(
torch
.
uint8
).
bits
if
input_size_per_partition
*
sum
(
output_partition_sizes
)
%
quant_ratio
!=
0
:
raise
ValueError
(
"The input size is not aligned with the quantized "
"weight shape. "
)
qweight
=
Parameter
(
torch
.
empty
(
input_size_per_partition
*
sum
(
output_partition_sizes
)
//
quant_ratio
,
1
,
dtype
=
torch
.
uint8
,
),
requires_grad
=
False
,
)
set_weight_attrs
(
qweight
,
{
"input_dim"
:
0
,
# In bitsandbytes, a tensor of shape [n,m] is quantized to
#[n*m/pack_ratio, 1],so the output_dim is 0
"output_dim"
:
0
,
"pack_factor"
:
quant_ratio
,
"use_bitsandbytes"
:
True
,
})
layer
.
register_parameter
(
"qweight"
,
qweight
)
set_weight_attrs
(
qweight
,
extra_weight_attrs
)
def
apply
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
# only load the bitsandbytes module when needed
from
bitsandbytes
import
matmul_4bit
original_type
=
x
.
dtype
bf_x
=
x
.
to
(
torch
.
bfloat16
)
qweight
=
layer
.
qweight
quant_states
=
qweight
.
bnb_quant_state
offsets
=
qweight
.
bnb_shard_offsets
out_dim_0
=
x
.
shape
[
0
]
out_dim_1
=
sum
(
[
quant_state
[
1
].
shape
[
0
]
for
quant_state
in
quant_states
.
items
()])
out
=
torch
.
empty
(
out_dim_0
,
out_dim_1
,
dtype
=
torch
.
bfloat16
,
device
=
x
.
device
)
current_index
=
0
for
i
in
range
(
len
(
quant_states
)):
output_size
=
quant_states
[
i
].
shape
[
0
]
# It is more efficient to use out kwarg like
# matmul_4bit(..., out = ...). Infeasible now due to the bug
# https://github.com/TimDettmers/bitsandbytes/issues/1235.
# Need to change after the bug is fixed.
out
[:,
current_index
:
current_index
+
output_size
]
=
matmul_4bit
(
bf_x
,
qweight
[
offsets
[
i
]:
offsets
[
i
+
1
]].
t
(),
quant_states
[
i
])
current_index
+=
output_size
out
=
out
.
to
(
original_type
)
if
bias
is
not
None
:
out
+=
bias
return
out
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
View file @
f48954a4
from
typing
import
Any
,
Dict
,
List
,
Optional
from
typing
import
Any
,
Dict
,
List
,
Optional
import
torch
import
torch
from
pydantic
import
BaseModel
from
vllm.model_executor.layers.linear
import
LinearBase
,
LinearMethodBase
from
vllm.model_executor.layers.linear
import
LinearBase
,
LinearMethodBase
from
vllm.model_executor.layers.quantization.base_config
import
(
# noqa: E501
from
vllm.model_executor.layers.quantization.base_config
import
(
# noqa: E501
QuantizationConfig
)
QuantizationConfig
)
from
vllm.model_executor.layers.quantization.compressed_tensors.schemes
import
(
from
vllm.model_executor.layers.quantization.compressed_tensors.schemes
import
(
CompressedTensorsScheme
,
CompressedTensorsW8A8StaticTensor
)
CompressedTensorsScheme
,
CompressedTensorsW8A8DynamicToken
,
CompressedTensorsW8A8StaticTensor
)
from
vllm.model_executor.layers.quantization.compressed_tensors.utils
import
(
QuantizationArgs
,
QuantizationStrategy
,
find_first_name_or_class_match
)
class
CompressedTensorsConfig
(
QuantizationConfig
):
class
CompressedTensorsConfig
(
QuantizationConfig
):
...
@@ -47,10 +51,12 @@ class CompressedTensorsConfig(QuantizationConfig):
...
@@ -47,10 +51,12 @@ class CompressedTensorsConfig(QuantizationConfig):
targets
=
quant_config
.
get
(
"targets"
)
targets
=
quant_config
.
get
(
"targets"
)
for
target
in
targets
:
for
target
in
targets
:
layer_quant_details
[
target
]
=
{}
layer_quant_details
[
target
]
=
{}
layer_quant_details
[
target
][
"weight"
]
=
quant_config
.
get
(
layer_quant_details
[
target
][
"weights"
)
"weight"
]
=
QuantizationArgs
.
parse_obj
(
layer_quant_details
[
target
][
"input"
]
=
quant_config
.
get
(
quant_config
.
get
(
"weights"
))
"input_activations"
)
layer_quant_details
[
target
][
"input"
]
=
QuantizationArgs
.
parse_obj
(
quant_config
.
get
(
"input_activations"
))
return
cls
(
layer_quant_details
=
layer_quant_details
,
ignore
=
ignore
)
return
cls
(
layer_quant_details
=
layer_quant_details
,
ignore
=
ignore
)
...
@@ -58,40 +64,46 @@ class CompressedTensorsConfig(QuantizationConfig):
...
@@ -58,40 +64,46 @@ class CompressedTensorsConfig(QuantizationConfig):
def
get_config_filenames
(
cls
)
->
List
[
str
]:
def
get_config_filenames
(
cls
)
->
List
[
str
]:
return
[]
return
[]
def
_get_schema
(
self
,
weight_quant
:
Dict
,
input_quant
:
Dict
):
def
_is_static_tensor_w8a8
(
self
,
weight_quant
:
BaseModel
,
# TODO: Refactor as additional cases are supported
input_quant
:
BaseModel
)
->
bool
:
is_8_bits
=
weight_quant
.
num_bits
==
input_quant
.
num_bits
==
8
weight_bit
=
weight_quant
.
get
(
"num_bits"
)
is_tensor
=
(
weight_quant
.
strategy
==
input_quant
.
strategy
==
input_bit
=
input_quant
.
get
(
"num_bits"
)
QuantizationStrategy
.
TENSOR
.
value
)
is_symmetric
=
weight_quant
.
symmetric
and
input_quant
.
symmetric
weight_strategy
=
weight_quant
.
get
(
"strategy"
)
is_static
=
not
weight_quant
.
dynamic
and
not
input_quant
.
dynamic
input_strategy
=
input_quant
.
get
(
"strategy"
)
return
is_8_bits
and
is_tensor
and
is_symmetric
and
is_static
weight_symmetric
=
weight_quant
.
get
(
"symmetric"
)
input_symmetric
=
input_quant
.
get
(
"symmetric"
)
def
_is_dynamic_token_w8a8
(
self
,
weight_quant
:
BaseModel
,
input_quant
:
BaseModel
)
->
bool
:
is_8_bits
=
weight_quant
.
num_bits
==
input_quant
.
num_bits
==
8
is_token_tensor
=
(
weight_quant
.
strategy
==
QuantizationStrategy
.
TENSOR
.
value
)
and
(
input_quant
.
strategy
==
QuantizationStrategy
.
TOKEN
.
value
)
is_symmetric
=
weight_quant
.
symmetric
and
input_quant
.
symmetric
is_dynamic
=
not
weight_quant
.
dynamic
and
input_quant
.
dynamic
return
is_8_bits
and
is_token_tensor
and
is_symmetric
and
is_dynamic
def
_get_schema
(
self
,
weight_quant
:
BaseModel
,
input_quant
:
BaseModel
)
->
"CompressedTensorsScheme"
:
if
self
.
_is_static_tensor_w8a8
(
weight_quant
,
input_quant
):
return
CompressedTensorsW8A8StaticTensor
()
is_8_bits
=
weight_bit
==
input_bit
==
8
if
self
.
_is_dynamic_token_w8a8
(
weight_quant
,
input_quant
):
is_tensor
=
weight_strategy
==
input_strategy
==
"tensor"
return
CompressedTensorsW8A8DynamicToken
()
is_symmetric
=
weight_symmetric
and
input_symmetric
if
is_8_bits
and
is_tensor
and
is_symmetric
and
\
raise
NotImplementedError
(
"Scheme not supported."
)
torch
.
cuda
.
is_available
():
# CompressedTensorsW8A8StaticTensor only supports CUDA path for
# now.
return
CompressedTensorsW8A8StaticTensor
()
raise
NotImplementedError
(
"Scheme not supported. Only CUDA, 8-bit static symmtetric "
"per tensor quantization is currently supported"
)
def
get_scheme
(
self
,
layer
:
torch
.
nn
.
Module
)
->
"CompressedTensorsScheme"
:
def
get_scheme
(
self
,
layer
:
torch
.
nn
.
Module
)
->
"CompressedTensorsScheme"
:
# TODO: update with matching function from `compressed_tensors`
layer_type_name
=
find_first_name_or_class_match
(
layer_type_name
=
None
name
=
""
,
layer_name_class
=
type
(
layer
).
__name__
.
lower
()
module
=
layer
,
for
target
in
self
.
layer_quant_details
:
targets
=
self
.
layer_quant_details
.
keys
(),
if
target
.
lower
()
in
layer_name_class
:
check_contains
=
True
)
layer_type_name
=
target
break
if
layer_type_name
is
None
:
if
layer_type_name
is
None
:
raise
ValueError
(
f
"Could not matching target for layer
{
layer
}
"
)
raise
ValueError
(
f
"Could not matching target for layer
{
layer
}
"
)
...
@@ -117,7 +129,9 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
...
@@ -117,7 +129,9 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
**
extra_weight_attrs
):
**
extra_weight_attrs
):
"""
"""
Use the CompressedTensorsScheme associated with each layer to create
Use the CompressedTensorsScheme associated with each layer to create
the necessary parameters for the layer.
the necessary parameters for the layer. See LinearMethodBase for param
details
"""
"""
weight_loader
=
extra_weight_attrs
.
get
(
"weight_loader"
)
weight_loader
=
extra_weight_attrs
.
get
(
"weight_loader"
)
...
@@ -139,7 +153,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
...
@@ -139,7 +153,8 @@ class CompressedTensorsLinearMethod(LinearMethodBase):
"""
"""
Use the output of create_weights and the CompressedTensorsScheme
Use the output of create_weights and the CompressedTensorsScheme
associated with the layer to apply the forward pass with the
associated with the layer to apply the forward pass with the
layer input.
layer input. See LinearMethodBase for param details
"""
"""
if
bias
is
not
None
:
if
bias
is
not
None
:
...
...
vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
View file @
f48954a4
from
.compressed_tensors_scheme
import
CompressedTensorsScheme
# noqa: F401
from
.compressed_tensors_scheme
import
CompressedTensorsScheme
# noqa: F401
from
.compressed_tensors_unquantized
import
(
# noqa: F401
from
.compressed_tensors_unquantized
import
(
# noqa: F401
CompressedTensorsUnquantized
)
CompressedTensorsUnquantized
)
from
.compressed_tensors_w8a8_dynamictoken
import
(
# noqa: F401, E501
CompressedTensorsW8A8DynamicToken
)
from
.compressed_tensors_w8a8_statictensor
import
(
# noqa: F401, E501
from
.compressed_tensors_w8a8_statictensor
import
(
# noqa: F401, E501
CompressedTensorsW8A8StaticTensor
)
CompressedTensorsW8A8StaticTensor
)
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
0 → 100644
View file @
f48954a4
from
typing
import
Callable
,
List
,
Tuple
,
Union
import
torch
from
torch.nn
import
Parameter
from
vllm
import
_custom_ops
as
custom_ops
from
vllm.model_executor.layers.quantization.compressed_tensors.schemes
import
(
CompressedTensorsScheme
)
from
vllm.model_executor.utils
import
set_weight_attrs
__all__
=
[
"CompressedTensorsW8A8DynamicToken"
]
class
CompressedTensorsW8A8DynamicToken
(
CompressedTensorsScheme
):
def
_shard_id_as_int
(
self
,
shard_id
:
Union
[
str
,
int
])
->
int
:
if
isinstance
(
shard_id
,
int
):
return
shard_id
assert
isinstance
(
shard_id
,
str
)
qkv_idxs
=
{
"q"
:
0
,
"k"
:
1
,
"v"
:
2
}
assert
shard_id
in
qkv_idxs
return
qkv_idxs
[
shard_id
]
def
scales_shard_splitter
(
self
,
param
:
torch
.
Tensor
,
loaded_weight
:
torch
.
Tensor
,
shard_id
:
Union
[
str
,
int
],
logical_widths
:
torch
.
Tensor
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
shard_id
=
self
.
_shard_id_as_int
(
shard_id
)
offset
=
sum
(
logical_widths
[:
shard_id
])
size
=
logical_widths
[
shard_id
]
# update loaded weight with copies for broadcast.
loaded_weight
=
loaded_weight
.
repeat
(
size
)
return
param
[
offset
:
offset
+
size
],
loaded_weight
def
create_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
output_partition_sizes
:
List
[
int
],
input_size_per_partition
:
int
,
params_dtype
:
torch
.
dtype
,
weight_loader
:
Callable
,
**
kwargs
):
# When the scales have a single value, it is required that they be
# on the CPU for performance and CUDA Graphs compatibility. Please
# refer to the comment in
# CompressedTensorsW8A8StaticTensor::create_weights for further
# information.
is_tensor_partitioned
=
len
(
output_partition_sizes
)
!=
1
weight_scale_dim
=
sum
(
output_partition_sizes
)
if
is_tensor_partitioned
else
1
weight_zero_point
=
Parameter
(
torch
.
empty
(
1
,
dtype
=
torch
.
int8
),
requires_grad
=
False
)
weight_scale
=
Parameter
(
torch
.
empty
(
weight_scale_dim
,
dtype
=
torch
.
float32
),
requires_grad
=
False
)
weight
=
Parameter
(
torch
.
empty
(
sum
(
output_partition_sizes
),
input_size_per_partition
,
dtype
=
torch
.
int8
),
requires_grad
=
False
)
layer
.
register_parameter
(
"weight"
,
weight
)
set_weight_attrs
(
weight
,
{
"input_dim"
:
1
,
"output_dim"
:
0
})
set_weight_attrs
(
weight
,
{
"weight_loader"
:
weight_loader
})
set_weight_attrs
(
weight
,
{
"logical_widths"
:
output_partition_sizes
})
layer
.
register_parameter
(
"weight_scale"
,
weight_scale
)
set_weight_attrs
(
weight_scale
,
{
"weight_loader"
:
weight_loader
})
set_weight_attrs
(
weight_scale
,
{
"shard_splitter"
:
self
.
scales_shard_splitter
,
"logical_widths"
:
output_partition_sizes
})
layer
.
register_parameter
(
"weight_zero_point"
,
weight_zero_point
)
set_weight_attrs
(
weight_zero_point
,
{
"weight_loader"
:
weight_loader
})
def
apply_weights
(
self
,
layer
:
torch
.
nn
.
Module
,
x
:
torch
.
Tensor
):
weight
=
layer
.
weight
weight_scale
=
layer
.
weight_scale
x_q
,
input_scales
=
custom_ops
.
scaled_int8_quant
(
x
)
return
custom_ops
.
cutlass_scaled_mm_dq
(
x_q
,
weight
.
t
(),
input_scales
,
weight_scale
,
x
.
dtype
)
vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
View file @
f48954a4
...
@@ -41,46 +41,19 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
...
@@ -41,46 +41,19 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
# TODO: remove zero_point parameters once the configs given remove them
# TODO: remove zero_point parameters once the configs given remove them
# Note on input/weight scales and zero_points
#
# When the scales have a single value, it is required that they be
# on the CPU for 2 reasons,
# 1. Performance:
# When the scales (input_scale/weight_scales) have only a single
# value, we perform a scalar broadcast of that value during the
# quant/dequant operations. The "quant" and the "gemm+dequant"
# kernels accept the Scalar by-value. These tensors are allocated
# on the CPU in order to avoid the GPU-to-CPU copy when passing
# by-value.
#
# 2. CUDA Graphs:
# CUDA Graphs don't support GPU-to-CPU copy operations during
# stream capture.
#
# TODO: zero-points are not supported yet. But we expect a similar
# pattern.
is_tensor_partitioned
=
len
(
output_partition_sizes
)
!=
1
is_tensor_partitioned
=
len
(
output_partition_sizes
)
!=
1
weight_scale_dim
=
sum
(
weight_scale_dim
=
sum
(
output_partition_sizes
)
if
is_tensor_partitioned
else
1
output_partition_sizes
)
if
is_tensor_partitioned
else
1
weight_scale_device
=
"cpu"
if
weight_scale_dim
==
1
else
"cuda"
input_scale
=
Parameter
(
torch
.
empty
(
1
,
input_scale
=
Parameter
(
torch
.
empty
(
1
,
dtype
=
torch
.
float32
),
device
=
"cpu"
,
dtype
=
torch
.
float32
),
requires_grad
=
False
)
requires_grad
=
False
)
input_zero_point
=
Parameter
(
torch
.
empty
(
1
,
input_zero_point
=
Parameter
(
torch
.
empty
(
1
,
dtype
=
torch
.
int8
),
device
=
"cpu"
,
dtype
=
torch
.
int8
),
requires_grad
=
False
)
requires_grad
=
False
)
weight_scale
=
Parameter
(
torch
.
empty
(
weight_scale_dim
,
weight_scale
=
Parameter
(
torch
.
empty
(
weight_scale_dim
,
device
=
weight_scale_device
,
dtype
=
torch
.
float32
),
dtype
=
torch
.
float32
),
requires_grad
=
False
)
requires_grad
=
False
)
weight_zero_point
=
Parameter
(
torch
.
empty
(
1
,
weight_zero_point
=
Parameter
(
torch
.
empty
(
1
,
dtype
=
torch
.
int8
),
device
=
"cpu"
,
dtype
=
torch
.
int8
),
requires_grad
=
False
)
requires_grad
=
False
)
weight
=
Parameter
(
torch
.
empty
(
sum
(
output_partition_sizes
),
weight
=
Parameter
(
torch
.
empty
(
sum
(
output_partition_sizes
),
...
@@ -124,7 +97,7 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
...
@@ -124,7 +97,7 @@ class CompressedTensorsW8A8StaticTensor(CompressedTensorsScheme):
act_scale
=
layer
.
input_scale
act_scale
=
layer
.
input_scale
# Input quantize
# Input quantize
x_q
=
custom_ops
.
static_
scaled_int8_quant
(
x
,
act_scale
[
0
].
item
()
)
x_q
,
_
=
custom_ops
.
scaled_int8_quant
(
x
,
act_scale
)
return
custom_ops
.
cutlass_scaled_mm_dq
(
x_q
,
weight
.
t
(),
act_scale
,
return
custom_ops
.
cutlass_scaled_mm_dq
(
x_q
,
weight
.
t
(),
act_scale
,
weight_scale
,
x
.
dtype
)
weight_scale
,
x
.
dtype
)
vllm/model_executor/layers/quantization/compressed_tensors/utils.py
0 → 100644
View file @
f48954a4
import
re
from
enum
import
Enum
from
typing
import
Any
,
Dict
,
Iterable
,
Optional
from
pydantic
import
BaseModel
,
Field
from
torch.nn
import
Module
class
QuantizationType
(
str
,
Enum
):
"""
Enum storing quantization type options
"""
INT
=
"int"
FLOAT
=
"float"
class
QuantizationStrategy
(
str
,
Enum
):
"""
Enum storing quantization strategy options
"""
TENSOR
=
"tensor"
CHANNEL
=
"channel"
GROUP
=
"group"
BLOCK
=
"block"
TOKEN
=
"token"
class
QuantizationArgs
(
BaseModel
):
"""
User facing arguments used to define a quantization config
for weights or activations
:param num_bits: quantization bit depth
:param type: dtype to quantized to, either int or float
:param symmetric: whether or not quantization scale is symmetric
:param strategy: string determining the scope of scale/zero-point to apply
:param group_size: group length to use for the group strategy
:param block_structure: 2d block structure to use for the block
strategy, must be of the format "2x4", "8x16", etc.
:param dynamic: set True to perform dynamic quantization -
values will not be calibrated during calibration phase,
instead during inference new quantization ranges will be
observed with every sample. Defaults to False for static
quantization. Note that enabling dynamic quantization
will change the default observer to a memoryless one
"""
num_bits
:
int
=
8
type
:
QuantizationType
=
QuantizationType
.
INT
symmetric
:
bool
=
True
group_size
:
Optional
[
int
]
=
None
strategy
:
Optional
[
QuantizationStrategy
]
=
None
block_structure
:
Optional
[
str
]
=
None
dynamic
:
bool
=
False
observer
:
str
=
Field
(
default
=
"minmax"
,
description
=
(
"The class to use to compute the quantization param - "
"scale and zero-point'"
),
)
observer_kwargs
:
Dict
[
str
,
Any
]
=
Field
(
default_factory
=
dict
,
description
=
(
"optional dict of kwargs to be passed directly to torch quantization "
"Observers constructor excluding quantization range or symmetry"
),
)
def
find_first_name_or_class_match
(
name
:
str
,
module
:
Module
,
targets
:
Iterable
[
str
],
check_contains
:
bool
=
False
)
->
Optional
[
str
]:
"""
Helper function to map the quantization details listed in the config
for a given list of targets against each model layer. First uses the
layer name to try and find a match. If no name match is found, uses
the layer class name. Returns None otherwise.
:param name: layer name
:param module: torch.nn.Module
:param targets: list of targets to match the layer against
:param check_contains: whether or not to do a substring match
"""
return
_find_first_match
(
name
,
targets
)
or
_find_first_match
(
module
.
__class__
.
__name__
,
targets
,
check_contains
)
def
_find_first_match
(
value
:
str
,
targets
:
Iterable
[
str
],
check_contains
:
bool
=
False
)
->
Optional
[
str
]:
"""
Returns first element of target that matches value either
exactly or as a regex after 're:'. If check_contains is set to True,
additionally checks if the target string is contained within the value.
:param value: string to compare the list of targets against
:param targets: list of targets to match the layer against
:param check_contains: whether or not to do a substring match
"""
for
target
in
targets
:
if
target
.
startswith
(
"re:"
):
pattern
=
target
[
3
:]
if
re
.
match
(
pattern
,
value
):
return
target
elif
check_contains
:
if
target
.
lower
()
in
value
.
lower
():
return
target
elif
target
==
value
:
return
target
return
None
Prev
1
…
7
8
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment