Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
959735fc
Unverified
Commit
959735fc
authored
Dec 11, 2024
by
Lianmin Zheng
Committed by
GitHub
Dec 11, 2024
Browse files
Fix model loader for more quantization formats (#2448)
parent
f6772394
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
54 additions
and
2 deletions
+54
-2
python/sglang/srt/models/llama.py
python/sglang/srt/models/llama.py
+22
-0
python/sglang/srt/models/qwen2.py
python/sglang/srt/models/qwen2.py
+20
-0
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+12
-2
No files found.
python/sglang/srt/models/llama.py
View file @
959735fc
...
...
@@ -294,6 +294,28 @@ class LlamaModel(nn.Module):
class
LlamaForCausalLM
(
nn
.
Module
):
# BitandBytes specific attributes
default_bitsandbytes_target_modules
=
[
".gate_proj."
,
".down_proj."
,
".up_proj."
,
".q_proj."
,
".k_proj."
,
".v_proj."
,
".o_proj."
,
]
# in TP, these weights are partitioned along the column dimension (dim=-1)
column_parallel_weights_modules
=
[
".down_proj."
,
".o_proj."
]
bitsandbytes_stacked_params_mapping
=
{
# shard_name, weight_name, index
"q_proj"
:
(
"qkv_proj"
,
0
),
"k_proj"
:
(
"qkv_proj"
,
1
),
"v_proj"
:
(
"qkv_proj"
,
2
),
"gate_proj"
:
(
"gate_up_proj"
,
0
),
"up_proj"
:
(
"gate_up_proj"
,
1
),
}
def
__init__
(
self
,
config
:
LlamaConfig
,
...
...
python/sglang/srt/models/qwen2.py
View file @
959735fc
...
...
@@ -267,6 +267,26 @@ class Qwen2Model(nn.Module):
class
Qwen2ForCausalLM
(
nn
.
Module
):
# BitandBytes specific attributes
default_bitsandbytes_target_modules
=
[
".gate_proj."
,
".down_proj."
,
".up_proj."
,
".q_proj."
,
".k_proj."
,
".v_proj."
,
".o_proj."
,
]
bitsandbytes_stacked_params_mapping
=
{
# shard_name, weight_name, index
"q_proj"
:
(
"qkv_proj"
,
0
),
"k_proj"
:
(
"qkv_proj"
,
1
),
"v_proj"
:
(
"qkv_proj"
,
2
),
"gate_proj"
:
(
"gate_up_proj"
,
0
),
"up_proj"
:
(
"gate_up_proj"
,
1
),
}
def
__init__
(
self
,
config
:
Qwen2Config
,
...
...
python/sglang/srt/server_args.py
View file @
959735fc
...
...
@@ -283,7 +283,15 @@ class ServerArgs:
"--load-format"
,
type
=
str
,
default
=
ServerArgs
.
load_format
,
choices
=
[
"auto"
,
"pt"
,
"safetensors"
,
"npcache"
,
"dummy"
,
"gguf"
],
choices
=
[
"auto"
,
"pt"
,
"safetensors"
,
"npcache"
,
"dummy"
,
"gguf"
,
"bitsandbytes"
,
],
help
=
"The format of the model weights to load. "
'"auto" will try to load the weights in the safetensors format '
"and fall back to the pytorch bin format if safetensors format "
...
...
@@ -294,7 +302,9 @@ class ServerArgs:
"a numpy cache to speed up the loading. "
'"dummy" will initialize the weights with random values, '
"which is mainly for profiling."
'"gguf" will load the weights in the gguf format. '
,
'"gguf" will load the weights in the gguf format. '
'"bitsandbytes" will load the weights using bitsandbytes '
"quantization."
,
)
parser
.
add_argument
(
"--trust-remote-code"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment