Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5fe6bf29
Unverified
Commit
5fe6bf29
authored
Jan 21, 2025
by
Nicolò Lucchesi
Committed by
GitHub
Jan 21, 2025
Browse files
[BugFix] Fix GGUF tp>1 when vocab_size is not divisible by 64 (#12230)
Signed-off-by:
NickLucche
<
nlucches@redhat.com
>
parent
d4b62d46
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
2 deletions
+12
-2
tests/models/decoder_only/language/test_gguf.py
tests/models/decoder_only/language/test_gguf.py
+10
-0
vllm/model_executor/layers/vocab_parallel_embedding.py
vllm/model_executor/layers/vocab_parallel_embedding.py
+2
-2
No files found.
tests/models/decoder_only/language/test_gguf.py
View file @
5fe6bf29
...
...
@@ -66,12 +66,20 @@ STARCODER_CONFIG = GGUFTestConfig(
gguf_filename
=
"starcoder2-3b.Q6_K.gguf"
,
)
DOLPHIN_CONFIG
=
GGUFTestConfig
(
# Test VocabParallelEmbedding sharding issue.
original_model
=
"cognitivecomputations/TinyDolphin-2.8-1.1b"
,
gguf_repo
=
"tsunemoto/TinyDolphin-2.8-1.1b-GGUF"
,
gguf_filename
=
"tinydolphin-2.8-1.1b.Q6_K.gguf"
,
)
MODELS
=
[
LLAMA_CONFIG
,
QWEN2_CONFIG
,
PHI3_CONFIG
,
GPT2_CONFIG
,
STABLELM_CONFIG
,
DOLPHIN_CONFIG
# STARCODER_CONFIG, # broken
]
...
...
@@ -107,6 +115,7 @@ def test_models(
# Run unquantized model.
with
vllm_runner
(
model_name
=
model
.
original_model
,
enforce_eager
=
True
,
# faster tests
dtype
=
dtype
,
max_model_len
=
MAX_MODEL_LEN
,
tensor_parallel_size
=
tp_size
)
as
original_model
:
...
...
@@ -115,6 +124,7 @@ def test_models(
# Run gguf model.
with
vllm_runner
(
model_name
=
model
.
gguf_model
,
enforce_eager
=
True
,
tokenizer_name
=
model
.
original_model
,
dtype
=
dtype
,
max_model_len
=
MAX_MODEL_LEN
,
...
...
vllm/model_executor/layers/vocab_parallel_embedding.py
View file @
5fe6bf29
...
...
@@ -355,7 +355,7 @@ class VocabParallelEmbedding(torch.nn.Module):
elif
isinstance
(
param
,
UninitializedParameter
):
shape
=
list
(
loaded_weight
.
shape
)
if
output_dim
is
not
None
:
shape
[
output_dim
]
=
s
hape
[
output_dim
]
//
self
.
tp_size
shape
[
output_dim
]
=
s
elf
.
num_embeddings_per_partition
param
.
materialize
(
tuple
(
shape
),
dtype
=
loaded_weight
.
dtype
)
# If parameter does not have output dim, then it should
...
...
@@ -381,7 +381,7 @@ class VocabParallelEmbedding(torch.nn.Module):
else
:
assert
loaded_weight
.
shape
[
output_dim
]
==
self
.
org_vocab_size
# Copy the data.
# Copy the data.
Select chunk corresponding to current shard.
loaded_weight
=
loaded_weight
.
narrow
(
output_dim
,
start_idx
,
shard_size
)
if
current_platform
.
is_hpu
():
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment