Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a9944aab
Unverified
Commit
a9944aab
authored
May 15, 2025
by
omahs
Committed by
GitHub
May 15, 2025
Browse files
fix: typos (#18151)
Signed-off-by:
omahs
<
73983677+omahs@users.noreply.github.com
>
parent
a8f5aec2
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
18 additions
and
18 deletions
+18
-18
csrc/attention/attention_kernels.cuh
csrc/attention/attention_kernels.cuh
+2
-2
examples/offline_inference/chat_with_tools.py
examples/offline_inference/chat_with_tools.py
+2
-2
tests/lora/test_lora_huggingface.py
tests/lora/test_lora_huggingface.py
+1
-1
tests/model_executor/weight_utils.py
tests/model_executor/weight_utils.py
+3
-3
vllm/config.py
vllm/config.py
+1
-1
vllm/lora/ops/triton_ops/lora_expand_op.py
vllm/lora/ops/triton_ops/lora_expand_op.py
+1
-1
vllm/model_executor/layers/mamba/mamba_mixer2.py
vllm/model_executor/layers/mamba/mamba_mixer2.py
+1
-1
vllm/model_executor/models/granite_speech.py
vllm/model_executor/models/granite_speech.py
+2
-2
vllm/model_executor/models/phi4mm_audio.py
vllm/model_executor/models/phi4mm_audio.py
+4
-4
vllm/v1/request.py
vllm/v1/request.py
+1
-1
No files found.
csrc/attention/attention_kernels.cuh
View file @
a9944aab
...
@@ -172,7 +172,7 @@ __device__ void paged_attention_kernel(
...
@@ -172,7 +172,7 @@ __device__ void paged_attention_kernel(
// Load the query to registers.
// Load the query to registers.
// Each thread in a thread group has a different part of the query.
// Each thread in a thread group has a different part of the query.
// For example, if the
the
thread group size is 4, then the first thread in
// For example, if the thread group size is 4, then the first thread in
// the group has 0, 4, 8, ... th vectors of the query, and the second thread
// the group has 0, 4, 8, ... th vectors of the query, and the second thread
// has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
// has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
// q is split from a qkv tensor, it may not be contiguous.
// q is split from a qkv tensor, it may not be contiguous.
...
@@ -259,7 +259,7 @@ __device__ void paged_attention_kernel(
...
@@ -259,7 +259,7 @@ __device__ void paged_attention_kernel(
// Load a key to registers.
// Load a key to registers.
// Each thread in a thread group has a different part of the key.
// Each thread in a thread group has a different part of the key.
// For example, if the
the
thread group size is 4, then the first thread in
// For example, if the thread group size is 4, then the first thread in
// the group has 0, 4, 8, ... th vectors of the key, and the second thread
// the group has 0, 4, 8, ... th vectors of the key, and the second thread
// has 1, 5, 9, ... th vectors of the key, and so on.
// has 1, 5, 9, ... th vectors of the key, and so on.
for
(
int
i
=
0
;
i
<
NUM_TOKENS_PER_THREAD_GROUP
;
i
++
)
{
for
(
int
i
=
0
;
i
<
NUM_TOKENS_PER_THREAD_GROUP
;
i
++
)
{
...
...
examples/offline_inference/chat_with_tools.py
View file @
a9944aab
...
@@ -68,7 +68,7 @@ def get_current_weather(city: str, state: str, unit: 'str'):
...
@@ -68,7 +68,7 @@ def get_current_weather(city: str, state: str, unit: 'str'):
"partly cloudly, with highs in the 90's."
)
"partly cloudly, with highs in the 90's."
)
tool_funtions
=
{
"get_current_weather"
:
get_current_weather
}
tool_fun
c
tions
=
{
"get_current_weather"
:
get_current_weather
}
tools
=
[{
tools
=
[{
"type"
:
"function"
,
"type"
:
"function"
,
...
@@ -122,7 +122,7 @@ messages.append({
...
@@ -122,7 +122,7 @@ messages.append({
# above defined function
# above defined function
tool_calls
=
json
.
loads
(
output
)
tool_calls
=
json
.
loads
(
output
)
tool_answers
=
[
tool_answers
=
[
tool_funtions
[
call
[
'name'
]](
**
call
[
'arguments'
])
for
call
in
tool_calls
tool_fun
c
tions
[
call
[
'name'
]](
**
call
[
'arguments'
])
for
call
in
tool_calls
]
]
# append the answer as a tool message and let the LLM give you an answer
# append the answer as a tool message and let the LLM give you an answer
...
...
tests/lora/test_lora_huggingface.py
View file @
a9944aab
...
@@ -30,7 +30,7 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
...
@@ -30,7 +30,7 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
lora_path
=
get_adapter_absolute_path
(
lora_name
)
lora_path
=
get_adapter_absolute_path
(
lora_name
)
# lora loading should work for either absolute path and hugg
g
ingface id.
# lora loading should work for either absolute path and huggingface id.
peft_helper
=
PEFTHelper
.
from_local_dir
(
lora_path
,
4096
)
peft_helper
=
PEFTHelper
.
from_local_dir
(
lora_path
,
4096
)
lora_model
=
LoRAModel
.
from_local_checkpoint
(
lora_model
=
LoRAModel
.
from_local_checkpoint
(
lora_path
,
lora_path
,
...
...
tests/model_executor/weight_utils.py
View file @
a9944aab
...
@@ -20,11 +20,11 @@ def test_hf_transfer_auto_activation():
...
@@ -20,11 +20,11 @@ def test_hf_transfer_auto_activation():
try
:
try
:
# enable hf hub transfer if available
# enable hf hub transfer if available
import
hf_transfer
# type: ignore # noqa
import
hf_transfer
# type: ignore # noqa
HF_TRANFER_ACTIVE
=
True
HF_TRAN
S
FER_ACTIVE
=
True
except
ImportError
:
except
ImportError
:
HF_TRANFER_ACTIVE
=
False
HF_TRAN
S
FER_ACTIVE
=
False
assert
(
huggingface_hub
.
constants
.
HF_HUB_ENABLE_HF_TRANSFER
==
assert
(
huggingface_hub
.
constants
.
HF_HUB_ENABLE_HF_TRANSFER
==
HF_TRANFER_ACTIVE
)
HF_TRAN
S
FER_ACTIVE
)
def
test_download_weights_from_hf
():
def
test_download_weights_from_hf
():
...
...
vllm/config.py
View file @
a9944aab
...
@@ -297,7 +297,7 @@ class ModelConfig:
...
@@ -297,7 +297,7 @@ class ModelConfig:
- 1K -> 1024
\n
- 1K -> 1024
\n
- 25.6k -> 25,600"""
- 25.6k -> 25,600"""
spec_target_max_model_len
:
Optional
[
int
]
=
None
spec_target_max_model_len
:
Optional
[
int
]
=
None
"""Specify the
the
maximum length for spec decoding draft models."""
"""Specify the maximum length for spec decoding draft models."""
quantization
:
Optional
[
QuantizationMethods
]
=
None
quantization
:
Optional
[
QuantizationMethods
]
=
None
"""Method used to quantize the weights. If `None`, we first check the
"""Method used to quantize the weights. If `None`, we first check the
`quantization_config` attribute in the model config file. If that is
`quantization_config` attribute in the model config file. If that is
...
...
vllm/lora/ops/triton_ops/lora_expand_op.py
View file @
a9944aab
...
@@ -153,7 +153,7 @@ def _lora_expand(
...
@@ -153,7 +153,7 @@ def _lora_expand(
lora_token_start_loc (torch.Tensor): A cumulative sum of
lora_token_start_loc (torch.Tensor): A cumulative sum of
num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
lora_token_start_loc[i], along with num_tokens_per_lora[i]
lora_token_start_loc[i], along with num_tokens_per_lora[i]
identifies the
the
region in token_indices_sorted_by_lora_ids that
identifies the region in token_indices_sorted_by_lora_ids that
LoRA lora_ids[i] should process.
LoRA lora_ids[i] should process.
lora_ids (torch.Tensor): LoRA ids to process.
lora_ids (torch.Tensor): LoRA ids to process.
no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
...
...
vllm/model_executor/layers/mamba/mamba_mixer2.py
View file @
a9944aab
...
@@ -142,7 +142,7 @@ def mamba_v2_sharded_weight_loader(
...
@@ -142,7 +142,7 @@ def mamba_v2_sharded_weight_loader(
)
->
LoaderFunction
:
)
->
LoaderFunction
:
"""Create a weight loader for mamba v2. This ensures that the projections
"""Create a weight loader for mamba v2. This ensures that the projections
are correctly sharded so that they can be split into x, B, C. It also
are correctly sharded so that they can be split into x, B, C. It also
ensures th
e the
all the groups corresponding to a head shard is placed
ensures th
at
all the groups corresponding to a head shard is placed
together with it.
together with it.
"""
"""
...
...
vllm/model_executor/models/granite_speech.py
View file @
a9944aab
...
@@ -21,7 +21,7 @@
...
@@ -21,7 +21,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
"""Inference-only IBM Granite spee
e
ch model."""
"""Inference-only IBM Granite speech model."""
import
math
import
math
from
collections.abc
import
Iterable
,
Mapping
from
collections.abc
import
Iterable
,
Mapping
from
typing
import
Optional
,
TypedDict
,
Union
from
typing
import
Optional
,
TypedDict
,
Union
...
@@ -626,7 +626,7 @@ class GraniteSpeechForConditionalGeneration(
...
@@ -626,7 +626,7 @@ class GraniteSpeechForConditionalGeneration(
audio_embed_sizes
:
torch
.
Tensor
,
audio_embed_sizes
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
"""Calculate the input features mask, which will generally be used
"""Calculate the input features mask, which will generally be used
to mask the
the
padded features for all entries in the batch except
to mask the padded features for all entries in the batch except
for those with the most audio features.
for those with the most audio features.
Args:
Args:
...
...
vllm/model_executor/models/phi4mm_audio.py
View file @
a9944aab
...
@@ -91,9 +91,9 @@ class ConformerEncoderLayer(nn.Module):
...
@@ -91,9 +91,9 @@ class ConformerEncoderLayer(nn.Module):
if set to True, use GLULinear module,
if set to True, use GLULinear module,
otherwise, used GLUPointWiseConv module.
otherwise, used GLUPointWiseConv module.
default to False.
default to False.
attention_inn
n
er_dim: int, optional
attention_inner_dim: int, optional
if equal to -1, attention dim for linears k/q/v is
if equal to -1, attention dim for linears k/q/v is
equal to d_model. otherwise attention_inn
n
er_dim is used.
equal to d_model. otherwise attention_inner_dim is used.
default -1.
default -1.
attention_glu_type: str, optional
attention_glu_type: str, optional
activation function for glu used in the multihead attention,
activation function for glu used in the multihead attention,
...
@@ -148,7 +148,7 @@ class ConformerEncoderLayer(nn.Module):
...
@@ -148,7 +148,7 @@ class ConformerEncoderLayer(nn.Module):
conv_glu_type
=
"sigmoid"
,
conv_glu_type
=
"sigmoid"
,
bias_in_glu
=
True
,
bias_in_glu
=
True
,
linear_glu_in_convm
=
False
,
linear_glu_in_convm
=
False
,
attention_inn
n
er_dim
=-
1
,
attention_inner_dim
=-
1
,
attention_glu_type
=
"swish"
,
attention_glu_type
=
"swish"
,
activation_checkpointing
=
""
,
activation_checkpointing
=
""
,
export
=
False
,
export
=
False
,
...
@@ -169,7 +169,7 @@ class ConformerEncoderLayer(nn.Module):
...
@@ -169,7 +169,7 @@ class ConformerEncoderLayer(nn.Module):
n_head
,
n_head
,
d_model
,
d_model
,
dropout_rate
,
dropout_rate
,
attention_inn
n
er_dim
,
attention_inner_dim
,
attention_glu_type
,
attention_glu_type
,
bias_in_glu
,
bias_in_glu
,
use_pt_scaled_dot_product_attention
=
use_pt_scaled_dot_product_attention
=
...
...
vllm/v1/request.py
View file @
a9944aab
...
@@ -72,7 +72,7 @@ class Request:
...
@@ -72,7 +72,7 @@ class Request:
assert
len
(
self
.
mm_inputs
)
==
len
(
self
.
mm_hashes
)
assert
len
(
self
.
mm_inputs
)
==
len
(
self
.
mm_hashes
)
# Read-only views
# Read-only views
# Prevent directly appending to
the
these lists since
# Prevent directly appending to these lists since
# they should also be updated simultaneously.
# they should also be updated simultaneously.
self
.
output_token_ids
=
ConstantList
(
self
.
_output_token_ids
)
self
.
output_token_ids
=
ConstantList
(
self
.
_output_token_ids
)
self
.
all_token_ids
=
ConstantList
(
self
.
_all_token_ids
)
self
.
all_token_ids
=
ConstantList
(
self
.
_all_token_ids
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment