Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a9944aab
Unverified
Commit
a9944aab
authored
May 15, 2025
by
omahs
Committed by
GitHub
May 15, 2025
Browse files
fix: typos (#18151)
Signed-off-by:
omahs
<
73983677+omahs@users.noreply.github.com
>
parent
a8f5aec2
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
18 additions
and
18 deletions
+18
-18
csrc/attention/attention_kernels.cuh
csrc/attention/attention_kernels.cuh
+2
-2
examples/offline_inference/chat_with_tools.py
examples/offline_inference/chat_with_tools.py
+2
-2
tests/lora/test_lora_huggingface.py
tests/lora/test_lora_huggingface.py
+1
-1
tests/model_executor/weight_utils.py
tests/model_executor/weight_utils.py
+3
-3
vllm/config.py
vllm/config.py
+1
-1
vllm/lora/ops/triton_ops/lora_expand_op.py
vllm/lora/ops/triton_ops/lora_expand_op.py
+1
-1
vllm/model_executor/layers/mamba/mamba_mixer2.py
vllm/model_executor/layers/mamba/mamba_mixer2.py
+1
-1
vllm/model_executor/models/granite_speech.py
vllm/model_executor/models/granite_speech.py
+2
-2
vllm/model_executor/models/phi4mm_audio.py
vllm/model_executor/models/phi4mm_audio.py
+4
-4
vllm/v1/request.py
vllm/v1/request.py
+1
-1
No files found.
csrc/attention/attention_kernels.cuh
View file @
a9944aab
...
...
@@ -172,7 +172,7 @@ __device__ void paged_attention_kernel(
// Load the query to registers.
// Each thread in a thread group has a different part of the query.
// For example, if the
the
thread group size is 4, then the first thread in
// For example, if the thread group size is 4, then the first thread in
// the group has 0, 4, 8, ... th vectors of the query, and the second thread
// has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
// q is split from a qkv tensor, it may not be contiguous.
...
...
@@ -259,7 +259,7 @@ __device__ void paged_attention_kernel(
// Load a key to registers.
// Each thread in a thread group has a different part of the key.
// For example, if the
the
thread group size is 4, then the first thread in
// For example, if the thread group size is 4, then the first thread in
// the group has 0, 4, 8, ... th vectors of the key, and the second thread
// has 1, 5, 9, ... th vectors of the key, and so on.
for
(
int
i
=
0
;
i
<
NUM_TOKENS_PER_THREAD_GROUP
;
i
++
)
{
...
...
examples/offline_inference/chat_with_tools.py
View file @
a9944aab
...
...
@@ -68,7 +68,7 @@ def get_current_weather(city: str, state: str, unit: 'str'):
"partly cloudly, with highs in the 90's."
)
tool_funtions
=
{
"get_current_weather"
:
get_current_weather
}
tool_fun
c
tions
=
{
"get_current_weather"
:
get_current_weather
}
tools
=
[{
"type"
:
"function"
,
...
...
@@ -122,7 +122,7 @@ messages.append({
# above defined function
tool_calls
=
json
.
loads
(
output
)
tool_answers
=
[
tool_funtions
[
call
[
'name'
]](
**
call
[
'arguments'
])
for
call
in
tool_calls
tool_fun
c
tions
[
call
[
'name'
]](
**
call
[
'arguments'
])
for
call
in
tool_calls
]
# append the answer as a tool message and let the LLM give you an answer
...
...
tests/lora/test_lora_huggingface.py
View file @
a9944aab
...
...
@@ -30,7 +30,7 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
lora_path
=
get_adapter_absolute_path
(
lora_name
)
# lora loading should work for either absolute path and hugg
g
ingface id.
# lora loading should work for either absolute path and huggingface id.
peft_helper
=
PEFTHelper
.
from_local_dir
(
lora_path
,
4096
)
lora_model
=
LoRAModel
.
from_local_checkpoint
(
lora_path
,
...
...
tests/model_executor/weight_utils.py
View file @
a9944aab
...
...
@@ -20,11 +20,11 @@ def test_hf_transfer_auto_activation():
try
:
# enable hf hub transfer if available
import
hf_transfer
# type: ignore # noqa
HF_TRANFER_ACTIVE
=
True
HF_TRAN
S
FER_ACTIVE
=
True
except
ImportError
:
HF_TRANFER_ACTIVE
=
False
HF_TRAN
S
FER_ACTIVE
=
False
assert
(
huggingface_hub
.
constants
.
HF_HUB_ENABLE_HF_TRANSFER
==
HF_TRANFER_ACTIVE
)
HF_TRAN
S
FER_ACTIVE
)
def
test_download_weights_from_hf
():
...
...
vllm/config.py
View file @
a9944aab
...
...
@@ -297,7 +297,7 @@ class ModelConfig:
- 1K -> 1024
\n
- 25.6k -> 25,600"""
spec_target_max_model_len
:
Optional
[
int
]
=
None
"""Specify the
the
maximum length for spec decoding draft models."""
"""Specify the maximum length for spec decoding draft models."""
quantization
:
Optional
[
QuantizationMethods
]
=
None
"""Method used to quantize the weights. If `None`, we first check the
`quantization_config` attribute in the model config file. If that is
...
...
vllm/lora/ops/triton_ops/lora_expand_op.py
View file @
a9944aab
...
...
@@ -153,7 +153,7 @@ def _lora_expand(
lora_token_start_loc (torch.Tensor): A cumulative sum of
num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
lora_token_start_loc[i], along with num_tokens_per_lora[i]
identifies the
the
region in token_indices_sorted_by_lora_ids that
identifies the region in token_indices_sorted_by_lora_ids that
LoRA lora_ids[i] should process.
lora_ids (torch.Tensor): LoRA ids to process.
no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
...
...
vllm/model_executor/layers/mamba/mamba_mixer2.py
View file @
a9944aab
...
...
@@ -142,7 +142,7 @@ def mamba_v2_sharded_weight_loader(
)
->
LoaderFunction
:
"""Create a weight loader for mamba v2. This ensures that the projections
are correctly sharded so that they can be split into x, B, C. It also
ensures th
e the
all the groups corresponding to a head shard is placed
ensures th
at
all the groups corresponding to a head shard is placed
together with it.
"""
...
...
vllm/model_executor/models/granite_speech.py
View file @
a9944aab
...
...
@@ -21,7 +21,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only IBM Granite spee
e
ch model."""
"""Inference-only IBM Granite speech model."""
import
math
from
collections.abc
import
Iterable
,
Mapping
from
typing
import
Optional
,
TypedDict
,
Union
...
...
@@ -626,7 +626,7 @@ class GraniteSpeechForConditionalGeneration(
audio_embed_sizes
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
"""Calculate the input features mask, which will generally be used
to mask the
the
padded features for all entries in the batch except
to mask the padded features for all entries in the batch except
for those with the most audio features.
Args:
...
...
vllm/model_executor/models/phi4mm_audio.py
View file @
a9944aab
...
...
@@ -91,9 +91,9 @@ class ConformerEncoderLayer(nn.Module):
if set to True, use GLULinear module,
otherwise, used GLUPointWiseConv module.
default to False.
attention_inn
n
er_dim: int, optional
attention_inner_dim: int, optional
if equal to -1, attention dim for linears k/q/v is
equal to d_model. otherwise attention_inn
n
er_dim is used.
equal to d_model. otherwise attention_inner_dim is used.
default -1.
attention_glu_type: str, optional
activation function for glu used in the multihead attention,
...
...
@@ -148,7 +148,7 @@ class ConformerEncoderLayer(nn.Module):
conv_glu_type
=
"sigmoid"
,
bias_in_glu
=
True
,
linear_glu_in_convm
=
False
,
attention_inn
n
er_dim
=-
1
,
attention_inner_dim
=-
1
,
attention_glu_type
=
"swish"
,
activation_checkpointing
=
""
,
export
=
False
,
...
...
@@ -169,7 +169,7 @@ class ConformerEncoderLayer(nn.Module):
n_head
,
d_model
,
dropout_rate
,
attention_inn
n
er_dim
,
attention_inner_dim
,
attention_glu_type
,
bias_in_glu
,
use_pt_scaled_dot_product_attention
=
...
...
vllm/v1/request.py
View file @
a9944aab
...
...
@@ -72,7 +72,7 @@ class Request:
assert
len
(
self
.
mm_inputs
)
==
len
(
self
.
mm_hashes
)
# Read-only views
# Prevent directly appending to
the
these lists since
# Prevent directly appending to these lists since
# they should also be updated simultaneously.
self
.
output_token_ids
=
ConstantList
(
self
.
_output_token_ids
)
self
.
all_token_ids
=
ConstantList
(
self
.
_all_token_ids
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment