Unverified Commit 6c046382 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Fix per file ruff ignores related to line length (#26262)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 91ac7f76
...@@ -916,8 +916,9 @@ class Qwen3NextDecoderLayer(nn.Module): ...@@ -916,8 +916,9 @@ class Qwen3NextDecoderLayer(nn.Module):
) )
else: else:
assert len(hidden_states.shape) == len(self.ffn_layer_scale.shape), ( assert len(hidden_states.shape) == len(self.ffn_layer_scale.shape), (
f"shape must be the same {len(hidden_states.shape)}, {len(self.ffn_layer_scale.shape)}" f"shape must be the same {len(hidden_states.shape)}, "
) # noqa: E501 f"{len(self.ffn_layer_scale.shape)}"
)
hidden_states = hidden_states * ( hidden_states = hidden_states * (
self.ffn_layer_scale.to(hidden_states.dtype) + 1 self.ffn_layer_scale.to(hidden_states.dtype) + 1
) )
......
...@@ -255,8 +255,8 @@ def is_rocm_aiter_fp8bmm_enabled() -> bool: ...@@ -255,8 +255,8 @@ def is_rocm_aiter_fp8bmm_enabled() -> bool:
if is_rocm_aiter_fp8bmm_enabled(): if is_rocm_aiter_fp8bmm_enabled():
from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import ( # noqa: E501 # isort: skip from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import ( # noqa: E501
batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm, batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant as aiter_triton_fp8_bmm, # noqa: E501
) )
def dynamic_per_batched_tensor_quant( def dynamic_per_batched_tensor_quant(
...@@ -1284,8 +1284,10 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): ...@@ -1284,8 +1284,10 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1), actual_seq_lens_q=prefill.query_seq_lens.view(-1, 1, 1, 1),
actual_seq_lens_kv=prefill.query_seq_lens.view(-1, 1, 1, 1), actual_seq_lens_kv=prefill.query_seq_lens.view(-1, 1, 1, 1),
causal=True, causal=True,
return_lse=True, # do not support False for now # Do not support False for now
is_cuda_graph_compatible=True, # Indicates actual_seq_lens are on GPU or CPU. return_lse=True,
# Indicates actual_seq_lens are on GPU or CPU.
is_cuda_graph_compatible=True,
) )
if return_softmax_lse: if return_softmax_lse:
return output, lse return output, lse
...@@ -1342,7 +1344,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): ...@@ -1342,7 +1344,8 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
), ),
causal=False, causal=False,
return_lse=True, return_lse=True,
is_cuda_graph_compatible=True, # Indicates actual_seq_lens are on GPU or CPU. # Indicates actual_seq_lens are on GPU or CPU.
is_cuda_graph_compatible=True,
) )
def process_weights_after_loading(self, act_dtype: torch.dtype): def process_weights_after_loading(self, act_dtype: torch.dtype):
......
...@@ -872,10 +872,13 @@ def wait_for_engine_startup( ...@@ -872,10 +872,13 @@ def wait_for_engine_startup(
EngineHandshakeMetadata( EngineHandshakeMetadata(
addresses=addresses, addresses=addresses,
parallel_config={ parallel_config={
"data_parallel_master_ip": parallel_config.data_parallel_master_ip, k: getattr(parallel_config, k)
"data_parallel_master_port": parallel_config.data_parallel_master_port, for k in (
"_data_parallel_master_port_list": parallel_config._data_parallel_master_port_list, "data_parallel_master_ip",
"data_parallel_size": parallel_config.data_parallel_size, "data_parallel_master_port",
"_data_parallel_master_port_list",
"data_parallel_size",
)
}, },
) )
) )
......
...@@ -345,13 +345,15 @@ def report_usage_stats( ...@@ -345,13 +345,15 @@ def report_usage_stats(
from vllm.model_executor.model_loader import get_architecture_class_name from vllm.model_executor.model_loader import get_architecture_class_name
parallel_config = vllm_config.parallel_config
usage_message.report_usage( usage_message.report_usage(
get_architecture_class_name(vllm_config.model_config), get_architecture_class_name(vllm_config.model_config),
usage_context, usage_context,
extra_kvs={ extra_kvs={
# Common configuration # Common configuration
"dtype": str(vllm_config.model_config.dtype), "dtype": str(vllm_config.model_config.dtype),
"tensor_parallel_size": vllm_config.parallel_config.tensor_parallel_size, "tensor_parallel_size": parallel_config.tensor_parallel_size,
"block_size": vllm_config.cache_config.block_size, "block_size": vllm_config.cache_config.block_size,
"gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization, "gpu_memory_utilization": vllm_config.cache_config.gpu_memory_utilization,
"kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes, "kv_cache_memory_bytes": vllm_config.cache_config.kv_cache_memory_bytes,
...@@ -362,7 +364,7 @@ def report_usage_stats( ...@@ -362,7 +364,7 @@ def report_usage_stats(
"enable_lora": bool(vllm_config.lora_config), "enable_lora": bool(vllm_config.lora_config),
"enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching, "enable_prefix_caching": vllm_config.cache_config.enable_prefix_caching,
"enforce_eager": vllm_config.model_config.enforce_eager, "enforce_eager": vllm_config.model_config.enforce_eager,
"disable_custom_all_reduce": vllm_config.parallel_config.disable_custom_all_reduce, "disable_custom_all_reduce": parallel_config.disable_custom_all_reduce,
}, },
) )
......
...@@ -3391,7 +3391,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -3391,7 +3391,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
attn_metadata[ubid][layer_name] = attn_metadata_i attn_metadata[ubid][layer_name] = attn_metadata_i
else: else:
assert type(attn_metadata) is dict assert type(attn_metadata) is dict
attn_metadata_i = attn_group.get_metadata_builder().build_for_cudagraph_capture( metadata_builder = attn_group.get_metadata_builder()
attn_metadata_i = metadata_builder.build_for_cudagraph_capture(
common_attn_metadata common_attn_metadata
) )
for layer_name in attn_group.layer_names: for layer_name in attn_group.layer_names:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment