Unverified Commit 6a895197 authored by Jiayi Yan's avatar Jiayi Yan Committed by GitHub
Browse files

[Bugfix][CI] fix typos (#34934)


Signed-off-by: default avatar1195343015 <1195343015@qq.com>
Signed-off-by: default avatarJiayi Yan <66017932+1195343015@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8c760b6a
...@@ -162,7 +162,7 @@ Ns = [1024] ...@@ -162,7 +162,7 @@ Ns = [1024]
TOPKs = [4, 1] TOPKs = [4, 1]
Es = [32] Es = [32]
DTYPEs = [torch.bfloat16] DTYPEs = [torch.bfloat16]
FUSED_MOE_CHUNK_SIZEs = [None, 16] FUSED_MOE_CHUNK_SIZES = [None, 16]
def is_nyi_config(config: Config) -> bool: def is_nyi_config(config: Config) -> bool:
...@@ -192,7 +192,7 @@ def generate_valid_test_cases( ...@@ -192,7 +192,7 @@ def generate_valid_test_cases(
DTYPEs, DTYPEs,
MK_QUANT_CONFIGS, MK_QUANT_CONFIGS,
product(prepare_finalize_types, MK_FUSED_EXPERT_TYPES), product(prepare_finalize_types, MK_FUSED_EXPERT_TYPES),
FUSED_MOE_CHUNK_SIZEs, FUSED_MOE_CHUNK_SIZES,
): ):
total = total + 1 total = total + 1
...@@ -266,7 +266,7 @@ def test_modular_kernel_combinations_multigpu( ...@@ -266,7 +266,7 @@ def test_modular_kernel_combinations_multigpu(
if cuda_device_count_stateless() < world_size: if cuda_device_count_stateless() < world_size:
pytest.skip( pytest.skip(
f"Not enough GPUs available to run, got " f"Not enough GPUs available to run, got "
f"{cuda_device_count_stateless()} exepected " f"{cuda_device_count_stateless()} expected "
f"{world_size}." f"{world_size}."
) )
......
...@@ -87,7 +87,7 @@ MSGS = [ ...@@ -87,7 +87,7 @@ MSGS = [
{ {
"role": "user", "role": "user",
"content": "Could you please rewrite the below article? \n\n My English needs " "content": "Could you please rewrite the below article? \n\n My English needs "
"improvving, maybe I make errors.", "improving, maybe I make errors.",
}, },
{ {
"role": "assistant", "role": "assistant",
...@@ -98,7 +98,7 @@ MSGS = [ ...@@ -98,7 +98,7 @@ MSGS = [
"type": "function", "type": "function",
"function": { "function": {
"name": "rewrite", "name": "rewrite",
"arguments": '{"text":"My English needs improvving, maybe ' "arguments": '{"text":"My English needs improving, maybe '
'I make errors."}', 'I make errors."}',
}, },
} }
......
...@@ -14,7 +14,7 @@ MAX_MODEL_LEN = 512 ...@@ -14,7 +14,7 @@ MAX_MODEL_LEN = 512
# Example from https://huggingface.co/BAAI/bge-m3 # Example from https://huggingface.co/BAAI/bge-m3
sentences_1 = ["What is BGE M3?", "Defination of BM25"] sentences_1 = ["What is BGE M3?", "Definition of BM25"]
sentences_2 = [ sentences_2 = [
"BGE M3 is an embedding model supporting dense retrieval, " "BGE M3 is an embedding model supporting dense retrieval, "
"lexical matching and multi-vector interaction.", "lexical matching and multi-vector interaction.",
......
...@@ -719,7 +719,7 @@ def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -719,7 +719,7 @@ def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
# Convert to tuple or None # Convert to tuple or None
all_hidden_states = tuple(hidden_states_list) if output_hidden_states else None all_hidden_states = tuple(hidden_states_list) if output_hidden_states else None
# Include hiden_states for compatibility with hidden_states_to_seq_logprobs() # Include hidden_states for compatibility with hidden_states_to_seq_logprobs()
return BaseModelOutputWithPast( return BaseModelOutputWithPast(
last_hidden_state=hidden_states, last_hidden_state=hidden_states,
past_key_values=past_key_values, past_key_values=past_key_values,
...@@ -1226,7 +1226,7 @@ def voxtral_patch_hf_runner(hf_model: "HfRunner") -> "HfRunner": ...@@ -1226,7 +1226,7 @@ def voxtral_patch_hf_runner(hf_model: "HfRunner") -> "HfRunner":
dicts (accepting ``url``, ``path``, or ``base64`` audio) rather than dicts (accepting ``url``, ``path``, or ``base64`` audio) rather than
the standard ``processor(text=, audio=, sampling_rate=)`` interface. the standard ``processor(text=, audio=, sampling_rate=)`` interface.
2. HfRunner.get_inputs cannot handle multi-audio per prompt because it 2. HfRunner.get_inputs cannot handle multi-audio per prompt because it
mis-unpacks ``[(arr1, sr1), (arr2, sr2)]`` via a ``len == 2`` check. incorrectly unpacks ``[(arr1, sr1), (arr2, sr2)]`` via a ``len == 2`` check.
We override ``get_inputs`` to build conversation dicts and call We override ``get_inputs`` to build conversation dicts and call
``apply_chat_template`` directly, bypassing both issues. We also wrap ``apply_chat_template`` directly, bypassing both issues. We also wrap
......
...@@ -25,7 +25,7 @@ def set_test_environment(): ...@@ -25,7 +25,7 @@ def set_test_environment():
os.environ["FLASHINFER_NVCC_THREADS"] = "16" os.environ["FLASHINFER_NVCC_THREADS"] = "16"
# Overide the backbone layers to 4 for faster startup # Override the backbone layers to 4 for faster startup
HF_OVERRIDE_TEXT = { HF_OVERRIDE_TEXT = {
"num_layers": 4, "num_layers": 4,
"num_hidden_layers": 4, "num_hidden_layers": 4,
......
...@@ -206,8 +206,8 @@ def test_resolve_chat_template_kwargs(sample_json_schema, model, expected_kwargs ...@@ -206,8 +206,8 @@ def test_resolve_chat_template_kwargs(sample_json_schema, model, expected_kwargs
chat_template_kwargs = { chat_template_kwargs = {
# both unused # both unused
"unsed_kwargs_1": 123, "unused_kwargs_1": 123,
"unsed_kwargs_2": "abc", "unused_kwargs_2": "abc",
# should not appear # should not appear
"chat_template": "{% Hello world! %}", "chat_template": "{% Hello world! %}",
"tokenize": True, "tokenize": True,
......
...@@ -853,7 +853,7 @@ def test_vllm_config_defaults_are_none(): ...@@ -853,7 +853,7 @@ def test_vllm_config_defaults_are_none():
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_id", "compiliation_config", "optimization_level"), ("model_id", "compilation_config", "optimization_level"),
[ [
( (
None, None,
...@@ -895,7 +895,7 @@ def test_vllm_config_defaults_are_none(): ...@@ -895,7 +895,7 @@ def test_vllm_config_defaults_are_none():
("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O3), ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O3),
], ],
) )
def test_vllm_config_defaults(model_id, compiliation_config, optimization_level): def test_vllm_config_defaults(model_id, compilation_config, optimization_level):
"""Test that optimization-level defaults are correctly applied.""" """Test that optimization-level defaults are correctly applied."""
model_config = None model_config = None
...@@ -903,12 +903,12 @@ def test_vllm_config_defaults(model_id, compiliation_config, optimization_level) ...@@ -903,12 +903,12 @@ def test_vllm_config_defaults(model_id, compiliation_config, optimization_level)
model_config = ModelConfig(model_id) model_config = ModelConfig(model_id)
vllm_config = VllmConfig( vllm_config = VllmConfig(
model_config=model_config, model_config=model_config,
compilation_config=compiliation_config, compilation_config=compilation_config,
optimization_level=optimization_level, optimization_level=optimization_level,
) )
else: else:
vllm_config = VllmConfig( vllm_config = VllmConfig(
compilation_config=compiliation_config, compilation_config=compilation_config,
optimization_level=optimization_level, optimization_level=optimization_level,
) )
# Use the global optimization level defaults # Use the global optimization level defaults
......
...@@ -106,7 +106,7 @@ def test_extract_tool_calls_no_tools(seed_oss_tool_parser): ...@@ -106,7 +106,7 @@ def test_extract_tool_calls_no_tools(seed_oss_tool_parser):
@pytest.mark.parametrize( @pytest.mark.parametrize(
ids=[ ids=[
"tool_call_0_thinking_budget", "tool_call_0_thinking_budget",
"tool_call_512_thinkg_budget", "tool_call_512_thinking_budget",
"tool_call_unlimited_thinking_budget", "tool_call_unlimited_thinking_budget",
], ],
argnames=["model_output", "expected_tool_calls", "expected_content"], argnames=["model_output", "expected_tool_calls", "expected_content"],
...@@ -308,7 +308,7 @@ def stream_delta_message_generator( ...@@ -308,7 +308,7 @@ def stream_delta_message_generator(
@pytest.mark.parametrize( @pytest.mark.parametrize(
ids=[ ids=[
"tool_call_0_thinking_budget", "tool_call_0_thinking_budget",
"tool_call_512_thinkg_budget", "tool_call_512_thinking_budget",
"tool_call_unlimited_thinking_budget", "tool_call_unlimited_thinking_budget",
], ],
argnames=["model_output", "expected_tool_calls", "expected_content"], argnames=["model_output", "expected_tool_calls", "expected_content"],
......
...@@ -34,10 +34,10 @@ def test_list_filtered_repo_files( ...@@ -34,10 +34,10 @@ def test_list_filtered_repo_files(
subfolder.mkdir() subfolder.mkdir()
(path_tmp_dir / "json_file.json").touch() (path_tmp_dir / "json_file.json").touch()
(path_tmp_dir / "correct_2.txt").touch() (path_tmp_dir / "correct_2.txt").touch()
(path_tmp_dir / "uncorrect.txt").touch() (path_tmp_dir / "incorrect.txt").touch()
(path_tmp_dir / "uncorrect.jpeg").touch() (path_tmp_dir / "incorrect.jpeg").touch()
(subfolder / "correct.txt").touch() (subfolder / "correct.txt").touch()
(subfolder / "uncorrect_sub.txt").touch() (subfolder / "incorrect_sub.txt").touch()
def _glob_path() -> list[str]: def _glob_path() -> list[str]:
return [ return [
...@@ -86,7 +86,7 @@ def test_one_filtered_repo_files(allow_patterns: list[str], expected_bool: bool) ...@@ -86,7 +86,7 @@ def test_one_filtered_repo_files(allow_patterns: list[str], expected_bool: bool)
path_tmp_dir = Path(tmp_dir) path_tmp_dir = Path(tmp_dir)
subfolder = path_tmp_dir / "subfolder" subfolder = path_tmp_dir / "subfolder"
subfolder.mkdir() subfolder.mkdir()
(path_tmp_dir / "uncorrect.jpeg").touch() (path_tmp_dir / "incorrect.jpeg").touch()
(subfolder / "correct.txt").touch() (subfolder / "correct.txt").touch()
def _glob_path() -> list[str]: def _glob_path() -> list[str]:
......
...@@ -308,7 +308,7 @@ def test_free_kv_cache_block_queue_append_n(): ...@@ -308,7 +308,7 @@ def test_free_kv_cache_block_queue_append_n():
# Create an empty FreeKVCacheBlockQueue # Create an empty FreeKVCacheBlockQueue
invalid_queue = FreeKVCacheBlockQueue([]) invalid_queue = FreeKVCacheBlockQueue([])
# set prev_free_block to None and this will cause assertation in append_n # set prev_free_block to None and this will cause assertion in append_n
invalid_queue.fake_free_list_tail.prev_free_block = None invalid_queue.fake_free_list_tail.prev_free_block = None
with pytest.raises(AssertionError): with pytest.raises(AssertionError):
# Append 1 block # Append 1 block
......
...@@ -2304,22 +2304,22 @@ def test_block_lookup_cache_single_block_per_key(): ...@@ -2304,22 +2304,22 @@ def test_block_lookup_cache_single_block_per_key():
assert cache.get_one_block(key0) is block0 assert cache.get_one_block(key0) is block0
assert cache.get_one_block(key1) is block1 assert cache.get_one_block(key1) is block1
assert cache.get_one_block(key2) is None assert cache.get_one_block(key2) is None
# No block poped due to block_id mismatch # No block popped due to block_id mismatch
assert cache.pop(key0, 100) is None assert cache.pop(key0, 100) is None
assert cache.get_one_block(key0) is block0 assert cache.get_one_block(key0) is block0
assert cache.get_one_block(key1) is block1 assert cache.get_one_block(key1) is block1
assert cache.get_one_block(key2) is None assert cache.get_one_block(key2) is None
# block poped with (key0, block ID 0) # block popped with (key0, block ID 0)
assert cache.pop(key0, 0) is block0 assert cache.pop(key0, 0) is block0
assert cache.get_one_block(key0) is None assert cache.get_one_block(key0) is None
assert cache.get_one_block(key1) is block1 assert cache.get_one_block(key1) is block1
assert cache.get_one_block(key2) is None assert cache.get_one_block(key2) is None
# No block poped due to block_id mismatch # No block popped due to block_id mismatch
assert cache.pop(key0, 1) is None assert cache.pop(key0, 1) is None
assert cache.get_one_block(key0) is None assert cache.get_one_block(key0) is None
assert cache.get_one_block(key1) is block1 assert cache.get_one_block(key1) is block1
assert cache.get_one_block(key2) is None assert cache.get_one_block(key2) is None
# block poped with (key1, block ID 1) # block popped with (key1, block ID 1)
assert cache.pop(key1, 1) is block1 assert cache.pop(key1, 1) is block1
assert cache.get_one_block(key0) is None assert cache.get_one_block(key0) is None
assert cache.get_one_block(key1) is None assert cache.get_one_block(key1) is None
......
...@@ -140,7 +140,7 @@ def _mock_draft_token_ids( ...@@ -140,7 +140,7 @@ def _mock_draft_token_ids(
return DraftTokenIds(req_ids=request_ids, draft_token_ids=sampled_token_ids) return DraftTokenIds(req_ids=request_ids, draft_token_ids=sampled_token_ids)
def _chech_valid_scheduler_output( def _check_valid_scheduler_output(
scheduler_output: SchedulerOutput, scheduler_output: SchedulerOutput,
seen_request_ids: set[str], seen_request_ids: set[str],
seen_mm_hashes: set[str], seen_mm_hashes: set[str],
...@@ -242,7 +242,7 @@ def test_priority_scheduling_blast( ...@@ -242,7 +242,7 @@ def test_priority_scheduling_blast(
) )
scheduler.add_request(req) scheduler.add_request(req)
scheduler_output = scheduler.schedule() scheduler_output = scheduler.schedule()
_chech_valid_scheduler_output( _check_valid_scheduler_output(
scheduler_output, seen_request_ids, seen_mm_hashes scheduler_output, seen_request_ids, seen_mm_hashes
) )
model_output = _mock_execute_model( model_output = _mock_execute_model(
......
...@@ -1116,7 +1116,7 @@ def _step_until_done( ...@@ -1116,7 +1116,7 @@ def _step_until_done(
def _step_until_kv_transfer_finished(scheduler: Scheduler, req_ids: list[str]): def _step_until_kv_transfer_finished(scheduler: Scheduler, req_ids: list[str]):
"""Cycle requests through a KV transfer cyle.""" """Cycle requests through a KV transfer cycle."""
# Requests should first transition to WAITING_FOR_REMOTE_KVS # Requests should first transition to WAITING_FOR_REMOTE_KVS
output = scheduler.schedule() output = scheduler.schedule()
...@@ -2714,7 +2714,7 @@ def _assert_right_encoder_inputs( ...@@ -2714,7 +2714,7 @@ def _assert_right_encoder_inputs(
if expected_total_reqs == 0: if expected_total_reqs == 0:
return return
# Number of expected enocder inputs should match number of requests # Number of expected encoder inputs should match number of requests
if expected_encoder_inputs: if expected_encoder_inputs:
assert check_exist and requests is not None # only support expect input exist assert check_exist and requests is not None # only support expect input exist
assert len(requests) == len(expected_encoder_inputs) assert len(requests) == len(expected_encoder_inputs)
...@@ -2964,7 +2964,7 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector): ...@@ -2964,7 +2964,7 @@ def test_ec_connector_with_partial_cache_hit_multi_round(use_kv_connector):
) )
scheduler.update_from_output(output, model_output) scheduler.update_from_output(output, model_output)
# request1 is finished after outputing 1 token # request1 is finished after outputting 1 token
# Finish request # Finish request
scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED) scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED)
...@@ -3060,14 +3060,14 @@ def test_ec_connector_schedule_multiple_requests(cache_exist, use_kv_connector): ...@@ -3060,14 +3060,14 @@ def test_ec_connector_schedule_multiple_requests(cache_exist, use_kv_connector):
for request in requests: for request in requests:
scheduler.add_request(request) scheduler.add_request(request)
# Set up to test different encoder cache exsistence scenario after preemption # Set up to test different encoder cache existence scenario after preemption
# Order of getting encoder cache should be: local cache -> connector-> compute # Order of getting encoder cache should be: local cache -> connector-> compute
scheduler.ec_connector.update_state_after_alloc = Mock( scheduler.ec_connector.update_state_after_alloc = Mock(
wraps=scheduler.ec_connector.update_state_after_alloc wraps=scheduler.ec_connector.update_state_after_alloc
) )
if cache_exist == "local": if cache_exist == "local":
# Allocate cache to cache manager manually to mimick # Allocate cache to cache manager manually to mimic
for req in requests: for req in requests:
scheduler.encoder_cache_manager.allocate(req, 0) scheduler.encoder_cache_manager.allocate(req, 0)
else: else:
...@@ -3384,13 +3384,13 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption( ...@@ -3384,13 +3384,13 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption(
pooler_output=[], pooler_output=[],
) )
# Finish the requests to make room for the preempted requests to resume # Finish the requests to make room for the preempted requests to resume
# req_high is finished after outputing 2 tokens # req_high is finished after outputting 2 tokens
scheduler.update_from_output(output, model_output) scheduler.update_from_output(output, model_output)
scheduler.finish_requests( scheduler.finish_requests(
request_high.request_id, RequestStatus.FINISHED_LENGTH_CAPPED request_high.request_id, RequestStatus.FINISHED_LENGTH_CAPPED
) )
# Set up to test different encoder cache exsistence scenario after preemption # Set up to test different encoder cache existence scenario after preemption
# Order of getting encoder cache should be: local cache -> connector-> compute # Order of getting encoder cache should be: local cache -> connector-> compute
# By default, the cache should still exist in local in this test case # By default, the cache should still exist in local in this test case
if cache_exist != "local": if cache_exist != "local":
...@@ -3483,7 +3483,7 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto ...@@ -3483,7 +3483,7 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto
ec_role="ec_consumer", ec_role="ec_consumer",
) )
# Limit the number of availiable slots of EncoderCacheManager # Limit the number of available slots of EncoderCacheManager
scheduler.encoder_cache_manager = EncoderCacheManager(cache_size=32) scheduler.encoder_cache_manager = EncoderCacheManager(cache_size=32)
# Create MM request1 # Create MM request1
...@@ -3574,7 +3574,7 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto ...@@ -3574,7 +3574,7 @@ def test_ec_connector_allocate_encoder_tokens_with_external_load(use_kv_connecto
) )
scheduler.update_from_output(output, model_output) scheduler.update_from_output(output, model_output)
# request1 is finished after outputing 1 token # request1 is finished after outputting 1 token
# Finish request # Finish request
scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED) scheduler.finish_requests(request1.request_id, RequestStatus.FINISHED_LENGTH_CAPPED)
assert scheduler.get_num_unfinished_requests() == 1 assert scheduler.get_num_unfinished_requests() == 1
......
...@@ -76,11 +76,11 @@ def get_fake_sample_fn() -> SamplerOutput: ...@@ -76,11 +76,11 @@ def get_fake_sample_fn() -> SamplerOutput:
), ),
logprobs_tensors=None, logprobs_tensors=None,
) )
accpeted_tokens = prompt_token_ids[ accepted_tokens = prompt_token_ids[
first_token_id_index : first_token_id_index first_token_id_index : first_token_id_index
+ min(num_accepted_tokens, logits.shape[0]) + min(num_accepted_tokens, logits.shape[0])
] ]
sampled_token_ids = accpeted_tokens sampled_token_ids = accepted_tokens
return SamplerOutput( return SamplerOutput(
sampled_token_ids=torch.tensor( sampled_token_ids=torch.tensor(
[sampled_token_ids], device="cuda", dtype=torch.int32 [sampled_token_ids], device="cuda", dtype=torch.int32
......
...@@ -911,7 +911,7 @@ def test_structured_output_with_structural_tag(backend: str): ...@@ -911,7 +911,7 @@ def test_structured_output_with_structural_tag(backend: str):
), ),
) )
prompt = "Hello and repete hello 10 times, do not say anything else. Only say hello hello hello, now start" prompt = "Hello and repeat hello 10 times, do not say anything else. Only say hello hello hello, now start"
outputs = llm.generate(prompt, sampling_params=sampling_params, use_tqdm=True) outputs = llm.generate(prompt, sampling_params=sampling_params, use_tqdm=True)
assert outputs is not None assert outputs is not None
for output in outputs: for output in outputs:
......
...@@ -99,7 +99,7 @@ def _setup_kv_transfer_request(request, remote_host="127.0.0.1", fake_port=4789) ...@@ -99,7 +99,7 @@ def _setup_kv_transfer_request(request, remote_host="127.0.0.1", fake_port=4789)
return request return request
class FakeMorIIOWrapper: class FakeMoRIIOWrapper:
# A fake MoRIIOWrapper for testing purposes # A fake MoRIIOWrapper for testing purposes
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
pass pass
...@@ -168,7 +168,7 @@ class FakeMorIIOWrapper: ...@@ -168,7 +168,7 @@ class FakeMorIIOWrapper:
pass pass
class FakeMorIIOConnectorWorker(MoRIIOConnectorWorker): class FakeMoRIIOConnectorWorker(MoRIIOConnectorWorker):
# Define a fake remote engine id for testing # Define a fake remote engine id for testing
REMOTE_ENGINE_ID = "remote_engine" REMOTE_ENGINE_ID = "remote_engine"
...@@ -373,7 +373,7 @@ def test_read_mode_loads_remote_block_ids(moriio_read_mode): ...@@ -373,7 +373,7 @@ def test_read_mode_loads_remote_block_ids(moriio_read_mode):
# Set remote block ids to be fetched. # Set remote block ids to be fetched.
request.kv_transfer_params["remote_block_ids"] = block_list request.kv_transfer_params["remote_block_ids"] = block_list
# Remote Prefill, triggers MorIIOConnectorMetadata. # Remote Prefill, triggers MoRIIOConnectorMetadata.
scheduler_output = scheduler.schedule() scheduler_output = scheduler.schedule()
kv_connector_metadata = scheduler_output.kv_connector_metadata kv_connector_metadata = scheduler_output.kv_connector_metadata
...@@ -451,7 +451,7 @@ def test_register_kv_caches(mock_parallel_groups): ...@@ -451,7 +451,7 @@ def test_register_kv_caches(mock_parallel_groups):
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
connector = MoRIIOConnector(vllm_config, KVConnectorRole.WORKER) connector = MoRIIOConnector(vllm_config, KVConnectorRole.WORKER)
connector.connector_worker = FakeMorIIOConnectorWorker( connector.connector_worker = FakeMoRIIOConnectorWorker(
vllm_config, connector.engine_id, hand_shake_latency=0 vllm_config, connector.engine_id, hand_shake_latency=0
) )
...@@ -528,7 +528,7 @@ def test_moriio_handshake_returns_metadata(mock_parallel_groups): ...@@ -528,7 +528,7 @@ def test_moriio_handshake_returns_metadata(mock_parallel_groups):
with ( with (
patch( patch(
"vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine.MoRIIOWrapper", "vllm.distributed.kv_transfer.kv_connector.v1.moriio.moriio_engine.MoRIIOWrapper",
FakeMorIIOWrapper, FakeMoRIIOWrapper,
), ),
): ):
handshake_port = _find_free_port() handshake_port = _find_free_port()
......
...@@ -460,9 +460,9 @@ class FakeNixlConnectorWorker(NixlConnectorWorker): ...@@ -460,9 +460,9 @@ class FakeNixlConnectorWorker(NixlConnectorWorker):
# When remote tp_size > local tp_size, handshake with multiple # When remote tp_size > local tp_size, handshake with multiple
# remote ranks. # remote ranks.
num_hanshakes = 1 if tp_ratio > 0 else -tp_ratio num_handshakes = 1 if tp_ratio > 0 else -tp_ratio
remote_agents: dict[int, str] = {} remote_agents: dict[int, str] = {}
for remote_tp_rank in range(num_hanshakes): for remote_tp_rank in range(num_handshakes):
remote_agent_name = self.add_remote_agent( remote_agent_name = self.add_remote_agent(
NixlAgentMetadata( NixlAgentMetadata(
engine_id=self.REMOTE_ENGINE_ID, engine_id=self.REMOTE_ENGINE_ID,
...@@ -688,7 +688,7 @@ class TestNixlHandshake: ...@@ -688,7 +688,7 @@ class TestNixlHandshake:
) )
check_handshake(2) check_handshake(2)
# NOTE flexiblity: a second remote with higher number of ranks is # NOTE flexibility: a second remote with higher number of ranks is
# discovered. This is not a scenario we actively support right now, but # discovered. This is not a scenario we actively support right now, but
# the connector allows it. # the connector allows it.
worker.REMOTE_ENGINE_ID = "remote_engine_2" worker.REMOTE_ENGINE_ID = "remote_engine_2"
...@@ -1766,7 +1766,7 @@ def test_aborted_request_removed_from_worker_in_batch(default_vllm_config, dist_ ...@@ -1766,7 +1766,7 @@ def test_aborted_request_removed_from_worker_in_batch(default_vllm_config, dist_
req = create_request(request_id=1, do_remote_decode=True, max_tokens=1) req = create_request(request_id=1, do_remote_decode=True, max_tokens=1)
scheduler.add_request(req) scheduler.add_request(req)
# First scheduling pass - examinate build_connector_meta output # First scheduling pass - examine build_connector_meta output
sched_out = scheduler.schedule() sched_out = scheduler.schedule()
kv_meta = sched_out.kv_connector_metadata kv_meta = sched_out.kv_connector_metadata
assert kv_meta is not None assert kv_meta is not None
......
...@@ -36,7 +36,7 @@ SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT ...@@ -36,7 +36,7 @@ SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT
# non-associative and sensitive to batch geometry. The ref LLM (no spec # non-associative and sensitive to batch geometry. The ref LLM (no spec
# decode, default scheduling) and the spec-decode LLM (chunked prefill, # decode, default scheduling) and the spec-decode LLM (chunked prefill,
# different effective batch sizes) follow different reduction orders, # different effective batch sizes) follow different reduction orders,
# producing numerically divergent logprobs that get mis-attributed to # producing numerically divergent logprobs that get misattributed to
# spec-decode incorrectness. # spec-decode incorrectness.
# #
# Force LLM instances into an identical, deterministic execution # Force LLM instances into an identical, deterministic execution
......
...@@ -726,7 +726,7 @@ def test_frequency_penalties(rejection_sampler): ...@@ -726,7 +726,7 @@ def test_frequency_penalties(rejection_sampler):
spec_tokens = [[1, 1, 1], [], [1, 1, 1]] spec_tokens = [[1, 1, 1], [], [1, 1, 1]]
output_tokens = [[1, 1, 1, 1], [7], [1, 1, 1, 1]] # 1, 7 and 1 are the bonus tokens output_tokens = [[1, 1, 1, 1], [7], [1, 1, 1, 1]] # 1, 7 and 1 are the bonus tokens
num_requsts = len(spec_tokens) num_requests = len(spec_tokens)
logits = create_logits_tensor(output_tokens, token_idx_to_override=15) logits = create_logits_tensor(output_tokens, token_idx_to_override=15)
metadata = create_sampling_metadata( metadata = create_sampling_metadata(
all_greedy=True, all_greedy=True,
...@@ -734,8 +734,8 @@ def test_frequency_penalties(rejection_sampler): ...@@ -734,8 +734,8 @@ def test_frequency_penalties(rejection_sampler):
spec_token_ids=spec_tokens, spec_token_ids=spec_tokens,
prompt_token_ids=torch.tensor([[5, 6, 7], [6, 7, 8], [7, 8, 9]], device=DEVICE), prompt_token_ids=torch.tensor([[5, 6, 7], [6, 7, 8], [7, 8, 9]], device=DEVICE),
frequency_penalties=[1.5, 1.5, 0.7], frequency_penalties=[1.5, 1.5, 0.7],
presence_penalties=[0.0] * num_requsts, presence_penalties=[0.0] * num_requests,
repetition_penalties=[1.0] * num_requsts, repetition_penalties=[1.0] * num_requests,
) )
bonus_token_tensor = torch.tensor( bonus_token_tensor = torch.tensor(
[output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device [output_tokens[i][-1] for i in range(len(output_tokens))], device=logits.device
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment