Unverified Commit d88918e4 authored by lirong's avatar lirong Committed by GitHub
Browse files

[Core] Enable sharded state loader for V1 engine and enhance test coverage (#25308)


Signed-off-by: default avatarpengdrumli <pengdrumli@tencent.com>
parent 3c713a97
......@@ -57,10 +57,19 @@ def llama_3p2_1b_files():
def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
llm_sharded_writer = LLM(model=input_dir, **kwargs)
# Check which engine version is being used
is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core")
# Dump worker states to output directory
llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
path=output_dir)
if is_v1_engine:
# For V1 engine, we need to use engine_core.save_sharded_state
print("Using V1 engine save path")
llm_sharded_writer.llm_engine.engine_core.save_sharded_state(
path=output_dir)
else:
# For V0 engine
print("Using V0 engine save path")
model_executor = llm_sharded_writer.llm_engine.model_executor
model_executor.save_sharded_state(path=output_dir)
# Copy metadata files to output directory
for file in os.listdir(input_dir):
......@@ -91,8 +100,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
gpu_memory_utilization = 0.8
input_dir = llama_3p2_1b_files
ctx = mp.get_context("spawn")
# The interface in v1 engine has changed, run in v1 engine will hang.
monkeypatch.setenv("VLLM_USE_V1", "0")
# Run in separate processes for memory & CUDA isolation
with TemporaryDirectory() as output_dir:
......@@ -100,7 +107,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
args=(input_dir, output_dir, weights_patterns),
kwargs=dict(
tensor_parallel_size=tp_size,
distributed_executor_backend="mp",
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=True,
))
......@@ -112,7 +118,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
p = ctx.Process(target=_run_generate,
args=(input_dir, queue),
kwargs=dict(
distributed_executor_backend="mp",
enable_lora=enable_lora,
gpu_memory_utilization=gpu_memory_utilization,
tensor_parallel_size=tp_size,
......@@ -133,7 +138,6 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
p = ctx.Process(target=_run_generate,
args=(output_dir, queue),
kwargs=dict(
distributed_executor_backend="mp",
enable_lora=enable_lora,
gpu_memory_utilization=gpu_memory_utilization,
tensor_parallel_size=tp_size,
......
......@@ -1486,12 +1486,6 @@ class EngineArgs:
#############################################################
# Unsupported Feature Flags on V1.
if self.load_format == "sharded_state":
_raise_or_fallback(
feature_name=f"--load_format {self.load_format}",
recommend_to_remove=False)
return False
if (self.logits_processor_pattern
!= EngineArgs.logits_processor_pattern):
_raise_or_fallback(feature_name="--logits-processor-pattern",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment