Commit a715dfbe authored by zhuwenwen's avatar zhuwenwen
Browse files

update test_basic_correctness.py

parent 4e06836d
...@@ -74,63 +74,63 @@ def test_models( ...@@ -74,63 +74,63 @@ def test_models(
) )
@multi_gpu_test(num_gpus=2) # @multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize( # @pytest.mark.parametrize(
"model, distributed_executor_backend, attention_backend, " # "model, distributed_executor_backend, attention_backend, "
"test_suite", [ # "test_suite", [
("facebook/opt-125m", "ray", "", "L4"), # ("facebook/opt-125m", "ray", "", "L4"),
("facebook/opt-125m", "mp", "", "L4"), # ("facebook/opt-125m", "mp", "", "L4"),
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"), # ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"), # ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
("facebook/opt-125m", "ray", "", "A100"), # ("facebook/opt-125m", "ray", "", "A100"),
("facebook/opt-125m", "mp", "", "A100"), # ("facebook/opt-125m", "mp", "", "A100"),
("facebook/opt-125m", "mp", "FLASHINFER", "A100"), # ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), # ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
]) # ])
def test_models_distributed( # def test_models_distributed(
hf_runner, # hf_runner,
vllm_runner, # vllm_runner,
example_prompts, # example_prompts,
model: str, # model: str,
distributed_executor_backend: str, # distributed_executor_backend: str,
attention_backend: str, # attention_backend: str,
test_suite: str, # test_suite: str,
) -> None: # ) -> None:
if test_suite != TARGET_TEST_SUITE: # if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}") # pytest.skip(f"Skip test for {test_suite}")
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa # if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# test ray adag # # test ray adag
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1" # os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1" # os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
if attention_backend: # if attention_backend:
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend # os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
dtype = "half" # dtype = "half"
max_tokens = 5 # max_tokens = 5
# NOTE: take care of the order. run vLLM first, and then run HF. # # NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization. # # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it # # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # # will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model, # with vllm_runner(model,
dtype=dtype, # dtype=dtype,
tensor_parallel_size=2, # tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend # distributed_executor_backend=distributed_executor_backend
) as vllm_model: # ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) # vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model: # with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) # hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal( # check_outputs_equal(
outputs_0_lst=hf_outputs, # outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs, # outputs_1_lst=vllm_outputs,
name_0="hf", # name_0="hf",
name_1="vllm", # name_1="vllm",
) # )
def test_model_with_failure(vllm_runner) -> None: def test_model_with_failure(vllm_runner) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment