[CI/Test] improve robustness of test (vllm_runner) (#5357)

[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357)

[CI/Test] improve robustness of test (vllm_runner) (#5357)
[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357)
8ea5e44a · youkaichao · GitHub · 9fb900f9 · 8ea5e44a · 8ea5e44a
Unverified Commit 8ea5e44a authored Jun 08, 2024 by youkaichao Committed by GitHub Jun 08, 2024
8 changed files
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -16,9 +16,9 @@ capability = capability[0] * 10 + capability[1]
    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
    reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
-    llm = vllm_runner("facebook/opt-125m", quantization="fp8")
+    with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
        fc1 = model.model.decoder.layers[0].fc1
        assert isinstance(fc1.quant_method, Fp8LinearMethod)
        assert fc1.weight.dtype == torch.float8_e4m3fn
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -2,10 +2,8 @@
 Run `pytest tests/samplers/test_beam_search.py`.
 """
-import gc
 import pytest
-import torch
 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
@@ -34,14 +32,9 @@ def test_beam_search_single_input(
        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
                                                   max_tokens)
-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
-    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
+        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
-                                                   max_tokens)
+                                                       beam_width, max_tokens)
-    del vllm_model
-    # NOTE(woosuk): For some reason, the following GC is required to avoid
-    # GPU OOM errors in the following tests using `vllm_runner`.
-    gc.collect()
-    torch.cuda.empty_cache()
    for i in range(len(example_prompts)):
        hf_output_ids, _ = hf_outputs[i]

--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -22,8 +22,9 @@ def test_ignore_eos(
    dtype: str,
    max_tokens: int,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
-    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         ignore_eos=True)
        for prompt in example_prompts:
            ignore_eos_output = vllm_model.model.generate(

--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -14,7 +14,7 @@ def test_logits_processor_force_generate(
    model: str,
    dtype: str,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
        tokenizer = vllm_model.model.get_tokenizer()
        repeat_times = 2
        enforced_answers = " vLLM"

--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -38,14 +38,14 @@ def test_get_prompt_logprobs(
            max_tokens=max_tokens,
        )
-    vllm_model = vllm_runner(
+    with vllm_runner(
            model,
            dtype=dtype,
            max_logprobs=num_top_logprobs,
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_batched_tokens=max_num_batched_tokens,
            max_num_seqs=max_num_seqs,
-    )
+    ) as vllm_model:
        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
                                              logprobs=num_top_logprobs,
                                              prompt_logprobs=num_top_logprobs,

--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -17,23 +17,18 @@ def test_ranks(
    num_top_logprobs = 5
    num_prompt_logprobs = 5
-    vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs)
+    with vllm_runner(model, dtype=dtype,
+                     max_logprobs=num_top_logprobs) as vllm_model:
        ## Test greedy logprobs ranks
-    vllm_sampling_params = SamplingParams(temperature=0.0,
+        vllm_sampling_params = SamplingParams(
+            temperature=0.0,
            top_p=1.0,
            max_tokens=max_tokens,
            logprobs=num_top_logprobs,
            prompt_logprobs=num_prompt_logprobs)
        vllm_results = vllm_model.generate_w_logprobs(example_prompts,
                                                      vllm_sampling_params)
-    for result in vllm_results:
-        assert result[2] is not None
-        assert len(result[2]) == len(result[0])
-        # check whether all chosen tokens have ranks = 1
-        for token, logprobs in zip(result[0], result[2]):
-            assert token in logprobs
-            assert logprobs[token].rank == 1
        ## Test non-greedy logprobs ranks
        sampling_params = SamplingParams(temperature=1.0,
@@ -42,6 +37,15 @@ def test_ranks(
                                         logprobs=num_top_logprobs,
                                         prompt_logprobs=num_prompt_logprobs)
        res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
+    for result in vllm_results:
+        assert result[2] is not None
+        assert len(result[2]) == len(result[0])
+        # check whether all chosen tokens have ranks = 1
+        for token, logprobs in zip(result[0], result[2]):
+            assert token in logprobs
+            assert logprobs[token].rank == 1
    for result in res:
        assert result[2] is not None
        assert len(result[2]) == len(result[0])

--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -17,9 +17,8 @@ RANDOM_SEEDS = list(range(5))
 @pytest.fixture
 def vllm_model(vllm_runner):
-    vllm_model = vllm_runner(MODEL, dtype="half")
+    with vllm_runner(MODEL, dtype="half") as vllm_model:
        yield vllm_model
-    del vllm_model
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)

--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
-import gc
 import json
 import os
 import subprocess
@@ -7,7 +6,6 @@ from unittest.mock import MagicMock, patch
 import openai
 import pytest
 import ray
-import torch
 from vllm import SamplingParams
 # yapf: disable
@@ -71,15 +69,15 @@ def test_can_deserialize_s3(vllm_runner):
    model_ref = "EleutherAI/pythia-1.4b"
    tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
-    loaded_hf_model = vllm_runner(model_ref,
+    with vllm_runner(model_ref,
                                  load_format="tensorizer",
                                  model_loader_extra_config=TensorizerConfig(
                                      tensorizer_uri=tensorized_path,
                                      num_readers=1,
                                      s3_endpoint="object.ord1.coreweave.com",
-                                  ))
+                                  )) as loaded_hf_model:
-    deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params)
+        deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params) # noqa: E501
        assert deserialized_outputs
@@ -87,7 +85,7 @@ def test_can_deserialize_s3(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
        vllm_runner, tmp_path):
-    vllm_model = vllm_runner(model_ref)
+    with vllm_runner(model_ref) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
        key_path = tmp_path / (model_ref + ".key")
        outputs = vllm_model.generate(prompts, sampling_params)
@@ -97,19 +95,15 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
                            config_for_serializing,
                            encryption_key_path=key_path)
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
    config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                encryption_keyfile=key_path)
-    loaded_vllm_model = vllm_runner(
+    with vllm_runner(
        model_ref,
        load_format="tensorizer",
-        model_loader_extra_config=config_for_deserializing)
+        model_loader_extra_config=config_for_deserializing) as loaded_vllm_model: # noqa: E501
-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501
        assert outputs == deserialized_outputs
@@ -124,12 +118,12 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
            serializer = TensorSerializer(stream)
            serializer.write_module(hf_model.model)
-    loaded_hf_model = vllm_runner(model_ref,
+    with vllm_runner(model_ref,
                                  load_format="tensorizer",
                                  model_loader_extra_config=TensorizerConfig(
                                      tensorizer_uri=model_path,
                                      num_readers=1,
-                                  ))
+                                  )) as loaded_hf_model:
        deserialized_outputs = loaded_hf_model.generate_greedy(
            prompts, max_tokens=max_tokens)
@@ -148,16 +142,13 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
    test_prompts = create_test_prompts(lora_path)
    # Serialize model before deserializing and binding LoRA adapters
-    vllm_model = vllm_runner(model_ref, )
+    with vllm_runner(model_ref, ) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
        serialize_vllm_model(vllm_model.model.llm_engine,
                            TensorizerConfig(tensorizer_uri=model_path))
-    del vllm_model
+    with vllm_runner(
-    gc.collect()
-    torch.cuda.empty_cache()
-    loaded_vllm_model = vllm_runner(
        model_ref,
        load_format="tensorizer",
        model_loader_extra_config=TensorizerConfig(
@@ -170,7 +161,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
        max_cpu_loras=2,
        max_num_seqs=50,
        max_model_len=1000,
-    )
+    ) as loaded_vllm_model:
        process_requests(loaded_vllm_model.model.llm_engine, test_prompts)
        assert loaded_vllm_model
@@ -186,7 +177,7 @@ def test_load_without_tensorizer_load_format(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
    ## Serialize model
-    vllm_model = vllm_runner(model_ref, )
+    with vllm_runner(model_ref, ) as vllm_model:
        model_path = tmp_path / (model_ref + ".tensors")
        serialize_vllm_model(vllm_model.model.llm_engine,
@@ -196,10 +187,6 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
            "tensorizer_uri": str(model_path),
        }
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
    ## Start OpenAI API server
    openai_args = [
        "--model", model_ref, "--dtype", "float16", "--load-format",
@@ -260,18 +247,15 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
    model_path = tmp_path / (model_ref + ".tensors")
    config = TensorizerConfig(tensorizer_uri=str(model_path))
-    vllm_model = vllm_runner(model_ref)
+    with vllm_runner(model_ref) as vllm_model:
        outputs = vllm_model.generate(prompts, sampling_params)
        serialize_vllm_model(vllm_model.model.llm_engine, config)
        assert is_vllm_tensorized(config)
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
-    loaded_vllm_model = vllm_runner(model_ref,
+    with vllm_runner(model_ref,
                    load_format="tensorizer",
-                                    model_loader_extra_config=config)
+                    model_loader_extra_config=config) as loaded_vllm_model:
-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501
        assert outputs == deserialized_outputs