[CI/Test] improve robustness of test (vllm_runner) (#5357)

[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357)

[CI/Test] improve robustness of test (vllm_runner) (#5357)
[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357)
8ea5e44a · youkaichao · GitHub · 9fb900f9 · 8ea5e44a · 8ea5e44a
Unverified Commit 8ea5e44a authored Jun 08, 2024 by youkaichao Committed by GitHub Jun 08, 2024
8 changed files
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -16,9 +16,9 @@ capability = capability[0] * 10 + capability[1]
    capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
    reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
-    llm = vllm_runner("facebook/opt-125m", quantization="fp8")
+    with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:

-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
-    fc1 = model.model.decoder.layers[0].fc1
-    assert isinstance(fc1.quant_method, Fp8LinearMethod)
-    assert fc1.weight.dtype == torch.float8_e4m3fn
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        fc1 = model.model.decoder.layers[0].fc1
+        assert isinstance(fc1.quant_method, Fp8LinearMethod)
+        assert fc1.weight.dtype == torch.float8_e4m3fn
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -2,10 +2,8 @@

 Run `pytest tests/samplers/test_beam_search.py`.
 """
-import gc

 import pytest
-import torch

 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
@@ -34,14 +32,9 @@ def test_beam_search_single_input(
        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
                                                   max_tokens)

-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
-    del vllm_model
-    # NOTE(woosuk): For some reason, the following GC is required to avoid
-    # GPU OOM errors in the following tests using `vllm_runner`.
-    gc.collect()
-    torch.cuda.empty_cache()
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
+                                                       beam_width, max_tokens)

    for i in range(len(example_prompts)):
        hf_output_ids, _ = hf_outputs[i]

--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -22,11 +22,12 @@ def test_ignore_eos(
    dtype: str,
    max_tokens: int,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
-    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         ignore_eos=True)

-    for prompt in example_prompts:
-        ignore_eos_output = vllm_model.model.generate(
-            prompt, sampling_params=sampling_params)
-        output_length = len(ignore_eos_output[0].outputs[0].token_ids)
-        assert output_length == max_tokens
+        for prompt in example_prompts:
+            ignore_eos_output = vllm_model.model.generate(
+                prompt, sampling_params=sampling_params)
+            output_length = len(ignore_eos_output[0].outputs[0].token_ids)
+            assert output_length == max_tokens
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -14,46 +14,46 @@ def test_logits_processor_force_generate(
    model: str,
    dtype: str,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
-    tokenizer = vllm_model.model.get_tokenizer()
-    repeat_times = 2
-    enforced_answers = " vLLM"
-    vllm_token_ids = tokenizer.encode(enforced_answers,
-                                      add_special_tokens=False)
-    max_tokens = len(vllm_token_ids) * repeat_times
-
-    def pick_vllm(token_ids, logits):
-        token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
-        logits[token_id] = torch.finfo(logits.dtype).max
-        return logits
-
-    params_with_logprobs = SamplingParams(
-        logits_processors=[pick_vllm],
-        prompt_logprobs=3,
-        max_tokens=max_tokens,
-    )
-
-    # test logits_processors when prompt_logprobs is not None
-    vllm_model.model._add_request(
-        example_prompts[0],
-        params=params_with_logprobs,
-    )
-
-    # test prompt_logprobs is not None
-    vllm_model.model._add_request(
-        example_prompts[1],
-        params=SamplingParams(
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        tokenizer = vllm_model.model.get_tokenizer()
+        repeat_times = 2
+        enforced_answers = " vLLM"
+        vllm_token_ids = tokenizer.encode(enforced_answers,
+                                          add_special_tokens=False)
+        max_tokens = len(vllm_token_ids) * repeat_times
+
+        def pick_vllm(token_ids, logits):
+            token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
+            logits[token_id] = torch.finfo(logits.dtype).max
+            return logits
+
+        params_with_logprobs = SamplingParams(
+            logits_processors=[pick_vllm],
            prompt_logprobs=3,
            max_tokens=max_tokens,
-        ),
-    )
-
-    # test grouped requests
-    vllm_model.model._add_request(
-        example_prompts[2],
-        params=SamplingParams(max_tokens=max_tokens),
-    )
-
-    outputs = vllm_model.model._run_engine(use_tqdm=False)
-
-    assert outputs[0].outputs[0].text == enforced_answers * repeat_times
+        )
+
+        # test logits_processors when prompt_logprobs is not None
+        vllm_model.model._add_request(
+            example_prompts[0],
+            params=params_with_logprobs,
+        )
+
+        # test prompt_logprobs is not None
+        vllm_model.model._add_request(
+            example_prompts[1],
+            params=SamplingParams(
+                prompt_logprobs=3,
+                max_tokens=max_tokens,
+            ),
+        )
+
+        # test grouped requests
+        vllm_model.model._add_request(
+            example_prompts[2],
+            params=SamplingParams(max_tokens=max_tokens),
+        )
+
+        outputs = vllm_model.model._run_engine(use_tqdm=False)
+
+        assert outputs[0].outputs[0].text == enforced_answers * repeat_times
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -38,21 +38,21 @@ def test_get_prompt_logprobs(
            max_tokens=max_tokens,
        )

-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        max_logprobs=num_top_logprobs,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_num_seqs=max_num_seqs,
-    )
-    vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
-                                          logprobs=num_top_logprobs,
-                                          prompt_logprobs=num_top_logprobs,
-                                          temperature=0.0,
-                                          detokenize=detokenize)
-    vllm_results = vllm_model.model.generate(
-        example_prompts, sampling_params=vllm_sampling_params)
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_logprobs=num_top_logprobs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
+                                              logprobs=num_top_logprobs,
+                                              prompt_logprobs=num_top_logprobs,
+                                              temperature=0.0,
+                                              detokenize=detokenize)
+        vllm_results = vllm_model.model.generate(
+            example_prompts, sampling_params=vllm_sampling_params)

    # Test whether logprobs are included in the results.
    for result in vllm_results:

--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -17,16 +17,27 @@ def test_ranks(
    num_top_logprobs = 5
    num_prompt_logprobs = 5

-    vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs)
-
-    ## Test greedy logprobs ranks
-    vllm_sampling_params = SamplingParams(temperature=0.0,
-                                          top_p=1.0,
-                                          max_tokens=max_tokens,
-                                          logprobs=num_top_logprobs,
-                                          prompt_logprobs=num_prompt_logprobs)
-    vllm_results = vllm_model.generate_w_logprobs(example_prompts,
-                                                  vllm_sampling_params)
+    with vllm_runner(model, dtype=dtype,
+                     max_logprobs=num_top_logprobs) as vllm_model:
+
+        ## Test greedy logprobs ranks
+        vllm_sampling_params = SamplingParams(
+            temperature=0.0,
+            top_p=1.0,
+            max_tokens=max_tokens,
+            logprobs=num_top_logprobs,
+            prompt_logprobs=num_prompt_logprobs)
+        vllm_results = vllm_model.generate_w_logprobs(example_prompts,
+                                                      vllm_sampling_params)
+
+        ## Test non-greedy logprobs ranks
+        sampling_params = SamplingParams(temperature=1.0,
+                                         top_p=1.0,
+                                         max_tokens=max_tokens,
+                                         logprobs=num_top_logprobs,
+                                         prompt_logprobs=num_prompt_logprobs)
+        res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
+
    for result in vllm_results:
        assert result[2] is not None
        assert len(result[2]) == len(result[0])
@@ -35,13 +46,6 @@ def test_ranks(
            assert token in logprobs
            assert logprobs[token].rank == 1

-    ## Test non-greedy logprobs ranks
-    sampling_params = SamplingParams(temperature=1.0,
-                                     top_p=1.0,
-                                     max_tokens=max_tokens,
-                                     logprobs=num_top_logprobs,
-                                     prompt_logprobs=num_prompt_logprobs)
-    res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
    for result in res:
        assert result[2] is not None
        assert len(result[2]) == len(result[0])

--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -17,9 +17,8 @@ RANDOM_SEEDS = list(range(5))

 @pytest.fixture
 def vllm_model(vllm_runner):
-    vllm_model = vllm_runner(MODEL, dtype="half")
-    yield vllm_model
-    del vllm_model
+    with vllm_runner(MODEL, dtype="half") as vllm_model:
+        yield vllm_model


 @pytest.mark.parametrize("seed", RANDOM_SEEDS)

--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
-import gc
 import json
 import os
 import subprocess
@@ -7,7 +6,6 @@ from unittest.mock import MagicMock, patch
 import openai
 import pytest
 import ray
-import torch

 from vllm import SamplingParams
 # yapf: disable
@@ -71,47 +69,43 @@ def test_can_deserialize_s3(vllm_runner):
    model_ref = "EleutherAI/pythia-1.4b"
    tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"

-    loaded_hf_model = vllm_runner(model_ref,
+    with vllm_runner(model_ref,
                                  load_format="tensorizer",
                                  model_loader_extra_config=TensorizerConfig(
                                      tensorizer_uri=tensorized_path,
                                      num_readers=1,
                                      s3_endpoint="object.ord1.coreweave.com",
-                                  ))
+                                  )) as loaded_hf_model:

-    deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params)
+        deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params) # noqa: E501

-    assert deserialized_outputs
+        assert deserialized_outputs


 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
        vllm_runner, tmp_path):
-    vllm_model = vllm_runner(model_ref)
-    model_path = tmp_path / (model_ref + ".tensors")
-    key_path = tmp_path / (model_ref + ".key")
-    outputs = vllm_model.generate(prompts, sampling_params)
-
-    config_for_serializing = TensorizerConfig(tensorizer_uri=model_path)
-    serialize_vllm_model(vllm_model.model.llm_engine,
-                         config_for_serializing,
-                         encryption_key_path=key_path)
+    with vllm_runner(model_ref) as vllm_model:
+        model_path = tmp_path / (model_ref + ".tensors")
+        key_path = tmp_path / (model_ref + ".key")
+        outputs = vllm_model.generate(prompts, sampling_params)

-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
+        config_for_serializing = TensorizerConfig(tensorizer_uri=model_path)
+        serialize_vllm_model(vllm_model.model.llm_engine,
+                            config_for_serializing,
+                            encryption_key_path=key_path)

    config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                encryption_keyfile=key_path)

-    loaded_vllm_model = vllm_runner(
+    with vllm_runner(
        model_ref,
        load_format="tensorizer",
-        model_loader_extra_config=config_for_deserializing)
+        model_loader_extra_config=config_for_deserializing) as loaded_vllm_model: # noqa: E501

-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501

-    assert outputs == deserialized_outputs
+        assert outputs == deserialized_outputs


 def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
@@ -124,17 +118,17 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
            serializer = TensorSerializer(stream)
            serializer.write_module(hf_model.model)

-    loaded_hf_model = vllm_runner(model_ref,
+    with vllm_runner(model_ref,
                                  load_format="tensorizer",
                                  model_loader_extra_config=TensorizerConfig(
                                      tensorizer_uri=model_path,
                                      num_readers=1,
-                                  ))
+                                  )) as loaded_hf_model:

-    deserialized_outputs = loaded_hf_model.generate_greedy(
-        prompts, max_tokens=max_tokens)
+        deserialized_outputs = loaded_hf_model.generate_greedy(
+            prompts, max_tokens=max_tokens)

-    assert outputs == deserialized_outputs
+        assert outputs == deserialized_outputs


 def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
@@ -148,16 +142,13 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
    test_prompts = create_test_prompts(lora_path)

    # Serialize model before deserializing and binding LoRA adapters
-    vllm_model = vllm_runner(model_ref, )
-    model_path = tmp_path / (model_ref + ".tensors")
+    with vllm_runner(model_ref, ) as vllm_model:
+        model_path = tmp_path / (model_ref + ".tensors")

-    serialize_vllm_model(vllm_model.model.llm_engine,
-                         TensorizerConfig(tensorizer_uri=model_path))
+        serialize_vllm_model(vllm_model.model.llm_engine,
+                            TensorizerConfig(tensorizer_uri=model_path))

-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
-    loaded_vllm_model = vllm_runner(
+    with vllm_runner(
        model_ref,
        load_format="tensorizer",
        model_loader_extra_config=TensorizerConfig(
@@ -170,10 +161,10 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
        max_cpu_loras=2,
        max_num_seqs=50,
        max_model_len=1000,
-    )
-    process_requests(loaded_vllm_model.model.llm_engine, test_prompts)
+    ) as loaded_vllm_model:
+        process_requests(loaded_vllm_model.model.llm_engine, test_prompts)

-    assert loaded_vllm_model
+        assert loaded_vllm_model


 def test_load_without_tensorizer_load_format(vllm_runner):
@@ -186,19 +177,15 @@ def test_load_without_tensorizer_load_format(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
    ## Serialize model
-    vllm_model = vllm_runner(model_ref, )
-    model_path = tmp_path / (model_ref + ".tensors")
-
-    serialize_vllm_model(vllm_model.model.llm_engine,
-                         TensorizerConfig(tensorizer_uri=model_path))
+    with vllm_runner(model_ref, ) as vllm_model:
+        model_path = tmp_path / (model_ref + ".tensors")

-    model_loader_extra_config = {
-        "tensorizer_uri": str(model_path),
-    }
+        serialize_vllm_model(vllm_model.model.llm_engine,
+                            TensorizerConfig(tensorizer_uri=model_path))

-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
+        model_loader_extra_config = {
+            "tensorizer_uri": str(model_path),
+        }

    ## Start OpenAI API server
    openai_args = [
@@ -260,18 +247,15 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
    model_path = tmp_path / (model_ref + ".tensors")
    config = TensorizerConfig(tensorizer_uri=str(model_path))

-    vllm_model = vllm_runner(model_ref)
-    outputs = vllm_model.generate(prompts, sampling_params)
-    serialize_vllm_model(vllm_model.model.llm_engine, config)
+    with vllm_runner(model_ref) as vllm_model:
+        outputs = vllm_model.generate(prompts, sampling_params)
+        serialize_vllm_model(vllm_model.model.llm_engine, config)

-    assert is_vllm_tensorized(config)
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
+        assert is_vllm_tensorized(config)

-    loaded_vllm_model = vllm_runner(model_ref,
-                                    load_format="tensorizer",
-                                    model_loader_extra_config=config)
-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+    with vllm_runner(model_ref,
+                    load_format="tensorizer",
+                    model_loader_extra_config=config) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501

-    assert outputs == deserialized_outputs
+        assert outputs == deserialized_outputs