[Misc] unify variable for LLM instance (#20996)

Signed-off-by: Andy Xie <andy.xning@gmail.com>

[Misc] unify variable for LLM instance (#20996)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
d9784107 · Ning Xie · GitHub · e6b90a28 · d9784107 · d9784107
Unverified Commit d9784107 authored Jul 21, 2025 by Ning Xie Committed by GitHub Jul 21, 2025
20 changed files
--- a/docs/configuration/model_resolution.md
+++ b/docs/configuration/model_resolution.md
@@ -14,7 +14,7 @@ For example:
 ```python
 from vllm import LLM

-model = LLM(
+llm = LLM(
    model="cerebras/Cerebras-GPT-1.3B",
    hf_overrides={"architectures": ["GPT2LMHeadModel"]},  # GPT-2
 )

--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -302,7 +302,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
        return tokenizer.apply_chat_template(chat, tokenize=False)


-    model = LLM(
+    llm = LLM(
        model=model_id,
        enable_lora=True,
        max_lora_rank=64,
@@ -329,7 +329,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
    }


-    outputs = model.generate(
+    outputs = llm.generate(
        inputs,
        sampling_params=SamplingParams(
            temperature=0.2,

--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -86,8 +86,9 @@ Load and run the model in `vllm`:

 ```python
 from vllm import LLM
-model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
-result = model.generate("Hello my name is")
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
+result = llm.generate("Hello my name is")
 print(result[0].outputs[0].text)
 ```

@@ -125,9 +126,10 @@ In this mode, all Linear modules (except for the final `lm_head`) have their wei

 ```python
 from vllm import LLM
-model = LLM("facebook/opt-125m", quantization="fp8")
+
+llm = LLM("facebook/opt-125m", quantization="fp8")
 # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
-result = model.generate("Hello, my name is")
+result = llm.generate("Hello, my name is")
 print(result[0].outputs[0].text)
 ```


--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -108,7 +108,8 @@ After quantization, you can load and run the model in vLLM:

 ```python
 from vllm import LLM
-model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
 ```

 To evaluate accuracy, you can use `lm_eval`:

--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -114,7 +114,8 @@ After quantization, you can load and run the model in vLLM:

 ```python
 from vllm import LLM
-model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
 ```

 To evaluate accuracy, you can use `lm_eval`:

--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -174,10 +174,10 @@ You can change the output dimensions of embedding models that support Matryoshka
 ```python
 from vllm import LLM, PoolingParams

-model = LLM(model="jinaai/jina-embeddings-v3", 
+llm = LLM(model="jinaai/jina-embeddings-v3",
          task="embed",
          trust_remote_code=True)
-outputs = model.embed(["Follow the white rabbit."], 
+outputs = llm.embed(["Follow the white rabbit."],
                    pooling_params=PoolingParams(dimensions=32))
 print(outputs[0].outputs)
 ```

--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -28,10 +28,10 @@ def main(args: Namespace):

    # Create an LLM.
    # You should pass task="classify" for classification models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))

    # Generate logits. The output is a list of ClassificationRequestOutputs.
-    outputs = model.classify(prompts)
+    outputs = llm.classify(prompts)

    # Print the outputs.
    print("\nGenerated Outputs:\n" + "-" * 60)

--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -31,10 +31,10 @@ def main(args: Namespace):

    # Create an LLM.
    # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))

    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
-    outputs = model.embed(prompts)
+    outputs = llm.embed(prompts)

    # Print the outputs.
    print("\nGenerated Outputs:\n" + "-" * 60)

--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -27,10 +27,10 @@ def main(args: Namespace):

    # Create an LLM.
    # You should pass task="score" for cross-encoder models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))

    # Generate scores. The output is a list of ScoringRequestOutputs.
-    outputs = model.score(text_1, texts_2)
+    outputs = llm.score(text_1, texts_2)

    # Print the outputs.
    print("\nGenerated Outputs:\n" + "-" * 60)

--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -30,11 +30,11 @@ def main(args: Namespace):

    # Create an LLM.
    # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))

    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
    # Only text matching task is supported for now. See #16120
-    outputs = model.embed(prompts)
+    outputs = llm.embed(prompts)

    # Print the outputs.
    print("\nGenerated Outputs:")

--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -30,10 +30,10 @@ def main(args: Namespace):

    # Create an LLM.
    # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))

    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
-    outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
+    outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32))

    # Print the outputs.
    print("\nGenerated Outputs:")

--- a/examples/offline_inference/neuron_speculation.py
+++ b/examples/offline_inference/neuron_speculation.py
@@ -25,7 +25,7 @@ def config_buckets():
    os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"


-def initialize_model():
+def initialize_llm():
    """Create an LLM with speculative decoding."""
    return LLM(
        model="openlm-research/open_llama_7b",
@@ -43,9 +43,9 @@ def initialize_model():
    )


-def process_requests(model: LLM, sampling_params: SamplingParams):
+def process_requests(llm: LLM, sampling_params: SamplingParams):
    """Generate texts from prompts and print them."""
-    outputs = model.generate(prompts, sampling_params)
+    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
@@ -53,12 +53,12 @@ def process_requests(model: LLM, sampling_params: SamplingParams):


 def main():
-    """Main function that sets up the model and processes prompts."""
+    """Main function that sets up the llm and processes prompts."""
    config_buckets()
-    model = initialize_model()
+    llm = initialize_llm()
    # Create a sampling params object.
    sampling_params = SamplingParams(max_tokens=100, top_k=1)
-    process_requests(model, sampling_params)
+    process_requests(llm, sampling_params)


 if __name__ == "__main__":

--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -140,7 +140,7 @@ datamodule_config = {
 class PrithviMAE:
    def __init__(self):
        print("Initializing PrithviMAE model")
-        self.model = LLM(
+        self.llm = LLM(
            model=os.path.join(os.path.dirname(__file__), "./model"),
            skip_tokenizer_init=True,
            dtype="float32",
@@ -158,7 +158,7 @@ class PrithviMAE:

        prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}

-        outputs = self.model.encode(prompt, use_tqdm=False)
+        outputs = self.llm.encode(prompt, use_tqdm=False)
        print("################ Inference done (it took seconds)  ##############")

        return outputs[0].outputs.data

--- a/examples/offline_inference/qwen3_reranker.py
+++ b/examples/offline_inference/qwen3_reranker.py
@@ -17,13 +17,13 @@ model_name = "Qwen/Qwen3-Reranker-0.6B"
 # Models converted offline using this method can not only be more efficient
 # and support the vllm score API, but also make the init parameters more
 # concise, for example.
-# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
+# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")

 # If you want to load the official original version, the init parameters are
 # as follows.


-def get_model() -> LLM:
+def get_llm() -> LLM:
    """Initializes and returns the LLM model for Qwen3-Reranker."""
    return LLM(
        model=model_name,
@@ -77,8 +77,8 @@ def main() -> None:
    ]
    documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]

-    model = get_model()
-    outputs = model.score(queries, documents)
+    llm = get_llm()
+    outputs = llm.score(queries, documents)

    print("-" * 30)
    print([output.outputs.score for output in outputs])

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -236,13 +236,13 @@ def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')

    with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
-        if isinstance(vllm_model.model.llm_engine, LLMEngineV1):
+        if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
            v1_test_failed_model_execution(vllm_model)


 def v1_test_failed_model_execution(vllm_model):

-    engine = vllm_model.model.llm_engine
+    engine = vllm_model.llm.llm_engine
    mocked_execute_model = Mock(
        side_effect=RuntimeError("Mocked Critical Error"))
    engine.engine_core.engine_core.model_executor.execute_model =\

--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -81,7 +81,7 @@ def test_chunked_prefill_recompute(
            disable_log_stats=False,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
                < ARTIFICIAL_PREEMPTION_MAX_CNT)

    for i in range(len(example_prompts)):
@@ -118,10 +118,10 @@ def test_preemption(
            distributed_executor_backend=distributed_executor_backend,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
                < ARTIFICIAL_PREEMPTION_MAX_CNT)
        total_preemption = (
-            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
+            vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)

    check_outputs_equal(
        outputs_0_lst=hf_outputs,
@@ -174,12 +174,12 @@ def test_preemption_infeasible(
    ) as vllm_model:
        sampling_params = SamplingParams(max_tokens=max_tokens,
                                         ignore_eos=True)
-        req_outputs = vllm_model.model.generate(
+        req_outputs = vllm_model.llm.generate(
            example_prompts,
            sampling_params=sampling_params,
        )

-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
                < ARTIFICIAL_PREEMPTION_MAX_CNT)

    # Verify the request is ignored and not hang.

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -784,7 +784,7 @@ class VllmRunner:
        enforce_eager: Optional[bool] = False,
        **kwargs,
    ) -> None:
-        self.model = LLM(
+        self.llm = LLM(
            model=model_name,
            task=task,
            tokenizer=tokenizer_name,
@@ -854,7 +854,7 @@ class VllmRunner:
                                 videos=videos,
                                 audios=audios)

-        req_outputs = self.model.generate(inputs,
+        req_outputs = self.llm.generate(inputs,
                                        sampling_params=sampling_params,
                                        **kwargs)

@@ -902,7 +902,7 @@ class VllmRunner:
                                 videos=videos,
                                 audios=audios)

-        req_outputs = self.model.generate(inputs,
+        req_outputs = self.llm.generate(inputs,
                                        sampling_params=sampling_params,
                                        **kwargs)

@@ -924,7 +924,7 @@ class VllmRunner:
        '''

        assert sampling_params.logprobs is not None
-        req_outputs = self.model.generate(encoder_decoder_prompts,
+        req_outputs = self.llm.generate(encoder_decoder_prompts,
                                        sampling_params=sampling_params)
        toks_str_logsprobs_prompt_logprobs = (
            self._final_steps_generate_w_logprobs(req_outputs))
@@ -1018,7 +1018,7 @@ class VllmRunner:
                                 videos=videos,
                                 audios=audios)

-        outputs = self.model.beam_search(
+        outputs = self.llm.beam_search(
            inputs,
            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
        returned_outputs = []
@@ -1029,7 +1029,7 @@ class VllmRunner:
        return returned_outputs

    def classify(self, prompts: list[str]) -> list[list[float]]:
-        req_outputs = self.model.classify(prompts)
+        req_outputs = self.llm.classify(prompts)
        return [req_output.outputs.probs for req_output in req_outputs]

    def embed(self,
@@ -1044,11 +1044,11 @@ class VllmRunner:
                                 videos=videos,
                                 audios=audios)

-        req_outputs = self.model.embed(inputs, *args, **kwargs)
+        req_outputs = self.llm.embed(inputs, *args, **kwargs)
        return [req_output.outputs.embedding for req_output in req_outputs]

    def encode(self, prompts: list[str]) -> list[list[float]]:
-        req_outputs = self.model.encode(prompts)
+        req_outputs = self.llm.encode(prompts)
        return [req_output.outputs.data for req_output in req_outputs]

    def score(
@@ -1058,18 +1058,18 @@ class VllmRunner:
        *args,
        **kwargs,
    ) -> list[float]:
-        req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
+        req_outputs = self.llm.score(text_1, text_2, *args, **kwargs)
        return [req_output.outputs.score for req_output in req_outputs]

    def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        executor = self.model.llm_engine.model_executor
+        executor = self.llm.llm_engine.model_executor
        return executor.apply_model(func)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
-        del self.model
+        del self.llm
        cleanup_dist_env_and_memory()



--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
@@ -37,7 +37,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
                        num_scheduler_steps=num_scheduler_steps,
                        enable_chunked_prefill=enable_chunked_prefill,
                        enforce_eager=enforce_eager)
-    engine: LLMEngine = runner.model.llm_engine
+    engine: LLMEngine = runner.llm.llm_engine

    # In multi-step + chunked-prefill there is no separate single prompt step.
    # What is scheduled will run for num_scheduler_steps always.

--- a/tests/detokenizer/test_stop_reason.py
+++ b/tests/detokenizer/test_stop_reason.py
@@ -28,7 +28,7 @@ def vllm_model(vllm_runner):
 def test_stop_reason(vllm_model, example_prompts):
    tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
    stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
-    llm = vllm_model.model
+    llm = vllm_model.llm

    # test stop token
    outputs = llm.generate(example_prompts,

--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@@ -101,42 +101,42 @@ def _stop_token_id(llm):
 def test_stop_strings():
    # If V0, must set enforce_eager=False since we use
    # async output processing below.
-    vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
+    llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)

    if envs.VLLM_USE_V1:
-        _stop_basic(vllm_model)
+        _stop_basic(llm)
    else:
-        _set_async_mode(vllm_model, True)
-        _stop_basic(vllm_model)
+        _set_async_mode(llm, True)
+        _stop_basic(llm)

-        _set_async_mode(vllm_model, False)
-        _stop_basic(vllm_model)
+        _set_async_mode(llm, False)
+        _stop_basic(llm)

    if envs.VLLM_USE_V1:
-        _stop_multi_tokens(vllm_model)
+        _stop_multi_tokens(llm)
    else:
-        _set_async_mode(vllm_model, True)
-        _stop_multi_tokens(vllm_model)
+        _set_async_mode(llm, True)
+        _stop_multi_tokens(llm)

-        _set_async_mode(vllm_model, False)
-        _stop_multi_tokens(vllm_model)
+        _set_async_mode(llm, False)
+        _stop_multi_tokens(llm)

    if envs.VLLM_USE_V1:
-        _stop_partial_token(vllm_model)
+        _stop_partial_token(llm)
    else:
-        _set_async_mode(vllm_model, True)
-        _stop_partial_token(vllm_model)
+        _set_async_mode(llm, True)
+        _stop_partial_token(llm)

-        _set_async_mode(vllm_model, False)
-        _stop_partial_token(vllm_model)
+        _set_async_mode(llm, False)
+        _stop_partial_token(llm)

    if envs.VLLM_USE_V1:
        # FIXME: this does not respect include_in_output=False
-        # _stop_token_id(vllm_model)
+        # _stop_token_id(llm)
        pass
    else:
-        _set_async_mode(vllm_model, True)
-        _stop_token_id(vllm_model)
+        _set_async_mode(llm, True)
+        _stop_token_id(llm)

-        _set_async_mode(vllm_model, False)
-        _stop_token_id(vllm_model)
+        _set_async_mode(llm, False)
+        _stop_token_id(llm)