"vscode:/vscode.git/clone" did not exist on "80e71eb8ef551b35eaeef3355c9ecc4e8b31c680"
Unverified Commit d9784107 authored by Ning Xie's avatar Ning Xie Committed by GitHub
Browse files

[Misc] unify variable for LLM instance (#20996)


Signed-off-by: default avatarAndy Xie <andy.xning@gmail.com>
parent e6b90a28
......@@ -186,25 +186,25 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
loaded_vllm_model = LLM(model=model_ref,
load_format="tensorizer",
enable_lora=True,
enforce_eager=True,
model_loader_extra_config=tensorizer_config,
max_num_seqs=13,
tensor_parallel_size=2,
max_loras=2)
loaded_llm = LLM(model=model_ref,
load_format="tensorizer",
enable_lora=True,
enforce_eager=True,
model_loader_extra_config=tensorizer_config,
max_num_seqs=13,
tensor_parallel_size=2,
max_loras=2)
tc_as_dict = tensorizer_config.to_serializable()
print("lora adapter created")
assert do_sample(loaded_vllm_model,
assert do_sample(loaded_llm,
sql_lora_files,
tensorizer_config_dict=tc_as_dict,
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
print("lora 1")
assert do_sample(loaded_vllm_model,
assert do_sample(loaded_llm,
sql_lora_files,
tensorizer_config_dict=tc_as_dict,
lora_id=1) == EXPECTED_LORA_OUTPUT
......@@ -41,7 +41,7 @@ def test_metric_counter_prompt_tokens(
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
tokenizer = vllm_model.llm.get_tokenizer()
prompt_token_counts = [
len(tokenizer.encode(p)) for p in example_prompts
]
......@@ -53,7 +53,7 @@ def test_metric_counter_prompt_tokens(
vllm_prompt_token_count = sum(prompt_token_counts)
_ = vllm_model.generate_greedy(example_prompts, max_tokens)
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
**stat_logger.labels)._value.get()
......@@ -77,8 +77,8 @@ def test_metric_counter_generation_tokens(
disable_log_stats=False,
gpu_memory_utilization=0.4) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
tokenizer = vllm_model.llm.get_tokenizer()
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get()
vllm_generation_count = 0
......@@ -113,8 +113,8 @@ def test_metric_counter_generation_tokens_multi_step(
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
tokenizer = vllm_model.llm.get_tokenizer()
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get()
vllm_generation_count = 0
......@@ -145,7 +145,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
disable_log_stats=False,
gpu_memory_utilization=0.3,
served_model_name=served_model_name) as vllm_model:
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metrics_tag_content = stat_logger.labels["model_name"]
if envs.VLLM_CI_USE_S3:
......
......@@ -32,8 +32,8 @@ def test_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n")
model_config = vllm_model.model.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer
model_config = vllm_model.llm.llm_engine.model_config
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
# asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512
......@@ -70,8 +70,8 @@ def test_roberta_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n")
model_config = vllm_model.model.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer
model_config = vllm_model.llm.llm_engine.model_config
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
# asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512
......@@ -108,7 +108,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n")
model_tokenizer = vllm_model.model.llm_engine.tokenizer
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
assert model_tokenizer.tokenizer_id == model_name
def check_model(model):
......
......@@ -274,7 +274,7 @@ def test_models_preemption_recompute(
Tests that outputs are identical with and w/o preemptions (recompute).
"""
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
scheduler = vllm_model.model.llm_engine.scheduler[0]
scheduler = vllm_model.llm.llm_engine.scheduler[0]
scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
preempt_vllm_outputs = vllm_model.generate_greedy(
example_prompts, max_tokens)
......
......@@ -238,8 +238,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str,
load_format="mistral") as vllm_model:
for prompt in SYMBOLIC_LANG_PROMPTS:
msg = {"role": "user", "content": prompt}
outputs = vllm_model.model.chat([msg],
sampling_params=SAMPLING_PARAMS)
outputs = vllm_model.llm.chat([msg],
sampling_params=SAMPLING_PARAMS)
assert "�" not in outputs[0].outputs[0].text.strip()
......@@ -253,11 +253,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
load_format="mistral") as vllm_model:
msgs = copy.deepcopy(MSGS)
outputs = vllm_model.model.chat(msgs,
tools=TOOLS,
sampling_params=SAMPLING_PARAMS)
outputs = vllm_model.llm.chat(msgs,
tools=TOOLS,
sampling_params=SAMPLING_PARAMS)
tokenizer = vllm_model.model.get_tokenizer()
tokenizer = vllm_model.llm.get_tokenizer()
tool_parser = MistralToolParser(tokenizer)
model_output = outputs[0].outputs[0].text.strip()
......@@ -308,7 +308,7 @@ def test_mistral_guided_decoding(
f"Give an example JSON for an employee profile that "
f"fits this schema: {SAMPLE_JSON_SCHEMA}"
}]
outputs = vllm_model.model.chat(messages, sampling_params=params)
outputs = vllm_model.llm.chat(messages, sampling_params=params)
generated_text = outputs[0].outputs[0].text
json_response = json.loads(generated_text)
......
......@@ -30,7 +30,7 @@ class VllmMtebEncoder(mteb.Encoder):
def __init__(self, vllm_model):
super().__init__()
self.model = vllm_model
self.llm = vllm_model
self.rng = np.random.default_rng(seed=42)
def encode(
......@@ -43,7 +43,7 @@ class VllmMtebEncoder(mteb.Encoder):
# issues by randomizing the order.
r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r]
outputs = self.model.embed(sentences, use_tqdm=False)
outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs)
embeds = embeds[np.argsort(r)]
return embeds
......@@ -61,10 +61,10 @@ class VllmMtebEncoder(mteb.Encoder):
queries = [s[0] for s in sentences]
corpus = [s[1] for s in sentences]
outputs = self.model.score(queries,
corpus,
truncate_prompt_tokens=-1,
use_tqdm=False)
outputs = self.llm.score(queries,
corpus,
truncate_prompt_tokens=-1,
use_tqdm=False)
scores = np.array(outputs)
scores = scores[np.argsort(r)]
return scores
......@@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner,
if model_info.architecture:
assert (model_info.architecture
in vllm_model.model.llm_engine.model_config.architectures)
in vllm_model.llm.llm_engine.model_config.architectures)
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
MTEB_EMBED_TASKS)
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
with hf_runner(model_info.name,
is_sentence_transformer=True,
......@@ -284,7 +284,7 @@ def mteb_test_rerank_models(hf_runner,
max_num_seqs=8,
**vllm_extra_kwargs) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config
model_config = vllm_model.llm.llm_engine.model_config
if model_info.architecture:
assert (model_info.architecture in model_config.architectures)
......
......@@ -120,7 +120,7 @@ def test_gritlm_offline_embedding(vllm_runner):
task="embed",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
llm = vllm_model.llm
d_rep = run_llm_encode(
llm,
......@@ -167,7 +167,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
task="generate",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
llm = vllm_model.llm
sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
outputs = llm.generate(input, sampling_params=sampling_params)
......
......@@ -87,10 +87,10 @@ def test_matryoshka(
task="embed",
dtype=dtype,
max_model_len=None) as vllm_model:
assert vllm_model.model.llm_engine.model_config.is_matryoshka
assert vllm_model.llm.llm_engine.model_config.is_matryoshka
matryoshka_dimensions = (
vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
assert matryoshka_dimensions is not None
if dimensions not in matryoshka_dimensions:
......
......@@ -23,7 +23,7 @@ max_model_len = int(original_max_position_embeddings * factor)
def test_default(model_info, vllm_runner):
with vllm_runner(model_info.name, task="embed",
max_model_len=None) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config
model_config = vllm_model.llm.llm_engine.model_config
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
# For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json.
......@@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
# set max_model_len <= 512
with vllm_runner(model_info.name, task="embed",
max_model_len=256) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 256
# set 512 < max_model_len <= 2048
......@@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
else:
with vllm_runner(model_info.name, task="embed",
max_model_len=1024) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config
model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 1024
......
......@@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner,
with vllm_runner(model_name, task="embed",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.model.encode(
vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
prompt_tokens = vllm_output[0].prompt_token_ids
......@@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner,
with vllm_runner(model_name, task="embed",
max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.model.encode(
vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
prompt_tokens = vllm_output[0].prompt_token_ids
......@@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner,
model_name, task="embed",
max_model_len=max_model_len) as vllm_model:
llm_output = vllm_model.model.encode(
llm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens)
assert llm_output == f"""truncate_prompt_tokens value
......
......@@ -180,8 +180,7 @@ def test_chat(
) as vllm_model:
outputs = []
for msg in MSGS:
output = vllm_model.model.chat(msg,
sampling_params=SAMPLING_PARAMS)
output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
outputs.extend(output)
......@@ -217,7 +216,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt,
max_model_len=8192,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
outputs = vllm_model.model.generate(prompt)
outputs = vllm_model.llm.generate(prompt)
assert len(outputs) == 1, f"{len(outputs)=}"
output: RequestOutput = outputs[0]
......
......@@ -106,7 +106,7 @@ def run_test(
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
llm = vllm_model.model
llm = vllm_model.llm
sampling_params = SamplingParams(
temperature=0,
......
......@@ -85,7 +85,7 @@ def run_test(
enforce_eager=enforce_eager,
task=task,
**vllm_runner_kwargs_) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
tokenizer = vllm_model.llm.get_tokenizer()
vllm_kwargs: dict[str, Any] = {}
if get_stop_token_ids is not None:
......
......@@ -96,7 +96,7 @@ def _run_test(
dtype=dtype,
enforce_eager=True,
max_model_len=8192) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
tokenizer = vllm_model.llm.get_tokenizer()
texts = [
# this is necessary because vllm_model.embed will not apply any
# templating to the prompt, and therefore lacks an image_pad
......
......@@ -56,7 +56,7 @@ def vllm_reranker(
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt,
) as vllm_model:
outputs = vllm_model.model.score(query, documents)
outputs = vllm_model.llm.score(query, documents)
return [output.outputs.score for output in outputs]
......
......@@ -45,7 +45,7 @@ EXPECTED_STRS_MAP = {
reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
model = LLM(
llm = LLM(
model=model_name,
max_model_len=MAX_MODEL_LEN,
trust_remote_code=True,
......@@ -68,9 +68,9 @@ def test_models(example_prompts, model_name) -> None:
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for prompt in formatted_prompts:
outputs = model.generate(prompt, params)
outputs = llm.generate(prompt, params)
generations.append(outputs[0].outputs[0].text)
del model
del llm
print(model_name, generations)
expected_strs = EXPECTED_STRS_MAP[model_name]
......
......@@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = {
reason="modelopt_fp4 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None:
model = LLM(
llm = LLM(
model=model_name,
max_model_len=MAX_MODEL_LEN,
trust_remote_code=True,
......@@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None:
# Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way.
for prompt in formatted_prompts:
outputs = model.generate(prompt, params)
outputs = llm.generate(prompt, params)
generations.append(outputs[0].outputs[0].text)
del model
del llm
print(model_name, generations)
expected_strs = EXPECTED_STRS_MAP[model_name]
......
......@@ -25,25 +25,25 @@ MODEL_LEN_LEN = [
@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
def test_disable_sliding_window(model_len_len, ):
model, sliding_len, full_len = model_len_len
vllm_disabled_model = LLM(model, disable_sliding_window=True)
vllm_disabled_model.generate("Hi my name is")
model_config = vllm_disabled_model.llm_engine.model_config
disabled_llm = LLM(model, disable_sliding_window=True)
disabled_llm.generate("Hi my name is")
model_config = disabled_llm.llm_engine.model_config
assert model_config.max_model_len == sliding_len, (
"Max len expected to equal sliding_len of %s, but got %s", sliding_len,
model_config.max_model_len)
del vllm_disabled_model
del disabled_llm
cleanup_dist_env_and_memory()
vllm_enabled_model = LLM(model,
enforce_eager=True,
disable_sliding_window=False,
enable_prefix_caching=False)
vllm_enabled_model.generate("Hi my name is")
model_config = vllm_enabled_model.llm_engine.model_config
enabled_llm = LLM(model,
enforce_eager=True,
disable_sliding_window=False,
enable_prefix_caching=False)
enabled_llm.generate("Hi my name is")
model_config = enabled_llm.llm_engine.model_config
assert model_config.max_model_len == full_len, (
"Max len expected to equal full_len of %s, but got %s", full_len,
model_config.max_model_len)
del vllm_enabled_model
del enabled_llm
cleanup_dist_env_and_memory()
......@@ -93,8 +93,8 @@ def test_mixed_requests(
# Run all the promopts
greedy_params = SamplingParams(temperature=0.0,
max_tokens=max_tokens)
req_outputs = vllm_model.model.generate(example_prompts,
greedy_params)
req_outputs = vllm_model.llm.generate(example_prompts,
greedy_params)
# Verify number of cached tokens
for i in range(len(req_outputs)):
......@@ -161,7 +161,7 @@ def test_fully_cached_prefill_needs_uncached_token(model):
max_num_batched_tokens=max_num_batched_tokens,
max_num_seqs=max_num_batched_tokens,
)
engine: LLMEngine = runner.model.llm_engine
engine: LLMEngine = runner.llm.llm_engine
scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore
engine.scheduler[0] = scheduler
......
......@@ -39,7 +39,7 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
GPTQLinearMethod)
for name, submodule in (vllm_model.model.llm_engine.model_executor.
for name, submodule in (vllm_model.llm.llm_engine.model_executor.
driver_worker.model_runner.model.named_modules()):
if name == "lm_head":
assert isinstance(submodule.quant_method, linear_method_cls)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment