"vllm/v1/engine/utils.py" did not exist on "7f6c5ee06c4861ae1310f4ea5caaa2104efb4d22"
Unverified Commit d9784107 authored by Ning Xie's avatar Ning Xie Committed by GitHub
Browse files

[Misc] unify variable for LLM instance (#20996)


Signed-off-by: default avatarAndy Xie <andy.xning@gmail.com>
parent e6b90a28
...@@ -186,7 +186,7 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files, ...@@ -186,7 +186,7 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri)) tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
loaded_vllm_model = LLM(model=model_ref, loaded_llm = LLM(model=model_ref,
load_format="tensorizer", load_format="tensorizer",
enable_lora=True, enable_lora=True,
enforce_eager=True, enforce_eager=True,
...@@ -198,13 +198,13 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files, ...@@ -198,13 +198,13 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
tc_as_dict = tensorizer_config.to_serializable() tc_as_dict = tensorizer_config.to_serializable()
print("lora adapter created") print("lora adapter created")
assert do_sample(loaded_vllm_model, assert do_sample(loaded_llm,
sql_lora_files, sql_lora_files,
tensorizer_config_dict=tc_as_dict, tensorizer_config_dict=tc_as_dict,
lora_id=0) == EXPECTED_NO_LORA_OUTPUT lora_id=0) == EXPECTED_NO_LORA_OUTPUT
print("lora 1") print("lora 1")
assert do_sample(loaded_vllm_model, assert do_sample(loaded_llm,
sql_lora_files, sql_lora_files,
tensorizer_config_dict=tc_as_dict, tensorizer_config_dict=tc_as_dict,
lora_id=1) == EXPECTED_LORA_OUTPUT lora_id=1) == EXPECTED_LORA_OUTPUT
...@@ -41,7 +41,7 @@ def test_metric_counter_prompt_tokens( ...@@ -41,7 +41,7 @@ def test_metric_counter_prompt_tokens(
dtype=dtype, dtype=dtype,
disable_log_stats=False, disable_log_stats=False,
gpu_memory_utilization=0.4) as vllm_model: gpu_memory_utilization=0.4) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.llm.get_tokenizer()
prompt_token_counts = [ prompt_token_counts = [
len(tokenizer.encode(p)) for p in example_prompts len(tokenizer.encode(p)) for p in example_prompts
] ]
...@@ -53,7 +53,7 @@ def test_metric_counter_prompt_tokens( ...@@ -53,7 +53,7 @@ def test_metric_counter_prompt_tokens(
vllm_prompt_token_count = sum(prompt_token_counts) vllm_prompt_token_count = sum(prompt_token_counts)
_ = vllm_model.generate_greedy(example_prompts, max_tokens) _ = vllm_model.generate_greedy(example_prompts, max_tokens)
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_prompt_tokens.labels( metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
**stat_logger.labels)._value.get() **stat_logger.labels)._value.get()
...@@ -77,8 +77,8 @@ def test_metric_counter_generation_tokens( ...@@ -77,8 +77,8 @@ def test_metric_counter_generation_tokens(
disable_log_stats=False, disable_log_stats=False,
gpu_memory_utilization=0.4) as vllm_model: gpu_memory_utilization=0.4) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.llm.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_generation_tokens.labels( metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get() **stat_logger.labels)._value.get()
vllm_generation_count = 0 vllm_generation_count = 0
...@@ -113,8 +113,8 @@ def test_metric_counter_generation_tokens_multi_step( ...@@ -113,8 +113,8 @@ def test_metric_counter_generation_tokens_multi_step(
disable_async_output_proc=disable_async_output_proc, disable_async_output_proc=disable_async_output_proc,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.llm.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_generation_tokens.labels( metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get() **stat_logger.labels)._value.get()
vllm_generation_count = 0 vllm_generation_count = 0
...@@ -145,7 +145,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, ...@@ -145,7 +145,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
disable_log_stats=False, disable_log_stats=False,
gpu_memory_utilization=0.3, gpu_memory_utilization=0.3,
served_model_name=served_model_name) as vllm_model: served_model_name=served_model_name) as vllm_model:
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
metrics_tag_content = stat_logger.labels["model_name"] metrics_tag_content = stat_logger.labels["model_name"]
if envs.VLLM_CI_USE_S3: if envs.VLLM_CI_USE_S3:
......
...@@ -32,8 +32,8 @@ def test_model_loading_with_params(vllm_runner): ...@@ -32,8 +32,8 @@ def test_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that" output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n") " dreams for the first time.\n")
model_config = vllm_model.model.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer model_tokenizer = vllm_model.llm.llm_engine.tokenizer
# asserts on the bert model config file # asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512 assert model_config.encoder_config["max_seq_length"] == 512
...@@ -70,8 +70,8 @@ def test_roberta_model_loading_with_params(vllm_runner): ...@@ -70,8 +70,8 @@ def test_roberta_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that" output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n") " dreams for the first time.\n")
model_config = vllm_model.model.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer model_tokenizer = vllm_model.llm.llm_engine.tokenizer
# asserts on the bert model config file # asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512 assert model_config.encoder_config["max_seq_length"] == 512
...@@ -108,7 +108,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner): ...@@ -108,7 +108,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner):
output = vllm_model.embed("Write a short story about a robot that" output = vllm_model.embed("Write a short story about a robot that"
" dreams for the first time.\n") " dreams for the first time.\n")
model_tokenizer = vllm_model.model.llm_engine.tokenizer model_tokenizer = vllm_model.llm.llm_engine.tokenizer
assert model_tokenizer.tokenizer_id == model_name assert model_tokenizer.tokenizer_id == model_name
def check_model(model): def check_model(model):
......
...@@ -274,7 +274,7 @@ def test_models_preemption_recompute( ...@@ -274,7 +274,7 @@ def test_models_preemption_recompute(
Tests that outputs are identical with and w/o preemptions (recompute). Tests that outputs are identical with and w/o preemptions (recompute).
""" """
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
scheduler = vllm_model.model.llm_engine.scheduler[0] scheduler = vllm_model.llm.llm_engine.scheduler[0]
scheduler.ENABLE_ARTIFICIAL_PREEMPT = True scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
preempt_vllm_outputs = vllm_model.generate_greedy( preempt_vllm_outputs = vllm_model.generate_greedy(
example_prompts, max_tokens) example_prompts, max_tokens)
......
...@@ -238,7 +238,7 @@ def test_mistral_symbolic_languages(vllm_runner, model: str, ...@@ -238,7 +238,7 @@ def test_mistral_symbolic_languages(vllm_runner, model: str,
load_format="mistral") as vllm_model: load_format="mistral") as vllm_model:
for prompt in SYMBOLIC_LANG_PROMPTS: for prompt in SYMBOLIC_LANG_PROMPTS:
msg = {"role": "user", "content": prompt} msg = {"role": "user", "content": prompt}
outputs = vllm_model.model.chat([msg], outputs = vllm_model.llm.chat([msg],
sampling_params=SAMPLING_PARAMS) sampling_params=SAMPLING_PARAMS)
assert "�" not in outputs[0].outputs[0].text.strip() assert "�" not in outputs[0].outputs[0].text.strip()
...@@ -253,11 +253,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None: ...@@ -253,11 +253,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
load_format="mistral") as vllm_model: load_format="mistral") as vllm_model:
msgs = copy.deepcopy(MSGS) msgs = copy.deepcopy(MSGS)
outputs = vllm_model.model.chat(msgs, outputs = vllm_model.llm.chat(msgs,
tools=TOOLS, tools=TOOLS,
sampling_params=SAMPLING_PARAMS) sampling_params=SAMPLING_PARAMS)
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.llm.get_tokenizer()
tool_parser = MistralToolParser(tokenizer) tool_parser = MistralToolParser(tokenizer)
model_output = outputs[0].outputs[0].text.strip() model_output = outputs[0].outputs[0].text.strip()
...@@ -308,7 +308,7 @@ def test_mistral_guided_decoding( ...@@ -308,7 +308,7 @@ def test_mistral_guided_decoding(
f"Give an example JSON for an employee profile that " f"Give an example JSON for an employee profile that "
f"fits this schema: {SAMPLE_JSON_SCHEMA}" f"fits this schema: {SAMPLE_JSON_SCHEMA}"
}] }]
outputs = vllm_model.model.chat(messages, sampling_params=params) outputs = vllm_model.llm.chat(messages, sampling_params=params)
generated_text = outputs[0].outputs[0].text generated_text = outputs[0].outputs[0].text
json_response = json.loads(generated_text) json_response = json.loads(generated_text)
......
...@@ -30,7 +30,7 @@ class VllmMtebEncoder(mteb.Encoder): ...@@ -30,7 +30,7 @@ class VllmMtebEncoder(mteb.Encoder):
def __init__(self, vllm_model): def __init__(self, vllm_model):
super().__init__() super().__init__()
self.model = vllm_model self.llm = vllm_model
self.rng = np.random.default_rng(seed=42) self.rng = np.random.default_rng(seed=42)
def encode( def encode(
...@@ -43,7 +43,7 @@ class VllmMtebEncoder(mteb.Encoder): ...@@ -43,7 +43,7 @@ class VllmMtebEncoder(mteb.Encoder):
# issues by randomizing the order. # issues by randomizing the order.
r = self.rng.permutation(len(sentences)) r = self.rng.permutation(len(sentences))
sentences = [sentences[i] for i in r] sentences = [sentences[i] for i in r]
outputs = self.model.embed(sentences, use_tqdm=False) outputs = self.llm.embed(sentences, use_tqdm=False)
embeds = np.array(outputs) embeds = np.array(outputs)
embeds = embeds[np.argsort(r)] embeds = embeds[np.argsort(r)]
return embeds return embeds
...@@ -61,7 +61,7 @@ class VllmMtebEncoder(mteb.Encoder): ...@@ -61,7 +61,7 @@ class VllmMtebEncoder(mteb.Encoder):
queries = [s[0] for s in sentences] queries = [s[0] for s in sentences]
corpus = [s[1] for s in sentences] corpus = [s[1] for s in sentences]
outputs = self.model.score(queries, outputs = self.llm.score(queries,
corpus, corpus,
truncate_prompt_tokens=-1, truncate_prompt_tokens=-1,
use_tqdm=False) use_tqdm=False)
...@@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner, ...@@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner,
if model_info.architecture: if model_info.architecture:
assert (model_info.architecture assert (model_info.architecture
in vllm_model.model.llm_engine.model_config.architectures) in vllm_model.llm.llm_engine.model_config.architectures)
vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model), vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
MTEB_EMBED_TASKS) MTEB_EMBED_TASKS)
vllm_dtype = vllm_model.model.llm_engine.model_config.dtype vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
with hf_runner(model_info.name, with hf_runner(model_info.name,
is_sentence_transformer=True, is_sentence_transformer=True,
...@@ -284,7 +284,7 @@ def mteb_test_rerank_models(hf_runner, ...@@ -284,7 +284,7 @@ def mteb_test_rerank_models(hf_runner,
max_num_seqs=8, max_num_seqs=8,
**vllm_extra_kwargs) as vllm_model: **vllm_extra_kwargs) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
if model_info.architecture: if model_info.architecture:
assert (model_info.architecture in model_config.architectures) assert (model_info.architecture in model_config.architectures)
......
...@@ -120,7 +120,7 @@ def test_gritlm_offline_embedding(vllm_runner): ...@@ -120,7 +120,7 @@ def test_gritlm_offline_embedding(vllm_runner):
task="embed", task="embed",
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
) as vllm_model: ) as vllm_model:
llm = vllm_model.model llm = vllm_model.llm
d_rep = run_llm_encode( d_rep = run_llm_encode(
llm, llm,
...@@ -167,7 +167,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner): ...@@ -167,7 +167,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
task="generate", task="generate",
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
) as vllm_model: ) as vllm_model:
llm = vllm_model.model llm = vllm_model.llm
sampling_params = SamplingParams(temperature=0.0, max_tokens=256) sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
outputs = llm.generate(input, sampling_params=sampling_params) outputs = llm.generate(input, sampling_params=sampling_params)
......
...@@ -87,10 +87,10 @@ def test_matryoshka( ...@@ -87,10 +87,10 @@ def test_matryoshka(
task="embed", task="embed",
dtype=dtype, dtype=dtype,
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
assert vllm_model.model.llm_engine.model_config.is_matryoshka assert vllm_model.llm.llm_engine.model_config.is_matryoshka
matryoshka_dimensions = ( matryoshka_dimensions = (
vllm_model.model.llm_engine.model_config.matryoshka_dimensions) vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
assert matryoshka_dimensions is not None assert matryoshka_dimensions is not None
if dimensions not in matryoshka_dimensions: if dimensions not in matryoshka_dimensions:
......
...@@ -23,7 +23,7 @@ max_model_len = int(original_max_position_embeddings * factor) ...@@ -23,7 +23,7 @@ max_model_len = int(original_max_position_embeddings * factor)
def test_default(model_info, vllm_runner): def test_default(model_info, vllm_runner):
with vllm_runner(model_info.name, task="embed", with vllm_runner(model_info.name, task="embed",
max_model_len=None) as vllm_model: max_model_len=None) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
# For nomic-embed-text-v2-moe the length is set to 512 # For nomic-embed-text-v2-moe the length is set to 512
# by sentence_bert_config.json. # by sentence_bert_config.json.
...@@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner): ...@@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
# set max_model_len <= 512 # set max_model_len <= 512
with vllm_runner(model_info.name, task="embed", with vllm_runner(model_info.name, task="embed",
max_model_len=256) as vllm_model: max_model_len=256) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 256 assert model_config.max_model_len == 256
# set 512 < max_model_len <= 2048 # set 512 < max_model_len <= 2048
...@@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner): ...@@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
else: else:
with vllm_runner(model_info.name, task="embed", with vllm_runner(model_info.name, task="embed",
max_model_len=1024) as vllm_model: max_model_len=1024) as vllm_model:
model_config = vllm_model.model.llm_engine.model_config model_config = vllm_model.llm.llm_engine.model_config
assert model_config.max_model_len == 1024 assert model_config.max_model_len == 1024
......
...@@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner, ...@@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner,
with vllm_runner(model_name, task="embed", with vllm_runner(model_name, task="embed",
max_model_len=max_model_len) as vllm_model: max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.model.encode( vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens) input_str, truncate_prompt_tokens=truncate_prompt_tokens)
prompt_tokens = vllm_output[0].prompt_token_ids prompt_tokens = vllm_output[0].prompt_token_ids
...@@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner, ...@@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner,
with vllm_runner(model_name, task="embed", with vllm_runner(model_name, task="embed",
max_model_len=max_model_len) as vllm_model: max_model_len=max_model_len) as vllm_model:
vllm_output = vllm_model.model.encode( vllm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens) input_str, truncate_prompt_tokens=truncate_prompt_tokens)
prompt_tokens = vllm_output[0].prompt_token_ids prompt_tokens = vllm_output[0].prompt_token_ids
...@@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner, ...@@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner,
model_name, task="embed", model_name, task="embed",
max_model_len=max_model_len) as vllm_model: max_model_len=max_model_len) as vllm_model:
llm_output = vllm_model.model.encode( llm_output = vllm_model.llm.encode(
input_str, truncate_prompt_tokens=truncate_prompt_tokens) input_str, truncate_prompt_tokens=truncate_prompt_tokens)
assert llm_output == f"""truncate_prompt_tokens value assert llm_output == f"""truncate_prompt_tokens value
......
...@@ -180,8 +180,7 @@ def test_chat( ...@@ -180,8 +180,7 @@ def test_chat(
) as vllm_model: ) as vllm_model:
outputs = [] outputs = []
for msg in MSGS: for msg in MSGS:
output = vllm_model.model.chat(msg, output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
sampling_params=SAMPLING_PARAMS)
outputs.extend(output) outputs.extend(output)
...@@ -217,7 +216,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt, ...@@ -217,7 +216,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt,
max_model_len=8192, max_model_len=8192,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT, limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model: ) as vllm_model:
outputs = vllm_model.model.generate(prompt) outputs = vllm_model.llm.generate(prompt)
assert len(outputs) == 1, f"{len(outputs)=}" assert len(outputs) == 1, f"{len(outputs)=}"
output: RequestOutput = outputs[0] output: RequestOutput = outputs[0]
......
...@@ -106,7 +106,7 @@ def run_test( ...@@ -106,7 +106,7 @@ def run_test(
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
) as vllm_model: ) as vllm_model:
llm = vllm_model.model llm = vllm_model.llm
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, temperature=0,
......
...@@ -85,7 +85,7 @@ def run_test( ...@@ -85,7 +85,7 @@ def run_test(
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
task=task, task=task,
**vllm_runner_kwargs_) as vllm_model: **vllm_runner_kwargs_) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.llm.get_tokenizer()
vllm_kwargs: dict[str, Any] = {} vllm_kwargs: dict[str, Any] = {}
if get_stop_token_ids is not None: if get_stop_token_ids is not None:
......
...@@ -96,7 +96,7 @@ def _run_test( ...@@ -96,7 +96,7 @@ def _run_test(
dtype=dtype, dtype=dtype,
enforce_eager=True, enforce_eager=True,
max_model_len=8192) as vllm_model: max_model_len=8192) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer() tokenizer = vllm_model.llm.get_tokenizer()
texts = [ texts = [
# this is necessary because vllm_model.embed will not apply any # this is necessary because vllm_model.embed will not apply any
# templating to the prompt, and therefore lacks an image_pad # templating to the prompt, and therefore lacks an image_pad
......
...@@ -56,7 +56,7 @@ def vllm_reranker( ...@@ -56,7 +56,7 @@ def vllm_reranker(
mm_processor_kwargs=mm_processor_kwargs, mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt, limit_mm_per_prompt=limit_mm_per_prompt,
) as vllm_model: ) as vllm_model:
outputs = vllm_model.model.score(query, documents) outputs = vllm_model.llm.score(query, documents)
return [output.outputs.score for output in outputs] return [output.outputs.score for output in outputs]
......
...@@ -45,7 +45,7 @@ EXPECTED_STRS_MAP = { ...@@ -45,7 +45,7 @@ EXPECTED_STRS_MAP = {
reason="fp8 is not supported on this GPU type.") reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS) @pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None: def test_models(example_prompts, model_name) -> None:
model = LLM( llm = LLM(
model=model_name, model=model_name,
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
trust_remote_code=True, trust_remote_code=True,
...@@ -68,9 +68,9 @@ def test_models(example_prompts, model_name) -> None: ...@@ -68,9 +68,9 @@ def test_models(example_prompts, model_name) -> None:
# Note: these need to be run 1 at a time due to numerical precision, # Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way. # since the expected strs were generated this way.
for prompt in formatted_prompts: for prompt in formatted_prompts:
outputs = model.generate(prompt, params) outputs = llm.generate(prompt, params)
generations.append(outputs[0].outputs[0].text) generations.append(outputs[0].outputs[0].text)
del model del llm
print(model_name, generations) print(model_name, generations)
expected_strs = EXPECTED_STRS_MAP[model_name] expected_strs = EXPECTED_STRS_MAP[model_name]
......
...@@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = { ...@@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = {
reason="modelopt_fp4 is not supported on this GPU type.") reason="modelopt_fp4 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS) @pytest.mark.parametrize("model_name", MODELS)
def test_models(example_prompts, model_name) -> None: def test_models(example_prompts, model_name) -> None:
model = LLM( llm = LLM(
model=model_name, model=model_name,
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
trust_remote_code=True, trust_remote_code=True,
...@@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None: ...@@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None:
# Note: these need to be run 1 at a time due to numerical precision, # Note: these need to be run 1 at a time due to numerical precision,
# since the expected strs were generated this way. # since the expected strs were generated this way.
for prompt in formatted_prompts: for prompt in formatted_prompts:
outputs = model.generate(prompt, params) outputs = llm.generate(prompt, params)
generations.append(outputs[0].outputs[0].text) generations.append(outputs[0].outputs[0].text)
del model del llm
print(model_name, generations) print(model_name, generations)
expected_strs = EXPECTED_STRS_MAP[model_name] expected_strs = EXPECTED_STRS_MAP[model_name]
......
...@@ -25,25 +25,25 @@ MODEL_LEN_LEN = [ ...@@ -25,25 +25,25 @@ MODEL_LEN_LEN = [
@pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN) @pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
def test_disable_sliding_window(model_len_len, ): def test_disable_sliding_window(model_len_len, ):
model, sliding_len, full_len = model_len_len model, sliding_len, full_len = model_len_len
vllm_disabled_model = LLM(model, disable_sliding_window=True) disabled_llm = LLM(model, disable_sliding_window=True)
vllm_disabled_model.generate("Hi my name is") disabled_llm.generate("Hi my name is")
model_config = vllm_disabled_model.llm_engine.model_config model_config = disabled_llm.llm_engine.model_config
assert model_config.max_model_len == sliding_len, ( assert model_config.max_model_len == sliding_len, (
"Max len expected to equal sliding_len of %s, but got %s", sliding_len, "Max len expected to equal sliding_len of %s, but got %s", sliding_len,
model_config.max_model_len) model_config.max_model_len)
del vllm_disabled_model del disabled_llm
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
vllm_enabled_model = LLM(model, enabled_llm = LLM(model,
enforce_eager=True, enforce_eager=True,
disable_sliding_window=False, disable_sliding_window=False,
enable_prefix_caching=False) enable_prefix_caching=False)
vllm_enabled_model.generate("Hi my name is") enabled_llm.generate("Hi my name is")
model_config = vllm_enabled_model.llm_engine.model_config model_config = enabled_llm.llm_engine.model_config
assert model_config.max_model_len == full_len, ( assert model_config.max_model_len == full_len, (
"Max len expected to equal full_len of %s, but got %s", full_len, "Max len expected to equal full_len of %s, but got %s", full_len,
model_config.max_model_len) model_config.max_model_len)
del vllm_enabled_model del enabled_llm
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
...@@ -93,7 +93,7 @@ def test_mixed_requests( ...@@ -93,7 +93,7 @@ def test_mixed_requests(
# Run all the promopts # Run all the promopts
greedy_params = SamplingParams(temperature=0.0, greedy_params = SamplingParams(temperature=0.0,
max_tokens=max_tokens) max_tokens=max_tokens)
req_outputs = vllm_model.model.generate(example_prompts, req_outputs = vllm_model.llm.generate(example_prompts,
greedy_params) greedy_params)
# Verify number of cached tokens # Verify number of cached tokens
...@@ -161,7 +161,7 @@ def test_fully_cached_prefill_needs_uncached_token(model): ...@@ -161,7 +161,7 @@ def test_fully_cached_prefill_needs_uncached_token(model):
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
max_num_seqs=max_num_batched_tokens, max_num_seqs=max_num_batched_tokens,
) )
engine: LLMEngine = runner.model.llm_engine engine: LLMEngine = runner.llm.llm_engine
scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore
engine.scheduler[0] = scheduler engine.scheduler[0] = scheduler
......
...@@ -39,7 +39,7 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool, ...@@ -39,7 +39,7 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else ( linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
GPTQLinearMethod) GPTQLinearMethod)
for name, submodule in (vllm_model.model.llm_engine.model_executor. for name, submodule in (vllm_model.llm.llm_engine.model_executor.
driver_worker.model_runner.model.named_modules()): driver_worker.model_runner.model.named_modules()):
if name == "lm_head": if name == "lm_head":
assert isinstance(submodule.quant_method, linear_method_cls) assert isinstance(submodule.quant_method, linear_method_cls)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment